├── requirements.txt
├── crawler.py
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | lxml
3 | bs4
4 | 


--------------------------------------------------------------------------------
/crawler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import requests
 3 | import re
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | def normalize(proburl, domain, path):
 7 | 	if not re.match("^http:|^https:|^//|^javascript:", proburl):
 8 | 		if not re.match("^/", proburl):
 9 | 			proburl = "/" + proburl
10 | 		if re.match(domain, proburl):
11 | 			if proburl not in path:
12 | 				return proburl
13 | 		else:
14 | 			proburl = domain + proburl
15 | 			if proburl not in path:
16 | 				return proburl
17 | 	elif re.match(domain, proburl):
18 | 		if proburl not in path:
19 | 			return proburl
20 | 
21 | class Crawl(object):
22 | 	def crawl(self, url, output=None, limitreqs=20, verbose=False):
23 | 		domain = re.findall("http[s]?://[a-z0-9.][a-z0-9-.]{0,61}[a-z0-9.]*", url)[0]
24 | 		path = []
25 | 		self.url = []
26 | 		self.status_code = []
27 | 		self.text = []
28 | 		done = []
29 | 		totalreqs = 0
30 | 		if url not in path:
31 | 			path.append(url)
32 | 		# TODO
33 | 		# notprocessfiles = (".jpg", ".gif", ".jpeg", ".ico", ".tiff", ".png", ".bmp")
34 | 		extract = {
35 | 			"a": "href",
36 | 			"img": "src",
37 | 			"form": "action",
38 | 			"script": "src",
39 | 			"iframe": "src",
40 | 			"div": "src",
41 | 			"frame": "src",
42 | 			"embed": "src",
43 | 			"link": "href",
44 | 		}
45 | 		
46 | 		for urlvalue in path:
47 | 			if urlvalue not in done and totalreqs != limitreqs:
48 | 				httpreq = requests.get(urlvalue)
49 | 				totalreqs += 1
50 | 				if verbose:
51 | 					print("[" + str(totalreqs) + "]" + "[" + str(len(path)) + "] " + urlvalue)
52 | 				soup = BeautifulSoup(str(httpreq.text), "lxml")
53 | 				for tag in extract:
54 | 					# print(tag + " => " + extract[tag])
55 | 					for htmltag in soup.find_all(tag):
56 | 						try:
57 | 							link = htmltag[extract[tag]]
58 | 							#print("LINK: " + link)
59 | 							internallink = normalize(link, domain, path)
60 | 							if internallink is not None:
61 | 								path.append(internallink)
62 | 								self.url.append(link)
63 | 								self.status_code.append(httpreq.status_code)
64 | 								self.text.append(httpreq.text)
65 | 						except KeyError:
66 | 							pass
67 | 				done.append(urlvalue)
68 | 		print("Total URL's found: " + str(len(path)))
69 | 
70 | 		if output is not None:
71 | 			f = open(output, 'w')
72 | 			for reg in done:
73 | 				print(reg)
74 | 				f.write(reg)
75 | 			f.close()
76 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # python-crawl
 2 | Personal library to extract internal links from domain
 3 | 
 4 | # Usage
 5 | 
 6 | ### Basics
 7 | 
 8 | ```
 9 | python3
10 | Python 3.5.2 (default, Nov 23 2017, 16:37:01) 
11 | [GCC 5.4.0 20160609] on linux
12 | Type "help", "copyright", "credits" or "license" for more information.
13 | >>> from crawler import crawl
14 | 
15 | >>> out = Crawl
16 | >>> out.crawl("http://www.inseguro.com.br", verbose=True, limitreqs=5)
17 | [1] http://www.inseguro.com.br
18 | [2] http://www.inseguro.com.br/search
19 | [3] http://www.inseguro.com.br/2017/10/bwapp-1-xss-reflected-overview.html
20 | [4] http://www.inseguro.com.br/2017/10/bwapp-1-xss-reflected-overview.html#more
21 | [5] http://www.inseguro.com.br/2017/10/bwapp-1-xss-reflected-overview.html#comment-form
22 | Total URL's found: 37
23 | 
24 | 
25 | >>> out.status_code[0]
26 | 200
27 | 
28 | >>> for url in out.url:
29 | ...     print(url)
30 | ... 
31 | http://www.inseguro.com.br
32 | http://www.inseguro.com.br/2017/10/bwapp-1-xss-reflected-overview.html
33 | http://www.inseguro.com.br/2017/10/bwapp-1-xss-reflected-overview.html#more
34 | http://www.inseguro.com.br/2017/10/bwapp-1-xss-reflected-overview.html#comment-form
35 | http://www.inseguro.com.br/2017/10/bwapp-1-xss-reflected-overview.html#links
36 | http://www.inseguro.com.br/search/label/bWAPP
37 | http://www.inseguro.com.br/search/label/Code%20Review
38 | http://www.inseguro.com.br/search/label/hacking
39 | http://www.inseguro.com.br/search/label/penetration%20testing
40 | http://www.inseguro.com.br/search/label/xss
41 | http://www.inseguro.com.br/2017/10/bwapp-buggy-web-application-ambiente.html
42 | http://www.inseguro.com.br/2017/10/bwapp-buggy-web-application-ambiente.html#more
43 | http://www.inseguro.com.br/2017/10/bwapp-buggy-web-application-ambiente.html#comment-form
44 | http://www.inseguro.com.br/2017/10/bwapp-buggy-web-application-ambiente.html#links
45 | http://www.inseguro.com.br/2017/09/paralelos-descubra-como-pirataria-e-o.html
46 | http://www.inseguro.com.br/2017/09/paralelos-descubra-como-pirataria-e-o.html#more
47 | http://www.inseguro.com.br/2017/09/paralelos-descubra-como-pirataria-e-o.html#comment-form
48 | http://www.inseguro.com.br/2017/09/paralelos-descubra-como-pirataria-e-o.html#links
49 | http://www.inseguro.com.br/search/label/cultura
50 | http://www.inseguro.com.br/2017/08/seo-e-suas-armadilhas-de-seguranca.html
51 | http://www.inseguro.com.br/2017/08/seo-e-suas-armadilhas-de-seguranca.html#more
52 | http://www.inseguro.com.br/2017/08/seo-e-suas-armadilhas-de-seguranca.html#comment-form
53 | http://www.inseguro.com.br/search/label/Google%20Dork
54 | http://www.inseguro.com.br/search/label/OSINT
55 | [...]
56 | 
57 | ```
58 | ### Export to txt
59 | ```
60 | >>> crawl("http://www.inseguro.com.br", output="www.inseguro.com.br.txt")
61 | ```
62 | 


--------------------------------------------------------------------------------