├── requirements.txt ├── crawler.py └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | lxml 3 | bs4 4 | -------------------------------------------------------------------------------- /crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import requests 3 | import re 4 | from bs4 import BeautifulSoup 5 | 6 | def normalize(proburl, domain, path): 7 | if not re.match("^http:|^https:|^//|^javascript:", proburl): 8 | if not re.match("^/", proburl): 9 | proburl = "/" + proburl 10 | if re.match(domain, proburl): 11 | if proburl not in path: 12 | return proburl 13 | else: 14 | proburl = domain + proburl 15 | if proburl not in path: 16 | return proburl 17 | elif re.match(domain, proburl): 18 | if proburl not in path: 19 | return proburl 20 | 21 | class Crawl(object): 22 | def crawl(self, url, output=None, limitreqs=20, verbose=False): 23 | domain = re.findall("http[s]?://[a-z0-9.][a-z0-9-.]{0,61}[a-z0-9.]*", url)[0] 24 | path = [] 25 | self.url = [] 26 | self.status_code = [] 27 | self.text = [] 28 | done = [] 29 | totalreqs = 0 30 | if url not in path: 31 | path.append(url) 32 | # TODO 33 | # notprocessfiles = (".jpg", ".gif", ".jpeg", ".ico", ".tiff", ".png", ".bmp") 34 | extract = { 35 | "a": "href", 36 | "img": "src", 37 | "form": "action", 38 | "script": "src", 39 | "iframe": "src", 40 | "div": "src", 41 | "frame": "src", 42 | "embed": "src", 43 | "link": "href", 44 | } 45 | 46 | for urlvalue in path: 47 | if urlvalue not in done and totalreqs != limitreqs: 48 | httpreq = requests.get(urlvalue) 49 | totalreqs += 1 50 | if verbose: 51 | print("[" + str(totalreqs) + "]" + "[" + str(len(path)) + "] " + urlvalue) 52 | soup = BeautifulSoup(str(httpreq.text), "lxml") 53 | for tag in extract: 54 | # print(tag + " => " + extract[tag]) 55 | for htmltag in soup.find_all(tag): 56 | try: 57 | link = htmltag[extract[tag]] 58 | #print("LINK: " + link) 59 | internallink = normalize(link, domain, path) 60 | if internallink is not None: 61 | path.append(internallink) 62 | self.url.append(link) 63 | self.status_code.append(httpreq.status_code) 64 | self.text.append(httpreq.text) 65 | except KeyError: 66 | pass 67 | done.append(urlvalue) 68 | print("Total URL's found: " + str(len(path))) 69 | 70 | if output is not None: 71 | f = open(output, 'w') 72 | for reg in done: 73 | print(reg) 74 | f.write(reg) 75 | f.close() 76 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python-crawl 2 | Personal library to extract internal links from domain 3 | 4 | # Usage 5 | 6 | ### Basics 7 | 8 | ``` 9 | python3 10 | Python 3.5.2 (default, Nov 23 2017, 16:37:01) 11 | [GCC 5.4.0 20160609] on linux 12 | Type "help", "copyright", "credits" or "license" for more information. 13 | >>> from crawler import crawl 14 | 15 | >>> out = Crawl 16 | >>> out.crawl("http://www.inseguro.com.br", verbose=True, limitreqs=5) 17 | [1] http://www.inseguro.com.br 18 | [2] http://www.inseguro.com.br/search 19 | [3] http://www.inseguro.com.br/2017/10/bwapp-1-xss-reflected-overview.html 20 | [4] http://www.inseguro.com.br/2017/10/bwapp-1-xss-reflected-overview.html#more 21 | [5] http://www.inseguro.com.br/2017/10/bwapp-1-xss-reflected-overview.html#comment-form 22 | Total URL's found: 37 23 | 24 | 25 | >>> out.status_code[0] 26 | 200 27 | 28 | >>> for url in out.url: 29 | ... print(url) 30 | ... 31 | http://www.inseguro.com.br 32 | http://www.inseguro.com.br/2017/10/bwapp-1-xss-reflected-overview.html 33 | http://www.inseguro.com.br/2017/10/bwapp-1-xss-reflected-overview.html#more 34 | http://www.inseguro.com.br/2017/10/bwapp-1-xss-reflected-overview.html#comment-form 35 | http://www.inseguro.com.br/2017/10/bwapp-1-xss-reflected-overview.html#links 36 | http://www.inseguro.com.br/search/label/bWAPP 37 | http://www.inseguro.com.br/search/label/Code%20Review 38 | http://www.inseguro.com.br/search/label/hacking 39 | http://www.inseguro.com.br/search/label/penetration%20testing 40 | http://www.inseguro.com.br/search/label/xss 41 | http://www.inseguro.com.br/2017/10/bwapp-buggy-web-application-ambiente.html 42 | http://www.inseguro.com.br/2017/10/bwapp-buggy-web-application-ambiente.html#more 43 | http://www.inseguro.com.br/2017/10/bwapp-buggy-web-application-ambiente.html#comment-form 44 | http://www.inseguro.com.br/2017/10/bwapp-buggy-web-application-ambiente.html#links 45 | http://www.inseguro.com.br/2017/09/paralelos-descubra-como-pirataria-e-o.html 46 | http://www.inseguro.com.br/2017/09/paralelos-descubra-como-pirataria-e-o.html#more 47 | http://www.inseguro.com.br/2017/09/paralelos-descubra-como-pirataria-e-o.html#comment-form 48 | http://www.inseguro.com.br/2017/09/paralelos-descubra-como-pirataria-e-o.html#links 49 | http://www.inseguro.com.br/search/label/cultura 50 | http://www.inseguro.com.br/2017/08/seo-e-suas-armadilhas-de-seguranca.html 51 | http://www.inseguro.com.br/2017/08/seo-e-suas-armadilhas-de-seguranca.html#more 52 | http://www.inseguro.com.br/2017/08/seo-e-suas-armadilhas-de-seguranca.html#comment-form 53 | http://www.inseguro.com.br/search/label/Google%20Dork 54 | http://www.inseguro.com.br/search/label/OSINT 55 | [...] 56 | 57 | ``` 58 | ### Export to txt 59 | ``` 60 | >>> crawl("http://www.inseguro.com.br", output="www.inseguro.com.br.txt") 61 | ``` 62 | --------------------------------------------------------------------------------