├── README.md └── crawler.py /README.md: -------------------------------------------------------------------------------- 1 | # crawler 2 | Crawl website extract links 3 | -------------------------------------------------------------------------------- /crawler.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from requests.packages.urllib3.exceptions import InsecureRequestWarning 3 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning) 4 | 5 | from bs4 import BeautifulSoup 6 | import queue 7 | import threading 8 | import argparse 9 | 10 | class discoveryWebCrawlerClass(): 11 | 12 | def __init__(self,domain,level): 13 | self.domain = domain 14 | self.q = queue.Queue() 15 | self.urls = [] 16 | self.levelsToCrawl = level 17 | 18 | def crawlURL(self,crawlUrl,currentLevel): 19 | s = requests.Session() 20 | r = s.get(crawlUrl,verify=False,timeout=10) 21 | soup = BeautifulSoup(r.content,'html.parser') 22 | links = soup.find_all('a') 23 | for url in links: 24 | try: 25 | url = url.get('href') 26 | # some href values dont have a full url. They look somthing like : /login.php 27 | if url[0] == '/': 28 | url = self.domain + url 29 | # check to see if link matches crawl domain 30 | if url.split("/")[2] == self.domain.split('/')[2] and url not in self.urls: 31 | self.urls.append(url) 32 | #insert into queue update crawl level 33 | if currentLevel+1 < self.levelsToCrawl: 34 | self.q.put({'url':url,'level':currentLevel +1}) 35 | except Exception as e: 36 | pass 37 | 38 | def worker(self): 39 | while 1: 40 | crawlUrlDict = self.q.get() 41 | self.crawlURL(crawlUrlDict['url'],crawlUrlDict['level']) 42 | self.q.task_done() 43 | 44 | def start(self): 45 | self.q.put({'url':self.domain,'level':0}) 46 | for i in range(0,100): 47 | t = threading.Thread(target=self.worker) 48 | t.daemon = True 49 | t.start() 50 | self.q.join() 51 | 52 | 53 | 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument("-d","--domain", help="Domain Name; EX: https://test.com") 56 | parser.add_argument("-l","--level", help="Levels deep to crawl. EX: 2") 57 | args = parser.parse_args() 58 | 59 | if args.domain and args.level: 60 | webcrawler = discoveryWebCrawlerClass(args.domain,int(args.level)) 61 | webcrawler.start() 62 | for i in range(0,len(webcrawler.urls)): 63 | print("{0}\t{1}".format(i,webcrawler.urls[i])) 64 | --------------------------------------------------------------------------------