├── README.md ├── requirement.txt ├── domain.py ├── link_finder.py ├── torconfig.py ├── main.py ├── torbootstrap.py ├── general.py └── spider.py /README.md: -------------------------------------------------------------------------------- 1 | # Dark-WebCrawler -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | bs4 2 | PySocks 3 | stem 4 | requests 5 | -------------------------------------------------------------------------------- /domain.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | 3 | def get_domain_name(url): 4 | try: 5 | results = get_sub_domain_name(url).split('.') 6 | return results[-2] + '.' + results[-1] 7 | except: 8 | return '' 9 | 10 | def get_sub_domain_name(url): 11 | try: 12 | return urlparse(url).netloc 13 | except: 14 | return '' 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /link_finder.py: -------------------------------------------------------------------------------- 1 | from html.parser import HTMLParser 2 | from urllib import parse 3 | 4 | class LinkFinder(HTMLParser): 5 | 6 | def error(self, message): 7 | pass 8 | 9 | def __init__(self,BASE_URL,PAGE_URL): 10 | super().__init__() 11 | self.base_url = BASE_URL 12 | self.page_url = PAGE_URL 13 | self.links = set() 14 | 15 | def handle_starttag(self, tag,attrs): 16 | if tag == 'a': 17 | for (attribute,value) in attrs: 18 | if attribute == 'href': 19 | url = parse.urljoin(self.base_url,value) 20 | self.links.add(url) 21 | 22 | def page_links(self): 23 | return self.links 24 | 25 | -------------------------------------------------------------------------------- /torconfig.py: -------------------------------------------------------------------------------- 1 | from stem import Signal 2 | from stem.control import Controller 3 | from requests import get 4 | from fake_useragent import UserAgent 5 | from getpass import getpass 6 | def new_tor_id(passw): 7 | with Controller.from_port(port=9051) as controller: 8 | controller.authenticate(password=passw) 9 | controller.signal(Signal.NEWNYM) 10 | 11 | def new_identity(url): 12 | tor_proxy = { 13 | "http": "socks5h://localhost:9050", 14 | "https": "socks5h://localhost:9050" 15 | } 16 | headers = { 17 | "User-Agent": UserAgent().random 18 | } 19 | resp = get(url, headers=headers, proxies=tor_proxy) 20 | return resp 21 | 22 | def connect(url,passw): 23 | new_identity(url) 24 | new_tor_id(passw) -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import threading 2 | from queue import Queue 3 | from spider import Spider 4 | from domain import * 5 | from general import * 6 | from torbootstrap import * 7 | process() 8 | bootstrap() 9 | PROJECT_NAME = input("Enter Project Name:") 10 | HOMEPAGE = input("Enter Url:") 11 | DOMAIN_NAME = get_domain_name(HOMEPAGE) 12 | QUEUE_FILE = PROJECT_NAME + "/notyetcrawled.txt" 13 | CRAWLED_FILE = PROJECT_NAME + "/crawled.txt" 14 | print(f"No of Threads Currently Running {threading.active_count()}") 15 | NUMBER_OF_THREADS = int(input("Enter No. of Theads to Be used :")) 16 | 17 | queue = Queue() 18 | Spider(PROJECT_NAME,HOMEPAGE,DOMAIN_NAME) 19 | 20 | 21 | def create_jobs(): 22 | for links in convert_file_to_set(QUEUE_FILE): 23 | queue.put(links) 24 | queue.join() 25 | crawl() 26 | 27 | 28 | def crawl(): 29 | queued_links = convert_file_to_set(QUEUE_FILE) 30 | if len(queued_links) > 0: 31 | print(f"{str(len(queued_links))} links yet to be crawled") 32 | create_jobs() 33 | 34 | def create_workers(): 35 | for _ in range(NUMBER_OF_THREADS): 36 | t = threading.Thread(target=work) 37 | t.daemon = True 38 | t.start() 39 | 40 | def work(): 41 | while True: 42 | url = queue.get() 43 | Spider.crawl_page(threading.current_thread().name,url) 44 | queue.task_done() 45 | 46 | 47 | create_workers() 48 | crawl() -------------------------------------------------------------------------------- /torbootstrap.py: -------------------------------------------------------------------------------- 1 | import os, signal 2 | 3 | 4 | def process(): 5 | # Ask user for the name of process 6 | name = "tor" 7 | try: 8 | 9 | # iterating through each instance of the process 10 | for line in os.popen("ps ax | grep " + name + " | grep -v grep"): 11 | fields = line.split() 12 | 13 | # extracting Process ID from the output 14 | pid = fields[0] 15 | 16 | # terminating process 17 | os.kill(int(pid), signal.SIGKILL) 18 | print("Process Successfully terminated") 19 | 20 | except: 21 | print("Error Encountered while running script") 22 | 23 | 24 | import io 25 | import socket 26 | import urllib.request 27 | 28 | import socks 29 | import stem.process 30 | from stem.util import term 31 | 32 | SOCKS_PORT=9050 33 | def print_bootstrap_lines(line): 34 | if "Bootstrapped " in line: 35 | print(term.format(line, term.Color.BLUE)) 36 | 37 | def bootstrap(): 38 | # Start an instance of Tor configured to only exit through Russia. This prints 39 | # Tor's bootstrap information as it starts. Note that this likely will not 40 | # work if you have another Tor instance running. 41 | print(term.format("Starting Tor:\n", term.Attr.BOLD)) 42 | 43 | tor_process = stem.process.launch_tor_with_config( 44 | config={ 45 | "SocksPort": str(SOCKS_PORT), 46 | "ExitNodes": "{ru}", 47 | }, 48 | init_msg_handler=print_bootstrap_lines, 49 | ) -------------------------------------------------------------------------------- /general.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # Creating s Project Directory for the Entered Website 4 | def project_dir(folder): 5 | if not os.path.exists(folder): 6 | print(f"Creating Project {folder} ...") 7 | os.makedirs(folder) 8 | 9 | # Create crawled and needtobecralwed 10 | 11 | def create_datafile(project_name,base_url): 12 | 13 | # Paths of the project files 14 | crawled = os.path.join(project_name , "crawled.txt") 15 | notyetcrawled = os.path.join(project_name,"notyetcrawled.txt") 16 | 17 | # checks the project files exists 18 | if not os.path.isfile(crawled): 19 | write_file(crawled,"") 20 | if not os.path.isfile(notyetcrawled): 21 | write_file(notyetcrawled,base_url) 22 | 23 | # Function to create and write to a file 24 | # Create a new file 25 | def write_file(path, data): 26 | with open(path, 'w') as f: 27 | f.write(data) 28 | # Function to append data to file 29 | def append_file(path,data): 30 | with open(path,'a') as fp: 31 | fp.write(data+"\n") 32 | 33 | # Function to remove all line in a file 34 | def remove_file_content(path): 35 | with open(path,'w') as fp: 36 | pass 37 | 38 | # Function to convert each line from the line to elements of set 39 | def convert_file_to_set(file_name): 40 | converted = set() 41 | with open(file_name,'rt') as fp: 42 | for i in fp: 43 | converted.add(i.replace('\n','')) 44 | return converted 45 | 46 | # Function to convert elements of set into file 47 | def convert_set_to_file(file_name,converted): 48 | with open(file_name,"w") as fp: 49 | for _ in sorted(converted): 50 | fp.write(f"{_}\n") 51 | -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | from link_finder import LinkFinder 3 | from domain import * 4 | from general import * 5 | import requests 6 | from bs4 import * 7 | #from torconfig import * 8 | from getpass import getpass 9 | #passw = getpass("Enter Password for the Tor:") 10 | 11 | class Spider: 12 | 13 | project_name = '' 14 | base_url = '' 15 | domain_name = '' 16 | queue_file = '' 17 | crawled_file = '' 18 | queue = set() 19 | crawled = set() 20 | 21 | def __init__(self, project_name, base_url, domain_name): 22 | Spider.project_name = project_name 23 | Spider.base_url = base_url 24 | Spider.domain_name = domain_name 25 | Spider.queue_file = Spider.project_name + '/notyetcrawled.txt' 26 | Spider.crawled_file = Spider.project_name + '/crawled.txt' 27 | self.boot() 28 | self.crawl_page('First spider', Spider.base_url) 29 | 30 | # Creates directory and files for project on first run and starts the spider 31 | @staticmethod 32 | def boot(): 33 | project_dir(Spider.project_name) 34 | create_datafile(Spider.project_name, Spider.base_url) 35 | Spider.queue = convert_file_to_set(Spider.queue_file) 36 | Spider.crawled = convert_file_to_set(Spider.crawled_file) 37 | 38 | # Updates user display, fills queue and updates files 39 | @staticmethod 40 | def crawl_page(thread_name, page_url): 41 | if page_url not in Spider.crawled: 42 | print(f"Crawling Url ==> {page_url}") 43 | #print('Queue ' + str(len(Spider.queue)) + ' | Crawled ' + str(len(Spider.crawled))) 44 | Spider.add_links_to_queue(Spider.gather_links(page_url)) 45 | Spider.queue.remove(page_url) 46 | Spider.crawled.add(page_url) 47 | Spider.update_files() 48 | 49 | # Converts raw response data into readable information and checks for proper html formatting 50 | @staticmethod 51 | def gather_links(page_url): 52 | html_string = '' 53 | try: 54 | session = requests.session() 55 | session.proxies["http"] = "socks5h://localhost:9050" 56 | session.proxies["https"] = "socks5h://localhost:9050" 57 | response = session.get(page_url) 58 | soup = BeautifulSoup(response.content,'html.parser') 59 | html_string = str(soup) 60 | finder = LinkFinder(Spider.base_url, page_url) 61 | finder.feed(html_string) 62 | except Exception as e: 63 | print(str(e)) 64 | return set() 65 | return finder.page_links() 66 | 67 | # Saves queue data to project files 68 | @staticmethod 69 | def add_links_to_queue(links): 70 | for url in links: 71 | if (url in Spider.queue) or (url in Spider.crawled): 72 | continue 73 | if Spider.domain_name != get_domain_name(url): 74 | continue 75 | Spider.queue.add(url) 76 | 77 | @staticmethod 78 | def update_files(): 79 | convert_set_to_file(Spider.queue_file, Spider.queue) 80 | convert_set_to_file(Spider.crawled_file, Spider.crawled) 81 | 82 | 83 | """ 84 | BACKUP 85 | @staticmethod 86 | def gather_links(page_url): 87 | html_string = '' 88 | try: 89 | response = urlopen(page_url) 90 | if 'text/html' in response.getheader('Content-Type'): 91 | html_bytes = response.read() 92 | html_string = html_bytes.decode("utf-8") 93 | finder = LinkFinder(Spider.base_url, page_url) 94 | finder.feed(html_string) 95 | except Exception as e: 96 | print(str(e)) 97 | return set() 98 | return finder.page_links() 99 | """ --------------------------------------------------------------------------------