├── README.md
├── requirement.txt
├── domain.py
├── link_finder.py
├── torconfig.py
├── main.py
├── torbootstrap.py
├── general.py
└── spider.py


/README.md:
--------------------------------------------------------------------------------
1 | # Dark-WebCrawler


--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
1 | bs4
2 | PySocks
3 | stem
4 | requests
5 | 


--------------------------------------------------------------------------------
/domain.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlparse
 2 | 
 3 | def get_domain_name(url):
 4 |     try:
 5 |         results = get_sub_domain_name(url).split('.')
 6 |         return results[-2] + '.' + results[-1]
 7 |     except:
 8 |         return ''
 9 | 
10 | def get_sub_domain_name(url):
11 |     try:
12 |         return urlparse(url).netloc
13 |     except:
14 |         return ''
15 | 
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/link_finder.py:
--------------------------------------------------------------------------------
 1 | from html.parser import HTMLParser
 2 | from urllib import parse
 3 | 
 4 | class LinkFinder(HTMLParser):
 5 | 
 6 |     def error(self, message):
 7 |         pass
 8 | 
 9 |     def __init__(self,BASE_URL,PAGE_URL):
10 |         super().__init__()
11 |         self.base_url = BASE_URL
12 |         self.page_url = PAGE_URL
13 |         self.links = set()
14 | 
15 |     def handle_starttag(self, tag,attrs):
16 |         if tag == 'a':
17 |             for (attribute,value) in attrs:
18 |                 if attribute == 'href':
19 |                     url = parse.urljoin(self.base_url,value)
20 |                     self.links.add(url)
21 | 
22 |     def page_links(self):
23 |         return self.links
24 | 
25 | 


--------------------------------------------------------------------------------
/torconfig.py:
--------------------------------------------------------------------------------
 1 | from stem import Signal
 2 | from stem.control import Controller
 3 | from requests import get
 4 | from fake_useragent import UserAgent
 5 | from getpass import getpass
 6 | def new_tor_id(passw):
 7 |     with Controller.from_port(port=9051) as controller:
 8 |         controller.authenticate(password=passw)
 9 |         controller.signal(Signal.NEWNYM)
10 | 
11 | def new_identity(url):
12 |     tor_proxy = {
13 |         "http": "socks5h://localhost:9050",
14 |         "https": "socks5h://localhost:9050"
15 |     }
16 |     headers = {
17 |         "User-Agent": UserAgent().random
18 |     }
19 |     resp = get(url, headers=headers, proxies=tor_proxy)
20 |     return resp
21 | 
22 | def connect(url,passw):
23 |     new_identity(url)
24 |     new_tor_id(passw)


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | from queue import Queue
 3 | from spider import Spider
 4 | from domain import *
 5 | from general import *
 6 | from torbootstrap import *
 7 | process()
 8 | bootstrap()
 9 | PROJECT_NAME = input("Enter Project Name:")
10 | HOMEPAGE = input("Enter Url:")
11 | DOMAIN_NAME = get_domain_name(HOMEPAGE)
12 | QUEUE_FILE = PROJECT_NAME + "/notyetcrawled.txt"
13 | CRAWLED_FILE = PROJECT_NAME + "/crawled.txt"
14 | print(f"No of Threads Currently Running {threading.active_count()}")
15 | NUMBER_OF_THREADS = int(input("Enter No. of Theads to Be used :"))
16 | 
17 | queue = Queue()
18 | Spider(PROJECT_NAME,HOMEPAGE,DOMAIN_NAME)
19 | 
20 | 
21 | def create_jobs():
22 |     for links in convert_file_to_set(QUEUE_FILE):
23 |         queue.put(links)
24 |     queue.join()
25 |     crawl()
26 | 
27 | 
28 | def crawl():
29 |     queued_links = convert_file_to_set(QUEUE_FILE)
30 |     if len(queued_links) > 0:
31 |         print(f"{str(len(queued_links))} links yet to be crawled")
32 |         create_jobs()
33 | 
34 | def create_workers():
35 |     for _ in range(NUMBER_OF_THREADS):
36 |         t = threading.Thread(target=work)
37 |         t.daemon = True
38 |         t.start()
39 | 
40 | def work():
41 |     while True:
42 |         url = queue.get()
43 |         Spider.crawl_page(threading.current_thread().name,url)
44 |         queue.task_done()
45 | 
46 | 
47 | create_workers()
48 | crawl()


--------------------------------------------------------------------------------
/torbootstrap.py:
--------------------------------------------------------------------------------
 1 | import os, signal
 2 | 
 3 | 
 4 | def process():
 5 |     # Ask user for the name of process
 6 |     name = "tor"
 7 |     try:
 8 | 
 9 |         # iterating through each instance of the process
10 |         for line in os.popen("ps ax | grep " + name + " | grep -v grep"):
11 |             fields = line.split()
12 | 
13 |             # extracting Process ID from the output
14 |             pid = fields[0]
15 | 
16 |             # terminating process
17 |             os.kill(int(pid), signal.SIGKILL)
18 |         print("Process Successfully terminated")
19 | 
20 |     except:
21 |         print("Error Encountered while running script")
22 | 
23 | 
24 | import io
25 | import socket
26 | import urllib.request
27 | 
28 | import socks
29 | import stem.process
30 | from stem.util import term
31 | 
32 | SOCKS_PORT=9050
33 | def print_bootstrap_lines(line):
34 |     if "Bootstrapped " in line:
35 |         print(term.format(line, term.Color.BLUE))
36 | 
37 | def bootstrap():
38 |     # Start an instance of Tor configured to only exit through Russia. This prints
39 |     # Tor's bootstrap information as it starts. Note that this likely will not
40 |     # work if you have another Tor instance running.
41 |     print(term.format("Starting Tor:\n", term.Attr.BOLD))
42 | 
43 |     tor_process = stem.process.launch_tor_with_config(
44 |         config={
45 |             "SocksPort": str(SOCKS_PORT),
46 |             "ExitNodes": "{ru}",
47 |         },
48 |         init_msg_handler=print_bootstrap_lines,
49 |     )


--------------------------------------------------------------------------------
/general.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | # Creating s Project Directory for the Entered Website
 4 | def project_dir(folder):
 5 |     if not os.path.exists(folder):
 6 |         print(f"Creating Project {folder} ...")
 7 |         os.makedirs(folder)
 8 | 
 9 | # Create crawled and needtobecralwed
10 | 
11 | def create_datafile(project_name,base_url):
12 | 
13 |     # Paths of the project files
14 |     crawled = os.path.join(project_name , "crawled.txt")
15 |     notyetcrawled = os.path.join(project_name,"notyetcrawled.txt")
16 | 
17 |     # checks the project files exists
18 |     if not os.path.isfile(crawled):
19 |         write_file(crawled,"")
20 |     if not os.path.isfile(notyetcrawled):
21 |         write_file(notyetcrawled,base_url)
22 | 
23 | # Function to create and write to a file
24 | # Create a new file
25 | def write_file(path, data):
26 |     with open(path, 'w') as f:
27 |         f.write(data)
28 | # Function to append data to file
29 | def append_file(path,data):
30 |     with open(path,'a') as fp:
31 |         fp.write(data+"\n")
32 | 
33 | # Function to remove all line in a file
34 | def remove_file_content(path):
35 |     with open(path,'w') as fp:
36 |         pass
37 | 
38 | # Function to convert each line from the line to elements of set
39 | def convert_file_to_set(file_name):
40 |     converted = set()
41 |     with open(file_name,'rt') as fp:
42 |         for i in fp:
43 |             converted.add(i.replace('\n',''))
44 |     return converted
45 | 
46 | # Function to convert elements of set into file
47 | def convert_set_to_file(file_name,converted):
48 |     with open(file_name,"w") as fp:
49 |         for _ in sorted(converted):
50 |             fp.write(f"{_}\n")
51 | 


--------------------------------------------------------------------------------
/spider.py:
--------------------------------------------------------------------------------
 1 | from urllib.request import urlopen
 2 | from link_finder import LinkFinder
 3 | from domain import *
 4 | from general import *
 5 | import requests
 6 | from bs4 import *
 7 | #from torconfig import *
 8 | from getpass import getpass
 9 | #passw = getpass("Enter Password for the Tor:")
10 | 
11 | class Spider:
12 | 
13 |     project_name = ''
14 |     base_url = ''
15 |     domain_name = ''
16 |     queue_file = ''
17 |     crawled_file = ''
18 |     queue = set()
19 |     crawled = set()
20 | 
21 |     def __init__(self, project_name, base_url, domain_name):
22 |         Spider.project_name = project_name
23 |         Spider.base_url = base_url
24 |         Spider.domain_name = domain_name
25 |         Spider.queue_file = Spider.project_name + '/notyetcrawled.txt'
26 |         Spider.crawled_file = Spider.project_name + '/crawled.txt'
27 |         self.boot()
28 |         self.crawl_page('First spider', Spider.base_url)
29 | 
30 |     # Creates directory and files for project on first run and starts the spider
31 |     @staticmethod
32 |     def boot():
33 |         project_dir(Spider.project_name)
34 |         create_datafile(Spider.project_name, Spider.base_url)
35 |         Spider.queue = convert_file_to_set(Spider.queue_file)
36 |         Spider.crawled = convert_file_to_set(Spider.crawled_file)
37 | 
38 |     # Updates user display, fills queue and updates files
39 |     @staticmethod
40 |     def crawl_page(thread_name, page_url):
41 |         if page_url not in Spider.crawled:
42 |             print(f"Crawling Url ==> {page_url}")
43 |            #print('Queue ' + str(len(Spider.queue)) + ' | Crawled  ' + str(len(Spider.crawled)))
44 |             Spider.add_links_to_queue(Spider.gather_links(page_url))
45 |             Spider.queue.remove(page_url)
46 |             Spider.crawled.add(page_url)
47 |             Spider.update_files()
48 | 
49 |     # Converts raw response data into readable information and checks for proper html formatting
50 |     @staticmethod
51 |     def gather_links(page_url):
52 |         html_string = ''
53 |         try:
54 |             session = requests.session()
55 |             session.proxies["http"] = "socks5h://localhost:9050"
56 |             session.proxies["https"] = "socks5h://localhost:9050"
57 |             response = session.get(page_url)
58 |             soup = BeautifulSoup(response.content,'html.parser')
59 |             html_string = str(soup)
60 |             finder = LinkFinder(Spider.base_url, page_url)
61 |             finder.feed(html_string)
62 |         except Exception as e:
63 |             print(str(e))
64 |             return set()
65 |         return finder.page_links()
66 | 
67 |     # Saves queue data to project files
68 |     @staticmethod
69 |     def add_links_to_queue(links):
70 |         for url in links:
71 |             if (url in Spider.queue) or (url in Spider.crawled):
72 |                 continue
73 |             if Spider.domain_name != get_domain_name(url):
74 |                 continue
75 |             Spider.queue.add(url)
76 | 
77 |     @staticmethod
78 |     def update_files():
79 |         convert_set_to_file(Spider.queue_file, Spider.queue)
80 |         convert_set_to_file(Spider.crawled_file, Spider.crawled)
81 | 
82 | 
83 | """
84 | BACKUP
85 |     @staticmethod
86 |     def gather_links(page_url):
87 |         html_string = ''
88 |         try:
89 |             response = urlopen(page_url)
90 |             if 'text/html' in response.getheader('Content-Type'):
91 |                 html_bytes = response.read()
92 |                 html_string = html_bytes.decode("utf-8")
93 |             finder = LinkFinder(Spider.base_url, page_url)
94 |             finder.feed(html_string)
95 |         except Exception as e:
96 |             print(str(e))
97 |             return set()
98 |         return finder.page_links()
99 | """


--------------------------------------------------------------------------------