├── Modules
    ├── Crawler
    │   ├── crawl.py
    │   ├── crawl_bot.py
    │   ├── file_manage.py
    │   ├── get_domains.py
    │   ├── link_finder.py
    │   └── link_finder.pyc
    └── Scraper
    │   └── Scrape.py
├── README.md
├── Tor.md
├── TorScrapper.py
├── onions.txt
└── requirements.txt


/Modules/Crawler/crawl.py:
--------------------------------------------------------------------------------
 1 | #Author - Abhishek Singh(absingh31).
 2 | 
 3 | from crawl_bot import Crawl_bot
 4 | from file_manage import *
 5 | from queue import Queue
 6 | import threading, sys, os
 7 | from get_domains import *
 8 | import tldextract
 9 | 
10 | 
11 | def input_url(base_url):
12 |     global BASE_URL, regex
13 |     BASE_URL=base_url
14 | 
15 |     url_extract = tldextract.extract(BASE_URL)
16 |     regex = url_extract.domain
17 | 
18 | if __name__=='__main__':
19 |     if (len(sys.argv)==2):
20 |         input_url(sys.argv[1])
21 |     else:
22 |         print("Invalid input")
23 | 
24 | 
25 | GET_DOMAIN = get_domain_name(BASE_URL)
26 | FOLDER_NAME = str(os.path.abspath("") + '/Output/Crawled-'+ regex)
27 | print (FOLDER_NAME)
28 | data_crawled = FOLDER_NAME + '/crawled.txt'
29 | data_in_queue = FOLDER_NAME + '/queue.txt'
30 | thread_count =50
31 | queue = Queue()
32 | 
33 | Crawl_bot(FOLDER_NAME, BASE_URL, GET_DOMAIN)
34 | 
35 | def do_job():                     # Get the job done
36 |     while True:
37 |         url = queue.get()
38 |         Crawl_bot.crawl_page(threading.current_thread().name, url)
39 |         queue.task_done()
40 | 
41 | def queue_jobs():                 # Define each queued link as a new job
42 |     for url_link in convert_to_set(data_in_queue):
43 |         queue.put(url_link)
44 |     queue.join()
45 |     initiate_bot()
46 | 
47 | def get_links_to_queue():          # Also used to create threads to work
48 |     for _ in range(thread_count):
49 |         thread = threading.Thread(target=do_job)
50 |         thread.daemon = True
51 |         thread.start()
52 | 
53 | def initiate_bot():               # Does the crawling job
54 |     links_in_queue = convert_to_set(data_in_queue)
55 |     if len(links_in_queue) > 0:
56 |         print(str(len(links_in_queue)) + ' queued links')
57 |         queue_jobs()
58 | 
59 | get_links_to_queue()
60 | initiate_bot()
61 | 
62 | # https://www.iocbucket.com/search
63 | # https://www.iocbucket.com/feeds/rss2/openioc/1.0/latestten
64 | # https://www.iocbucket.com/feeds/rss2/openioc/1.1/latestten
65 | # https://www.iocbucket.com/feeds/rss2/yara/latestten
66 | # https://www.metaflows.com/codeigniter/stats/content_md5_hash/
67 | 


--------------------------------------------------------------------------------
/Modules/Crawler/crawl_bot.py:
--------------------------------------------------------------------------------
 1 | #Author - Abhishek Singh(absingh31) and Shivam Kapoor (ConanKapoor).
 2 | 
 3 | from get_domains import *
 4 | from file_manage import *
 5 | from link_finder import link_crawler
 6 | from urllib.request import urlopen
 7 | 
 8 | #######################################################################################################################
 9 | ################################################ TOR CONNECTION BELOW #################################################
10 | #######################################################################################################################
11 | 
12 | #Importing Stem libraries
13 | from stem import Signal
14 | from stem.control import Controller
15 | import socks, socket
16 | 
17 | #Initiating Connection
18 | with Controller.from_port(port=9051) as controller:
19 |     controller.authenticate("16:AE80E3930E42F7A3606823FA19CD0A3E721813EF8798ABFE86DB91DD09")
20 |     controller.signal(Signal.NEWNYM)
21 | 
22 | # TOR SETUP GLOBAL Vars
23 | SOCKS_PORT = 9050  # TOR proxy port that is default from torrc, change to whatever torrc is configured to
24 | socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", SOCKS_PORT)
25 | socket.socket = socks.socksocket
26 | 
27 | # Perform DNS resolution through the socket
28 | def getaddrinfo(*args):
29 |     return [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))]
30 | 
31 | socket.getaddrinfo = getaddrinfo
32 | 
33 | #######################################################################################################################
34 | ################################################ TOR CONNECTION ABOVE #################################################
35 | #######################################################################################################################
36 | 
37 | class Crawl_bot:
38 | 
39 |     folder_name, start_link, domain_name, queued_data, crawled_data = '', '', '', '', ''
40 |     queue = set()
41 |     data_crawled = set()
42 | 
43 |     def __init__(self, folder_name, start_link, domain_name):
44 |         Crawl_bot.folder_name = folder_name
45 |         Crawl_bot.start_link = start_link
46 |         Crawl_bot.domain_name = domain_name
47 |         Crawl_bot.queued_data = Crawl_bot.folder_name + '/queue.txt'
48 |         Crawl_bot.crawled_data = Crawl_bot.folder_name + '/crawled.txt'
49 |         self.initiate_directory()
50 |         self.crawl_page('Spider starts here', Crawl_bot.start_link)
51 | 
52 |     @staticmethod
53 |     def initiate_directory():                   # Define and create new directory on the first run
54 |         create_project_folder(Crawl_bot.folder_name)
55 |         create_data_files(Crawl_bot.folder_name, Crawl_bot.start_link)
56 |         Crawl_bot.queue = convert_to_set(Crawl_bot.queued_data)
57 |         Crawl_bot.data_crawled = convert_to_set(Crawl_bot.crawled_data)
58 | 
59 |     @staticmethod
60 |     def crawl_page(thread_name, web_url):      # Fill queue and then update files, also updating user display 
61 |         if web_url not in Crawl_bot.data_crawled:
62 |             print(thread_name + ' now crawl starts ' + web_url)
63 |             print('Queue_url ' + str(len(Crawl_bot.queue)) + ' | Crawled_url  ' + str(len(Crawl_bot.data_crawled)))
64 |             Crawl_bot.add_url_to_queue(Crawl_bot.collect_url(web_url))
65 |             Crawl_bot.queue.remove(web_url)
66 |             Crawl_bot.data_crawled.add(web_url)
67 |             Crawl_bot.update_folder()
68 | 
69 |     # Converts raw response data into readable information and checks for proper html formatting
70 |     @staticmethod
71 |     def collect_url(web_url):
72 |         html_data_string = ''
73 |         try:
74 |             received_response = urlopen(web_url)
75 |             if 'text/html' in received_response.getheader('Content-Type'):
76 |                 data_bytes = received_response.read()
77 |                 html_data_string = data_bytes.decode("latin-1")
78 |             link_finder = link_crawler(Crawl_bot.start_link, web_url)
79 |             link_finder.feed(html_data_string)
80 |         except Exception as e:
81 |             print(str(e))
82 |             return set()
83 |         return link_finder.page_urls()
84 | 
85 |     @staticmethod
86 |     def add_url_to_queue(links):          # Queue data saves to project files
87 |         for url in links:
88 |             if (url in Crawl_bot.queue) or (url in Crawl_bot.data_crawled):
89 |                 continue
90 |             # if Crawl_bot.domain_name != get_domain_name(url):
91 |             #     continue
92 |             Crawl_bot.queue.add(url)
93 | 
94 |     @staticmethod
95 |     def update_folder():                    # Update the project directory
96 |         set_to_file(Crawl_bot.queue, Crawl_bot.queued_data)
97 |         set_to_file(Crawl_bot.data_crawled, Crawl_bot.crawled_data)
98 | 


--------------------------------------------------------------------------------
/Modules/Crawler/file_manage.py:
--------------------------------------------------------------------------------
 1 | #Author - Abhishek Singh(absingh31).
 2 | 
 3 | import os
 4 | 
 5 | def create_project_folder(dir):                 # Create seperate folder for each website
 6 |     if not os.path.exists(dir):
 7 |         print('Creating directory ' + dir)
 8 |         os.makedirs(dir)
 9 | 
10 | def create_data_files(folder_name, start_link): # Append to queue and crawled list
11 |     queue = os.path.join(folder_name , 'queue.txt')
12 |     data_crawled = os.path.join(folder_name,"crawled.txt")
13 |     if not os.path.isfile(queue):
14 |         write_to_file(queue, start_link)
15 |     if not os.path.isfile(data_crawled):
16 |         write_to_file(data_crawled, '')
17 | 
18 | def write_to_file(path, url):                   # Create a new file for the task
19 |     with open(path, 'w') as f:
20 |         f.write(url)
21 | 
22 | def append_file(path, url):                     # Append new data to existing file
23 |     with open(path, 'a') as file:
24 |         file.write(url + '\n')
25 | 
26 | def empty_queue(path):                          # Delete contents of a file
27 |     open(path, 'w').close()
28 | 
29 | def convert_to_set(file_name):                  # Read a file and convert each line to set items
30 |     results = set()
31 |     with open(file_name, 'rt') as f:
32 |         for line in f:
33 |             results.add(line.replace('\n', ''))
34 |     return results
35 | 
36 | def set_to_file(urls, file_name):               # Iterate through a set, each item will be a line in a file
37 |     with open(file_name,"w") as f:
38 |         for l in sorted(urls):
39 |             f.write(l+"\n")
40 | 


--------------------------------------------------------------------------------
/Modules/Crawler/get_domains.py:
--------------------------------------------------------------------------------
1 | #Author - Abhishek Singh(absingh31) and Shivam Kapoor(ConanKapoor).
2 | 
3 | import tldextract
4 | 
5 | def get_domain_name(link):
6 |     url_extract = tldextract.extract(link)
7 |     site_name = url_extract.domain + '.' + url_extract.suffix
8 |     return site_name
9 | 


--------------------------------------------------------------------------------
/Modules/Crawler/link_finder.py:
--------------------------------------------------------------------------------
 1 | #Author - Abhishek Singh(absingh31).
 2 | 
 3 | import urllib
 4 | from html.parser import HTMLParser
 5 | from urllib.parse import urljoin
 6 | 
 7 | class link_crawler(HTMLParser):
 8 | 
 9 |     def __init__(self, start_link, web_url):
10 |         super().__init__()
11 |         self.start_link = start_link
12 |         self.web_url = web_url
13 |         self.urls = set()
14 | 
15 |     def handle_starttag(self, tag, found_attributes):     # The main logic for crawler
16 |         if tag == 'a':
17 |             for (attr, value) in found_attributes:
18 |                 if attr == 'href':
19 |                     url = urllib.parse.urljoin(self.start_link, value)
20 |                     self.urls.add(url)
21 | 
22 |     def page_urls(self):
23 |         return self.urls
24 | 
25 |     def error(self, message):
26 |         pass
27 | 


--------------------------------------------------------------------------------
/Modules/Crawler/link_finder.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/little-endian-0x01/TorScrapper/2fc4452de4ae88dab1f0aeedca0e4ab240e9dea9/Modules/Crawler/link_finder.pyc


--------------------------------------------------------------------------------
/Modules/Scraper/Scrape.py:
--------------------------------------------------------------------------------
 1 | #Author - Shivam Kapoor (ConanKapoor).
 2 | 
 3 | #Importing Essentials
 4 | import urllib.request
 5 | from bs4 import BeautifulSoup
 6 | import sys,re,os
 7 | 
 8 | #######################################################################################################################
 9 | ################################################ TOR CONNECTION BELOW #################################################
10 | #######################################################################################################################
11 | 
12 | #Importing Stem libraries
13 | from stem import Signal
14 | from stem.control import Controller
15 | import socks, socket
16 | 
17 | #Initiating Connection
18 | with Controller.from_port(port=9051) as controller:
19 |     controller.authenticate("16:AE80E3930E42F7A3606823FA19CD0A3E721813EF8798ABFE86DB91DD09")
20 |     controller.signal(Signal.NEWNYM)
21 | 
22 | # TOR SETUP GLOBAL Vars
23 | SOCKS_PORT = 9050  # TOR proxy port that is default from torrc, change to whatever torrc is configured to
24 | socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", SOCKS_PORT)
25 | socket.socket = socks.socksocket
26 | 
27 | # Perform DNS resolution through the socket
28 | def getaddrinfo(*args):
29 |     return [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))]
30 | 
31 | socket.getaddrinfo = getaddrinfo
32 | 
33 | #######################################################################################################################
34 | ################################################ TOR CONNECTION ABOVE #################################################
35 | #######################################################################################################################
36 | 
37 | #Scrapping Onion links.
38 | def Scrape(url):
39 |     timeout = 10
40 |     socket.setdefaulttimeout(timeout)
41 | 
42 |     #Collecting html content.
43 |     headers = {'User-Agent': 'TorScrapper - Onion scrapper | github.com/ConanKapoor/TorScrapper.git' }
44 |     req = urllib.request.Request(url,None,headers)
45 |     response = urllib.request.urlopen(req)
46 | 
47 |     #Using BeautifulSoup to parse html object response.
48 |     page = BeautifulSoup(response.read(),'html.parser')
49 | 
50 |     #Saving output
51 |     token = re.sub(r'[^\w]', '', url)
52 |     name = os.path.abspath("") + '/Output/Scraped-' + token +'.html'
53 |     file = open(name,'w')
54 |     file.write(str(page))
55 |     file.close()
56 | 
57 | # Taking input.
58 | if __name__=='__main__':
59 |     if (len(sys.argv)==2):
60 |         url=sys.argv[1]
61 |         Scrape(url)
62 |     else:
63 |         print("Invalid input")
64 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # TorScrapper
  2 | A basic scrapper made in python with BeautifulSoup and Tor support to - 
  3 | 
  4 | * Scrape Onion and normal links.
  5 | * Save the output in html format in Output folder.
  6 | * Filter the html output and strip out useful data only (Work in Progress).
  7 | * Striping out IOCs and other related data (On To-Do list).
  8 | 
  9 | ## Getting Started
 10 | 
 11 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. See deployment for notes on how to deploy the project on a live system.
 12 | 
 13 | ### Prerequisites
 14 | 
 15 | * You will need **Python3** to run this project smoothly. Go to your terminal and execute the following command or visit [Python3](https://www.python.org/download/releases/3.0/) website.
 16 | 
 17 | ```
 18 | [sudo] apt-get install python3 python3-dev
 19 | ```
 20 | 
 21 | * You can install **Tor** by going to their website - https://www.torproject.org/
 22 | 
 23 | * Furthermore install the **requirements.txt** using pip3 - 
 24 | 
 25 | ```
 26 | [sudo] pip3 install -r requirements.txt
 27 | ```
 28 | 
 29 | TL;DR: We recommend installing TorScrapper inside a **virtual environment** on all platforms.
 30 | 
 31 | Python packages can be installed either globally (a.k.a system wide), or in user-space. We do not recommend installing TorScrapper system wide.
 32 | 
 33 | Instead, we recommend that you install TorScrapper within a so-called “virtual environment” (virtualenv). Virtualenvs allow you to not conflict with already-installed Python system packages (which could break some of your system tools and scripts), and still install packages normally with pip (without sudo and the likes).
 34 | 
 35 | To get started with virtual environments, see virtualenv installation instructions. To install it globally (having it globally installed actually helps here), it should be a matter of running:
 36 | 
 37 | ```
 38 | [sudo] pip install virtualenv
 39 | ```
 40 | ## Basic setup
 41 | Before you run the torBot make sure the following things are done properly:
 42 | 
 43 | * Run tor service
 44 | `sudo service tor start`
 45 | 
 46 | * Set a password for tor
 47 | `tor --hash-password "my_password" `
 48 | 
 49 | * Give the password inside /Modules/Scrape.py
 50 | `from stem.control import Controller
 51 | with Controller.from_port(port = 9051) as controller:
 52 |  controller.authenticate("your_password_hash")
 53 |  controller.signal(Signal.NEWNYM)`
 54 | 
 55 | * Go to /etc/tor/torrc and uncomment - _**ControlPort 9051**_
 56 | 
 57 | Read more about torrc here : [Torrc](https://github.com/ConanKapoor/TorScrapper/blob/master/Tor.md)
 58 | 
 59 | ### Deployment
 60 | 
 61 | A step by step series of examples that tells what you have to do to get this project running -
 62 | 
 63 | * Enter the project directory.
 64 | * Copy all the onion and normal links you want to scrape in _onions.txt_
 65 | 
 66 | ```
 67 | [nano]/[vim]/[gedit]/[Your choice of editor] onions.txt
 68 | ```
 69 | 
 70 | * Run TorScrapper.py using Python3
 71 | 
 72 | ```
 73 | [sudo] python3 TorScrapper.py
 74 | ```
 75 | 
 76 | * Check the scraped outputs in Output folder.
 77 | 
 78 | 
 79 | ## Built With
 80 | 
 81 | * [Python](https://www.python.org/) - Python programming language.
 82 | * [Tor](https://www.torproject.org/) - If you don't know about Tor then you probably shouldn't be here :)
 83 | * [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) - Beautiful Soup is a Python library for pulling data out of HTML and XML files.
 84 | 
 85 | ## Contributing
 86 | 
 87 | If you have new ideas which is worth implementing, mention those by starting a new issue with the title [FEATURE_REQUEST]. If the idea is worth implementing, congratz you are now a contributor.
 88 | 
 89 | ## Versioning
 90 | 
 91 | Version 1.something Mehh...
 92 | 
 93 | ## Authors
 94 | 
 95 | * **Shivam Kapoor** - An avid learner who likes to know every tiny detail in working of real life systems. Real enthusiast of cyber security and underlying networking concepts. (Email - kapoor.shivam88@gmail.com)
 96 | 
 97 | ## License
 98 | 
 99 | Too lazy to decide on a License. zZzZ
100 | 


--------------------------------------------------------------------------------
/Tor.md:
--------------------------------------------------------------------------------
 1 | # Tor Configuration
 2 | 
 3 | Look at your torrc for the following configuration options...
 4 | 
 5 |  Tor uses a text file called torrc that contains configuration instructions for how your Tor program should behave. The default configuration should work fine for most Tor users.
 6 | 
 7 | If you installed Tor Browser on Windows or Linux, look for Browser/TorBrowser/Data/Tor/torrc inside your Tor Browser directory. If you're on macOS, the torrc is in ~/Library/Application Support/TorBrowser-Data/Tor . To get to it, press cmd-shift-g while in Finder and copy/paste that directory into the box that appears.
 8 | 
 9 | Otherwise, if you are using Tor without Tor Browser, it looks for the torrc file in /usr/local/etc/tor/torrc if you compiled tor from source, and /etc/tor/torrc or /etc/torrc if you installed a pre-built package.
10 | 
11 | Once you've created or changed your torrc file, you will need to restart tor for the changes to take effect. (For advanced users, note that you actually only need to send Tor a HUP signal, not actually restart it.) 
12 | 
13 | ## torrc
14 | 
15 | #This provides a port for our script to talk with. If you set this then be
16 | #sure to also set either CookieAuthentication *or* HashedControlPassword!
17 | #
18 | #You could also use ControlSocket instead of ControlPort, which provides a
19 | #file based socket. You don't need to have authentication if you use
20 | #ControlSocket. For this example however we'll use a port.
21 | 
22 | ControlPort 9051
23 | 
24 | #Setting this will make Tor write an authentication cookie. Anything with
25 | #permission to read this file can connect to Tor. If you're going to run
26 | #your script with the same user or permission group as Tor then this is the
27 | #easiest method of authentication to use.
28 | 
29 | CookieAuthentication 1
30 | 
31 | #Alternatively we can authenticate with a password. To set a password first
32 | #get its hash...
33 | #
34 | #% tor --hash-password "my_password"
35 | #16:E600ADC1B52C80BB6022A0E999A7734571A451EB6AE50FED489B72E3DF
36 | #
37 | #... and use that for the HashedControlPassword in your torrc.
38 | 
39 | HashedControlPassword 16:E600ADC1B52C80BB6022A0E999A7734571A451EB6AE50FED489B72E3DF
40 | 


--------------------------------------------------------------------------------
/TorScrapper.py:
--------------------------------------------------------------------------------
 1 | # Author - Shivam Kapoor (ConanKapoor).
 2 | 
 3 | # Importing Essentials
 4 | from multiprocessing import Pool
 5 | from pyfiglet import Figlet
 6 | import os
 7 | 
 8 | # Opening onions directory. To scrape links add the same in onions.txt
 9 | with open("onions.txt", "r") as onion:
10 |     content = onion.read().splitlines()
11 | 
12 | # Terminal Process to edit Onions.txt using nano.(Only for Gnome at the moment).
13 | def ExecuteEditor():
14 |     execute = "nano onions.txt"
15 |     os.system(execute)
16 | 
17 | # Terminal Process for Crawler (Only for Gnome at the moment).
18 | def ExecuteCrawler(url):
19 |     execute = str('gnome-terminal -e \' python3 Modules/Crawler/crawl.py ' + url + '\'')
20 |     os.system(execute)
21 | 
22 | # Terminal Process for Scraper (Only for Gnome at the moment).
23 | def ExecuteScraper(url):
24 |     execute = str('gnome-terminal -e \'python3 Modules/Scraper/Scrape.py ' + url + '\'')
25 |     print (execute)
26 |     os.system(execute)
27 | 
28 | # Terminal Process for Scraping latest links. (Under Construction)
29 | def ExecuteDiff():
30 |     print ("\n------------> Work in progress. The developer is lazy af. <------------\n")
31 | 
32 | # MultiPrcessing Implementation (Limit - 5 processes at a time).
33 | def Multiprocessing(task):
34 |     if (os.path.exists("Output")):
35 |         delete = str('rm -r Output')
36 |         os.system(delete)
37 |         os.makedirs("Output")
38 |     else:
39 |         os.makedirs("Output")
40 | 
41 |     with Pool(processes=5) as pool:
42 |         for onion in range(0, len(content)):
43 |             pool.apply(task, args=(content[onion],))
44 | 
45 | # Banner for the program.
46 | def Banner():
47 |     banner = Figlet(font='slant')
48 |     print (banner.renderText('TorScraper'))
49 |     print ("<---------WELCOME TO TORSCRAPER PROGRAM--------->")
50 |     print ("<---------v1.0 - Author - Conan Kapoor--------->")
51 |     print ("\n")
52 | 
53 | # Menu given to users. Eat away!
54 | def Menu():
55 |     print ("Please Select the mode of operation:- \n")
56 |     print ("----> 1) Edit Onions.txt to add links.")
57 |     print ("----> 2) Crawl given links :].")
58 |     print ("----> 3) Scrape given links :].")
59 |     print ("----> 4) Compare latest crawl and scrape the latest links.")
60 |     print ("----> 5) Exit the program :[.\n")
61 | 
62 | if __name__ == '__main__':
63 |     die =1
64 |     try:
65 |         while(die):
66 |             os.system("clear")
67 |             Banner()
68 |             Menu()
69 |             choice = int(input("Enter your choice: "))
70 |             print("\n")
71 | 
72 |             if choice == 1:
73 |                 ExecuteEditor()
74 |             elif choice == 2:
75 |                 Multiprocessing(ExecuteCrawler)
76 |             elif choice == 3:
77 |                 Multiprocessing(ExecuteScraper)
78 |             elif choice == 4:
79 |                 ExecuteDiff()
80 |             else:
81 |                 die = 0
82 |                 quit()
83 | 
84 |     except KeyboardInterrupt:
85 |         print("\n\nInterrupt received! Exiting cleanly...\n")
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/onions.txt:
--------------------------------------------------------------------------------
1 | http://torlinkbgs6aabns.onion/
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.6.0
2 | PySocks==1.6.7
3 | stem==1.5.4
4 | tldextract
5 | pyfiglet
6 | 


--------------------------------------------------------------------------------