├── Modules ├── Crawler │ ├── crawl.py │ ├── crawl_bot.py │ ├── file_manage.py │ ├── get_domains.py │ ├── link_finder.py │ └── link_finder.pyc └── Scraper │ └── Scrape.py ├── README.md ├── Tor.md ├── TorScrapper.py ├── onions.txt └── requirements.txt /Modules/Crawler/crawl.py: -------------------------------------------------------------------------------- 1 | #Author - Abhishek Singh(absingh31). 2 | 3 | from crawl_bot import Crawl_bot 4 | from file_manage import * 5 | from queue import Queue 6 | import threading, sys, os 7 | from get_domains import * 8 | import tldextract 9 | 10 | 11 | def input_url(base_url): 12 | global BASE_URL, regex 13 | BASE_URL=base_url 14 | 15 | url_extract = tldextract.extract(BASE_URL) 16 | regex = url_extract.domain 17 | 18 | if __name__=='__main__': 19 | if (len(sys.argv)==2): 20 | input_url(sys.argv[1]) 21 | else: 22 | print("Invalid input") 23 | 24 | 25 | GET_DOMAIN = get_domain_name(BASE_URL) 26 | FOLDER_NAME = str(os.path.abspath("") + '/Output/Crawled-'+ regex) 27 | print (FOLDER_NAME) 28 | data_crawled = FOLDER_NAME + '/crawled.txt' 29 | data_in_queue = FOLDER_NAME + '/queue.txt' 30 | thread_count =50 31 | queue = Queue() 32 | 33 | Crawl_bot(FOLDER_NAME, BASE_URL, GET_DOMAIN) 34 | 35 | def do_job(): # Get the job done 36 | while True: 37 | url = queue.get() 38 | Crawl_bot.crawl_page(threading.current_thread().name, url) 39 | queue.task_done() 40 | 41 | def queue_jobs(): # Define each queued link as a new job 42 | for url_link in convert_to_set(data_in_queue): 43 | queue.put(url_link) 44 | queue.join() 45 | initiate_bot() 46 | 47 | def get_links_to_queue(): # Also used to create threads to work 48 | for _ in range(thread_count): 49 | thread = threading.Thread(target=do_job) 50 | thread.daemon = True 51 | thread.start() 52 | 53 | def initiate_bot(): # Does the crawling job 54 | links_in_queue = convert_to_set(data_in_queue) 55 | if len(links_in_queue) > 0: 56 | print(str(len(links_in_queue)) + ' queued links') 57 | queue_jobs() 58 | 59 | get_links_to_queue() 60 | initiate_bot() 61 | 62 | # https://www.iocbucket.com/search 63 | # https://www.iocbucket.com/feeds/rss2/openioc/1.0/latestten 64 | # https://www.iocbucket.com/feeds/rss2/openioc/1.1/latestten 65 | # https://www.iocbucket.com/feeds/rss2/yara/latestten 66 | # https://www.metaflows.com/codeigniter/stats/content_md5_hash/ 67 | -------------------------------------------------------------------------------- /Modules/Crawler/crawl_bot.py: -------------------------------------------------------------------------------- 1 | #Author - Abhishek Singh(absingh31) and Shivam Kapoor (ConanKapoor). 2 | 3 | from get_domains import * 4 | from file_manage import * 5 | from link_finder import link_crawler 6 | from urllib.request import urlopen 7 | 8 | ####################################################################################################################### 9 | ################################################ TOR CONNECTION BELOW ################################################# 10 | ####################################################################################################################### 11 | 12 | #Importing Stem libraries 13 | from stem import Signal 14 | from stem.control import Controller 15 | import socks, socket 16 | 17 | #Initiating Connection 18 | with Controller.from_port(port=9051) as controller: 19 | controller.authenticate("16:AE80E3930E42F7A3606823FA19CD0A3E721813EF8798ABFE86DB91DD09") 20 | controller.signal(Signal.NEWNYM) 21 | 22 | # TOR SETUP GLOBAL Vars 23 | SOCKS_PORT = 9050 # TOR proxy port that is default from torrc, change to whatever torrc is configured to 24 | socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", SOCKS_PORT) 25 | socket.socket = socks.socksocket 26 | 27 | # Perform DNS resolution through the socket 28 | def getaddrinfo(*args): 29 | return [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))] 30 | 31 | socket.getaddrinfo = getaddrinfo 32 | 33 | ####################################################################################################################### 34 | ################################################ TOR CONNECTION ABOVE ################################################# 35 | ####################################################################################################################### 36 | 37 | class Crawl_bot: 38 | 39 | folder_name, start_link, domain_name, queued_data, crawled_data = '', '', '', '', '' 40 | queue = set() 41 | data_crawled = set() 42 | 43 | def __init__(self, folder_name, start_link, domain_name): 44 | Crawl_bot.folder_name = folder_name 45 | Crawl_bot.start_link = start_link 46 | Crawl_bot.domain_name = domain_name 47 | Crawl_bot.queued_data = Crawl_bot.folder_name + '/queue.txt' 48 | Crawl_bot.crawled_data = Crawl_bot.folder_name + '/crawled.txt' 49 | self.initiate_directory() 50 | self.crawl_page('Spider starts here', Crawl_bot.start_link) 51 | 52 | @staticmethod 53 | def initiate_directory(): # Define and create new directory on the first run 54 | create_project_folder(Crawl_bot.folder_name) 55 | create_data_files(Crawl_bot.folder_name, Crawl_bot.start_link) 56 | Crawl_bot.queue = convert_to_set(Crawl_bot.queued_data) 57 | Crawl_bot.data_crawled = convert_to_set(Crawl_bot.crawled_data) 58 | 59 | @staticmethod 60 | def crawl_page(thread_name, web_url): # Fill queue and then update files, also updating user display 61 | if web_url not in Crawl_bot.data_crawled: 62 | print(thread_name + ' now crawl starts ' + web_url) 63 | print('Queue_url ' + str(len(Crawl_bot.queue)) + ' | Crawled_url ' + str(len(Crawl_bot.data_crawled))) 64 | Crawl_bot.add_url_to_queue(Crawl_bot.collect_url(web_url)) 65 | Crawl_bot.queue.remove(web_url) 66 | Crawl_bot.data_crawled.add(web_url) 67 | Crawl_bot.update_folder() 68 | 69 | # Converts raw response data into readable information and checks for proper html formatting 70 | @staticmethod 71 | def collect_url(web_url): 72 | html_data_string = '' 73 | try: 74 | received_response = urlopen(web_url) 75 | if 'text/html' in received_response.getheader('Content-Type'): 76 | data_bytes = received_response.read() 77 | html_data_string = data_bytes.decode("latin-1") 78 | link_finder = link_crawler(Crawl_bot.start_link, web_url) 79 | link_finder.feed(html_data_string) 80 | except Exception as e: 81 | print(str(e)) 82 | return set() 83 | return link_finder.page_urls() 84 | 85 | @staticmethod 86 | def add_url_to_queue(links): # Queue data saves to project files 87 | for url in links: 88 | if (url in Crawl_bot.queue) or (url in Crawl_bot.data_crawled): 89 | continue 90 | # if Crawl_bot.domain_name != get_domain_name(url): 91 | # continue 92 | Crawl_bot.queue.add(url) 93 | 94 | @staticmethod 95 | def update_folder(): # Update the project directory 96 | set_to_file(Crawl_bot.queue, Crawl_bot.queued_data) 97 | set_to_file(Crawl_bot.data_crawled, Crawl_bot.crawled_data) 98 | -------------------------------------------------------------------------------- /Modules/Crawler/file_manage.py: -------------------------------------------------------------------------------- 1 | #Author - Abhishek Singh(absingh31). 2 | 3 | import os 4 | 5 | def create_project_folder(dir): # Create seperate folder for each website 6 | if not os.path.exists(dir): 7 | print('Creating directory ' + dir) 8 | os.makedirs(dir) 9 | 10 | def create_data_files(folder_name, start_link): # Append to queue and crawled list 11 | queue = os.path.join(folder_name , 'queue.txt') 12 | data_crawled = os.path.join(folder_name,"crawled.txt") 13 | if not os.path.isfile(queue): 14 | write_to_file(queue, start_link) 15 | if not os.path.isfile(data_crawled): 16 | write_to_file(data_crawled, '') 17 | 18 | def write_to_file(path, url): # Create a new file for the task 19 | with open(path, 'w') as f: 20 | f.write(url) 21 | 22 | def append_file(path, url): # Append new data to existing file 23 | with open(path, 'a') as file: 24 | file.write(url + '\n') 25 | 26 | def empty_queue(path): # Delete contents of a file 27 | open(path, 'w').close() 28 | 29 | def convert_to_set(file_name): # Read a file and convert each line to set items 30 | results = set() 31 | with open(file_name, 'rt') as f: 32 | for line in f: 33 | results.add(line.replace('\n', '')) 34 | return results 35 | 36 | def set_to_file(urls, file_name): # Iterate through a set, each item will be a line in a file 37 | with open(file_name,"w") as f: 38 | for l in sorted(urls): 39 | f.write(l+"\n") 40 | -------------------------------------------------------------------------------- /Modules/Crawler/get_domains.py: -------------------------------------------------------------------------------- 1 | #Author - Abhishek Singh(absingh31) and Shivam Kapoor(ConanKapoor). 2 | 3 | import tldextract 4 | 5 | def get_domain_name(link): 6 | url_extract = tldextract.extract(link) 7 | site_name = url_extract.domain + '.' + url_extract.suffix 8 | return site_name 9 | -------------------------------------------------------------------------------- /Modules/Crawler/link_finder.py: -------------------------------------------------------------------------------- 1 | #Author - Abhishek Singh(absingh31). 2 | 3 | import urllib 4 | from html.parser import HTMLParser 5 | from urllib.parse import urljoin 6 | 7 | class link_crawler(HTMLParser): 8 | 9 | def __init__(self, start_link, web_url): 10 | super().__init__() 11 | self.start_link = start_link 12 | self.web_url = web_url 13 | self.urls = set() 14 | 15 | def handle_starttag(self, tag, found_attributes): # The main logic for crawler 16 | if tag == 'a': 17 | for (attr, value) in found_attributes: 18 | if attr == 'href': 19 | url = urllib.parse.urljoin(self.start_link, value) 20 | self.urls.add(url) 21 | 22 | def page_urls(self): 23 | return self.urls 24 | 25 | def error(self, message): 26 | pass 27 | -------------------------------------------------------------------------------- /Modules/Crawler/link_finder.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/little-endian-0x01/TorScrapper/2fc4452de4ae88dab1f0aeedca0e4ab240e9dea9/Modules/Crawler/link_finder.pyc -------------------------------------------------------------------------------- /Modules/Scraper/Scrape.py: -------------------------------------------------------------------------------- 1 | #Author - Shivam Kapoor (ConanKapoor). 2 | 3 | #Importing Essentials 4 | import urllib.request 5 | from bs4 import BeautifulSoup 6 | import sys,re,os 7 | 8 | ####################################################################################################################### 9 | ################################################ TOR CONNECTION BELOW ################################################# 10 | ####################################################################################################################### 11 | 12 | #Importing Stem libraries 13 | from stem import Signal 14 | from stem.control import Controller 15 | import socks, socket 16 | 17 | #Initiating Connection 18 | with Controller.from_port(port=9051) as controller: 19 | controller.authenticate("16:AE80E3930E42F7A3606823FA19CD0A3E721813EF8798ABFE86DB91DD09") 20 | controller.signal(Signal.NEWNYM) 21 | 22 | # TOR SETUP GLOBAL Vars 23 | SOCKS_PORT = 9050 # TOR proxy port that is default from torrc, change to whatever torrc is configured to 24 | socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", SOCKS_PORT) 25 | socket.socket = socks.socksocket 26 | 27 | # Perform DNS resolution through the socket 28 | def getaddrinfo(*args): 29 | return [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))] 30 | 31 | socket.getaddrinfo = getaddrinfo 32 | 33 | ####################################################################################################################### 34 | ################################################ TOR CONNECTION ABOVE ################################################# 35 | ####################################################################################################################### 36 | 37 | #Scrapping Onion links. 38 | def Scrape(url): 39 | timeout = 10 40 | socket.setdefaulttimeout(timeout) 41 | 42 | #Collecting html content. 43 | headers = {'User-Agent': 'TorScrapper - Onion scrapper | github.com/ConanKapoor/TorScrapper.git' } 44 | req = urllib.request.Request(url,None,headers) 45 | response = urllib.request.urlopen(req) 46 | 47 | #Using BeautifulSoup to parse html object response. 48 | page = BeautifulSoup(response.read(),'html.parser') 49 | 50 | #Saving output 51 | token = re.sub(r'[^\w]', '', url) 52 | name = os.path.abspath("") + '/Output/Scraped-' + token +'.html' 53 | file = open(name,'w') 54 | file.write(str(page)) 55 | file.close() 56 | 57 | # Taking input. 58 | if __name__=='__main__': 59 | if (len(sys.argv)==2): 60 | url=sys.argv[1] 61 | Scrape(url) 62 | else: 63 | print("Invalid input") 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TorScrapper 2 | A basic scrapper made in python with BeautifulSoup and Tor support to - 3 | 4 | * Scrape Onion and normal links. 5 | * Save the output in html format in Output folder. 6 | * Filter the html output and strip out useful data only (Work in Progress). 7 | * Striping out IOCs and other related data (On To-Do list). 8 | 9 | ## Getting Started 10 | 11 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. See deployment for notes on how to deploy the project on a live system. 12 | 13 | ### Prerequisites 14 | 15 | * You will need **Python3** to run this project smoothly. Go to your terminal and execute the following command or visit [Python3](https://www.python.org/download/releases/3.0/) website. 16 | 17 | ``` 18 | [sudo] apt-get install python3 python3-dev 19 | ``` 20 | 21 | * You can install **Tor** by going to their website - https://www.torproject.org/ 22 | 23 | * Furthermore install the **requirements.txt** using pip3 - 24 | 25 | ``` 26 | [sudo] pip3 install -r requirements.txt 27 | ``` 28 | 29 | TL;DR: We recommend installing TorScrapper inside a **virtual environment** on all platforms. 30 | 31 | Python packages can be installed either globally (a.k.a system wide), or in user-space. We do not recommend installing TorScrapper system wide. 32 | 33 | Instead, we recommend that you install TorScrapper within a so-called “virtual environment” (virtualenv). Virtualenvs allow you to not conflict with already-installed Python system packages (which could break some of your system tools and scripts), and still install packages normally with pip (without sudo and the likes). 34 | 35 | To get started with virtual environments, see virtualenv installation instructions. To install it globally (having it globally installed actually helps here), it should be a matter of running: 36 | 37 | ``` 38 | [sudo] pip install virtualenv 39 | ``` 40 | ## Basic setup 41 | Before you run the torBot make sure the following things are done properly: 42 | 43 | * Run tor service 44 | `sudo service tor start` 45 | 46 | * Set a password for tor 47 | `tor --hash-password "my_password" ` 48 | 49 | * Give the password inside /Modules/Scrape.py 50 | `from stem.control import Controller 51 | with Controller.from_port(port = 9051) as controller: 52 | controller.authenticate("your_password_hash") 53 | controller.signal(Signal.NEWNYM)` 54 | 55 | * Go to /etc/tor/torrc and uncomment - _**ControlPort 9051**_ 56 | 57 | Read more about torrc here : [Torrc](https://github.com/ConanKapoor/TorScrapper/blob/master/Tor.md) 58 | 59 | ### Deployment 60 | 61 | A step by step series of examples that tells what you have to do to get this project running - 62 | 63 | * Enter the project directory. 64 | * Copy all the onion and normal links you want to scrape in _onions.txt_ 65 | 66 | ``` 67 | [nano]/[vim]/[gedit]/[Your choice of editor] onions.txt 68 | ``` 69 | 70 | * Run TorScrapper.py using Python3 71 | 72 | ``` 73 | [sudo] python3 TorScrapper.py 74 | ``` 75 | 76 | * Check the scraped outputs in Output folder. 77 | 78 | 79 | ## Built With 80 | 81 | * [Python](https://www.python.org/) - Python programming language. 82 | * [Tor](https://www.torproject.org/) - If you don't know about Tor then you probably shouldn't be here :) 83 | * [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) - Beautiful Soup is a Python library for pulling data out of HTML and XML files. 84 | 85 | ## Contributing 86 | 87 | If you have new ideas which is worth implementing, mention those by starting a new issue with the title [FEATURE_REQUEST]. If the idea is worth implementing, congratz you are now a contributor. 88 | 89 | ## Versioning 90 | 91 | Version 1.something Mehh... 92 | 93 | ## Authors 94 | 95 | * **Shivam Kapoor** - An avid learner who likes to know every tiny detail in working of real life systems. Real enthusiast of cyber security and underlying networking concepts. (Email - kapoor.shivam88@gmail.com) 96 | 97 | ## License 98 | 99 | Too lazy to decide on a License. zZzZ 100 | -------------------------------------------------------------------------------- /Tor.md: -------------------------------------------------------------------------------- 1 | # Tor Configuration 2 | 3 | Look at your torrc for the following configuration options... 4 | 5 | Tor uses a text file called torrc that contains configuration instructions for how your Tor program should behave. The default configuration should work fine for most Tor users. 6 | 7 | If you installed Tor Browser on Windows or Linux, look for Browser/TorBrowser/Data/Tor/torrc inside your Tor Browser directory. If you're on macOS, the torrc is in ~/Library/Application Support/TorBrowser-Data/Tor . To get to it, press cmd-shift-g while in Finder and copy/paste that directory into the box that appears. 8 | 9 | Otherwise, if you are using Tor without Tor Browser, it looks for the torrc file in /usr/local/etc/tor/torrc if you compiled tor from source, and /etc/tor/torrc or /etc/torrc if you installed a pre-built package. 10 | 11 | Once you've created or changed your torrc file, you will need to restart tor for the changes to take effect. (For advanced users, note that you actually only need to send Tor a HUP signal, not actually restart it.) 12 | 13 | ## torrc 14 | 15 | #This provides a port for our script to talk with. If you set this then be 16 | #sure to also set either CookieAuthentication *or* HashedControlPassword! 17 | # 18 | #You could also use ControlSocket instead of ControlPort, which provides a 19 | #file based socket. You don't need to have authentication if you use 20 | #ControlSocket. For this example however we'll use a port. 21 | 22 | ControlPort 9051 23 | 24 | #Setting this will make Tor write an authentication cookie. Anything with 25 | #permission to read this file can connect to Tor. If you're going to run 26 | #your script with the same user or permission group as Tor then this is the 27 | #easiest method of authentication to use. 28 | 29 | CookieAuthentication 1 30 | 31 | #Alternatively we can authenticate with a password. To set a password first 32 | #get its hash... 33 | # 34 | #% tor --hash-password "my_password" 35 | #16:E600ADC1B52C80BB6022A0E999A7734571A451EB6AE50FED489B72E3DF 36 | # 37 | #... and use that for the HashedControlPassword in your torrc. 38 | 39 | HashedControlPassword 16:E600ADC1B52C80BB6022A0E999A7734571A451EB6AE50FED489B72E3DF 40 | -------------------------------------------------------------------------------- /TorScrapper.py: -------------------------------------------------------------------------------- 1 | # Author - Shivam Kapoor (ConanKapoor). 2 | 3 | # Importing Essentials 4 | from multiprocessing import Pool 5 | from pyfiglet import Figlet 6 | import os 7 | 8 | # Opening onions directory. To scrape links add the same in onions.txt 9 | with open("onions.txt", "r") as onion: 10 | content = onion.read().splitlines() 11 | 12 | # Terminal Process to edit Onions.txt using nano.(Only for Gnome at the moment). 13 | def ExecuteEditor(): 14 | execute = "nano onions.txt" 15 | os.system(execute) 16 | 17 | # Terminal Process for Crawler (Only for Gnome at the moment). 18 | def ExecuteCrawler(url): 19 | execute = str('gnome-terminal -e \' python3 Modules/Crawler/crawl.py ' + url + '\'') 20 | os.system(execute) 21 | 22 | # Terminal Process for Scraper (Only for Gnome at the moment). 23 | def ExecuteScraper(url): 24 | execute = str('gnome-terminal -e \'python3 Modules/Scraper/Scrape.py ' + url + '\'') 25 | print (execute) 26 | os.system(execute) 27 | 28 | # Terminal Process for Scraping latest links. (Under Construction) 29 | def ExecuteDiff(): 30 | print ("\n------------> Work in progress. The developer is lazy af. <------------\n") 31 | 32 | # MultiPrcessing Implementation (Limit - 5 processes at a time). 33 | def Multiprocessing(task): 34 | if (os.path.exists("Output")): 35 | delete = str('rm -r Output') 36 | os.system(delete) 37 | os.makedirs("Output") 38 | else: 39 | os.makedirs("Output") 40 | 41 | with Pool(processes=5) as pool: 42 | for onion in range(0, len(content)): 43 | pool.apply(task, args=(content[onion],)) 44 | 45 | # Banner for the program. 46 | def Banner(): 47 | banner = Figlet(font='slant') 48 | print (banner.renderText('TorScraper')) 49 | print ("<---------WELCOME TO TORSCRAPER PROGRAM--------->") 50 | print ("<---------v1.0 - Author - Conan Kapoor--------->") 51 | print ("\n") 52 | 53 | # Menu given to users. Eat away! 54 | def Menu(): 55 | print ("Please Select the mode of operation:- \n") 56 | print ("----> 1) Edit Onions.txt to add links.") 57 | print ("----> 2) Crawl given links :].") 58 | print ("----> 3) Scrape given links :].") 59 | print ("----> 4) Compare latest crawl and scrape the latest links.") 60 | print ("----> 5) Exit the program :[.\n") 61 | 62 | if __name__ == '__main__': 63 | die =1 64 | try: 65 | while(die): 66 | os.system("clear") 67 | Banner() 68 | Menu() 69 | choice = int(input("Enter your choice: ")) 70 | print("\n") 71 | 72 | if choice == 1: 73 | ExecuteEditor() 74 | elif choice == 2: 75 | Multiprocessing(ExecuteCrawler) 76 | elif choice == 3: 77 | Multiprocessing(ExecuteScraper) 78 | elif choice == 4: 79 | ExecuteDiff() 80 | else: 81 | die = 0 82 | quit() 83 | 84 | except KeyboardInterrupt: 85 | print("\n\nInterrupt received! Exiting cleanly...\n") 86 | 87 | 88 | -------------------------------------------------------------------------------- /onions.txt: -------------------------------------------------------------------------------- 1 | http://torlinkbgs6aabns.onion/ 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.6.0 2 | PySocks==1.6.7 3 | stem==1.5.4 4 | tldextract 5 | pyfiglet 6 | --------------------------------------------------------------------------------