├── .gitignore ├── README.MD ├── __init__.py ├── main.py ├── rarbg ├── __init__.py ├── captcha_handler.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── rarbg_spider.py ├── requirements.txt ├── results └── torrentz.xml └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | .static_storage/ 58 | .media/ 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | .idea/* -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | # RARBG scraper 2 | 3 | Scraping RARBG for torrents using Scrapy.
4 | Including headless browsing with Selenium and CAPTCHA solving with pytesseract and Pillow.
5 | 6 | # Log example 7 | ![scrapy log](http://oi68.tinypic.com/28mmkiu.jpg) -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/evyatarmeged/RARBG-scraper/306edaaf4474d00b74201146ed37699d79bf7146/__init__.py -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from scrapy import crawler 2 | from rarbg.spiders.rarbg_spider import TorrentSpider 3 | 4 | 5 | def main(): 6 | spider = crawler.CrawlerProcess() 7 | spider.crawl(TorrentSpider) 8 | spider.start() 9 | 10 | 11 | if __name__ == '__main__': 12 | main() 13 | 14 | 15 | -------------------------------------------------------------------------------- /rarbg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/evyatarmeged/RARBG-scraper/306edaaf4474d00b74201146ed37699d79bf7146/rarbg/__init__.py -------------------------------------------------------------------------------- /rarbg/captcha_handler.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytesseract 3 | import requests 4 | from PIL import Image 5 | 6 | 7 | # Get CAPTCHA image & extract text 8 | class CaptchaHandler: 9 | 10 | def __init__(self): 11 | self.filename = 'solved_captcha.png' 12 | 13 | def get_captcha(self, src): 14 | img = requests.get(src) 15 | with open(self.filename, 'wb') as captcha_image: 16 | captcha_image.write(img.content) 17 | return self.solve_captcha(self.filename) 18 | 19 | @staticmethod 20 | def solve_captcha(img_path): 21 | try: 22 | solution = pytesseract.image_to_string(Image.open(img_path)) 23 | os.remove(img_path) # Remove the file after solving 24 | return solution 25 | except FileNotFoundError: 26 | return 27 | -------------------------------------------------------------------------------- /rarbg/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | 5 | # Scrapy torrent item model 6 | class Torrent(scrapy.Item): 7 | title = scrapy.Field() 8 | url = scrapy.Field() 9 | upload_date = scrapy.Field() 10 | size = scrapy.Field() 11 | seeders = scrapy.Field() 12 | leechers = scrapy.Field() 13 | uploader = scrapy.Field() 14 | -------------------------------------------------------------------------------- /rarbg/middlewares.py: -------------------------------------------------------------------------------- 1 | import time 2 | import logging 3 | from scrapy.downloadermiddlewares.redirect import RedirectMiddleware 4 | from selenium import webdriver 5 | from selenium.webdriver.chrome.options import Options 6 | from selenium.webdriver.support.expected_conditions import NoSuchElementException 7 | from .captcha_handler import CaptchaHandler 8 | 9 | 10 | LOGGER = logging.getLogger(__name__) 11 | options = Options() 12 | options.add_argument('--headless') 13 | 14 | 15 | class ThreatDefenceRedirectMiddleware(RedirectMiddleware): 16 | """ 17 | Custom RedirectMiddleware 18 | Using Selenium and chromedriver with a --headless flag 19 | Checks if redirected to a CAPTCHA page or a browser identification page and acts accordingly 20 | """ 21 | def __init__(self, settings): 22 | self.threat_defence = 'threat_defence.php' 23 | self.driver = webdriver.Chrome('/home/mr_evya/idaide/chromedriver', chrome_options=options) 24 | self.tries = 0 25 | self.captcha_handler = CaptchaHandler() 26 | self.cookies = None 27 | super().__init__(settings) 28 | 29 | # Test for `threat_defence` in redirect url, else call super class' redirect 30 | def _redirect(self, redirected, request, spider, reason): 31 | time.sleep(3) 32 | if self.threat_defence not in redirected.url: 33 | return super()._redirect(redirected, request, spider, reason) 34 | LOGGER.info('Threat defense triggered for {0}'.format(request.url)) 35 | LOGGER.info('Redirected to: {0}'.format(redirected.url)) 36 | request.cookies = self.bypass_threat_defense(redirected.url) 37 | self.driver.close() 38 | return request # With cookies of solved CAPTCHA session 39 | 40 | def bypass_threat_defense(self, url): 41 | LOGGER.info('Number of tries: #{0}'.format(self.tries)) 42 | self.driver.get(url) 43 | # While loop to decide whether we are on a browser detection (redirect) page or a captcha page 44 | while self.tries <= 5: # Current limit is 5 giving pytesseract % of success 45 | LOGGER.info('Waiting for browser detection') 46 | time.sleep(3) 47 | try: 48 | self.cookies = self.find_solve_submit_captcha() 49 | break 50 | except NoSuchElementException: 51 | LOGGER.info('No CAPTCHA found in page') 52 | try: 53 | self.redirect_retry() 54 | break 55 | except NoSuchElementException: 56 | LOGGER.info('No Link in page either. EXITING') 57 | break 58 | # If the solution was wrong and we are prompt with another try call method again 59 | if self.threat_defence in self.driver.current_url: 60 | self.tries += 1 61 | LOGGER.info('CAPTCHA solution was wrong. Trying again') 62 | self.bypass_threat_defense(self.driver.current_url) 63 | if self.cookies: 64 | return self.cookies 65 | exit('Something went wrong') 66 | 67 | # Press retry link if reached a redirect page without captcha 68 | def redirect_retry(self): 69 | LOGGER.info('Looking for `retry` link in page') 70 | link = self.driver.find_element_by_partial_link_text('Click') 71 | LOGGER.info('Retrying to get CAPTCHA page') 72 | self.tries += 1 73 | self.bypass_threat_defense(link.get_attribute('href')) 74 | 75 | def find_solve_submit_captcha(self): 76 | LOGGER.info('Looking for CAPTCHA image in page') 77 | # Find 78 | captcha = self.driver.find_element_by_xpath("//img[contains(@src, 'captcha')]") 79 | LOGGER.info('Found CAPTCHA image: {0}'.format(captcha.get_attribute('src'))) 80 | # Solve 81 | solved_captcha = self.captcha_handler.get_captcha(src=captcha.get_attribute('src')) 82 | LOGGER.info('CAPTCHA solved: {0}'.format(solved_captcha)) 83 | input_field = self.driver.find_element_by_id('solve_string') 84 | input_field.send_keys(solved_captcha) 85 | LOGGER.info('Submitting solution') 86 | # Submit 87 | self.driver.find_element_by_id('button_submit').click() 88 | return self.driver.get_cookies() 89 | -------------------------------------------------------------------------------- /rarbg/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class RarbgPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /rarbg/settings.py: -------------------------------------------------------------------------------- 1 | 2 | BOT_NAME = 'rarbg' 3 | 4 | SPIDER_MODULES = ['rarbg.spiders'] 5 | NEWSPIDER_MODULE = 'rarbg.spiders' 6 | 7 | LOG_LEVEL = 'INFO' 8 | LOG_SHORT_NAMES = True 9 | FEED_EXPORT_INDENT = 4 10 | 11 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko)' \ 12 | ' Chrome/27.0.1453.93 Safari/537.36' 13 | 14 | ROBOTSTXT_OBEY = False 15 | 16 | 17 | DOWNLOADER_MIDDLEWARES = { 18 | 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None, 19 | 'rarbg.middlewares.ThreatDefenceRedirectMiddleware': 600, 20 | } 21 | 22 | CONCURRENT_REQUESTS = 2 23 | 24 | # DOWNLOAD_DELAY = 4 25 | 26 | COOKIES_ENABLED = True 27 | 28 | -------------------------------------------------------------------------------- /rarbg/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /rarbg/spiders/rarbg_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import logging 3 | from ..items import Torrent 4 | 5 | 6 | LOGGER = logging.getLogger(__name__) 7 | 8 | 9 | class TorrentSpider(scrapy.Spider): 10 | name = "rarbg" 11 | # URL of movie torrents sorted by seeders descending order 12 | start_urls = \ 13 | ['http://rarbgproxy.org/torrents.php?category=14;17;42;44;45;46;47;48;50;51;52&search=&order=seeders&by=DESC'] 14 | 15 | def parse(self, response): 16 | for page_url in response.css('#pager_links > a::attr(href)').extract(): 17 | page_url = response.urljoin(page_url) 18 | yield scrapy.Request(url=page_url, callback=self.parse) 19 | 20 | for tr in response.css('tr.lista2'): 21 | tds = tr.css('td') 22 | yield Torrent( 23 | title=tds[1].css('a')[0].css('::attr(title)').extract_first(), 24 | url=response.urljoin(tds[1].css('a')[0].css('::attr(href)').extract_first()), 25 | upload_date=tds[2].css('::text').extract_first(), 26 | size=tds[3].css('::text').extract_first(), 27 | seeders=int(tds[4].css('::text').extract_first()), 28 | leechers=int(tds[5].css('::text').extract_first()), 29 | uploader=tds[7].css('::text').extract_first() 30 | ) 31 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.18.4 2 | selenium==3.6.0 3 | Scrapy==1.4.0 4 | pytesseract==0.1.7 5 | Pillow==4.2.1 6 | 7 | 8 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = rarbg.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = rarbg 12 | --------------------------------------------------------------------------------