├── .gitignore
├── README.MD
├── __init__.py
├── main.py
├── rarbg
├── __init__.py
├── captcha_handler.py
├── items.py
├── middlewares.py
├── pipelines.py
├── settings.py
└── spiders
│ ├── __init__.py
│ └── rarbg_spider.py
├── requirements.txt
├── results
└── torrentz.xml
└── scrapy.cfg
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | ### Python template
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | .hypothesis/
50 |
51 | # Translations
52 | *.mo
53 | *.pot
54 |
55 | # Django stuff:
56 | *.log
57 | .static_storage/
58 | .media/
59 | local_settings.py
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # pyenv
78 | .python-version
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # Environments
87 | .env
88 | .venv
89 | env/
90 | venv/
91 | ENV/
92 | env.bak/
93 | venv.bak/
94 |
95 | # Spyder project settings
96 | .spyderproject
97 | .spyproject
98 |
99 | # Rope project settings
100 | .ropeproject
101 |
102 | # mkdocs documentation
103 | /site
104 |
105 | # mypy
106 | .mypy_cache/
107 |
108 | .idea/*
--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
1 | # RARBG scraper
2 |
3 | Scraping RARBG for torrents using Scrapy.
4 | Including headless browsing with Selenium and CAPTCHA solving with pytesseract and Pillow.
5 |
6 | # Log example
7 | 
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/evyatarmeged/RARBG-scraper/306edaaf4474d00b74201146ed37699d79bf7146/__init__.py
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from scrapy import crawler
2 | from rarbg.spiders.rarbg_spider import TorrentSpider
3 |
4 |
5 | def main():
6 | spider = crawler.CrawlerProcess()
7 | spider.crawl(TorrentSpider)
8 | spider.start()
9 |
10 |
11 | if __name__ == '__main__':
12 | main()
13 |
14 |
15 |
--------------------------------------------------------------------------------
/rarbg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/evyatarmeged/RARBG-scraper/306edaaf4474d00b74201146ed37699d79bf7146/rarbg/__init__.py
--------------------------------------------------------------------------------
/rarbg/captcha_handler.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pytesseract
3 | import requests
4 | from PIL import Image
5 |
6 |
7 | # Get CAPTCHA image & extract text
8 | class CaptchaHandler:
9 |
10 | def __init__(self):
11 | self.filename = 'solved_captcha.png'
12 |
13 | def get_captcha(self, src):
14 | img = requests.get(src)
15 | with open(self.filename, 'wb') as captcha_image:
16 | captcha_image.write(img.content)
17 | return self.solve_captcha(self.filename)
18 |
19 | @staticmethod
20 | def solve_captcha(img_path):
21 | try:
22 | solution = pytesseract.image_to_string(Image.open(img_path))
23 | os.remove(img_path) # Remove the file after solving
24 | return solution
25 | except FileNotFoundError:
26 | return
27 |
--------------------------------------------------------------------------------
/rarbg/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 |
4 |
5 | # Scrapy torrent item model
6 | class Torrent(scrapy.Item):
7 | title = scrapy.Field()
8 | url = scrapy.Field()
9 | upload_date = scrapy.Field()
10 | size = scrapy.Field()
11 | seeders = scrapy.Field()
12 | leechers = scrapy.Field()
13 | uploader = scrapy.Field()
14 |
--------------------------------------------------------------------------------
/rarbg/middlewares.py:
--------------------------------------------------------------------------------
1 | import time
2 | import logging
3 | from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
4 | from selenium import webdriver
5 | from selenium.webdriver.chrome.options import Options
6 | from selenium.webdriver.support.expected_conditions import NoSuchElementException
7 | from .captcha_handler import CaptchaHandler
8 |
9 |
10 | LOGGER = logging.getLogger(__name__)
11 | options = Options()
12 | options.add_argument('--headless')
13 |
14 |
15 | class ThreatDefenceRedirectMiddleware(RedirectMiddleware):
16 | """
17 | Custom RedirectMiddleware
18 | Using Selenium and chromedriver with a --headless flag
19 | Checks if redirected to a CAPTCHA page or a browser identification page and acts accordingly
20 | """
21 | def __init__(self, settings):
22 | self.threat_defence = 'threat_defence.php'
23 | self.driver = webdriver.Chrome('/home/mr_evya/idaide/chromedriver', chrome_options=options)
24 | self.tries = 0
25 | self.captcha_handler = CaptchaHandler()
26 | self.cookies = None
27 | super().__init__(settings)
28 |
29 | # Test for `threat_defence` in redirect url, else call super class' redirect
30 | def _redirect(self, redirected, request, spider, reason):
31 | time.sleep(3)
32 | if self.threat_defence not in redirected.url:
33 | return super()._redirect(redirected, request, spider, reason)
34 | LOGGER.info('Threat defense triggered for {0}'.format(request.url))
35 | LOGGER.info('Redirected to: {0}'.format(redirected.url))
36 | request.cookies = self.bypass_threat_defense(redirected.url)
37 | self.driver.close()
38 | return request # With cookies of solved CAPTCHA session
39 |
40 | def bypass_threat_defense(self, url):
41 | LOGGER.info('Number of tries: #{0}'.format(self.tries))
42 | self.driver.get(url)
43 | # While loop to decide whether we are on a browser detection (redirect) page or a captcha page
44 | while self.tries <= 5: # Current limit is 5 giving pytesseract % of success
45 | LOGGER.info('Waiting for browser detection')
46 | time.sleep(3)
47 | try:
48 | self.cookies = self.find_solve_submit_captcha()
49 | break
50 | except NoSuchElementException:
51 | LOGGER.info('No CAPTCHA found in page')
52 | try:
53 | self.redirect_retry()
54 | break
55 | except NoSuchElementException:
56 | LOGGER.info('No Link in page either. EXITING')
57 | break
58 | # If the solution was wrong and we are prompt with another try call method again
59 | if self.threat_defence in self.driver.current_url:
60 | self.tries += 1
61 | LOGGER.info('CAPTCHA solution was wrong. Trying again')
62 | self.bypass_threat_defense(self.driver.current_url)
63 | if self.cookies:
64 | return self.cookies
65 | exit('Something went wrong')
66 |
67 | # Press retry link if reached a redirect page without captcha
68 | def redirect_retry(self):
69 | LOGGER.info('Looking for `retry` link in page')
70 | link = self.driver.find_element_by_partial_link_text('Click')
71 | LOGGER.info('Retrying to get CAPTCHA page')
72 | self.tries += 1
73 | self.bypass_threat_defense(link.get_attribute('href'))
74 |
75 | def find_solve_submit_captcha(self):
76 | LOGGER.info('Looking for CAPTCHA image in page')
77 | # Find
78 | captcha = self.driver.find_element_by_xpath("//img[contains(@src, 'captcha')]")
79 | LOGGER.info('Found CAPTCHA image: {0}'.format(captcha.get_attribute('src')))
80 | # Solve
81 | solved_captcha = self.captcha_handler.get_captcha(src=captcha.get_attribute('src'))
82 | LOGGER.info('CAPTCHA solved: {0}'.format(solved_captcha))
83 | input_field = self.driver.find_element_by_id('solve_string')
84 | input_field.send_keys(solved_captcha)
85 | LOGGER.info('Submitting solution')
86 | # Submit
87 | self.driver.find_element_by_id('button_submit').click()
88 | return self.driver.get_cookies()
89 |
--------------------------------------------------------------------------------
/rarbg/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class RarbgPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/rarbg/settings.py:
--------------------------------------------------------------------------------
1 |
2 | BOT_NAME = 'rarbg'
3 |
4 | SPIDER_MODULES = ['rarbg.spiders']
5 | NEWSPIDER_MODULE = 'rarbg.spiders'
6 |
7 | LOG_LEVEL = 'INFO'
8 | LOG_SHORT_NAMES = True
9 | FEED_EXPORT_INDENT = 4
10 |
11 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko)' \
12 | ' Chrome/27.0.1453.93 Safari/537.36'
13 |
14 | ROBOTSTXT_OBEY = False
15 |
16 |
17 | DOWNLOADER_MIDDLEWARES = {
18 | 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
19 | 'rarbg.middlewares.ThreatDefenceRedirectMiddleware': 600,
20 | }
21 |
22 | CONCURRENT_REQUESTS = 2
23 |
24 | # DOWNLOAD_DELAY = 4
25 |
26 | COOKIES_ENABLED = True
27 |
28 |
--------------------------------------------------------------------------------
/rarbg/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/rarbg/spiders/rarbg_spider.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | import logging
3 | from ..items import Torrent
4 |
5 |
6 | LOGGER = logging.getLogger(__name__)
7 |
8 |
9 | class TorrentSpider(scrapy.Spider):
10 | name = "rarbg"
11 | # URL of movie torrents sorted by seeders descending order
12 | start_urls = \
13 | ['http://rarbgproxy.org/torrents.php?category=14;17;42;44;45;46;47;48;50;51;52&search=&order=seeders&by=DESC']
14 |
15 | def parse(self, response):
16 | for page_url in response.css('#pager_links > a::attr(href)').extract():
17 | page_url = response.urljoin(page_url)
18 | yield scrapy.Request(url=page_url, callback=self.parse)
19 |
20 | for tr in response.css('tr.lista2'):
21 | tds = tr.css('td')
22 | yield Torrent(
23 | title=tds[1].css('a')[0].css('::attr(title)').extract_first(),
24 | url=response.urljoin(tds[1].css('a')[0].css('::attr(href)').extract_first()),
25 | upload_date=tds[2].css('::text').extract_first(),
26 | size=tds[3].css('::text').extract_first(),
27 | seeders=int(tds[4].css('::text').extract_first()),
28 | leechers=int(tds[5].css('::text').extract_first()),
29 | uploader=tds[7].css('::text').extract_first()
30 | )
31 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests==2.18.4
2 | selenium==3.6.0
3 | Scrapy==1.4.0
4 | pytesseract==0.1.7
5 | Pillow==4.2.1
6 |
7 |
8 |
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = rarbg.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = rarbg
12 |
--------------------------------------------------------------------------------