├── .gitignore ├── LICENSE ├── README.rst ├── random_useragent.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | 56 | # IDEs 57 | .idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Srinivasan Rangarajan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Scrapy Random User-Agent 2 | ======================== 3 | 4 | Does your scrapy spider get identified and blocked by servers because 5 | you use the default user-agent or a generic one? 6 | 7 | Use this ``random_useragent`` module and set a random user-agent for 8 | every request. You are limited only by the number of different 9 | user-agents you set in a text file. 10 | 11 | Installing 12 | ---------- 13 | 14 | Installing it is pretty simple. 15 | 16 | .. code-block:: python 17 | 18 | pip install scrapy-random-useragent 19 | 20 | Usage 21 | ----- 22 | 23 | In your ``settings.py`` file, update the ``DOWNLOADER_MIDDLEWARES`` 24 | variable like this. 25 | 26 | .. code-block:: python 27 | 28 | DOWNLOADER_MIDDLEWARES = { 29 | 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 30 | 'random_useragent.RandomUserAgentMiddleware': 400 31 | } 32 | 33 | This disables the default ``UserAgentMiddleware`` and enables the 34 | ``RandomUserAgentMiddleware``. 35 | 36 | Then, create a new variable ``USER_AGENT_LIST`` with the path to your 37 | text file which has the list of all user-agents 38 | (one user-agent per line). 39 | 40 | .. code-block:: python 41 | 42 | USER_AGENT_LIST = "/path/to/useragents.txt" 43 | 44 | Now all the requests from your crawler will have a random user-agent 45 | picked from the text file. 46 | -------------------------------------------------------------------------------- /random_useragent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*-coding:utf-8-*- 3 | """Scrapy Middleware to set a random User-Agent for every Request. 4 | 5 | Downloader Middleware which uses a file containing a list of 6 | user-agents and sets a random one for each request. 7 | """ 8 | 9 | import random 10 | from scrapy import signals 11 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 12 | 13 | __author__ = "Srinivasan Rangarajan" 14 | __copyright__ = "Copyright 2016, Srinivasan Rangarajan" 15 | __credits__ = ["Srinivasan Rangarajan"] 16 | __license__ = "MIT" 17 | __version__ = "0.2" 18 | __maintainer__ = "Srinivasan Rangarajan" 19 | __email__ = "srinivasanr@gmail.com" 20 | __status__ = "Development" 21 | 22 | 23 | class RandomUserAgentMiddleware(UserAgentMiddleware): 24 | 25 | def __init__(self, settings, user_agent='Scrapy'): 26 | super(RandomUserAgentMiddleware, self).__init__() 27 | self.user_agent = user_agent 28 | user_agent_list_file = settings.get('USER_AGENT_LIST') 29 | if not user_agent_list_file: 30 | # If USER_AGENT_LIST_FILE settings is not set, 31 | # Use the default USER_AGENT or whatever was 32 | # passed to the middleware. 33 | ua = settings.get('USER_AGENT', user_agent) 34 | self.user_agent_list = [ua] 35 | else: 36 | with open(user_agent_list_file, 'r') as f: 37 | self.user_agent_list = [line.strip() for line in f.readlines()] 38 | 39 | @classmethod 40 | def from_crawler(cls, crawler): 41 | obj = cls(crawler.settings) 42 | crawler.signals.connect(obj.spider_opened, 43 | signal=signals.spider_opened) 44 | return obj 45 | 46 | def process_request(self, request, spider): 47 | user_agent = random.choice(self.user_agent_list) 48 | if user_agent: 49 | request.headers.setdefault('User-Agent', user_agent) 50 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Setup script for PyPi 3 | """ 4 | import codecs 5 | import re 6 | from setuptools import setup 7 | 8 | 9 | # Get the long description from the relevant file 10 | with codecs.open('README.rst', encoding='utf-8') as f: 11 | long_description = f.read() 12 | 13 | 14 | # Open the package file so we can read the meta data. 15 | with codecs.open('random_useragent.py', encoding='utf-8') as f: 16 | package_file = f.read() 17 | 18 | 19 | def get_package_meta(meta_name): 20 | """Return value of variable set in the package where said variable is 21 | named in the Python meta format `____`. 22 | """ 23 | regex = "__{0}__ = ['\"]([^'\"]+)['\"]".format(meta_name) 24 | return re.search(regex, package_file).group(1) 25 | 26 | 27 | version = get_package_meta('version') 28 | author = get_package_meta('author') 29 | email = get_package_meta('email') 30 | license = get_package_meta('license') 31 | 32 | 33 | setup( 34 | name='scrapy-random-useragent', 35 | version=version, 36 | 37 | description='Scrapy Middleware to set a random User-Agent for every Request.', 38 | long_description=long_description, 39 | 40 | author=author, 41 | author_email=email, 42 | url='https://github.com/cnu/scrapy-random-useragent', 43 | 44 | license=license, 45 | 46 | py_modules=['random_useragent'], 47 | platforms=['Any'], 48 | 49 | keywords="scrapy random user-agent ", 50 | classifiers=[ 51 | 'Development Status :: 4 - Beta', 52 | 'Intended Audience :: Developers', 53 | 'Environment :: Console', 54 | 'License :: OSI Approved :: MIT License', 55 | 'Operating System :: OS Independent', 56 | 'Programming Language :: Python', 57 | 'Framework :: Scrapy', 58 | ] 59 | ) 60 | --------------------------------------------------------------------------------