├── .coveragerc ├── .flake8 ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── LICENSE ├── README.md ├── finders ├── __init__.py ├── domains.py └── oldstyle.py ├── flake8_scrapy.py ├── requirements-dev.txt ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── samples ├── allowed_domains.py └── url_in_allowed_domains.py ├── test_domains.py └── test_oldstyle.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit=venv/*,setup.py 3 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: [3.8] 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v1 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install -r requirements.txt 25 | pip install -r requirements-dev.txt 26 | - name: Lint with Flake8 27 | run: | 28 | flake8 . 29 | - name: Run unit tests 30 | run: | 31 | pytest 32 | - name: Run coverage 33 | run: | 34 | pytest --cov=. --cov-fail-under=95 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .nox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # IPython 77 | profile_default/ 78 | ipython_config.py 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | .dmypy.json 111 | dmypy.json 112 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Valdir Stumm Junior 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scrapy-flake8 2 | ![](https://github.com/stummjr/flake8-scrapy/workflows/CI/badge.svg) 3 | [![Downloads](https://pepy.tech/badge/flake8-scrapy)](https://pepy.tech/project/scrapy-fieldstats) 4 | 5 | A Flake8 plugin to catch common issues on Scrapy spiders. 6 | 7 | ## Issue types 8 | 9 | | Code | Meaning | 10 | | --- | --- | 11 | | SCP01 | There are URLs in `start_urls` whose netloc is not in `allowed_domains` | 12 | | SCP02 | There are URLs in `allowed_domains` | 13 | | SCP03 | Usage of `urljoin(response.url, '/foo')` instead of `response.urljoin('/foo')` | 14 | | SCP04 | Usage of `Selector(response)` in callback | 15 | 16 | This is a work in progress, so new issues will be added to this list. 17 | 18 | 19 | ## Installation 20 | 21 | To run this in your project, please make sure you have flake8 installed first: 22 | 23 | ``` 24 | $ pip install flake8 25 | ``` 26 | 27 | And then install flake8-scrapy: 28 | 29 | ``` 30 | $ pip install flake8-scrapy 31 | ``` 32 | 33 | Now, all you have to do is run it on your project: 34 | 35 | ``` 36 | $ flake8 37 | ``` 38 | 39 | And Flake8 will run the checks defined in this plugin. -------------------------------------------------------------------------------- /finders/__init__.py: -------------------------------------------------------------------------------- 1 | class IssueFinder(object): 2 | msg_code = '' 3 | msg_info = '' 4 | 5 | @property 6 | def message(self): 7 | return '{} {}'.format(self.msg_code, self.msg_info) 8 | 9 | def find_issues(self, node): 10 | raise NotImplementedError 11 | -------------------------------------------------------------------------------- /finders/domains.py: -------------------------------------------------------------------------------- 1 | import ast 2 | from six.moves.urllib.parse import urlparse 3 | 4 | from finders import IssueFinder 5 | 6 | 7 | def get_list_metadata(node): 8 | return [ 9 | (subnode.lineno, subnode.col_offset, subnode.s) 10 | for subnode in node.value.elts 11 | if isinstance(subnode, ast.Str) 12 | ] 13 | 14 | 15 | def is_list_assignment(node, var_name): 16 | return ( 17 | isinstance(node.targets[0], ast.Name) and 18 | isinstance(node.value, (ast.List, ast.Tuple)) and 19 | node.targets[0].id == var_name 20 | ) 21 | 22 | 23 | class UnreachableDomainIssueFinder(IssueFinder): 24 | msg_code = 'SCP01' 25 | msg_info = "allowed_domains doesn't allow this URL from start_urls" 26 | 27 | def __init__(self, *args, **kwargs): 28 | super(UnreachableDomainIssueFinder, self).__init__(*args, **kwargs) 29 | self.allowed_domains = [] 30 | self.start_urls = [] 31 | 32 | def url_in_allowed_domains(self, url): 33 | netloc = urlparse(url).netloc 34 | return any( 35 | domain in netloc 36 | for _, _, domain in self.allowed_domains 37 | ) 38 | 39 | def find_issues(self, node): 40 | if is_list_assignment(node, var_name='allowed_domains'): 41 | self.allowed_domains = get_list_metadata(node) 42 | 43 | if is_list_assignment(node, var_name='start_urls'): 44 | self.start_urls = get_list_metadata(node) 45 | 46 | if not all((self.allowed_domains, self.start_urls)): 47 | return 48 | 49 | for line, col, url in self.start_urls: 50 | if not self.url_in_allowed_domains(url): 51 | yield (line, col, self.message) 52 | 53 | 54 | class UrlInAllowedDomainsIssueFinder(IssueFinder): 55 | msg_code = 'SCP02' 56 | msg_info = 'allowed_domains should not contain URLs' 57 | 58 | def is_url(self, domain): 59 | # when it's just a domain (as 'example.com'), the parsed URL contains 60 | # only the 'path' component 61 | forbidden_components = [ 62 | 'scheme', 'netloc', 'params', 'query', 'fragment', 63 | ] 64 | parts = urlparse(domain) 65 | return any( 66 | getattr(parts, comp, None) for comp in forbidden_components 67 | ) 68 | 69 | def find_issues(self, node): 70 | if is_list_assignment(node, var_name='allowed_domains'): 71 | allowed_domains = get_list_metadata(node) 72 | 73 | for line, col, url in allowed_domains: 74 | if self.is_url(url): 75 | yield (line, col, self.message) 76 | -------------------------------------------------------------------------------- /finders/oldstyle.py: -------------------------------------------------------------------------------- 1 | import ast 2 | 3 | from finders import IssueFinder 4 | 5 | 6 | class UrlJoinIssueFinder(IssueFinder): 7 | msg_code = 'SCP03' 8 | msg_info = 'urljoin(response.url, "/foo") can be replaced by response.urljoin("/foo")' 9 | 10 | def find_issues(self, node): 11 | if not self.issue_applies(node): 12 | return 13 | 14 | first_param = node.args[0] 15 | if not isinstance(first_param, ast.Attribute) or not isinstance(first_param.value, ast.Name): 16 | return 17 | 18 | if first_param.value.id == 'response' and first_param.attr == 'url': 19 | # found it: first param to urljoin is response.url 20 | yield (node.lineno, node.col_offset, self.message) 21 | 22 | def issue_applies(self, node): 23 | return ( 24 | isinstance(node.func, ast.Name) and 25 | node.func.id == 'urljoin' and 26 | node.args 27 | ) 28 | 29 | 30 | class OldSelectorIssueFinder(IssueFinder): 31 | msg_code = 'SCP04' 32 | msg_info = 'use response.selector or response.xpath or response.css instead' 33 | 34 | def is_response_dot_body_as_unicode(self, node): 35 | """ Returns True if node represents response.body_as_unicode() 36 | """ 37 | return ( 38 | isinstance(node, ast.Call) and 39 | isinstance(node.func, ast.Attribute) and 40 | node.func.value.id == 'response' and 41 | node.func.attr == 'body_as_unicode' 42 | ) 43 | 44 | def is_response_dot_text_or_body(self, node): 45 | """ Return whether or not a node represents response.text or 46 | response.body 47 | """ 48 | return ( 49 | isinstance(node, ast.Attribute) and 50 | node.value.id == 'response' and 51 | node.attr in ('text', 'body') 52 | ) 53 | 54 | def is_response(self, node): 55 | """ Check if node represents an object named as response 56 | """ 57 | return ( 58 | isinstance(node, ast.Name) and 59 | node.id == 'response' 60 | ) 61 | 62 | def has_response_for_keyword_parameter(self, node): 63 | """ Check if response or response.text is passed as a keyword parameter 64 | as in: Selector(text=response.text) or Selector(response=response) 65 | """ 66 | return ( 67 | ( 68 | node.arg == 'text' and 69 | self.is_response_dot_text_or_body(node.value) or 70 | self.is_response_dot_body_as_unicode(node.value) 71 | ) or ( 72 | node.arg == 'response' and 73 | self.is_response(node.value) 74 | ) 75 | ) 76 | 77 | def issue_applies(self, node): 78 | return ( 79 | isinstance(node.value, ast.Call) and 80 | isinstance(node.value.func, ast.Name) and 81 | node.value.func.id == 'Selector' 82 | ) 83 | 84 | def find_issues(self, node): 85 | if not self.issue_applies(node): 86 | return 87 | 88 | # look for: Selector(response) 89 | if node.value.args: 90 | param = node.value.args[0] 91 | if self.is_response(param): 92 | return [(node.lineno, node.col_offset, self.message)] 93 | 94 | # look for: Selector(response=response) or Selector(text=response.text) 95 | for kw in node.value.keywords: 96 | if self.has_response_for_keyword_parameter(kw): 97 | return [(node.lineno, node.col_offset, self.message)] 98 | -------------------------------------------------------------------------------- /flake8_scrapy.py: -------------------------------------------------------------------------------- 1 | import ast 2 | 3 | from finders.domains import ( 4 | UnreachableDomainIssueFinder, UrlInAllowedDomainsIssueFinder, 5 | ) 6 | from finders.oldstyle import OldSelectorIssueFinder, UrlJoinIssueFinder 7 | 8 | 9 | __version__ = '0.0.2' 10 | 11 | 12 | class ScrapyStyleIssueFinder(ast.NodeVisitor): 13 | 14 | def __init__(self, *args, **kwargs): 15 | super(ScrapyStyleIssueFinder, self).__init__(*args, **kwargs) 16 | self.issues = [] 17 | self.finders = { 18 | 'Assign': [ 19 | UnreachableDomainIssueFinder(), 20 | UrlInAllowedDomainsIssueFinder(), 21 | OldSelectorIssueFinder(), 22 | ], 23 | 'Call': [ 24 | UrlJoinIssueFinder(), 25 | ] 26 | } 27 | 28 | def find_issues_visitor(self, visitor, node): 29 | """Find issues for the provided visitor 30 | """ 31 | for finder in self.finders[visitor]: 32 | issues = finder.find_issues(node) 33 | if issues: 34 | self.issues.extend(list(issues)) 35 | self.generic_visit(node) 36 | 37 | def visit_Assign(self, node): 38 | self.find_issues_visitor('Assign', node) 39 | 40 | def visit_Call(self, node): 41 | self.find_issues_visitor('Call', node) 42 | 43 | 44 | class ScrapyStyleChecker(object): 45 | options = None 46 | name = 'flake8-scrapy' 47 | version = __version__ 48 | 49 | def __init__(self, tree, filename): 50 | self.tree = tree 51 | self.filename = filename 52 | 53 | def run(self): 54 | finder = ScrapyStyleIssueFinder() 55 | finder.visit(self.tree) 56 | 57 | for line, col, msg in finder.issues: 58 | yield (line, col, msg, ScrapyStyleChecker) 59 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | flake8==3.7.9 2 | pytest==5.4.1 3 | pytest-cov==2.8.1 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | six==1.11.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open('README.md', 'r') as f: 4 | long_description = f.read() 5 | 6 | 7 | setuptools.setup( 8 | name='flake8-scrapy', 9 | license='MIT', 10 | version='0.0.2', 11 | long_description=long_description, 12 | long_description_content_type='text/markdown', 13 | author='Valdir Stumm Junior', 14 | author_email='stummjr@gmail.com', 15 | url='http://github.com/stummjr/flake8-scrapy', 16 | py_modules=[ 17 | 'flake8_scrapy', 18 | 'finders', 19 | 'finders.domains', 20 | 'finders.oldstyle', 21 | ], 22 | entry_points={ 23 | 'flake8.extension': [ 24 | 'SCP0 = flake8_scrapy:ScrapyStyleChecker', 25 | ], 26 | }, 27 | install_requires=['flake8'], 28 | tests_require=['pytest'], 29 | classifiers=[ 30 | 'Framework :: Flake8', 31 | 'License :: OSI Approved :: MIT License', 32 | 'Programming Language :: Python', 33 | 'Programming Language :: Python :: 3', 34 | 'Programming Language :: Python :: 3.5', 35 | 'Programming Language :: Python :: 3.6', 36 | 'Programming Language :: Python :: 3.7', 37 | 'Topic :: Software Development :: Libraries :: Python Modules', 38 | 'Topic :: Software Development :: Quality Assurance', 39 | ], 40 | ) 41 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os 3 | 4 | from flake8_scrapy import ScrapyStyleChecker 5 | 6 | 7 | def load_sample_file(filename): 8 | path = os.path.join( 9 | os.path.dirname(__file__), 10 | 'samples', 11 | filename 12 | ) 13 | return open(path).read() 14 | 15 | 16 | def run_checker(code): 17 | tree = ast.parse(code) 18 | checker = ScrapyStyleChecker(tree, None) 19 | return list(checker.run()) 20 | -------------------------------------------------------------------------------- /tests/samples/allowed_domains.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class AllowedDomainsSpider(scrapy.Spider): 5 | """ Sample that demonstrates the issue of having start_urls 6 | for domains out of allowed_domains. 7 | """ 8 | # name = 'allowed_domains' 9 | allowed_domains = [ 10 | 'example.com', 11 | 'scrapy.org', 12 | ] 13 | start_urls = [ 14 | 'http://quotes.toscrape.com', 15 | 'http://httpbin.org', 16 | ] 17 | 18 | def parse(self, response): 19 | self.do_nothing() 20 | 21 | def do_nothing(self): 22 | pass 23 | -------------------------------------------------------------------------------- /tests/samples/url_in_allowed_domains.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class URLINAllowedDomainsSpider(scrapy.Spider): 5 | """ Sample that demonstrates the issue of having URLs 6 | in allowd_domains. 7 | """ 8 | name = 'url_not_in_allowed_domains' 9 | allowed_domains = [ 10 | 'http://example.com', 11 | 'scrapy.org', 12 | ] 13 | -------------------------------------------------------------------------------- /tests/test_domains.py: -------------------------------------------------------------------------------- 1 | from . import load_sample_file, run_checker 2 | from finders.domains import ( 3 | UnreachableDomainIssueFinder, UrlInAllowedDomainsIssueFinder, 4 | ) 5 | 6 | 7 | def test_url_not_in_allowed_domains(): 8 | code = load_sample_file('allowed_domains.py') 9 | issues = run_checker(code) 10 | 11 | assert len(issues) == 2 12 | # first issue 13 | assert issues[0][0] == 14 # line 14 | assert issues[0][1] == 8 # col 15 | assert UnreachableDomainIssueFinder.msg_code in issues[0][2] 16 | assert UnreachableDomainIssueFinder.msg_info in issues[0][2] 17 | # second issue 18 | assert issues[1][0] == 15 # line 19 | assert issues[1][1] == 8 # col 20 | assert UnreachableDomainIssueFinder.msg_code in issues[1][2] 21 | assert UnreachableDomainIssueFinder.msg_info in issues[1][2] 22 | 23 | 24 | def test_url_in_allowed_domains(): 25 | code = load_sample_file('url_in_allowed_domains.py') 26 | issues = run_checker(code) 27 | 28 | assert len(issues) == 1 29 | assert issues[0][0] == 10 # line 30 | assert issues[0][1] == 8 # col 31 | assert UrlInAllowedDomainsIssueFinder.msg_code in issues[0][2] 32 | assert UrlInAllowedDomainsIssueFinder.msg_info in issues[0][2] 33 | -------------------------------------------------------------------------------- /tests/test_oldstyle.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from . import run_checker 4 | from finders.oldstyle import UrlJoinIssueFinder 5 | 6 | 7 | @pytest.mark.parametrize('code', [ 8 | ('urljoin(response.url, "/foo")'), 9 | ('url = urljoin(response.url, "/foo")'), 10 | ]) 11 | def test_finds_old_style_urljoin(code): 12 | issues = run_checker(code) 13 | assert len(issues) == 1 14 | assert UrlJoinIssueFinder.msg_code in issues[0][2] 15 | 16 | 17 | @pytest.mark.parametrize('code', [ 18 | ('response.urljoin("/foo")'), 19 | ('url = urljoin()'), 20 | ('urljoin(x, "/foo")'), 21 | ('urljoin(x.y.z, "/foo")'), 22 | ]) 23 | def test_dont_find_old_style_urljoin(code): 24 | issues = run_checker(code) 25 | assert len(issues) == 0 26 | 27 | 28 | @pytest.mark.parametrize('code,expected', [ 29 | ('sel = Selector(response)', 1), 30 | ('sel = Selector(response, type="html")', 1), 31 | ('sel = Selector(response=response, type="html")', 1), 32 | ('sel = Selector(response=response)', 1), 33 | ('sel = Selector(text=response.text)', 1), 34 | ('sel = Selector(text=response.body)', 1), 35 | ('sel = Selector(text=response.body_as_unicode())', 1), 36 | ('sel = Selector(text=response.text, type="html")', 1), 37 | ('sel = Selector(get_text())', 0), 38 | ('sel = Selector(self.get_text())', 0), 39 | ]) 40 | def test_find_old_style_selector(code, expected): 41 | issues = run_checker(code) 42 | assert len(issues) == expected 43 | --------------------------------------------------------------------------------