├── .coveragerc
├── .flake8
├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── LICENSE
├── README.md
├── finders
    ├── __init__.py
    ├── domains.py
    └── oldstyle.py
├── flake8_scrapy.py
├── requirements-dev.txt
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── samples
        ├── allowed_domains.py
        └── url_in_allowed_domains.py
    ├── test_domains.py
    └── test_oldstyle.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit=venv/*,setup.py
3 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         python-version: [3.8]
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python ${{ matrix.python-version }}
18 |       uses: actions/setup-python@v1
19 |       with:
20 |         python-version: ${{ matrix.python-version }}
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install -r requirements.txt
25 |         pip install -r requirements-dev.txt
26 |     - name: Lint with Flake8
27 |       run: |
28 |         flake8 .
29 |     - name: Run unit tests
30 |       run: |
31 |         pytest
32 |     - name: Run coverage
33 |       run: |
34 |         pytest --cov=. --cov-fail-under=95
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .nox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # IPython
 77 | profile_default/
 78 | ipython_config.py
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # SageMath parsed files
 87 | *.sage.py
 88 | 
 89 | # Environments
 90 | .env
 91 | .venv
 92 | env/
 93 | venv/
 94 | ENV/
 95 | env.bak/
 96 | venv.bak/
 97 | 
 98 | # Spyder project settings
 99 | .spyderproject
100 | .spyproject
101 | 
102 | # Rope project settings
103 | .ropeproject
104 | 
105 | # mkdocs documentation
106 | /site
107 | 
108 | # mypy
109 | .mypy_cache/
110 | .dmypy.json
111 | dmypy.json
112 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Valdir Stumm Junior
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # scrapy-flake8
 2 | ![](https://github.com/stummjr/flake8-scrapy/workflows/CI/badge.svg)
 3 | [![Downloads](https://pepy.tech/badge/flake8-scrapy)](https://pepy.tech/project/scrapy-fieldstats)
 4 | 
 5 | A Flake8 plugin to catch common issues on Scrapy spiders.
 6 | 
 7 | ## Issue types
 8 | 
 9 | | Code  | Meaning |
10 | | ---   | --- |
11 | | SCP01 | There are URLs in `start_urls` whose netloc is not in `allowed_domains` |
12 | | SCP02 | There are URLs in `allowed_domains` |
13 | | SCP03 | Usage of `urljoin(response.url, '/foo')` instead of `response.urljoin('/foo')` |
14 | | SCP04 | Usage of `Selector(response)` in callback |
15 | 
16 | This is a work in progress, so new issues will be added to this list.
17 | 
18 | 
19 | ## Installation
20 | 
21 | To run this in your project, please make sure you have flake8 installed first:
22 | 
23 | ```
24 | $ pip install flake8
25 | ```
26 | 
27 | And then install flake8-scrapy:
28 | 
29 | ```
30 | $ pip install flake8-scrapy
31 | ```
32 | 
33 | Now, all you have to do is run it on your project:
34 | 
35 | ```
36 | $ flake8
37 | ```
38 | 
39 | And Flake8 will run the checks defined in this plugin.


--------------------------------------------------------------------------------
/finders/__init__.py:
--------------------------------------------------------------------------------
 1 | class IssueFinder(object):
 2 |     msg_code = ''
 3 |     msg_info = ''
 4 | 
 5 |     @property
 6 |     def message(self):
 7 |         return '{} {}'.format(self.msg_code, self.msg_info)
 8 | 
 9 |     def find_issues(self, node):
10 |         raise NotImplementedError
11 | 


--------------------------------------------------------------------------------
/finders/domains.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | from six.moves.urllib.parse import urlparse
 3 | 
 4 | from finders import IssueFinder
 5 | 
 6 | 
 7 | def get_list_metadata(node):
 8 |     return [
 9 |         (subnode.lineno, subnode.col_offset, subnode.s)
10 |         for subnode in node.value.elts
11 |         if isinstance(subnode, ast.Str)
12 |     ]
13 | 
14 | 
15 | def is_list_assignment(node, var_name):
16 |     return (
17 |         isinstance(node.targets[0], ast.Name) and
18 |         isinstance(node.value, (ast.List, ast.Tuple)) and
19 |         node.targets[0].id == var_name
20 |     )
21 | 
22 | 
23 | class UnreachableDomainIssueFinder(IssueFinder):
24 |     msg_code = 'SCP01'
25 |     msg_info = "allowed_domains doesn't allow this URL from start_urls"
26 | 
27 |     def __init__(self, *args, **kwargs):
28 |         super(UnreachableDomainIssueFinder, self).__init__(*args, **kwargs)
29 |         self.allowed_domains = []
30 |         self.start_urls = []
31 | 
32 |     def url_in_allowed_domains(self, url):
33 |         netloc = urlparse(url).netloc
34 |         return any(
35 |             domain in netloc
36 |             for _, _, domain in self.allowed_domains
37 |         )
38 | 
39 |     def find_issues(self, node):
40 |         if is_list_assignment(node, var_name='allowed_domains'):
41 |             self.allowed_domains = get_list_metadata(node)
42 | 
43 |         if is_list_assignment(node, var_name='start_urls'):
44 |             self.start_urls = get_list_metadata(node)
45 | 
46 |         if not all((self.allowed_domains, self.start_urls)):
47 |             return
48 | 
49 |         for line, col, url in self.start_urls:
50 |             if not self.url_in_allowed_domains(url):
51 |                 yield (line, col, self.message)
52 | 
53 | 
54 | class UrlInAllowedDomainsIssueFinder(IssueFinder):
55 |     msg_code = 'SCP02'
56 |     msg_info = 'allowed_domains should not contain URLs'
57 | 
58 |     def is_url(self, domain):
59 |         # when it's just a domain (as 'example.com'), the parsed URL contains
60 |         # only the 'path' component
61 |         forbidden_components = [
62 |             'scheme', 'netloc', 'params', 'query', 'fragment',
63 |         ]
64 |         parts = urlparse(domain)
65 |         return any(
66 |             getattr(parts, comp, None) for comp in forbidden_components
67 |         )
68 | 
69 |     def find_issues(self, node):
70 |         if is_list_assignment(node, var_name='allowed_domains'):
71 |             allowed_domains = get_list_metadata(node)
72 | 
73 |             for line, col, url in allowed_domains:
74 |                 if self.is_url(url):
75 |                     yield (line, col, self.message)
76 | 


--------------------------------------------------------------------------------
/finders/oldstyle.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | 
 3 | from finders import IssueFinder
 4 | 
 5 | 
 6 | class UrlJoinIssueFinder(IssueFinder):
 7 |     msg_code = 'SCP03'
 8 |     msg_info = 'urljoin(response.url, "/foo") can be replaced by response.urljoin("/foo")'
 9 | 
10 |     def find_issues(self, node):
11 |         if not self.issue_applies(node):
12 |             return
13 | 
14 |         first_param = node.args[0]
15 |         if not isinstance(first_param, ast.Attribute) or not isinstance(first_param.value, ast.Name):
16 |             return
17 | 
18 |         if first_param.value.id == 'response' and first_param.attr == 'url':
19 |             # found it: first param to urljoin is response.url
20 |             yield (node.lineno, node.col_offset, self.message)
21 | 
22 |     def issue_applies(self, node):
23 |         return (
24 |             isinstance(node.func, ast.Name) and
25 |             node.func.id == 'urljoin' and
26 |             node.args
27 |         )
28 | 
29 | 
30 | class OldSelectorIssueFinder(IssueFinder):
31 |     msg_code = 'SCP04'
32 |     msg_info = 'use response.selector or response.xpath or response.css instead'
33 | 
34 |     def is_response_dot_body_as_unicode(self, node):
35 |         """ Returns True if node represents response.body_as_unicode()
36 |         """
37 |         return (
38 |             isinstance(node, ast.Call) and
39 |             isinstance(node.func, ast.Attribute) and
40 |             node.func.value.id == 'response' and
41 |             node.func.attr == 'body_as_unicode'
42 |         )
43 | 
44 |     def is_response_dot_text_or_body(self, node):
45 |         """ Return whether or not a node represents response.text or
46 |             response.body
47 |         """
48 |         return (
49 |             isinstance(node, ast.Attribute) and
50 |             node.value.id == 'response' and
51 |             node.attr in ('text', 'body')
52 |         )
53 | 
54 |     def is_response(self, node):
55 |         """ Check if node represents an object named as response
56 |         """
57 |         return (
58 |             isinstance(node, ast.Name) and
59 |             node.id == 'response'
60 |         )
61 | 
62 |     def has_response_for_keyword_parameter(self, node):
63 |         """ Check if response or response.text is passed as a keyword parameter
64 |             as in: Selector(text=response.text) or Selector(response=response)
65 |         """
66 |         return (
67 |             (
68 |                 node.arg == 'text' and
69 |                 self.is_response_dot_text_or_body(node.value) or
70 |                 self.is_response_dot_body_as_unicode(node.value)
71 |             ) or (
72 |                 node.arg == 'response' and
73 |                 self.is_response(node.value)
74 |             )
75 |         )
76 | 
77 |     def issue_applies(self, node):
78 |         return (
79 |             isinstance(node.value, ast.Call) and
80 |             isinstance(node.value.func, ast.Name) and
81 |             node.value.func.id == 'Selector'
82 |         )
83 | 
84 |     def find_issues(self, node):
85 |         if not self.issue_applies(node):
86 |             return
87 | 
88 |         # look for: Selector(response)
89 |         if node.value.args:
90 |             param = node.value.args[0]
91 |             if self.is_response(param):
92 |                 return [(node.lineno, node.col_offset, self.message)]
93 | 
94 |         # look for: Selector(response=response) or Selector(text=response.text)
95 |         for kw in node.value.keywords:
96 |             if self.has_response_for_keyword_parameter(kw):
97 |                 return [(node.lineno, node.col_offset, self.message)]
98 | 


--------------------------------------------------------------------------------
/flake8_scrapy.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | 
 3 | from finders.domains import (
 4 |     UnreachableDomainIssueFinder, UrlInAllowedDomainsIssueFinder,
 5 | )
 6 | from finders.oldstyle import OldSelectorIssueFinder, UrlJoinIssueFinder
 7 | 
 8 | 
 9 | __version__ = '0.0.2'
10 | 
11 | 
12 | class ScrapyStyleIssueFinder(ast.NodeVisitor):
13 | 
14 |     def __init__(self, *args, **kwargs):
15 |         super(ScrapyStyleIssueFinder, self).__init__(*args, **kwargs)
16 |         self.issues = []
17 |         self.finders = {
18 |             'Assign': [
19 |                 UnreachableDomainIssueFinder(),
20 |                 UrlInAllowedDomainsIssueFinder(),
21 |                 OldSelectorIssueFinder(),
22 |             ],
23 |             'Call': [
24 |                 UrlJoinIssueFinder(),
25 |             ]
26 |         }
27 | 
28 |     def find_issues_visitor(self, visitor, node):
29 |         """Find issues for the provided visitor
30 |         """
31 |         for finder in self.finders[visitor]:
32 |             issues = finder.find_issues(node)
33 |             if issues:
34 |                 self.issues.extend(list(issues))
35 |         self.generic_visit(node)
36 | 
37 |     def visit_Assign(self, node):
38 |         self.find_issues_visitor('Assign', node)
39 | 
40 |     def visit_Call(self, node):
41 |         self.find_issues_visitor('Call', node)
42 | 
43 | 
44 | class ScrapyStyleChecker(object):
45 |     options = None
46 |     name = 'flake8-scrapy'
47 |     version = __version__
48 | 
49 |     def __init__(self, tree, filename):
50 |         self.tree = tree
51 |         self.filename = filename
52 | 
53 |     def run(self):
54 |         finder = ScrapyStyleIssueFinder()
55 |         finder.visit(self.tree)
56 | 
57 |         for line, col, msg in finder.issues:
58 |             yield (line, col, msg, ScrapyStyleChecker)
59 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | flake8==3.7.9
2 | pytest==5.4.1
3 | pytest-cov==2.8.1
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | six==1.11.0


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open('README.md', 'r') as f:
 4 |     long_description = f.read()
 5 | 
 6 | 
 7 | setuptools.setup(
 8 |     name='flake8-scrapy',
 9 |     license='MIT',
10 |     version='0.0.2',
11 |     long_description=long_description,
12 |     long_description_content_type='text/markdown',
13 |     author='Valdir Stumm Junior',
14 |     author_email='stummjr@gmail.com',
15 |     url='http://github.com/stummjr/flake8-scrapy',
16 |     py_modules=[
17 |         'flake8_scrapy',
18 |         'finders',
19 |         'finders.domains',
20 |         'finders.oldstyle',
21 |     ],
22 |     entry_points={
23 |         'flake8.extension': [
24 |             'SCP0 = flake8_scrapy:ScrapyStyleChecker',
25 |         ],
26 |     },
27 |     install_requires=['flake8'],
28 |     tests_require=['pytest'],
29 |     classifiers=[
30 |         'Framework :: Flake8',
31 |         'License :: OSI Approved :: MIT License',
32 |         'Programming Language :: Python',
33 |         'Programming Language :: Python :: 3',
34 |         'Programming Language :: Python :: 3.5',
35 |         'Programming Language :: Python :: 3.6',
36 |         'Programming Language :: Python :: 3.7',
37 |         'Topic :: Software Development :: Libraries :: Python Modules',
38 |         'Topic :: Software Development :: Quality Assurance',
39 |     ],
40 | )
41 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import os
 3 | 
 4 | from flake8_scrapy import ScrapyStyleChecker
 5 | 
 6 | 
 7 | def load_sample_file(filename):
 8 |     path = os.path.join(
 9 |         os.path.dirname(__file__),
10 |         'samples',
11 |         filename
12 |     )
13 |     return open(path).read()
14 | 
15 | 
16 | def run_checker(code):
17 |     tree = ast.parse(code)
18 |     checker = ScrapyStyleChecker(tree, None)
19 |     return list(checker.run())
20 | 


--------------------------------------------------------------------------------
/tests/samples/allowed_domains.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | 
 4 | class AllowedDomainsSpider(scrapy.Spider):
 5 |     """ Sample that demonstrates the issue of having start_urls
 6 |         for domains out of allowed_domains.
 7 |     """
 8 |     # name = 'allowed_domains'
 9 |     allowed_domains = [
10 |         'example.com',
11 |         'scrapy.org',
12 |     ]
13 |     start_urls = [
14 |         'http://quotes.toscrape.com',
15 |         'http://httpbin.org',
16 |     ]
17 | 
18 |     def parse(self, response):
19 |         self.do_nothing()
20 | 
21 |     def do_nothing(self):
22 |         pass
23 | 


--------------------------------------------------------------------------------
/tests/samples/url_in_allowed_domains.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | 
 4 | class URLINAllowedDomainsSpider(scrapy.Spider):
 5 |     """ Sample that demonstrates the issue of having URLs
 6 |         in allowd_domains.
 7 |     """
 8 |     name = 'url_not_in_allowed_domains'
 9 |     allowed_domains = [
10 |         'http://example.com',
11 |         'scrapy.org',
12 |     ]
13 | 


--------------------------------------------------------------------------------
/tests/test_domains.py:
--------------------------------------------------------------------------------
 1 | from . import load_sample_file, run_checker
 2 | from finders.domains import (
 3 |     UnreachableDomainIssueFinder, UrlInAllowedDomainsIssueFinder,
 4 | )
 5 | 
 6 | 
 7 | def test_url_not_in_allowed_domains():
 8 |     code = load_sample_file('allowed_domains.py')
 9 |     issues = run_checker(code)
10 | 
11 |     assert len(issues) == 2
12 |     # first issue
13 |     assert issues[0][0] == 14  # line
14 |     assert issues[0][1] == 8   # col
15 |     assert UnreachableDomainIssueFinder.msg_code in issues[0][2]
16 |     assert UnreachableDomainIssueFinder.msg_info in issues[0][2]
17 |     # second issue
18 |     assert issues[1][0] == 15  # line
19 |     assert issues[1][1] == 8   # col
20 |     assert UnreachableDomainIssueFinder.msg_code in issues[1][2]
21 |     assert UnreachableDomainIssueFinder.msg_info in issues[1][2]
22 | 
23 | 
24 | def test_url_in_allowed_domains():
25 |     code = load_sample_file('url_in_allowed_domains.py')
26 |     issues = run_checker(code)
27 | 
28 |     assert len(issues) == 1
29 |     assert issues[0][0] == 10  # line
30 |     assert issues[0][1] == 8   # col
31 |     assert UrlInAllowedDomainsIssueFinder.msg_code in issues[0][2]
32 |     assert UrlInAllowedDomainsIssueFinder.msg_info in issues[0][2]
33 | 


--------------------------------------------------------------------------------
/tests/test_oldstyle.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from . import run_checker
 4 | from finders.oldstyle import UrlJoinIssueFinder
 5 | 
 6 | 
 7 | @pytest.mark.parametrize('code', [
 8 |     ('urljoin(response.url, "/foo")'),
 9 |     ('url = urljoin(response.url, "/foo")'),
10 | ])
11 | def test_finds_old_style_urljoin(code):
12 |     issues = run_checker(code)
13 |     assert len(issues) == 1
14 |     assert UrlJoinIssueFinder.msg_code in issues[0][2]
15 | 
16 | 
17 | @pytest.mark.parametrize('code', [
18 |     ('response.urljoin("/foo")'),
19 |     ('url = urljoin()'),
20 |     ('urljoin(x, "/foo")'),
21 |     ('urljoin(x.y.z, "/foo")'),
22 | ])
23 | def test_dont_find_old_style_urljoin(code):
24 |     issues = run_checker(code)
25 |     assert len(issues) == 0
26 | 
27 | 
28 | @pytest.mark.parametrize('code,expected', [
29 |     ('sel = Selector(response)', 1),
30 |     ('sel = Selector(response, type="html")', 1),
31 |     ('sel = Selector(response=response, type="html")', 1),
32 |     ('sel = Selector(response=response)', 1),
33 |     ('sel = Selector(text=response.text)', 1),
34 |     ('sel = Selector(text=response.body)', 1),
35 |     ('sel = Selector(text=response.body_as_unicode())', 1),
36 |     ('sel = Selector(text=response.text, type="html")', 1),
37 |     ('sel = Selector(get_text())', 0),
38 |     ('sel = Selector(self.get_text())', 0),
39 | ])
40 | def test_find_old_style_selector(code, expected):
41 |     issues = run_checker(code)
42 |     assert len(issues) == expected
43 | 


--------------------------------------------------------------------------------