├── data
└── .keepme
├── crawlpy
├── __init__.py
├── spiders
│ ├── __init__.py
│ └── crawlpy_spider.py
├── pipelines.py
├── items.py
├── middlewares.py
└── settings.py
├── scrapy.cfg
├── crawlpy.config.json-sample
├── run.sh
├── LICENSE.md
├── .gitignore
├── contrib
├── README.md
└── crawlpy-login.py
└── README.md
/data/.keepme:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/crawlpy/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/crawlpy/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = crawlpy.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = crawlpy
12 |
--------------------------------------------------------------------------------
/crawlpy/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class CrawlpyPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/crawlpy.config.json-sample:
--------------------------------------------------------------------------------
1 | {
2 | "proto": "http",
3 | "domain": "localhost",
4 | "depth": 3,
5 | "ignores": [
6 | "/user/logout.php",
7 | "/user/delete.php?id=1",
8 | "/user/disable.php?id=1",
9 | ],
10 | "httpstatus_list": [],
11 | "login": {
12 | "enabled": false,
13 | "method": "post",
14 | "action": "/login.php",
15 | "failure": "Password is incorrect",
16 | "fields": {
17 | "username": "john",
18 | "password": "doe"
19 | },
20 | "csrf": {
21 | "enabled": false,
22 | "field": "csrf"
23 | }
24 |
25 | },
26 | "store": {
27 | "enabled": false,
28 | "path": "./data"
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/crawlpy/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Python web spider/crawler based on scrapy with support for POST/GET login,
4 | variable level of recursions/depth and optionally save to disk.
5 |
6 | Defines my custom model items.
7 | See documentation in:
8 | http://doc.scrapy.org/en/latest/topics/items.html
9 | """
10 |
11 | from scrapy.item import Item, Field
12 |
13 |
14 | __author__ = "cytopia"
15 | __license__ = "MIT"
16 | __email__ = "cytopia@everythingcli.org"
17 |
18 |
19 | class CrawlpyItem(Item):
20 | """Data Model Class"""
21 |
22 | url = Field()
23 | text = Field()
24 | status = Field()
25 | depth = Field()
26 | referer = Field()
27 |
--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 |
4 | echo
5 | echo "Do not save to file:"
6 | echo "$ scrapy crawl crawlpy -a config=./crawlpy.config.json"
7 |
8 | echo
9 | echo "Save results to json:"
10 | echo "$ scrapy crawl crawlpy --loglevel=INFO -a config=./crawlpy.config.json -o urls.json -t json"
11 | echo "$ scrapy crawl crawlpy --loglevel=WARNING -a config=./crawlpy.config.json -o urls.json -t json"
12 |
13 | echo
14 | echo "Save results to csv:"
15 | echo "$ scrapy crawl crawlpy --loglevel=INFO -a config=./crawlpy.config.json -o urls.csv -t csv"
16 | echo "$ scrapy crawl crawlpy --loglevel=WARNING -a config=./crawlpy.config.json -o urls.csv -t csv"
17 |
18 | echo
19 | echo "Clear cache for re-coding:"
20 | echo "$ find . -name \*.pyc -exec rm '{}' \;"
21 |
22 | exit 0
23 |
--------------------------------------------------------------------------------
/crawlpy/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Python web spider/crawler based on scrapy with support for POST/GET login,
4 | variable level of recursions/depth and optionally save to disk.
5 |
6 | This file provides the a middleware that overwrites the crawling depth behavior.
7 | """
8 |
9 | from scrapy.spidermiddlewares.depth import DepthMiddleware
10 |
11 |
12 | __author__ = "cytopia"
13 | __license__ = "MIT"
14 | __email__ = "cytopia@everythingcli.org"
15 |
16 |
17 | """Custom DepthMiddleWare"""
18 | class MyDepthMiddleware(DepthMiddleware):
19 |
20 | #----------------------------------------------------------------------
21 | def process_spider_output(self, response, result, spider):
22 | """Overwrite parent DepthMiddleware and set MAX_DEPTH"""
23 |
24 | if hasattr(spider, 'max_depth'):
25 | self.maxdepth = getattr(spider, 'max_depth')
26 | return super(MyDepthMiddleware, self).process_spider_output(response, result, spider)
27 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2016 cytopia
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | #
2 | # Custom stuff
3 | #
4 | crawlpy.config.json
5 | data/*
6 |
7 | !.keepme
8 |
9 | # Byte-compiled / optimized / DLL files
10 | __pycache__/
11 | *.py[cod]
12 | *$py.class
13 |
14 | # C extensions
15 | *.so
16 |
17 | # Distribution / packaging
18 | .Python
19 | env/
20 | build/
21 | develop-eggs/
22 | dist/
23 | downloads/
24 | eggs/
25 | .eggs/
26 | lib/
27 | lib64/
28 | parts/
29 | sdist/
30 | var/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 |
35 | # PyInstaller
36 | # Usually these files are written by a python script from a template
37 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
38 | *.manifest
39 | *.spec
40 |
41 | # Installer logs
42 | pip-log.txt
43 | pip-delete-this-directory.txt
44 |
45 | # Unit test / coverage reports
46 | htmlcov/
47 | .tox/
48 | .coverage
49 | .coverage.*
50 | .cache
51 | nosetests.xml
52 | coverage.xml
53 | *,cover
54 | .hypothesis/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # IPython Notebook
78 | .ipynb_checkpoints
79 |
80 | # pyenv
81 | .python-version
82 |
83 | # celery beat schedule file
84 | celerybeat-schedule
85 |
86 | # dotenv
87 | .env
88 |
89 | # virtualenv
90 | venv/
91 | ENV/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 |
96 | # Rope project settings
97 | .ropeproject
98 |
--------------------------------------------------------------------------------
/contrib/README.md:
--------------------------------------------------------------------------------
1 | # Contributed binaries
2 |
3 | ## crawlpy-login.py
4 |
5 | * Test the login prior crawling.
6 | * Create wget-like login session cookie (useable by [sqlmap](http://sqlmap.org/))
7 | * Dump login page (after login)
8 |
9 | ```shell
10 | Usage: crawlpy-login.py -C conf.json [-c cookie.txt] [-o output.html] [-y] [-v]
11 | crawlpy-login.py -h
12 | crawlpy-login.py -V
13 |
14 | crawlpy-login.py will test whether or not the specified crawlpy config
15 | is valid and can successfully login.
16 |
17 | You can optionally save a login session cookie (-c/--cookie) in wget format
18 | which can be used by tools such as sqlmap.
19 |
20 | You can also store the html output from a successfull/unsuccessful login
21 | to file (-o/--output).
22 |
23 |
24 | Required arguments:
25 | -C, --config= Path to crawlpy json config.
26 | -C /path/to/conf.json
27 | --config=/path/to/conf.json
28 |
29 | Optional arguments:
30 | -c, --cookie= Path where to store the session cookie.
31 | -c /path/to/cookie.txt
32 | --cookie=/path/to/cookie.txt
33 |
34 | -o, --output= Path where to store the html source after logging in.
35 | -o /path/to/login.html
36 | --cookie=/path/to/login.html
37 |
38 | -v, --verbose Be more verbose.
39 |
40 | -y, --yes Answer 'yes' to all questions.
41 |
42 | System options:
43 | -h, --help Show help.
44 | -V, --version Show version information.
45 | ```
--------------------------------------------------------------------------------
/crawlpy/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for crawlpy project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'crawlpy'
13 |
14 | SPIDER_MODULES = ['crawlpy.spiders']
15 | NEWSPIDER_MODULE = 'crawlpy.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'crawlpy (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | COOKIES_ENABLED = True
37 |
38 | # Disable Telnet Console (enabled by default)
39 | TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | SPIDER_MIDDLEWARES = {
50 | 'scrapy.spidermiddlewares.depth.DepthMiddleware': None,
51 | 'crawlpy.middlewares.MyDepthMiddleware': 543,
52 | }
53 |
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | # 'crawlpy.middlewares.MyCustomDownloaderMiddleware': 543,
58 | #}
59 |
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | # 'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 |
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | #ITEM_PIPELINES = {
69 | # 'crawlpy.pipelines.SomePipeline': 300,
70 | #}
71 |
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 |
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Crawlpy
2 |
3 | [](https://github.com/cytopia/crawlpy/releases)
4 | [](https://opensource.org/licenses/MIT)
5 |
6 | Python web spider/crawler based on [scrapy](http://scrapy.org/) with support for POST/GET login, variable level of recursions/depth and optionally save to disk.
7 |
8 |
9 | ## Requirements
10 |
11 | * [python 2.7](https://www.python.org/)
12 | * [lxml](http://lxml.de/)
13 | * [pip](https://pip.pypa.io/en/latest/installing/)
14 |
15 | ```shell
16 | pip install Scrapy
17 | ```
18 |
19 |
20 | ## Features
21 |
22 | * POST/GET Login prior crawling
23 | * Can handle logins that requires dynamic CSRF token
24 | * Variable level of crawling depth
25 | * Optionally save webpages to disk
26 |
27 | ## Roadmap
28 |
29 | Find all planned features and their stati here: https://github.com/cytopia/crawlpy/issues/1
30 |
31 |
32 | ## Usage
33 |
34 | ```bash
35 | # stdout output
36 | scrapy crawl crawlpy -a config=/path/to/crawlpy.config.json
37 |
38 | # save as json (url:, status:, depth:, referer:) to 'urls.json'
39 | scrapy crawl crawlpy --loglevel=INFO -a config=/path/to/crawlpy.config.json -o urls.json -t json
40 |
41 | # save as csv (url, status, depth, referer) to 'urls.csv'
42 | scrapy crawl crawlpy --loglevel=INFO -a config=/path/to/crawlpy.config.json -o urls.csv -t csv
43 | ```
44 |
45 | ## Configuration
46 |
47 | Make a copy of [crawlpy.config.json-sample](crawlpy.config.json-sample) (e.g.: `example.com-config.json`) and adjust the values accordingly.
48 |
49 | **Note:**
50 | It must be a valid json file (without comments), otherwise `crawlpy` will throw errors parsing json. (Use http://jsonlint.com/ to validate your config file.)
51 |
52 | ```javascript
53 | {
54 | "proto": "http", // 'http' or 'https'
55 | "domain": "localhost", // Only the domain. e.g.: 'example.com' or 'www.example.com'
56 | "depth": 3, // Nesting depth to crawl
57 | "ignores": [], // Array of substrings to deny/ignore when found in URL
58 | "httpstatus_list": [], // Array of http status codes to handle (default is 2xx)
59 | "login": { // Login section
60 | "enabled": false, // Do we actually need to do a login?
61 | "method": "post", // 'post' or 'get'
62 | "action": "/login.php", // Where the post or get will be submitted to
63 | "failure": "Password is incorrect", // The string you will see on login failure
64 | "fields": { // POST/GET Fields to submit to login page
65 | "username": "john",
66 | "password": "doe"
67 | },
68 | "csrf": {
69 | "enabled": false, // Login requires a CSRF token?
70 | "field": "csrf" // Input field name that holds dynamic CSRF token
71 | }
72 | },
73 | "store": { // Store section
74 | "enabled": false, // save to disk?
75 | "path": "./data" // path for saving (rel or abs)
76 | }
77 | }
78 | ```
79 |
80 | ### Detailed description
81 |
82 | |Key|Type|Default Value|Possible Values|Description|
83 | |---|----|-------------|---------------|-----------|
84 | |proto|string|`http`|`http` or `https`|Is the site you want to crawl running on `http` or `https`?|
85 | |domain|string|`localhost`|Domain or subdomain|The domain or subdomain you want to spider. Nothing outside this domain/subdomain will be touched.|
86 | |depth|integer|`3`|`0`,`1`,`2`,`3`,...|`0`: Crawl indefinetely until every subpage has been reached.
`1`: Only crawl links on the initial page.
`2`: Crawl links on the initial page and everything found on the links of that page.
**Note:** when you do a login, the login page already counts as one level of depth by scrapy itself, but this is rewritten internally to subtract that depth again, so your output will not show that extra depth.|
87 | |ignores|array|\[ \]|\['/logout.php', 'delete.php?id=1'\]|Each array string element is treated as a substring (no regex) and is checked against a FQDN. If any of the specified substrings is found in that URL, it will not be crawled.
**Note:** It does make sense, when you login somewhere, to ignore the logout page, as well as other pages that might delete/disable your current user, so you will not be kicked from your login session during crawl time.|
88 | |httpstatus_list|array|\[ \]|\[403, 404, 500\]|By default scrapy ignores pages with status code other than 2xx, so if you know that a 403 page contains actual content with links, just add this here.
**Note:** There is no need to specify `200`, as scrapy crawls them by default.|
89 | |**login**||||Login section|
90 | |enabled|boolean|`false`|`true` or `false`|`true`: Do a login prior crawling
`false`: do not login
**Note:**When login is set to `false`, you do not need to fill in the rest of the variables inside the `login` section|
91 | |method|string|`post`|`post` or `get`|Method required to execute the login|
92 | |action|string|`/login.php`|login page|Relative login page (from the base domain, including leading slash) where the `post` or `get` will go to.|
93 | |failure|string|`Password is incorrect`|login failed string|A string that is found on the login page, when the login fails.|
94 | |fields|key-value|`{`
`"username": "john",`
`"password": "doe"`
`}`|`post` or `get` params|POST or GET params required to login.
**Examples:** username, password, hidden-field-name|
95 | |**csrf**||||Login CSRF section|
96 | |enabled|boolean|`false`|`true` or `false`|`true`: Login page has a dynamic CSRF token that you want to read out and submit along the normal submit data.
`false`: Login does not require a CSRF token to be submitted.
**Note:** If the login has a static (never-changing) CSRF field, just add the data into the fields section
**Note:** Read below about built-in automatic CSRF detection and leave this off at first.|
97 | |field|string|`csrf`|Field name|The name of the input field which holds the CSRF token|
98 | |**store**||||Store section|
99 | |enabled|boolean|`false`|`true` or `false`|`true`: Save webpages to disk
`false`: Do not save webpages to disk.|
100 | |path|string|`./data`|Path|Absolute or relative path to store webpages to disk|
101 |
102 |
103 | ### Note about CSRF
104 |
105 | Scrapy will most likely handle this automatically, so its best to turn off custom `csrf` in the config. If there is however any situation where the built-in CSRF recognition does not work, try the user-defined one. If none of them work, drop me an issue.
106 |
107 |
108 | ## Reference
109 |
110 | * https://stackoverflow.com/questions/5851213/crawling-with-an-authenticated-session-in-scrapy
111 | * https://stackoverflow.com/questions/38619807/scrapy-recursive-link-crawler-with-login-help-me-improve/
112 | * http://thuongnh.com/building-a-web-crawler-with-scrapy/
113 |
114 |
115 | ## License
116 |
117 | [MIT License](LICENSE.md)
118 |
119 | Copyright (c) 2016 [cytopia](https://github.com/cytopia)
120 |
--------------------------------------------------------------------------------
/crawlpy/spiders/crawlpy_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Python web spider/crawler based on scrapy with support for POST/GET login,
4 | variable level of recursions/depth and optionally save to disk.
5 |
6 | This file provides the crawling spider class.
7 | """
8 |
9 | import sys # Encoding
10 | import os # file path checks
11 | import logging # logger
12 | import json # json extract
13 |
14 | from scrapy.http import Request, FormRequest
15 | from scrapy.linkextractors import LinkExtractor
16 | from scrapy.spiders import Rule
17 | from scrapy.spiders.init import InitSpider
18 |
19 | from crawlpy.items import CrawlpyItem
20 |
21 |
22 | __author__ = "cytopia"
23 | __license__ = "MIT"
24 | __email__ = "cytopia@everythingcli.org"
25 |
26 |
27 |
28 | # Fix UTF-8 problems inside dict()
29 | reload(sys)
30 | sys.setdefaultencoding('utf8')
31 |
32 |
33 | # TODO:
34 | # * Self-contained spider: http://snipplr.com/view/67012/selfcontained-script-to-crawl-a-site-updated-scrapy-130dev/
35 | # * From a script spider: http://snipplr.com/view/67006/using-scrapy-from-a-script/
36 |
37 |
38 |
39 | ################################################################################
40 | # Spider Class
41 | ################################################################################
42 | class CrawlpySpider(InitSpider):
43 | """
44 | Crawlpy Class
45 | """
46 |
47 | ########################################
48 | # Scrapy Variables
49 | ########################################
50 | name = "crawlpy"
51 |
52 | # Link extraction rules
53 | # To be initialized
54 | rules = ()
55 |
56 | # Store all urls in order to
57 | # filter duplicates
58 | duplicates = []
59 |
60 |
61 |
62 | # scrapy domain/url vars
63 | # To be initialized
64 | allowed_domains = []
65 | start_urls = []
66 |
67 |
68 | ########################################
69 | # Configuration
70 | ########################################
71 |
72 | # Main JSON Configuration dict
73 | config = None
74 | config_defaults = dict({
75 | 'proto': 'http',
76 | 'domain': 'localhost',
77 | 'depth': 3,
78 | 'ignores': [],
79 | 'httpstatus_list': [],
80 | 'login': {
81 | 'enabled': False,
82 | 'method': 'post',
83 | 'action': '/login.php',
84 | 'failure': 'Password is incorrect',
85 | 'fields': {
86 | 'username': 'john',
87 | 'password': 'doe'
88 | },
89 | 'csrf': {
90 | 'enabled': False,
91 | 'field': 'csrf'
92 | }
93 | },
94 | 'store': {
95 | 'enabled': False,
96 | 'path': './data'
97 | }
98 | })
99 |
100 |
101 | ########################################
102 | # Helper variables
103 | ########################################
104 |
105 | base_url = '' # (http|https)://domain.tld
106 | login_url = '' # (http|https)://domain.tld/path/to/login
107 |
108 | # Abort flag
109 | abort = False
110 |
111 |
112 |
113 | ########################################
114 | # Methods
115 | ########################################
116 |
117 | #----------------------------------------------------------------------
118 | def __init__(self, *args, **kwargs):
119 | """Constructor: overwrite parent __init__ function"""
120 |
121 | # Call parent init
122 | super(CrawlpySpider, self).__init__(*args, **kwargs)
123 |
124 | # Get command line arg provided configuration param
125 | config_file = kwargs.get('config')
126 |
127 | # Validate configuration file parameter
128 | if not config_file:
129 | logging.error('Missing argument "-a config"')
130 | logging.error('Usage: scrapy crawl crawlpy -a config=/path/to/config.json')
131 | self.abort = True
132 |
133 | # Check if it is actually a file
134 | elif not os.path.isfile(config_file):
135 | logging.error('Specified config file does not exist')
136 | logging.error('Not found in: "' + config_file + '"')
137 | self.abort = True
138 |
139 | # All good, read config
140 | else:
141 | # Load json config
142 | fpointer = open(config_file)
143 | data = fpointer.read()
144 | fpointer.close()
145 |
146 | # convert JSON to dict
147 | config = json.loads(data)
148 |
149 | # fill in default values for missing values
150 | self.config = dict()
151 | self.config['proto'] = str(config.get('proto', self.config_defaults['proto']))
152 | self.config['domain'] = str(config.get('domain', self.config_defaults['domain']))
153 | self.config['depth'] = int(config.get('depth', self.config_defaults['depth']))
154 | self.config['ignores'] = config.get('ignores', self.config_defaults['ignores'])
155 | self.config['httpstatus_list'] = config.get('httpstatus_list', self.config_defaults['httpstatus_list'])
156 | self.config['login'] = dict()
157 | self.config['login']['enabled'] = bool(config.get('login', dict()).get('enabled', self.config_defaults['login']['enabled']))
158 | self.config['login']['method'] = str(config.get('login', dict()).get('method', self.config_defaults['login']['method']))
159 | self.config['login']['action'] = str(config.get('login', dict()).get('action', self.config_defaults['login']['enabled']))
160 | self.config['login']['failure'] = str(config.get('login', dict()).get('failure', self.config_defaults['login']['failure']))
161 | self.config['login']['fields'] = config.get('login', dict()).get('fields', self.config_defaults['login']['fields'])
162 | self.config['login']['csrf'] = dict()
163 | self.config['login']['csrf']['enabled'] = bool(config.get('login', dict()).get('csrf', dict()).get('enabled', self.config_defaults['login']['csrf']['enabled']))
164 | self.config['login']['csrf']['field'] = str(config.get('login', dict()).get('csrf', dict()).get('field', self.config_defaults['login']['csrf']['field']))
165 | self.config['store'] = dict()
166 | self.config['store']['enabled'] = bool(config.get('store', dict()).get('enabled', self.config_defaults['store']['enabled']))
167 | self.config['store']['path'] = str(config.get('store', dict()).get('path', self.config_defaults['store']['path']))
168 | logging.info('Merged configuration:')
169 | logging.info(self.config)
170 |
171 |
172 | # Set scrapy globals
173 | self.allowed_domains = [self.config['domain']]
174 | self.start_urls = [self.config['proto'] + '://' + self.config['domain'] + '/']
175 | self.rules = (
176 | Rule(
177 | LinkExtractor(
178 | allow_domains=(self.allowed_domains),
179 | unique=True,
180 | deny=tuple(self.config['ignores']),
181 | ),
182 | callback='parse',
183 | follow=True
184 | ),
185 | )
186 |
187 |
188 | # Handle more status codes
189 | self.handle_httpstatus_list = self.config['httpstatus_list']
190 |
191 | # Overwrite built-in crawling depth via own config file
192 | # Make sure to add +1 if we do a login (which counts as 1 level)
193 | # The variable will be handle by a custom middleware: MyDepthMiddleware
194 | # and parse it to the normal middleware: DepthMiddleware
195 | if self.config['login']['enabled'] and self.config['depth'] != 0:
196 | self.max_depth = self.config['depth'] + 1
197 | else:
198 | self.max_depth = self.config['depth']
199 |
200 |
201 | # Set misc globals
202 | self.base_url = self.config['proto'] + '://' + self.config['domain']
203 | self.login_url = self.config['proto'] + '://' + self.config['domain'] + \
204 | self.config['login']['action']
205 |
206 |
207 |
208 |
209 | #----------------------------------------------------------------------
210 | def init_request(self):
211 | """This function is called before crawling starts."""
212 |
213 | # Do not start a request on error,
214 | # simply return nothing and quit scrapy
215 | if self.abort:
216 | return
217 |
218 | logging.info('All set, start crawling with depth: ' + str(self.max_depth))
219 |
220 | # Do a login
221 | if self.config['login']['enabled']:
222 | # Start with login first
223 | logging.info('Login required')
224 | return Request(url=self.login_url, callback=self.login)
225 | else:
226 | # Start with pase function
227 | logging.info('Not login required')
228 | return Request(url=self.base_url, callback=self.parse)
229 |
230 |
231 |
232 | #----------------------------------------------------------------------
233 | def login(self, response):
234 | """Generate a login request."""
235 |
236 | # Add CSRF data to login.
237 | # Note: scrapy already does this automatically, if it finds
238 | # pre-filled input fields. If everything works without having
239 | # to use this custom csrf feature, it could be removed in the future.
240 | if self.config['login']['csrf']['enabled']:
241 | field = self.config['login']['csrf']['field']
242 | csrf = response.xpath('//input[@name="' + field + '"]/@value')[0].extract()
243 | self.config['login']['fields'][field] = csrf
244 | logging.info('Adding CSRF data to login. Field: "' + field + '" | value: "' + csrf + "'")
245 |
246 | return FormRequest.from_response(
247 | response,
248 | formdata=self.config['login']['fields'],
249 | method=self.config['login']['method'],
250 | dont_filter=True,
251 | callback=self.post_login
252 | )
253 |
254 |
255 | #----------------------------------------------------------------------
256 | def post_login(self, response):
257 | """
258 | Check the response returned by a login request to see if we are
259 | successfully logged in.
260 | """
261 |
262 | if self.config['login']['failure'] not in response.body:
263 | # Now the crawling can begin..
264 | logging.info('Login successful')
265 | return self.initialized()
266 | else:
267 | # Something went wrong, we couldn't log in, so nothing happens.
268 | logging.error('Unable to login')
269 |
270 |
271 | #----------------------------------------------------------------------
272 | def parse(self, response):
273 | """
274 | Scrapy parse callback
275 | """
276 |
277 | # Get current nesting level
278 | curr_depth = response.meta.get('depth', 1)
279 | if self.config['login']['enabled']:
280 | curr_depth = curr_depth - 1 # Do not count the login page as nesting depth
281 |
282 | # Store to disk?
283 | if self.config['store']['enabled']:
284 | path = response.url.replace(os.sep, '--') # Replace directory separator
285 | path = self.config['store']['path'] + os.sep + path
286 | with open(path, 'wb') as fpointer:
287 | fpointer.write(response.body)
288 |
289 | # Yield current url item
290 | item = CrawlpyItem()
291 | item['url'] = response.url
292 | item['status'] = response.status
293 | item['depth'] = curr_depth
294 | item['referer'] = response.meta.get('referer', '')
295 | yield item
296 |
297 |
298 |
299 | # Get all links from the current page
300 | links = LinkExtractor().extract_links(response)
301 |
302 | # Iterate all found links and crawl them
303 | for link in links:
304 | deny = False
305 |
306 | # Check requests to be ignored
307 | for ignore in self.config['ignores']:
308 | if (ignore in link.url) or (ignore.lower() in link.url.lower()):
309 | # Ignore pattern found, stop looking into other patterns
310 | deny = True
311 | break
312 |
313 |
314 | # [NO] Max depth exceeded
315 | if curr_depth >= self.max_depth:
316 | logging.info('[Not Crawling] Current depth (' + curr_depth + ') exceeds max depth (' + self.max_depth + ')')
317 | pass
318 | # [NO] Duplicate URL
319 | elif link.url in self.duplicates:
320 | logging.info('[Not Crawling] Url already crawled: ' + link.url)
321 | pass
322 | # [NO] URL denied
323 | elif deny:
324 | logging.info('[Not Crawling] Url denied (pattern: "' + ignore + '"): ' + link.url)
325 | pass
326 | # [OK] Crawl!
327 | else:
328 | self.duplicates.append(link.url)
329 | yield Request(link.url, meta={'depth': curr_depth+1, 'referer': response.url})
330 |
--------------------------------------------------------------------------------
/contrib/crawlpy-login.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #-*- coding: utf-8 -*-
3 |
4 | import sys
5 | import getopt
6 | import os.path
7 | import json
8 | import subprocess
9 | import re
10 |
11 | __author__ = "cytopia"
12 | __email__ = "cytopia@everythingcli.org"
13 | __license__ = "MIT"
14 | __version__ = '0.2'
15 | __date__ = '2016-08-15'
16 |
17 |
18 |
19 | # Fix UTF-8 problems inside dict()
20 | reload(sys)
21 | sys.setdefaultencoding('utf8')
22 |
23 |
24 | ################################################################################
25 | # File Class
26 | ################################################################################
27 | class MyFile(object):
28 |
29 | #----------------------------------------------------------------------
30 | @staticmethod
31 | def read(path):
32 | fp = open(path)
33 | data = fp.read()
34 | fp.close()
35 | return data
36 |
37 |
38 | ################################################################################
39 | # Json Class
40 | ################################################################################
41 | class MyJson(object):
42 |
43 | #----------------------------------------------------------------------
44 | @staticmethod
45 | def _toAscii(input):
46 | if isinstance(input, dict):
47 | return {MyJson._toAscii(key): MyJson._toAscii(value) for key, value in input.iteritems()}
48 | elif isinstance(input, list):
49 | return [MyJson._toAscii(element) for element in input]
50 | elif isinstance(input, unicode):
51 | return input.encode('utf-8')
52 | else:
53 | return input
54 |
55 | #----------------------------------------------------------------------
56 | @staticmethod
57 | def validateFile(path):
58 | json_string = MyFile.read(path)
59 | return MyJson.validateString(json_string)
60 |
61 | #----------------------------------------------------------------------
62 | @staticmethod
63 | def validateString(json_string):
64 | try:
65 | json_object = json.loads(json_string)
66 | except ValueError, e:
67 | return False
68 | return True
69 |
70 | #----------------------------------------------------------------------
71 | @staticmethod
72 | def convertFile2dict(path):
73 | json_string = MyFile.read(path)
74 | return MyJson.convertString2dict(json_string)
75 |
76 | #----------------------------------------------------------------------
77 | @staticmethod
78 | def convertString2dict(json_string):
79 | # Remove unicide
80 | ujdict = json.loads(json_string)
81 | jdict = MyJson._toAscii(ujdict)
82 | return jdict
83 |
84 |
85 |
86 | ################################################################################
87 | # Shell Class
88 | ################################################################################
89 | class MyShell(object):
90 |
91 | #----------------------------------------------------------------------
92 | @staticmethod
93 | def which(program):
94 | def is_exe(fpath):
95 | return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
96 |
97 | fpath, fname = os.path.split(program)
98 | if fpath:
99 | if is_exe(program):
100 | return program
101 | else:
102 | for path in os.environ["PATH"].split(os.pathsep):
103 | path = path.strip('"')
104 | exe_file = os.path.join(path, program)
105 | if is_exe(exe_file):
106 | return exe_file
107 |
108 | return None
109 |
110 | #----------------------------------------------------------------------
111 | @staticmethod
112 | def run(args, output, show_cmd=False, show_return=False, cmd_color='green'):
113 |
114 | if show_cmd:
115 | print MyShell.color(cmd_color) + '$ ' + ' '.join(args) + MyShell.color('reset')
116 |
117 | #retval = subprocess.call(args, shell=False, stdout=stdout)
118 | try:
119 | retval = 0
120 | output[0] = subprocess.check_output(args, shell=False)
121 | except subprocess.CalledProcessError as err:
122 | retval = err.returncode
123 | output[0] = err.output
124 |
125 |
126 | if show_return:
127 | print retval
128 |
129 | return retval
130 |
131 |
132 | #----------------------------------------------------------------------
133 | @staticmethod
134 | def color(color):
135 | if color == 'red':
136 | return '\033[0;31m'
137 | elif color == 'green':
138 | return '\033[0;32m'
139 | elif color == 'brown':
140 | return '\033[0;33m'
141 | elif color == 'blue':
142 | return '\033[0;34m'
143 | elif color == 'magenta':
144 | return '\033[0;35m'
145 | elif color == 'cyan':
146 | return '\033[0;36m'
147 | else:
148 | return '\033[0m'
149 |
150 |
151 |
152 |
153 | ################################################################################
154 | # Function
155 | ################################################################################
156 |
157 |
158 | #----------------------------------------------------------------------
159 | def usage():
160 | filename = os.path.basename(sys.argv[0])
161 |
162 | print 'Usage: ' + filename + ' -C conf.json [-c cookie.txt] [-o output.html] [-y] [-v]'
163 | print ' ' + filename + ' -h'
164 | print ' ' + filename + ' -V'
165 | print
166 | print filename + ' will test whether or not the specified crawlpy config'
167 | print 'is valid and can successfully login.'
168 | print
169 | print 'You can optionally save a login session cookie (-c/--cookie) in wget format'
170 | print 'which can be used by tools such as sqlmap.'
171 | print
172 | print 'You can also store the html output from a successfull/unsuccessful login'
173 | print 'to file (-o/--output).'
174 | print
175 | print
176 | print "Required arguments:"
177 | print " -C, --config= Path to crawlpy json config."
178 | print " -C /path/to/conf.json"
179 | print " --config=/path/to/conf.json"
180 | print
181 | print "Optional arguments:"
182 | print " -c, --cookie= Path where to store the session cookie."
183 | print " -c /path/to/cookie.txt"
184 | print " --cookie=/path/to/cookie.txt"
185 | print
186 | print " -o, --output= Path where to store the html source after logging in."
187 | print " -o /path/to/login.html"
188 | print " --cookie=/path/to/login.html"
189 | print
190 | print " -v, --verbose Be more verbose."
191 | print
192 | print " -y, --yes Answer 'yes' to all questions."
193 | print
194 | print "System options:"
195 | print " -h, --help Show help."
196 | print " -V, --version Show version information."
197 |
198 |
199 | #----------------------------------------------------------------------
200 | def credits():
201 | filename = os.path.basename(sys.argv[0])
202 | print filename + ' v' + __version__ + ' (' + __date__ + ')'
203 | print __author__ + ' <' + __email__ + '>'
204 |
205 |
206 | #----------------------------------------------------------------------
207 | def check_requirements():
208 |
209 | if MyShell().which('wget') is None:
210 | print "wget is required, but not found."
211 | return False
212 |
213 | return True
214 |
215 |
216 | #----------------------------------------------------------------------
217 | def get_arguments(argv):
218 |
219 | # Parse command line arguments
220 | try:
221 | opts, args = getopt.getopt(argv, 'C:c:o:vyhV', ['config=', 'cookie=', 'output=', 'verbose', 'yes', 'help', 'version'])
222 | except getopt.GetoptError:
223 | print "Invalid argument(s)"
224 | usage()
225 | sys.exit(2)
226 |
227 | # Get values from command line arguments
228 | for opt, arg in opts:
229 | if opt in ("-C", "--config"):
230 | config = arg
231 | elif opt in ("-c", "--cookie"):
232 | cookie = arg
233 | elif opt in ("-o", "--output"):
234 | output = arg
235 | elif opt in ("-v", "--verbose"):
236 | verbose = True
237 | elif opt in ("-y", "--yes"):
238 | yes = True
239 | elif opt in ("-h", "--help"):
240 | usage()
241 | sys.exit()
242 | elif opt in ("-V", "--version"):
243 | credits()
244 | sys.exit()
245 | else:
246 | print "Invalid argument: " + opt
247 | usage()
248 | sys.exit(2)
249 |
250 | # Check existance of command line arguments
251 | if 'config' not in locals():
252 | print "Missing -C, --config argument"
253 | usage()
254 | sys.exit(2)
255 |
256 | # Set default values
257 | if 'cookie' not in locals():
258 | cookie = False
259 | if 'output' not in locals():
260 | output = False
261 | if 'verbose' not in locals():
262 | verbose = False
263 | if 'yes' not in locals():
264 | yes = False
265 |
266 | # Return values
267 | return config, cookie, output, verbose, yes
268 |
269 |
270 |
271 |
272 |
273 |
274 | ################################################################################
275 | # Main Entry Point
276 | ################################################################################
277 |
278 |
279 | if __name__ == "__main__":
280 |
281 | # Retrieve cmd arguments
282 | config, cookie, output, verbose, yes = get_arguments(sys.argv[1:])
283 |
284 |
285 | # Check requirements
286 | if not check_requirements():
287 | sys.exit(2)
288 |
289 | # Check if config file exists
290 | if not os.path.isfile(config):
291 | print "Specified config file does not exist: " + config
292 | sys.exit(2)
293 |
294 | # Check valid json
295 | if not MyJson.validateFile(config):
296 | print "Invalid JSON data in: " + config
297 | sys.exit(2)
298 |
299 |
300 | # 4. Read JSON config into dict()
301 | jdict = MyJson.convertFile2dict(config)
302 |
303 |
304 | # 5. Set up base
305 | base_url = jdict['proto'] + '://' + jdict['domain']
306 | login_url = base_url + jdict['login']['action']
307 |
308 | post_data = []
309 | for key,val in jdict['login']['fields'].iteritems():
310 | post_data.append(key + '=' + val)
311 |
312 |
313 | # Cookie/Output files
314 | file_output = output if output else '/tmp/login.html'
315 | file_cookie = cookie if cookie else '/tmp/cookie.txt'
316 |
317 |
318 | # Ask what to do if file exists and not '--yes' was specified
319 | if os.path.isfile(file_output) and not yes:
320 | answer = None
321 | while answer != 'y' and answer != 'Y':
322 | answer = raw_input('Output file already exists. Overwrite? [y/n]? ')
323 |
324 | if answer == 'Y' or answer == 'y' or answer == 'Yes' or answer == 'yes':
325 | break
326 | elif answer == 'N' or answer == 'n':
327 | print "aborting..."
328 | sys.exit(0)
329 |
330 | # Ask what to do if file exists and not '--yes' was specified
331 | if os.path.isfile(file_cookie) and not yes:
332 | answer = None
333 | while answer != 'y' and answer != 'Y':
334 | answer = raw_input('Cookie file already exists. Overwrite? [y/n]? ')
335 |
336 | if answer == 'Y' or answer == 'y' or answer == 'Yes' or answer == 'yes':
337 | break
338 | elif answer == 'N' or answer == 'n':
339 | print "aborting..."
340 | sys.exit(0)
341 |
342 |
343 |
344 | wget_create_session = [
345 | 'wget',
346 | '--quiet',
347 | '--keep-session-cookies',
348 | '--save-cookies',
349 | file_cookie,
350 | '-O',
351 | '-',
352 | login_url
353 | ]
354 |
355 |
356 |
357 | # Initial wget
358 | if verbose:
359 | print MyShell().color('blue') + '[1] Creating initial session request' + MyShell().color('reset')
360 |
361 | output = ['']
362 | MyShell().run(wget_create_session, output, show_cmd=verbose, show_return=True)
363 |
364 | if jdict['login']['csrf']['enabled']:
365 | if verbose:
366 | print MyShell().color('blue') + '[2] Extracting CSRF key' + MyShell().color('reset')
367 |
368 | csrf_key = jdict['login']['csrf']['field']
369 | # Prepare regex
370 | re1 = "name=(\"|')%s(\"|').*value=(\"|')(.*)(\"|')" % (csrf_key)
371 | re2 = "value=(\"|')(.*)(\"|').*name=(\"|')%s(\"|')" % (csrf_key)
372 | # Search
373 | r1 = re.search(re1, output[0])
374 | r2 = re.search(re2, output[0])
375 |
376 | if r1:
377 | csrf_val = r1.group(4)
378 | elif r2:
379 | csrf_val = r2.group(2)
380 | else:
381 | print "Error, no such html attribute found"
382 | csrf_val = ''
383 |
384 | # Show extracted key
385 | if verbose:
386 | print "key: %s | val: %s" % (csrf_key, csrf_val)
387 |
388 | post_data.append(csrf_key + '=' + csrf_val)
389 | else:
390 | print MyShell().color('blue') + '[2] No CSRF key extraction' + MyShell().color('reset')
391 |
392 |
393 | wget_login = [
394 | 'wget',
395 | '--quiet',
396 | '--content-on-error',
397 | '--keep-session-cookies',
398 | '--load-cookies',
399 | file_cookie,
400 | '--save-cookies',
401 | file_cookie,
402 | '--post-data',
403 | '&'.join(post_data),
404 | '-O',
405 | file_output,
406 | login_url
407 | ]
408 |
409 | # Login wget
410 | if verbose:
411 | print MyShell().color('blue') + '[3] Submitting POST login' + MyShell().color('reset')
412 | MyShell().run(wget_login, output, show_cmd=verbose, show_return=True)
413 |
414 | # Inspect source code
415 | if verbose:
416 | print MyShell().color('blue') + '[4] Evaluating login page source' + MyShell().color('reset')
417 | source = MyFile.read(file_output)
418 |
419 |
420 | retval = 0
421 | if jdict['login']['failure'] in source:
422 | print "[FAILED] Login failed"
423 | retval = 2
424 | elif os.path.getsize(file_output) > 0:
425 | print "[OK] Login successful"
426 | retval = 0
427 | else:
428 | print "[FAILED] Result page has 0 Bytes"
429 | retval = 2
430 |
431 |
432 | if cookie:
433 | print "[OK] Session cookie created: " + file_cookie
434 | else:
435 | os.unlink(file_cookie)
436 |
437 | if output:
438 | print "[OK] Output file saved: " + file_output
439 | else:
440 | os.unlink(file_output)
441 |
442 | sys.exit(retval)
443 |
444 |
--------------------------------------------------------------------------------