├── data └── .keepme ├── crawlpy ├── __init__.py ├── spiders │ ├── __init__.py │ └── crawlpy_spider.py ├── pipelines.py ├── items.py ├── middlewares.py └── settings.py ├── scrapy.cfg ├── crawlpy.config.json-sample ├── run.sh ├── LICENSE.md ├── .gitignore ├── contrib ├── README.md └── crawlpy-login.py └── README.md /data/.keepme: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /crawlpy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /crawlpy/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = crawlpy.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = crawlpy 12 | -------------------------------------------------------------------------------- /crawlpy/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class CrawlpyPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /crawlpy.config.json-sample: -------------------------------------------------------------------------------- 1 | { 2 | "proto": "http", 3 | "domain": "localhost", 4 | "depth": 3, 5 | "ignores": [ 6 | "/user/logout.php", 7 | "/user/delete.php?id=1", 8 | "/user/disable.php?id=1", 9 | ], 10 | "httpstatus_list": [], 11 | "login": { 12 | "enabled": false, 13 | "method": "post", 14 | "action": "/login.php", 15 | "failure": "Password is incorrect", 16 | "fields": { 17 | "username": "john", 18 | "password": "doe" 19 | }, 20 | "csrf": { 21 | "enabled": false, 22 | "field": "csrf" 23 | } 24 | 25 | }, 26 | "store": { 27 | "enabled": false, 28 | "path": "./data" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /crawlpy/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Python web spider/crawler based on scrapy with support for POST/GET login, 4 | variable level of recursions/depth and optionally save to disk. 5 | 6 | Defines my custom model items. 7 | See documentation in: 8 | http://doc.scrapy.org/en/latest/topics/items.html 9 | """ 10 | 11 | from scrapy.item import Item, Field 12 | 13 | 14 | __author__ = "cytopia" 15 | __license__ = "MIT" 16 | __email__ = "cytopia@everythingcli.org" 17 | 18 | 19 | class CrawlpyItem(Item): 20 | """Data Model Class""" 21 | 22 | url = Field() 23 | text = Field() 24 | status = Field() 25 | depth = Field() 26 | referer = Field() 27 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | 4 | echo 5 | echo "Do not save to file:" 6 | echo "$ scrapy crawl crawlpy -a config=./crawlpy.config.json" 7 | 8 | echo 9 | echo "Save results to json:" 10 | echo "$ scrapy crawl crawlpy --loglevel=INFO -a config=./crawlpy.config.json -o urls.json -t json" 11 | echo "$ scrapy crawl crawlpy --loglevel=WARNING -a config=./crawlpy.config.json -o urls.json -t json" 12 | 13 | echo 14 | echo "Save results to csv:" 15 | echo "$ scrapy crawl crawlpy --loglevel=INFO -a config=./crawlpy.config.json -o urls.csv -t csv" 16 | echo "$ scrapy crawl crawlpy --loglevel=WARNING -a config=./crawlpy.config.json -o urls.csv -t csv" 17 | 18 | echo 19 | echo "Clear cache for re-coding:" 20 | echo "$ find . -name \*.pyc -exec rm '{}' \;" 21 | 22 | exit 0 23 | -------------------------------------------------------------------------------- /crawlpy/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Python web spider/crawler based on scrapy with support for POST/GET login, 4 | variable level of recursions/depth and optionally save to disk. 5 | 6 | This file provides the a middleware that overwrites the crawling depth behavior. 7 | """ 8 | 9 | from scrapy.spidermiddlewares.depth import DepthMiddleware 10 | 11 | 12 | __author__ = "cytopia" 13 | __license__ = "MIT" 14 | __email__ = "cytopia@everythingcli.org" 15 | 16 | 17 | """Custom DepthMiddleWare""" 18 | class MyDepthMiddleware(DepthMiddleware): 19 | 20 | #---------------------------------------------------------------------- 21 | def process_spider_output(self, response, result, spider): 22 | """Overwrite parent DepthMiddleware and set MAX_DEPTH""" 23 | 24 | if hasattr(spider, 'max_depth'): 25 | self.maxdepth = getattr(spider, 'max_depth') 26 | return super(MyDepthMiddleware, self).process_spider_output(response, result, spider) 27 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 cytopia 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # 2 | # Custom stuff 3 | # 4 | crawlpy.config.json 5 | data/* 6 | 7 | !.keepme 8 | 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | env/ 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *,cover 54 | .hypothesis/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # IPython Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | -------------------------------------------------------------------------------- /contrib/README.md: -------------------------------------------------------------------------------- 1 | # Contributed binaries 2 | 3 | ## crawlpy-login.py 4 | 5 | * Test the login prior crawling. 6 | * Create wget-like login session cookie (useable by [sqlmap](http://sqlmap.org/)) 7 | * Dump login page (after login) 8 | 9 | ```shell 10 | Usage: crawlpy-login.py -C conf.json [-c cookie.txt] [-o output.html] [-y] [-v] 11 | crawlpy-login.py -h 12 | crawlpy-login.py -V 13 | 14 | crawlpy-login.py will test whether or not the specified crawlpy config 15 | is valid and can successfully login. 16 | 17 | You can optionally save a login session cookie (-c/--cookie) in wget format 18 | which can be used by tools such as sqlmap. 19 | 20 | You can also store the html output from a successfull/unsuccessful login 21 | to file (-o/--output). 22 | 23 | 24 | Required arguments: 25 | -C, --config= Path to crawlpy json config. 26 | -C /path/to/conf.json 27 | --config=/path/to/conf.json 28 | 29 | Optional arguments: 30 | -c, --cookie= Path where to store the session cookie. 31 | -c /path/to/cookie.txt 32 | --cookie=/path/to/cookie.txt 33 | 34 | -o, --output= Path where to store the html source after logging in. 35 | -o /path/to/login.html 36 | --cookie=/path/to/login.html 37 | 38 | -v, --verbose Be more verbose. 39 | 40 | -y, --yes Answer 'yes' to all questions. 41 | 42 | System options: 43 | -h, --help Show help. 44 | -V, --version Show version information. 45 | ``` -------------------------------------------------------------------------------- /crawlpy/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for crawlpy project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'crawlpy' 13 | 14 | SPIDER_MODULES = ['crawlpy.spiders'] 15 | NEWSPIDER_MODULE = 'crawlpy.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'crawlpy (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | COOKIES_ENABLED = True 37 | 38 | # Disable Telnet Console (enabled by default) 39 | TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | SPIDER_MIDDLEWARES = { 50 | 'scrapy.spidermiddlewares.depth.DepthMiddleware': None, 51 | 'crawlpy.middlewares.MyDepthMiddleware': 543, 52 | } 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'crawlpy.middlewares.MyCustomDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | #ITEM_PIPELINES = { 69 | # 'crawlpy.pipelines.SomePipeline': 300, 70 | #} 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Crawlpy 2 | 3 | [![Tag](https://img.shields.io/github/tag/cytopia/crawlpy.svg)](https://github.com/cytopia/crawlpy/releases) 4 | [![License](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) 5 | 6 | Python web spider/crawler based on [scrapy](http://scrapy.org/) with support for POST/GET login, variable level of recursions/depth and optionally save to disk. 7 | 8 | 9 | ## Requirements 10 | 11 | * [python 2.7](https://www.python.org/) 12 | * [lxml](http://lxml.de/) 13 | * [pip](https://pip.pypa.io/en/latest/installing/) 14 | 15 | ```shell 16 | pip install Scrapy 17 | ``` 18 | 19 | 20 | ## Features 21 | 22 | * POST/GET Login prior crawling 23 | * Can handle logins that requires dynamic CSRF token 24 | * Variable level of crawling depth 25 | * Optionally save webpages to disk 26 | 27 | ## Roadmap 28 | 29 | Find all planned features and their stati here: https://github.com/cytopia/crawlpy/issues/1 30 | 31 | 32 | ## Usage 33 | 34 | ```bash 35 | # stdout output 36 | scrapy crawl crawlpy -a config=/path/to/crawlpy.config.json 37 | 38 | # save as json (url:, status:, depth:, referer:) to 'urls.json' 39 | scrapy crawl crawlpy --loglevel=INFO -a config=/path/to/crawlpy.config.json -o urls.json -t json 40 | 41 | # save as csv (url, status, depth, referer) to 'urls.csv' 42 | scrapy crawl crawlpy --loglevel=INFO -a config=/path/to/crawlpy.config.json -o urls.csv -t csv 43 | ``` 44 | 45 | ## Configuration 46 | 47 | Make a copy of [crawlpy.config.json-sample](crawlpy.config.json-sample) (e.g.: `example.com-config.json`) and adjust the values accordingly. 48 | 49 | **Note:** 50 | It must be a valid json file (without comments), otherwise `crawlpy` will throw errors parsing json. (Use http://jsonlint.com/ to validate your config file.) 51 | 52 | ```javascript 53 | { 54 | "proto": "http", // 'http' or 'https' 55 | "domain": "localhost", // Only the domain. e.g.: 'example.com' or 'www.example.com' 56 | "depth": 3, // Nesting depth to crawl 57 | "ignores": [], // Array of substrings to deny/ignore when found in URL 58 | "httpstatus_list": [], // Array of http status codes to handle (default is 2xx) 59 | "login": { // Login section 60 | "enabled": false, // Do we actually need to do a login? 61 | "method": "post", // 'post' or 'get' 62 | "action": "/login.php", // Where the post or get will be submitted to 63 | "failure": "Password is incorrect", // The string you will see on login failure 64 | "fields": { // POST/GET Fields to submit to login page 65 | "username": "john", 66 | "password": "doe" 67 | }, 68 | "csrf": { 69 | "enabled": false, // Login requires a CSRF token? 70 | "field": "csrf" // Input field name that holds dynamic CSRF token 71 | } 72 | }, 73 | "store": { // Store section 74 | "enabled": false, // save to disk? 75 | "path": "./data" // path for saving (rel or abs) 76 | } 77 | } 78 | ``` 79 | 80 | ### Detailed description 81 | 82 | |Key|Type|Default Value|Possible Values|Description| 83 | |---|----|-------------|---------------|-----------| 84 | |proto|string|`http`|`http` or `https`|Is the site you want to crawl running on `http` or `https`?| 85 | |domain|string|`localhost`|Domain or subdomain|The domain or subdomain you want to spider. Nothing outside this domain/subdomain will be touched.| 86 | |depth|integer|`3`|`0`,`1`,`2`,`3`,...|`0`: Crawl indefinetely until every subpage has been reached.
`1`: Only crawl links on the initial page.
`2`: Crawl links on the initial page and everything found on the links of that page.

**Note:** when you do a login, the login page already counts as one level of depth by scrapy itself, but this is rewritten internally to subtract that depth again, so your output will not show that extra depth.| 87 | |ignores|array|\[ \]|\['/logout.php', 'delete.php?id=1'\]|Each array string element is treated as a substring (no regex) and is checked against a FQDN. If any of the specified substrings is found in that URL, it will not be crawled.

**Note:** It does make sense, when you login somewhere, to ignore the logout page, as well as other pages that might delete/disable your current user, so you will not be kicked from your login session during crawl time.| 88 | |httpstatus_list|array|\[ \]|\[403, 404, 500\]|By default scrapy ignores pages with status code other than 2xx, so if you know that a 403 page contains actual content with links, just add this here.

**Note:** There is no need to specify `200`, as scrapy crawls them by default.| 89 | |**login**||||Login section| 90 | |enabled|boolean|`false`|`true` or `false`|`true`: Do a login prior crawling
`false`: do not login

**Note:**When login is set to `false`, you do not need to fill in the rest of the variables inside the `login` section| 91 | |method|string|`post`|`post` or `get`|Method required to execute the login| 92 | |action|string|`/login.php`|login page|Relative login page (from the base domain, including leading slash) where the `post` or `get` will go to.| 93 | |failure|string|`Password is incorrect`|login failed string|A string that is found on the login page, when the login fails.| 94 | |fields|key-value|`{`
`"username": "john",`
`"password": "doe"`
`}`|`post` or `get` params|POST or GET params required to login.

**Examples:** username, password, hidden-field-name| 95 | |**csrf**||||Login CSRF section| 96 | |enabled|boolean|`false`|`true` or `false`|`true`: Login page has a dynamic CSRF token that you want to read out and submit along the normal submit data.
`false`: Login does not require a CSRF token to be submitted.

**Note:** If the login has a static (never-changing) CSRF field, just add the data into the fields section
**Note:** Read below about built-in automatic CSRF detection and leave this off at first.| 97 | |field|string|`csrf`|Field name|The name of the input field which holds the CSRF token| 98 | |**store**||||Store section| 99 | |enabled|boolean|`false`|`true` or `false`|`true`: Save webpages to disk
`false`: Do not save webpages to disk.| 100 | |path|string|`./data`|Path|Absolute or relative path to store webpages to disk| 101 | 102 | 103 | ### Note about CSRF 104 | 105 | Scrapy will most likely handle this automatically, so its best to turn off custom `csrf` in the config. If there is however any situation where the built-in CSRF recognition does not work, try the user-defined one. If none of them work, drop me an issue. 106 | 107 | 108 | ## Reference 109 | 110 | * https://stackoverflow.com/questions/5851213/crawling-with-an-authenticated-session-in-scrapy 111 | * https://stackoverflow.com/questions/38619807/scrapy-recursive-link-crawler-with-login-help-me-improve/ 112 | * http://thuongnh.com/building-a-web-crawler-with-scrapy/ 113 | 114 | 115 | ## License 116 | 117 | [MIT License](LICENSE.md) 118 | 119 | Copyright (c) 2016 [cytopia](https://github.com/cytopia) 120 | -------------------------------------------------------------------------------- /crawlpy/spiders/crawlpy_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Python web spider/crawler based on scrapy with support for POST/GET login, 4 | variable level of recursions/depth and optionally save to disk. 5 | 6 | This file provides the crawling spider class. 7 | """ 8 | 9 | import sys # Encoding 10 | import os # file path checks 11 | import logging # logger 12 | import json # json extract 13 | 14 | from scrapy.http import Request, FormRequest 15 | from scrapy.linkextractors import LinkExtractor 16 | from scrapy.spiders import Rule 17 | from scrapy.spiders.init import InitSpider 18 | 19 | from crawlpy.items import CrawlpyItem 20 | 21 | 22 | __author__ = "cytopia" 23 | __license__ = "MIT" 24 | __email__ = "cytopia@everythingcli.org" 25 | 26 | 27 | 28 | # Fix UTF-8 problems inside dict() 29 | reload(sys) 30 | sys.setdefaultencoding('utf8') 31 | 32 | 33 | # TODO: 34 | # * Self-contained spider: http://snipplr.com/view/67012/selfcontained-script-to-crawl-a-site-updated-scrapy-130dev/ 35 | # * From a script spider: http://snipplr.com/view/67006/using-scrapy-from-a-script/ 36 | 37 | 38 | 39 | ################################################################################ 40 | # Spider Class 41 | ################################################################################ 42 | class CrawlpySpider(InitSpider): 43 | """ 44 | Crawlpy Class 45 | """ 46 | 47 | ######################################## 48 | # Scrapy Variables 49 | ######################################## 50 | name = "crawlpy" 51 | 52 | # Link extraction rules 53 | # To be initialized 54 | rules = () 55 | 56 | # Store all urls in order to 57 | # filter duplicates 58 | duplicates = [] 59 | 60 | 61 | 62 | # scrapy domain/url vars 63 | # To be initialized 64 | allowed_domains = [] 65 | start_urls = [] 66 | 67 | 68 | ######################################## 69 | # Configuration 70 | ######################################## 71 | 72 | # Main JSON Configuration dict 73 | config = None 74 | config_defaults = dict({ 75 | 'proto': 'http', 76 | 'domain': 'localhost', 77 | 'depth': 3, 78 | 'ignores': [], 79 | 'httpstatus_list': [], 80 | 'login': { 81 | 'enabled': False, 82 | 'method': 'post', 83 | 'action': '/login.php', 84 | 'failure': 'Password is incorrect', 85 | 'fields': { 86 | 'username': 'john', 87 | 'password': 'doe' 88 | }, 89 | 'csrf': { 90 | 'enabled': False, 91 | 'field': 'csrf' 92 | } 93 | }, 94 | 'store': { 95 | 'enabled': False, 96 | 'path': './data' 97 | } 98 | }) 99 | 100 | 101 | ######################################## 102 | # Helper variables 103 | ######################################## 104 | 105 | base_url = '' # (http|https)://domain.tld 106 | login_url = '' # (http|https)://domain.tld/path/to/login 107 | 108 | # Abort flag 109 | abort = False 110 | 111 | 112 | 113 | ######################################## 114 | # Methods 115 | ######################################## 116 | 117 | #---------------------------------------------------------------------- 118 | def __init__(self, *args, **kwargs): 119 | """Constructor: overwrite parent __init__ function""" 120 | 121 | # Call parent init 122 | super(CrawlpySpider, self).__init__(*args, **kwargs) 123 | 124 | # Get command line arg provided configuration param 125 | config_file = kwargs.get('config') 126 | 127 | # Validate configuration file parameter 128 | if not config_file: 129 | logging.error('Missing argument "-a config"') 130 | logging.error('Usage: scrapy crawl crawlpy -a config=/path/to/config.json') 131 | self.abort = True 132 | 133 | # Check if it is actually a file 134 | elif not os.path.isfile(config_file): 135 | logging.error('Specified config file does not exist') 136 | logging.error('Not found in: "' + config_file + '"') 137 | self.abort = True 138 | 139 | # All good, read config 140 | else: 141 | # Load json config 142 | fpointer = open(config_file) 143 | data = fpointer.read() 144 | fpointer.close() 145 | 146 | # convert JSON to dict 147 | config = json.loads(data) 148 | 149 | # fill in default values for missing values 150 | self.config = dict() 151 | self.config['proto'] = str(config.get('proto', self.config_defaults['proto'])) 152 | self.config['domain'] = str(config.get('domain', self.config_defaults['domain'])) 153 | self.config['depth'] = int(config.get('depth', self.config_defaults['depth'])) 154 | self.config['ignores'] = config.get('ignores', self.config_defaults['ignores']) 155 | self.config['httpstatus_list'] = config.get('httpstatus_list', self.config_defaults['httpstatus_list']) 156 | self.config['login'] = dict() 157 | self.config['login']['enabled'] = bool(config.get('login', dict()).get('enabled', self.config_defaults['login']['enabled'])) 158 | self.config['login']['method'] = str(config.get('login', dict()).get('method', self.config_defaults['login']['method'])) 159 | self.config['login']['action'] = str(config.get('login', dict()).get('action', self.config_defaults['login']['enabled'])) 160 | self.config['login']['failure'] = str(config.get('login', dict()).get('failure', self.config_defaults['login']['failure'])) 161 | self.config['login']['fields'] = config.get('login', dict()).get('fields', self.config_defaults['login']['fields']) 162 | self.config['login']['csrf'] = dict() 163 | self.config['login']['csrf']['enabled'] = bool(config.get('login', dict()).get('csrf', dict()).get('enabled', self.config_defaults['login']['csrf']['enabled'])) 164 | self.config['login']['csrf']['field'] = str(config.get('login', dict()).get('csrf', dict()).get('field', self.config_defaults['login']['csrf']['field'])) 165 | self.config['store'] = dict() 166 | self.config['store']['enabled'] = bool(config.get('store', dict()).get('enabled', self.config_defaults['store']['enabled'])) 167 | self.config['store']['path'] = str(config.get('store', dict()).get('path', self.config_defaults['store']['path'])) 168 | logging.info('Merged configuration:') 169 | logging.info(self.config) 170 | 171 | 172 | # Set scrapy globals 173 | self.allowed_domains = [self.config['domain']] 174 | self.start_urls = [self.config['proto'] + '://' + self.config['domain'] + '/'] 175 | self.rules = ( 176 | Rule( 177 | LinkExtractor( 178 | allow_domains=(self.allowed_domains), 179 | unique=True, 180 | deny=tuple(self.config['ignores']), 181 | ), 182 | callback='parse', 183 | follow=True 184 | ), 185 | ) 186 | 187 | 188 | # Handle more status codes 189 | self.handle_httpstatus_list = self.config['httpstatus_list'] 190 | 191 | # Overwrite built-in crawling depth via own config file 192 | # Make sure to add +1 if we do a login (which counts as 1 level) 193 | # The variable will be handle by a custom middleware: MyDepthMiddleware 194 | # and parse it to the normal middleware: DepthMiddleware 195 | if self.config['login']['enabled'] and self.config['depth'] != 0: 196 | self.max_depth = self.config['depth'] + 1 197 | else: 198 | self.max_depth = self.config['depth'] 199 | 200 | 201 | # Set misc globals 202 | self.base_url = self.config['proto'] + '://' + self.config['domain'] 203 | self.login_url = self.config['proto'] + '://' + self.config['domain'] + \ 204 | self.config['login']['action'] 205 | 206 | 207 | 208 | 209 | #---------------------------------------------------------------------- 210 | def init_request(self): 211 | """This function is called before crawling starts.""" 212 | 213 | # Do not start a request on error, 214 | # simply return nothing and quit scrapy 215 | if self.abort: 216 | return 217 | 218 | logging.info('All set, start crawling with depth: ' + str(self.max_depth)) 219 | 220 | # Do a login 221 | if self.config['login']['enabled']: 222 | # Start with login first 223 | logging.info('Login required') 224 | return Request(url=self.login_url, callback=self.login) 225 | else: 226 | # Start with pase function 227 | logging.info('Not login required') 228 | return Request(url=self.base_url, callback=self.parse) 229 | 230 | 231 | 232 | #---------------------------------------------------------------------- 233 | def login(self, response): 234 | """Generate a login request.""" 235 | 236 | # Add CSRF data to login. 237 | # Note: scrapy already does this automatically, if it finds 238 | # pre-filled input fields. If everything works without having 239 | # to use this custom csrf feature, it could be removed in the future. 240 | if self.config['login']['csrf']['enabled']: 241 | field = self.config['login']['csrf']['field'] 242 | csrf = response.xpath('//input[@name="' + field + '"]/@value')[0].extract() 243 | self.config['login']['fields'][field] = csrf 244 | logging.info('Adding CSRF data to login. Field: "' + field + '" | value: "' + csrf + "'") 245 | 246 | return FormRequest.from_response( 247 | response, 248 | formdata=self.config['login']['fields'], 249 | method=self.config['login']['method'], 250 | dont_filter=True, 251 | callback=self.post_login 252 | ) 253 | 254 | 255 | #---------------------------------------------------------------------- 256 | def post_login(self, response): 257 | """ 258 | Check the response returned by a login request to see if we are 259 | successfully logged in. 260 | """ 261 | 262 | if self.config['login']['failure'] not in response.body: 263 | # Now the crawling can begin.. 264 | logging.info('Login successful') 265 | return self.initialized() 266 | else: 267 | # Something went wrong, we couldn't log in, so nothing happens. 268 | logging.error('Unable to login') 269 | 270 | 271 | #---------------------------------------------------------------------- 272 | def parse(self, response): 273 | """ 274 | Scrapy parse callback 275 | """ 276 | 277 | # Get current nesting level 278 | curr_depth = response.meta.get('depth', 1) 279 | if self.config['login']['enabled']: 280 | curr_depth = curr_depth - 1 # Do not count the login page as nesting depth 281 | 282 | # Store to disk? 283 | if self.config['store']['enabled']: 284 | path = response.url.replace(os.sep, '--') # Replace directory separator 285 | path = self.config['store']['path'] + os.sep + path 286 | with open(path, 'wb') as fpointer: 287 | fpointer.write(response.body) 288 | 289 | # Yield current url item 290 | item = CrawlpyItem() 291 | item['url'] = response.url 292 | item['status'] = response.status 293 | item['depth'] = curr_depth 294 | item['referer'] = response.meta.get('referer', '') 295 | yield item 296 | 297 | 298 | 299 | # Get all links from the current page 300 | links = LinkExtractor().extract_links(response) 301 | 302 | # Iterate all found links and crawl them 303 | for link in links: 304 | deny = False 305 | 306 | # Check requests to be ignored 307 | for ignore in self.config['ignores']: 308 | if (ignore in link.url) or (ignore.lower() in link.url.lower()): 309 | # Ignore pattern found, stop looking into other patterns 310 | deny = True 311 | break 312 | 313 | 314 | # [NO] Max depth exceeded 315 | if curr_depth >= self.max_depth: 316 | logging.info('[Not Crawling] Current depth (' + curr_depth + ') exceeds max depth (' + self.max_depth + ')') 317 | pass 318 | # [NO] Duplicate URL 319 | elif link.url in self.duplicates: 320 | logging.info('[Not Crawling] Url already crawled: ' + link.url) 321 | pass 322 | # [NO] URL denied 323 | elif deny: 324 | logging.info('[Not Crawling] Url denied (pattern: "' + ignore + '"): ' + link.url) 325 | pass 326 | # [OK] Crawl! 327 | else: 328 | self.duplicates.append(link.url) 329 | yield Request(link.url, meta={'depth': curr_depth+1, 'referer': response.url}) 330 | -------------------------------------------------------------------------------- /contrib/crawlpy-login.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | 4 | import sys 5 | import getopt 6 | import os.path 7 | import json 8 | import subprocess 9 | import re 10 | 11 | __author__ = "cytopia" 12 | __email__ = "cytopia@everythingcli.org" 13 | __license__ = "MIT" 14 | __version__ = '0.2' 15 | __date__ = '2016-08-15' 16 | 17 | 18 | 19 | # Fix UTF-8 problems inside dict() 20 | reload(sys) 21 | sys.setdefaultencoding('utf8') 22 | 23 | 24 | ################################################################################ 25 | # File Class 26 | ################################################################################ 27 | class MyFile(object): 28 | 29 | #---------------------------------------------------------------------- 30 | @staticmethod 31 | def read(path): 32 | fp = open(path) 33 | data = fp.read() 34 | fp.close() 35 | return data 36 | 37 | 38 | ################################################################################ 39 | # Json Class 40 | ################################################################################ 41 | class MyJson(object): 42 | 43 | #---------------------------------------------------------------------- 44 | @staticmethod 45 | def _toAscii(input): 46 | if isinstance(input, dict): 47 | return {MyJson._toAscii(key): MyJson._toAscii(value) for key, value in input.iteritems()} 48 | elif isinstance(input, list): 49 | return [MyJson._toAscii(element) for element in input] 50 | elif isinstance(input, unicode): 51 | return input.encode('utf-8') 52 | else: 53 | return input 54 | 55 | #---------------------------------------------------------------------- 56 | @staticmethod 57 | def validateFile(path): 58 | json_string = MyFile.read(path) 59 | return MyJson.validateString(json_string) 60 | 61 | #---------------------------------------------------------------------- 62 | @staticmethod 63 | def validateString(json_string): 64 | try: 65 | json_object = json.loads(json_string) 66 | except ValueError, e: 67 | return False 68 | return True 69 | 70 | #---------------------------------------------------------------------- 71 | @staticmethod 72 | def convertFile2dict(path): 73 | json_string = MyFile.read(path) 74 | return MyJson.convertString2dict(json_string) 75 | 76 | #---------------------------------------------------------------------- 77 | @staticmethod 78 | def convertString2dict(json_string): 79 | # Remove unicide 80 | ujdict = json.loads(json_string) 81 | jdict = MyJson._toAscii(ujdict) 82 | return jdict 83 | 84 | 85 | 86 | ################################################################################ 87 | # Shell Class 88 | ################################################################################ 89 | class MyShell(object): 90 | 91 | #---------------------------------------------------------------------- 92 | @staticmethod 93 | def which(program): 94 | def is_exe(fpath): 95 | return os.path.isfile(fpath) and os.access(fpath, os.X_OK) 96 | 97 | fpath, fname = os.path.split(program) 98 | if fpath: 99 | if is_exe(program): 100 | return program 101 | else: 102 | for path in os.environ["PATH"].split(os.pathsep): 103 | path = path.strip('"') 104 | exe_file = os.path.join(path, program) 105 | if is_exe(exe_file): 106 | return exe_file 107 | 108 | return None 109 | 110 | #---------------------------------------------------------------------- 111 | @staticmethod 112 | def run(args, output, show_cmd=False, show_return=False, cmd_color='green'): 113 | 114 | if show_cmd: 115 | print MyShell.color(cmd_color) + '$ ' + ' '.join(args) + MyShell.color('reset') 116 | 117 | #retval = subprocess.call(args, shell=False, stdout=stdout) 118 | try: 119 | retval = 0 120 | output[0] = subprocess.check_output(args, shell=False) 121 | except subprocess.CalledProcessError as err: 122 | retval = err.returncode 123 | output[0] = err.output 124 | 125 | 126 | if show_return: 127 | print retval 128 | 129 | return retval 130 | 131 | 132 | #---------------------------------------------------------------------- 133 | @staticmethod 134 | def color(color): 135 | if color == 'red': 136 | return '\033[0;31m' 137 | elif color == 'green': 138 | return '\033[0;32m' 139 | elif color == 'brown': 140 | return '\033[0;33m' 141 | elif color == 'blue': 142 | return '\033[0;34m' 143 | elif color == 'magenta': 144 | return '\033[0;35m' 145 | elif color == 'cyan': 146 | return '\033[0;36m' 147 | else: 148 | return '\033[0m' 149 | 150 | 151 | 152 | 153 | ################################################################################ 154 | # Function 155 | ################################################################################ 156 | 157 | 158 | #---------------------------------------------------------------------- 159 | def usage(): 160 | filename = os.path.basename(sys.argv[0]) 161 | 162 | print 'Usage: ' + filename + ' -C conf.json [-c cookie.txt] [-o output.html] [-y] [-v]' 163 | print ' ' + filename + ' -h' 164 | print ' ' + filename + ' -V' 165 | print 166 | print filename + ' will test whether or not the specified crawlpy config' 167 | print 'is valid and can successfully login.' 168 | print 169 | print 'You can optionally save a login session cookie (-c/--cookie) in wget format' 170 | print 'which can be used by tools such as sqlmap.' 171 | print 172 | print 'You can also store the html output from a successfull/unsuccessful login' 173 | print 'to file (-o/--output).' 174 | print 175 | print 176 | print "Required arguments:" 177 | print " -C, --config= Path to crawlpy json config." 178 | print " -C /path/to/conf.json" 179 | print " --config=/path/to/conf.json" 180 | print 181 | print "Optional arguments:" 182 | print " -c, --cookie= Path where to store the session cookie." 183 | print " -c /path/to/cookie.txt" 184 | print " --cookie=/path/to/cookie.txt" 185 | print 186 | print " -o, --output= Path where to store the html source after logging in." 187 | print " -o /path/to/login.html" 188 | print " --cookie=/path/to/login.html" 189 | print 190 | print " -v, --verbose Be more verbose." 191 | print 192 | print " -y, --yes Answer 'yes' to all questions." 193 | print 194 | print "System options:" 195 | print " -h, --help Show help." 196 | print " -V, --version Show version information." 197 | 198 | 199 | #---------------------------------------------------------------------- 200 | def credits(): 201 | filename = os.path.basename(sys.argv[0]) 202 | print filename + ' v' + __version__ + ' (' + __date__ + ')' 203 | print __author__ + ' <' + __email__ + '>' 204 | 205 | 206 | #---------------------------------------------------------------------- 207 | def check_requirements(): 208 | 209 | if MyShell().which('wget') is None: 210 | print "wget is required, but not found." 211 | return False 212 | 213 | return True 214 | 215 | 216 | #---------------------------------------------------------------------- 217 | def get_arguments(argv): 218 | 219 | # Parse command line arguments 220 | try: 221 | opts, args = getopt.getopt(argv, 'C:c:o:vyhV', ['config=', 'cookie=', 'output=', 'verbose', 'yes', 'help', 'version']) 222 | except getopt.GetoptError: 223 | print "Invalid argument(s)" 224 | usage() 225 | sys.exit(2) 226 | 227 | # Get values from command line arguments 228 | for opt, arg in opts: 229 | if opt in ("-C", "--config"): 230 | config = arg 231 | elif opt in ("-c", "--cookie"): 232 | cookie = arg 233 | elif opt in ("-o", "--output"): 234 | output = arg 235 | elif opt in ("-v", "--verbose"): 236 | verbose = True 237 | elif opt in ("-y", "--yes"): 238 | yes = True 239 | elif opt in ("-h", "--help"): 240 | usage() 241 | sys.exit() 242 | elif opt in ("-V", "--version"): 243 | credits() 244 | sys.exit() 245 | else: 246 | print "Invalid argument: " + opt 247 | usage() 248 | sys.exit(2) 249 | 250 | # Check existance of command line arguments 251 | if 'config' not in locals(): 252 | print "Missing -C, --config argument" 253 | usage() 254 | sys.exit(2) 255 | 256 | # Set default values 257 | if 'cookie' not in locals(): 258 | cookie = False 259 | if 'output' not in locals(): 260 | output = False 261 | if 'verbose' not in locals(): 262 | verbose = False 263 | if 'yes' not in locals(): 264 | yes = False 265 | 266 | # Return values 267 | return config, cookie, output, verbose, yes 268 | 269 | 270 | 271 | 272 | 273 | 274 | ################################################################################ 275 | # Main Entry Point 276 | ################################################################################ 277 | 278 | 279 | if __name__ == "__main__": 280 | 281 | # Retrieve cmd arguments 282 | config, cookie, output, verbose, yes = get_arguments(sys.argv[1:]) 283 | 284 | 285 | # Check requirements 286 | if not check_requirements(): 287 | sys.exit(2) 288 | 289 | # Check if config file exists 290 | if not os.path.isfile(config): 291 | print "Specified config file does not exist: " + config 292 | sys.exit(2) 293 | 294 | # Check valid json 295 | if not MyJson.validateFile(config): 296 | print "Invalid JSON data in: " + config 297 | sys.exit(2) 298 | 299 | 300 | # 4. Read JSON config into dict() 301 | jdict = MyJson.convertFile2dict(config) 302 | 303 | 304 | # 5. Set up base 305 | base_url = jdict['proto'] + '://' + jdict['domain'] 306 | login_url = base_url + jdict['login']['action'] 307 | 308 | post_data = [] 309 | for key,val in jdict['login']['fields'].iteritems(): 310 | post_data.append(key + '=' + val) 311 | 312 | 313 | # Cookie/Output files 314 | file_output = output if output else '/tmp/login.html' 315 | file_cookie = cookie if cookie else '/tmp/cookie.txt' 316 | 317 | 318 | # Ask what to do if file exists and not '--yes' was specified 319 | if os.path.isfile(file_output) and not yes: 320 | answer = None 321 | while answer != 'y' and answer != 'Y': 322 | answer = raw_input('Output file already exists. Overwrite? [y/n]? ') 323 | 324 | if answer == 'Y' or answer == 'y' or answer == 'Yes' or answer == 'yes': 325 | break 326 | elif answer == 'N' or answer == 'n': 327 | print "aborting..." 328 | sys.exit(0) 329 | 330 | # Ask what to do if file exists and not '--yes' was specified 331 | if os.path.isfile(file_cookie) and not yes: 332 | answer = None 333 | while answer != 'y' and answer != 'Y': 334 | answer = raw_input('Cookie file already exists. Overwrite? [y/n]? ') 335 | 336 | if answer == 'Y' or answer == 'y' or answer == 'Yes' or answer == 'yes': 337 | break 338 | elif answer == 'N' or answer == 'n': 339 | print "aborting..." 340 | sys.exit(0) 341 | 342 | 343 | 344 | wget_create_session = [ 345 | 'wget', 346 | '--quiet', 347 | '--keep-session-cookies', 348 | '--save-cookies', 349 | file_cookie, 350 | '-O', 351 | '-', 352 | login_url 353 | ] 354 | 355 | 356 | 357 | # Initial wget 358 | if verbose: 359 | print MyShell().color('blue') + '[1] Creating initial session request' + MyShell().color('reset') 360 | 361 | output = [''] 362 | MyShell().run(wget_create_session, output, show_cmd=verbose, show_return=True) 363 | 364 | if jdict['login']['csrf']['enabled']: 365 | if verbose: 366 | print MyShell().color('blue') + '[2] Extracting CSRF key' + MyShell().color('reset') 367 | 368 | csrf_key = jdict['login']['csrf']['field'] 369 | # Prepare regex 370 | re1 = "name=(\"|')%s(\"|').*value=(\"|')(.*)(\"|')" % (csrf_key) 371 | re2 = "value=(\"|')(.*)(\"|').*name=(\"|')%s(\"|')" % (csrf_key) 372 | # Search 373 | r1 = re.search(re1, output[0]) 374 | r2 = re.search(re2, output[0]) 375 | 376 | if r1: 377 | csrf_val = r1.group(4) 378 | elif r2: 379 | csrf_val = r2.group(2) 380 | else: 381 | print "Error, no such html attribute found" 382 | csrf_val = '' 383 | 384 | # Show extracted key 385 | if verbose: 386 | print "key: %s | val: %s" % (csrf_key, csrf_val) 387 | 388 | post_data.append(csrf_key + '=' + csrf_val) 389 | else: 390 | print MyShell().color('blue') + '[2] No CSRF key extraction' + MyShell().color('reset') 391 | 392 | 393 | wget_login = [ 394 | 'wget', 395 | '--quiet', 396 | '--content-on-error', 397 | '--keep-session-cookies', 398 | '--load-cookies', 399 | file_cookie, 400 | '--save-cookies', 401 | file_cookie, 402 | '--post-data', 403 | '&'.join(post_data), 404 | '-O', 405 | file_output, 406 | login_url 407 | ] 408 | 409 | # Login wget 410 | if verbose: 411 | print MyShell().color('blue') + '[3] Submitting POST login' + MyShell().color('reset') 412 | MyShell().run(wget_login, output, show_cmd=verbose, show_return=True) 413 | 414 | # Inspect source code 415 | if verbose: 416 | print MyShell().color('blue') + '[4] Evaluating login page source' + MyShell().color('reset') 417 | source = MyFile.read(file_output) 418 | 419 | 420 | retval = 0 421 | if jdict['login']['failure'] in source: 422 | print "[FAILED] Login failed" 423 | retval = 2 424 | elif os.path.getsize(file_output) > 0: 425 | print "[OK] Login successful" 426 | retval = 0 427 | else: 428 | print "[FAILED] Result page has 0 Bytes" 429 | retval = 2 430 | 431 | 432 | if cookie: 433 | print "[OK] Session cookie created: " + file_cookie 434 | else: 435 | os.unlink(file_cookie) 436 | 437 | if output: 438 | print "[OK] Output file saved: " + file_output 439 | else: 440 | os.unlink(file_output) 441 | 442 | sys.exit(retval) 443 | 444 | --------------------------------------------------------------------------------