├── .github ├── CODE_OF_CONDUCT.rst ├── FUNDING.yml ├── ISSUE_TEMPLATE.rst └── PULL_REQUEST_TEMPLATE.rst ├── .gitignore ├── .semver ├── .travis.yml ├── CONTRIBUTING.rst ├── LICENSE.rst ├── MANIFEST.in ├── README.rst ├── docs ├── Makefile ├── make.bat └── source │ ├── _static │ ├── better.css │ ├── css │ │ ├── bootstrap-theme.css │ │ └── bootstrap.min.css │ ├── fonts │ │ ├── glyphicons-halflings-regular.eot │ │ ├── glyphicons-halflings-regular.svg │ │ ├── glyphicons-halflings-regular.ttf │ │ ├── glyphicons-halflings-regular.woff │ │ └── glyphicons-halflings-regular.woff2 │ ├── img │ │ ├── flow.png │ │ ├── flow.svg │ │ ├── header.jpg │ │ ├── logo.png │ │ └── logo.svg │ └── js │ │ ├── bootstrap-theme.js │ │ ├── bootstrap.min.js │ │ └── releases-dropdown.js │ ├── _templates │ └── layout.html │ ├── conf.py │ ├── getting_started.rst │ ├── index.rst │ ├── installation.rst │ ├── kitchen_sink.rst │ ├── migration.rst │ ├── modules.rst │ ├── nyawc.helpers.rst │ ├── nyawc.http.rst │ ├── nyawc.rst │ ├── nyawc.scrapers.rst │ ├── options_callbacks.rst │ ├── options_crawling_identity.rst │ ├── options_crawling_scope.rst │ ├── options_misc.rst │ ├── options_performance.rst │ └── options_routing.rst ├── example_extensive.py ├── example_minimal.py ├── nyawc ├── Crawler.py ├── CrawlerActions.py ├── CrawlerThread.py ├── Options.py ├── Queue.py ├── QueueItem.py ├── Routing.py ├── __init__.py ├── helpers │ ├── DebugHelper.py │ ├── HTTPRequestHelper.py │ ├── PackageHelper.py │ ├── RandomInputHelper.py │ ├── URLHelper.py │ └── __init__.py ├── http │ ├── Handler.py │ ├── Request.py │ ├── Response.py │ └── __init__.py └── scrapers │ ├── BaseScraper.py │ ├── CSSRegexLinkScraper.py │ ├── HTMLSoupFormScraper.py │ ├── HTMLSoupLinkScraper.py │ ├── JSONRegexLinkScraper.py │ ├── XMLRegexLinkScraper.py │ └── __init__.py ├── requirements.txt ├── setup.py └── test ├── __init__.py ├── site ├── fuzzing │ ├── empty.php │ └── sleep.php ├── http_statuses │ ├── status_100.php │ ├── status_200.php │ ├── status_300.php │ ├── status_400.php │ └── status_500.php ├── index.php ├── invalid_content_types │ ├── css.php │ ├── html.php │ ├── json.php │ ├── xhtml.php │ └── xml.php └── malformed_responses │ ├── css.php │ ├── html.php │ ├── json.php │ ├── xhtml.php │ └── xml.php ├── test_helpers_url_helper.py ├── test_queue.py ├── test_scrapers_css_regex_link_scraper.py ├── test_scrapers_html_soup_form_scraper.py ├── test_scrapers_html_soup_link_scraper.py ├── test_scrapers_json_regex_link_scraper.py ├── test_scrapers_xml_regex_link_scraper.py └── test_site.py /.github/CODE_OF_CONDUCT.rst: -------------------------------------------------------------------------------- 1 | Contributor Covenant Code of Conduct 2 | ==================================== 3 | 4 | Our Pledge 5 | ---------- 6 | 7 | In the interest of fostering an open and welcoming environment, we as 8 | contributors and maintainers pledge to making participation in our 9 | project and our community a harassment-free experience for everyone, 10 | regardless of age, body size, disability, ethnicity, gender identity and 11 | expression, level of experience, nationality, personal appearance, race, 12 | religion, or sexual identity and orientation. 13 | 14 | Our Standards 15 | ------------- 16 | 17 | Examples of behavior that contributes to creating a positive environment 18 | include: 19 | 20 | - Using welcoming and inclusive language 21 | - Being respectful of differing viewpoints and experiences 22 | - Gracefully accepting constructive criticism 23 | - Focusing on what is best for the community 24 | - Showing empathy towards other community members 25 | 26 | Examples of unacceptable behavior by participants include: 27 | 28 | - The use of sexualized language or imagery and unwelcome sexual 29 | attention or advances 30 | - Trolling, insulting/derogatory comments, and personal or political 31 | attacks 32 | - Public or private harassment 33 | - Publishing others’ private information, such as a physical or 34 | electronic address, without explicit permission 35 | - Other conduct which could reasonably be considered inappropriate in a 36 | professional setting 37 | 38 | Our Responsibilities 39 | -------------------- 40 | 41 | Project maintainers are responsible for clarifying the standards of 42 | acceptable behavior and are expected to take appropriate and fair 43 | corrective action in response to any instances of unacceptable behavior. 44 | 45 | Project maintainers have the right and responsibility to remove, edit, 46 | or reject comments, commits, code, wiki edits, issues, and other 47 | contributions that are not aligned to this Code of Conduct, or to ban 48 | temporarily or permanently any contributor for other behaviors that they 49 | deem inappropriate, threatening, offensive, or harmful. 50 | 51 | Scope 52 | ----- 53 | 54 | This Code of Conduct applies both within project spaces and in public 55 | spaces when an individual is representing the project or its community. 56 | Examples of representing a project or community include using an 57 | official project e-mail address, posting via an official social media 58 | account, or acting as an appointed representative at an online or 59 | offline event. Representation of a project may be further defined and 60 | clarified by project maintainers. 61 | 62 | Enforcement 63 | ----------- 64 | 65 | Instances of abusive, harassing, or otherwise unacceptable behavior may 66 | be reported by contacting the project team at 67 | t{{dot}}gommers{{plus}}nyawc{{at}}outlook{{dot}}com. The project team 68 | will review and investigate all complaints, and will respond in a 69 | way that it deems appropriate to the circumstances. The project team is 70 | obligated to maintain confidentiality with regard to the reporter of an 71 | incident. Further details of specific enforcement policies may be posted 72 | separately. 73 | 74 | Project maintainers who do not follow or enforce the Code of Conduct in 75 | good faith may face temporary or permanent repercussions as determined 76 | by other members of the project’s leadership. 77 | 78 | Attribution 79 | ----------- 80 | 81 | This Code of Conduct is adapted from the `Contributor Covenant`_, 82 | version 1.4, available at `http://contributor-covenant.org/version/1/4`_ 83 | 84 | .. _Contributor Covenant: http://contributor-covenant.org 85 | .. _`http://contributor-covenant.org/version/1/4`: http://contributor-covenant.org/version/1/4/ 86 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: tijme 2 | custom: ['https://www.paypal.me/tijmegommers', 'https://bunq.me/tijme'] 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.rst: -------------------------------------------------------------------------------- 1 | One line summary of the issue here. 2 | 3 | Expected behavior 4 | ================= 5 | 6 | As concisely as possible, describe the expected behavior. 7 | 8 | Actual behavior 9 | ================= 10 | 11 | As concisely as possible, describe the observed behavior. 12 | 13 | Steps to reproduce the behavior 14 | ================= 15 | 16 | Please list all relevant steps to reproduce the observed behavior. -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.rst: -------------------------------------------------------------------------------- 1 | One line summary of the issue here. 2 | 3 | Problem 4 | ================= 5 | 6 | Explain the context and why you’re making that change. What is the problem you’re trying to solve? In some cases there is not a problem and this can be thought of being the motivation for your change. 7 | 8 | Solution 9 | ================= 10 | 11 | Describe the modifications you’ve done. 12 | 13 | Result 14 | ================= 15 | 16 | What will change as a result of your pull request? Note that sometimes this section is unnecessary because it is self-explanatory based on the solution. 17 | 18 | Checklist 19 | ================= 20 | 21 | - [ ] All tests pass and ``example.py`` runs successfully. 22 | - [ ] Code complies with the Google Python Style Guide. 23 | - [ ] Change complies with the contribution guidelines. 24 | - [ ] Mention ``Fixes #`` in the description *if relevant*. 25 | - [ ] Documentation/wiki is updated according to the change(s). 26 | 27 | Google Python Style Guide: . 28 | Contribution guidelines: . -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Custom 2 | nyawc/.semver 3 | example_test.py 4 | releases.js 5 | 6 | # OS 7 | Thumbs.db 8 | .DS_Store 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | env/ 21 | /build 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *,cover 55 | .hypothesis/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | docs/build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # IPython Notebook 80 | .ipynb_checkpoints 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # dotenv 89 | .env 90 | 91 | # virtualenv 92 | venv/ 93 | ENV/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | -------------------------------------------------------------------------------- /.semver: -------------------------------------------------------------------------------- 1 | 1.8.2 -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | os: linux 2 | dist: trusty 3 | sudo: required 4 | language: python 5 | 6 | env: 7 | - UNITTEST_NYAWC_SITE=1 8 | 9 | python: 10 | - 2.7 11 | - 3.5 12 | - 3.6 13 | - 3.7-dev 14 | 15 | install: 16 | - sudo apt-get install -y apache2 17 | - sudo apt-get install -y php5-common libapache2-mod-php5 18 | - sudo service apache2 restart 19 | - sudo rm -r /var/www/html/* 20 | - sudo mv -T test/site /var/www/html 21 | - sudo chown -R www-data:www-data /var/www 22 | - pip install --upgrade setuptools 23 | - pip install -r requirements.txt 24 | 25 | script: 26 | - python -m unittest discover 27 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | Contributing 2 | ============ 3 | 4 | Getting Started 5 | --------------- 6 | 7 | - Submit a ticket for your issue, assuming one does not already exist. 8 | 9 | - Clearly describe the issue including steps to reproduce when it is 10 | a bug. 11 | - Make sure you fill in the earliest version that you know has the 12 | issue. 13 | 14 | - Fork the repository on GitHub. 15 | 16 | Making Changes 17 | -------------- 18 | 19 | - Create a topic branch from where you want to base your work. 20 | 21 | - This is usually the develop branch. 22 | - To quickly create a topic branch based on master; 23 | 24 | - ``git checkout -b bugfix-my-contribution``, 25 | - ``git checkout -b feature-my-contribution``. 26 | 27 | - Please avoid working directly on the ``master`` branch. 28 | 29 | - Make sure your code complies with the `Google Python Style Guide`_. 30 | - Make commits of logical units and make sure your commit messages are 31 | in the proper format. 32 | - Make sure you have added the necessary tests for your changes. 33 | - Run *all* the tests to assure nothing else was accidentally broken. 34 | 35 | Submitting Changes 36 | ------------------ 37 | 38 | - Push your changes to the topic branch in your fork of the repository. 39 | - Submit a pull request to the main repository 40 | (``tijme/not-your-average-web-crawler``). 41 | 42 | .. _Google Python Style Guide: https://google.github.io/styleguide/pyguide.html 43 | -------------------------------------------------------------------------------- /LICENSE.rst: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | ===================== 3 | 4 | Copyright (c) 2017 Tijme Gommers 5 | 6 | Permission is hereby granted, free of charge, to any person 7 | obtaining a copy of this software and associated documentation 8 | files (the “Software”), to deal in the Software without 9 | restriction, including without limitation the rights to use, 10 | copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the 12 | Software is furnished to do so, subject to the following 13 | conditions: 14 | 15 | The above copyright notice and this permission notice shall be 16 | included in all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, 19 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | OTHER DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include LICENSE.rst 3 | include requirements.txt 4 | include .semver -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. raw:: html 2 | 3 |

4 | 5 | .. image:: https://tijme.github.io/not-your-average-web-crawler/latest/_static/img/logo.svg?pypi=png.from.svg 6 | :width: 300px 7 | :height: 300px 8 | :alt: N.Y.A.W.C. logo 9 | :align: center 10 | 11 | .. raw:: html 12 | 13 |
14 | 15 | .. image:: https://raw.finnwea.com/shield/?firstText=Donate%20via&secondText=Bunq 16 | :target: https://bunq.me/tijme/0/A%20web%20crawler%20(for%20bug%20hunting)%20that%20gathers%20more%20than%20you%20can%20imagine 17 | :alt: Donate via Bunq 18 | 19 | .. image:: https://raw.finnwea.com/shield/?typeKey=TravisBuildStatus&typeValue1=tijme/not-your-average-web-crawler&typeValue2=master&cache=1 20 | :target: https://travis-ci.org/tijme/not-your-average-web-crawler 21 | :alt: Build Status 22 | 23 | .. image:: https://raw.finnwea.com/vector-shields-v1/?typeKey=SemverVersion&typeValue1=tijme&typeValue2=not-your-average-web-crawler 24 | :target: https://pypi.python.org/pypi/nyawc/ 25 | :alt: PyPi version 26 | 27 | .. image:: https://raw.finnwea.com/shield/?firstText=License&secondText=MIT 28 | :target: https://github.com/tijme/not-your-average-web-crawler/blob/master/LICENSE.rst 29 | :alt: License: MIT 30 | 31 | .. raw:: html 32 | 33 |

34 |

Not Your Average Web Crawler

35 | 36 | N.Y.A.W.C is a Python library that enables you to test your payload against all requests of a certain domain. It crawls all requests (e.g. GET, POST or PUT) in the specified scope and keeps track of the request and response data. During the crawling process the callbacks enable you to insert your payload at specific places and test if they worked. 37 | 38 | Table of contents 39 | ----------------- 40 | 41 | - `Installation <#installation>`__ 42 | - `Crawling flow <#crawling-flow>`__ 43 | - `Documentation <#documentation>`__ 44 | - `Minimal implementation <#minimal-implementation>`__ 45 | - `Testing <#testing>`__ 46 | - `Issues <#issues>`__ 47 | - `License <#license>`__ 48 | 49 | Installation 50 | ------------ 51 | 52 | First make sure you're on `Python 2.7/3.3 `__ or higher. Then run the command below to install N.Y.A.W.C. 53 | 54 | ``$ pip install --upgrade nyawc`` 55 | 56 | Crawling flow 57 | ------------- 58 | 59 | 1. You can define your startpoint (a request) and the crawling scope and then start the crawler. 60 | 2. The crawler repeatedly starts the first request in the queue until ``max threads`` is reached. 61 | 3. The crawler adds all requests found in the response to the end of the queue (except duplicates). 62 | 4. The crawler goes back to step #2 to spawn new requests repeatedly until ``max threads`` is reached. 63 | 64 | .. image:: https://tijme.github.io/not-your-average-web-crawler/latest/_static/img/flow.svg 65 | :alt: N.Y.A.W.C crawling flow 66 | 67 | **Please note that if the queue is empty and all crawler threads are finished, the crawler will stop.** 68 | 69 | Documentation 70 | ------------- 71 | 72 | Please refer to the `documentation `__ or the `API `__ for all the information about N.Y.A.W.C. 73 | 74 | Minimal implementation 75 | ---------------------- 76 | 77 | You can use the callbacks in ``example_minimal.py`` to run your own exploit against the requests. If you want an example of automated exploit scanning, please take a look at `ACSTIS `__ (it uses N.Y.A.W.C to scan for AngularJS client-side template injection vulnerabilities). 78 | 79 | You can also use the `kitchen sink `__ (which contains all the functionalities from N.Y.A.W.C.) instead of the example below. The code below is a minimal implementation of N.Y.A.W.C. 80 | 81 | - ``$ python example_minimal.py`` 82 | - ``$ python -u example_minimal.py > output.log`` 83 | 84 | .. code:: python 85 | 86 | # example_minimal.py 87 | 88 | from nyawc.Options import Options 89 | from nyawc.QueueItem import QueueItem 90 | from nyawc.Crawler import Crawler 91 | from nyawc.CrawlerActions import CrawlerActions 92 | from nyawc.http.Request import Request 93 | 94 | def cb_crawler_before_start(): 95 | print("Crawler started.") 96 | 97 | def cb_crawler_after_finish(queue): 98 | print("Crawler finished.") 99 | print("Found " + str(len(queue.get_all(QueueItem.STATUS_FINISHED))) + " requests.") 100 | 101 | def cb_request_before_start(queue, queue_item): 102 | print("Starting: {}".format(queue_item.request.url)) 103 | return CrawlerActions.DO_CONTINUE_CRAWLING 104 | 105 | def cb_request_after_finish(queue, queue_item, new_queue_items): 106 | print("Finished: {}".format(queue_item.request.url)) 107 | return CrawlerActions.DO_CONTINUE_CRAWLING 108 | 109 | options = Options() 110 | 111 | options.callbacks.crawler_before_start = cb_crawler_before_start # Called before the crawler starts crawling. Default is a null route. 112 | options.callbacks.crawler_after_finish = cb_crawler_after_finish # Called after the crawler finished crawling. Default is a null route. 113 | options.callbacks.request_before_start = cb_request_before_start # Called before the crawler starts a new request. Default is a null route. 114 | options.callbacks.request_after_finish = cb_request_after_finish # Called after the crawler finishes a request. Default is a null route. 115 | 116 | crawler = Crawler(options) 117 | crawler.start_with(Request("https://finnwea.com/")) 118 | 119 | Testing 120 | ------- 121 | 122 | The testing can and will automatically be done by `Travis CI `__ on every push to the master branch. If you want to manually run the unit tests, use the command below. 123 | 124 | ``$ python -m unittest discover`` 125 | 126 | Issues 127 | ------ 128 | 129 | Issues or new features can be reported via the GitHub issue tracker. Please make sure your issue or feature has not yet been reported by anyone else before submitting a new one. 130 | 131 | License 132 | ------- 133 | 134 | Not Your Average Web Crawler (N.Y.A.W.C) is open-sourced software licensed under the `MIT license `__. 135 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = NYAWC 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -E -a 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -E -a -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | set SPHINXPROJ=NYAWC 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% -E -a 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% -E -a 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/source/_static/better.css: -------------------------------------------------------------------------------- 1 | @import url("better_basic.css"); 2 | -------------------------------------------------------------------------------- /docs/source/_static/css/bootstrap-theme.css: -------------------------------------------------------------------------------- 1 | /* 2 | @media(max-width: 767px) {} 3 | @media(min-width: 768px) {} 4 | @media(min-width: 992px) {} 5 | @media(min-width: 1200px) {} 6 | */ 7 | 8 | /** 9 | * 10 | * General 11 | * 12 | */ 13 | html, body { 14 | width: 100%; 15 | height: 100%; 16 | 17 | margin: 0px 0px 0px 0px; 18 | padding: 0px 0px 0px 0px; 19 | } 20 | 21 | h1, h2, h3, h4, h5, h6 { 22 | font-family: Georgia, serif !important; 23 | } 24 | 25 | .vertical-center { 26 | width: 100%; 27 | min-height: 100%; 28 | display: flex; 29 | align-items: center; 30 | } 31 | 32 | /** 33 | * 34 | * Table of Contents 35 | * 36 | */ 37 | @media(max-width: 767px) { 38 | .nav-stacked { 39 | background: #f1f1f1; 40 | border-radius: 4px; 41 | border: 1px solid #f1f1f1; 42 | margin-bottom: 20px; 43 | } 44 | } 45 | 46 | .nav-stacked ul { 47 | padding-left: 35px; 48 | list-style: none; 49 | } 50 | 51 | .nav-stacked ul li { 52 | padding-top: 4px; 53 | padding-bottom: 4px; 54 | } 55 | 56 | 57 | .nav-stacked ul li a { 58 | font-size: 13px; 59 | } 60 | 61 | 62 | #tocscroll .affix { 63 | top: 20px; 64 | } 65 | 66 | /** 67 | * 68 | * Navbar 69 | * 70 | */ 71 | .navbar-inverse { 72 | height: 64px; 73 | 74 | background: transparent !important; 75 | border: none; 76 | 77 | margin: 0px; 78 | position: relative; 79 | z-index: 50; 80 | 81 | -webkit-box-shadow: 0 1px 2px rgba(0, 0, 0, 0.3); 82 | box-shadow: 0 1px 2px rgba(0, 0, 0, 0.3); 83 | } 84 | 85 | .navbar-inverse .navbar-nav > li > a { 86 | color: #fff; 87 | 88 | padding-top: 22px; 89 | padding-bottom: 22px; 90 | } 91 | 92 | .navbar-inverse .navbar-nav > li > a span.text { 93 | color: #fff; 94 | text-shadow: 0 0 5px #000; 95 | } 96 | 97 | .navbar-inverse .navbar-nav > li > a:hover span.text { 98 | border-bottom: 1px dotted #fff; 99 | } 100 | 101 | .navbar-inverse .navbar-nav > .open > a, 102 | .navbar-inverse .navbar-nav > .open > a:focus, 103 | .navbar-inverse .navbar-nav > .open > a:hover { 104 | background: rgba(0, 0, 0, 0.35); 105 | } 106 | 107 | @media(max-width: 767px) { 108 | .navbar-inverse .navbar-toggle { 109 | float: left; 110 | border: none; 111 | 112 | padding-top: 17px; 113 | padding-bottom: 17px; 114 | } 115 | 116 | .navbar-inverse .navbar-toggle:focus, 117 | .navbar-inverse .navbar-toggle:hover { 118 | background: none; 119 | } 120 | 121 | .navbar-inverse .navbar-collapse { 122 | margin-left: -15px; 123 | margin-right: -15px; 124 | padding-left: 30px; 125 | padding-right: 30px; 126 | 127 | border: none; 128 | background: rgba(0, 0, 0, 0.9); 129 | max-height: 500px; 130 | } 131 | 132 | 133 | .navbar-nav { 134 | margin-top: 0px; 135 | margin-bottom: 0px; 136 | } 137 | 138 | .navbar-inverse .navbar-nav > li > a { 139 | padding-top: 15px; 140 | padding-bottom: 15px; 141 | } 142 | } 143 | 144 | /** 145 | * 146 | * Jumbotron 147 | * 148 | */ 149 | .jumbotron { 150 | height: 250px; 151 | 152 | background-color: #222222; 153 | position: relative; 154 | padding: 64px 0px 0px 0px; 155 | margin: -64px 0px 20px 0px; 156 | } 157 | 158 | .home.jumbotron { 159 | height: 450px; 160 | } 161 | 162 | .jumbotron img { 163 | width: 100%; 164 | height: 100%; 165 | 166 | top: 0; 167 | left: 0; 168 | z-index: 5; 169 | 170 | position: absolute; 171 | object-fit: cover; 172 | opacity: 0.9; 173 | } 174 | 175 | .jumbotron .jumbotron-darken { 176 | width: 100%; 177 | height: 100%; 178 | 179 | z-index: 10; 180 | 181 | position: relative; 182 | background: rgba(0, 0, 0, 0.35); 183 | } 184 | 185 | .jumbotron .jumbotron-darken h1 { 186 | color: #ffffff; 187 | font-size: 40px; 188 | } 189 | 190 | @media(min-width: 768px) { 191 | .jumbotron .jumbotron-darken h1 { 192 | font-size: 45px; 193 | } 194 | } 195 | 196 | @media(min-width: 992px) { 197 | .jumbotron .jumbotron-darken h1 { 198 | font-size: 65px; 199 | } 200 | } 201 | 202 | .jumbotron .jumbotron-darken h1 small { 203 | display: block; 204 | margin-top: 15px; 205 | margin-bottom: 25px; 206 | font-size: 18px; 207 | color: #ffffff; 208 | } 209 | 210 | @media(min-width: 768px) { 211 | .jumbotron .jumbotron-darken h1 small { 212 | font-size: 22px; 213 | } 214 | } 215 | 216 | @media(min-width: 992px) { 217 | .jumbotron .jumbotron-darken h1 small { 218 | font-size: 26px; 219 | } 220 | } 221 | 222 | .jumbotron .jumbotron-darken div.pre { 223 | margin: 0px 0px 0px 0px; 224 | padding: 2px 4px 2px 4px; 225 | display: inline-block; 226 | 227 | border: none; 228 | background: rgba(0, 0, 0, 0.75); 229 | 230 | font-size: 13px; 231 | color: #ffffff; 232 | text-align: center; 233 | font-family: "Lucida Console", Monaco, monospace; 234 | } 235 | 236 | @media(min-width: 768px) { 237 | .jumbotron .jumbotron-darken div.pre { 238 | font-size: 14px; 239 | padding: 3px 5px 3px 5px; 240 | } 241 | } 242 | 243 | .jumbotron .jumbotron-darken .btn { 244 | border: none; 245 | background: rgba(0, 0, 0, 0.75); 246 | color: #ffffff; 247 | } 248 | 249 | /** 250 | * 251 | * Content 252 | * 253 | */ 254 | .content-container h1 { 255 | color: #000000; 256 | 257 | margin-top: 40px; 258 | font-size: 28px; 259 | } 260 | 261 | .content-container .section:first-child h1:first-child { 262 | margin-top: 0px; 263 | } 264 | 265 | .content-container h2 { 266 | color: #333333; 267 | 268 | margin-top: 20px; 269 | font-size: 22px; 270 | } 271 | 272 | .content-container h3 { 273 | color: #333333; 274 | 275 | margin-top: 15px; 276 | font-size: 18px; 277 | } 278 | 279 | .content-container .highlight pre { 280 | border: 1px solid #f1f1f1; 281 | 282 | font-family: "Lucida Console", Monaco, monospace; 283 | font-size: 12px; 284 | word-break: break-word; 285 | 286 | color: #666666; 287 | background: #f5f5f5; 288 | } 289 | 290 | /** 291 | * 292 | * Footer 293 | * 294 | */ 295 | .footer { 296 | font-size: 11px; 297 | color: #333333; 298 | text-align: center; 299 | 300 | margin-top: 20px; 301 | margin-bottom: 20px; 302 | } 303 | 304 | /** 305 | * 306 | * Home 307 | * 308 | */ 309 | img.flow { 310 | margin: 20px auto; 311 | } 312 | -------------------------------------------------------------------------------- /docs/source/_static/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/docs/source/_static/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /docs/source/_static/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/docs/source/_static/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /docs/source/_static/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/docs/source/_static/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /docs/source/_static/fonts/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/docs/source/_static/fonts/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /docs/source/_static/img/flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/docs/source/_static/img/flow.png -------------------------------------------------------------------------------- /docs/source/_static/img/header.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/docs/source/_static/img/header.jpg -------------------------------------------------------------------------------- /docs/source/_static/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/docs/source/_static/img/logo.png -------------------------------------------------------------------------------- /docs/source/_static/js/bootstrap-theme.js: -------------------------------------------------------------------------------- 1 | function selectText(containerid) { 2 | if (document.selection) { 3 | var range = document.body.createTextRange(); 4 | range.moveToElementText(document.getElementById(containerid)); 5 | range.select(); 6 | } else if (window.getSelection) { 7 | var range = document.createRange(); 8 | range.selectNode(document.getElementById(containerid)); 9 | window.getSelection().removeAllRanges(); 10 | window.getSelection().addRange(range); 11 | } 12 | } 13 | 14 | function unwrapToc() { 15 | $('.nav-stacked').each(function() { 16 | $(this).html($(this).find('ul').html()); 17 | }); 18 | } 19 | 20 | function tocInitializeAffix() { 21 | $('#tocscroll .nav').affix({ 22 | offset: { 23 | top: function () { 24 | var alertHeight = 0; 25 | if ($('.alert-version').lenght) { 26 | alertHeight = $('.alert-version').outerHeight() 27 | } 28 | 29 | return (this.top = $('.jumbotron').outerHeight() + alertHeight) 30 | }, 31 | bottom: function () { 32 | return (this.bottom = $('.footer').outerHeight()) 33 | } 34 | } 35 | }); 36 | } 37 | 38 | function tocAffixSetWidth() { 39 | $('#tocscroll .nav').width($('#tocscroll').width()) 40 | } 41 | 42 | $(document).ready(function() { 43 | unwrapToc(); 44 | tocInitializeAffix(); 45 | tocAffixSetWidth(); 46 | }); 47 | 48 | $(window).resize(function () { 49 | tocAffixSetWidth(); 50 | }); 51 | -------------------------------------------------------------------------------- /docs/source/_static/js/releases-dropdown.js: -------------------------------------------------------------------------------- 1 | $(document).ready(function() { 2 | var latestVersion = Object.keys(releases)[Object.keys(releases).length - 1]; 3 | var currentVersion = $('#releases').attr('data-selected'); 4 | 5 | /** 6 | * Generate dropdown 7 | */ 8 | var dropdownHtml = ""; 9 | 10 | Object.keys(releases).forEach(function(version, index) { 11 | var isLatest = version == latestVersion 12 | var labelHtml = isLatest ? " latest" : ""; 13 | var labelLink = '../' + (isLatest ? 'latest' : version) + '/index.html'; 14 | 15 | dropdownHtml = "
  • Version " + version + labelHtml + "
  • " + dropdownHtml; 16 | }); 17 | 18 | $('#releases .dropdown-menu').html(dropdownHtml); 19 | 20 | /** 21 | * Show message if not viewing the latest version 22 | */ 23 | if (latestVersion != currentVersion) { 24 | var message = "Warning! Version " + latestVersion + " is available (you are currently viewing version " + currentVersion + ")."; 25 | var messageHtml = '
    '; 26 | $(messageHtml).insertAfter($('.jumbotron')) 27 | } 28 | }); 29 | -------------------------------------------------------------------------------- /docs/source/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends '!layout.html' %} 2 | 3 | {% block header %} 4 | 59 | {% endblock %} 60 | 61 | {% block content %} 62 | 63 | {%- if pagename == 'index' %} 64 |
    65 |
    66 |
    67 |
    68 |
    69 |
    70 |

    Not Your Average Web Crawler
    Execute your exploit against every request in scope

    71 |

    N.Y.A.W.C
    Execute your exploit against every request in scope

    72 | Getting Started 73 |
    74 |
    75 |
    76 |
    77 |
    78 | 79 |
    80 | {%- endif %} 81 | 82 | {%- if pagename != 'index' %} 83 |
    84 |
    85 |
    86 |
    87 |
    88 |
    89 |

    {{title}}

    90 |
    91 |
    92 |
    93 |
    94 |
    95 | 96 |
    97 | {%- endif %} 98 | 99 |
    100 |
    101 |
    102 | {%- if display_toc %} 103 |
    104 | 110 | 115 |
    116 | {%- endif %} 117 | 118 |
    119 | {% block body %} {% endblock %} 120 |
    121 | 122 | {%- if display_toc %} 123 |
    124 |
    125 | {%- endif %} 126 |
    127 |
    128 |
    129 | {% endblock %} 130 | 131 | {% block footer %} 132 |
    133 |
    134 |
    135 |
    136 |

    N.Y.A.W.C v{{release}} is open-sourced software licensed under the MIT license.

    137 |
    138 |
    139 |
    140 |
    141 | {% endblock %} 142 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # N.Y.A.W.C documentation build configuration file, created by 5 | # sphinx-quickstart on Fri May 12 17:22:14 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | sys.path.insert(0, os.path.abspath('../..')) 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | # 28 | # needs_sphinx = '1.0' 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | 'sphinx.ext.autodoc', 35 | 'sphinxcontrib.napoleon', 36 | 'sphinx.ext.linkcode', 37 | 'sphinx.ext.todo' 38 | ] 39 | 40 | # Add any paths that contain templates here, relative to this directory. 41 | templates_path = ['_templates'] 42 | 43 | # The suffix(es) of source filenames. 44 | # You can specify multiple suffix as a list of string: 45 | # 46 | # source_suffix = ['.rst', '.md'] 47 | source_suffix = '.rst' 48 | 49 | # The master toctree document. 50 | master_doc = 'index' 51 | 52 | # General information about the project. 53 | project = 'N.Y.A.W.C' 54 | copyright = '2017, Tijme Gommers' 55 | author = 'Tijme Gommers' 56 | 57 | # The version info for the project you're documenting, acts as replacement for 58 | # |version| and |release|, also used in various other places throughout the 59 | # built documents. 60 | 61 | with open("../../.semver") as file: 62 | semver = file.read().rstrip() 63 | 64 | # The short X.Y version. 65 | version = semver 66 | # The full version, including alpha/beta/rc tags. 67 | release = semver 68 | 69 | # The language for content autogenerated by Sphinx. Refer to documentation 70 | # for a list of supported languages. 71 | # 72 | # This is also used if you do content translation via gettext catalogs. 73 | # Usually you set "language" from the command line for these cases. 74 | language = None 75 | 76 | # List of patterns, relative to source directory, that match files and 77 | # directories to ignore when looking for source files. 78 | # This patterns also effect to html_static_path and html_extra_path 79 | exclude_patterns = [] 80 | 81 | # The name of the Pygments (syntax highlighting) style to use. 82 | pygments_style = 'sphinx' 83 | 84 | # If true, `todo` and `todoList` produce output, else they produce nothing. 85 | todo_include_todos = False 86 | 87 | # -- Options for HTML output ---------------------------------------------- 88 | 89 | # The theme to use for HTML and HTML Help pages. See the documentation for 90 | # a list of builtin themes. 91 | # 92 | from better import better_theme_path 93 | html_theme_path = [better_theme_path] 94 | html_theme = 'better' 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | # 100 | html_theme_options = { 101 | # show sidebar on the right instead of on the left 102 | 'rightsidebar': False, 103 | 104 | # CSS files to include after all other CSS files 105 | # (refer to by relative path from conf.py directory, or link to a 106 | # remote file) 107 | # 'cssfiles': ['_static/my_style.css'], # default is empty list 108 | 109 | # show a big text header with the value of html_title 110 | 'showheader': True, 111 | 112 | # show the breadcrumbs and index|next|previous links at the top of 113 | # the page 114 | 'showrelbartop': False, 115 | # same for bottom of the page 116 | 'showrelbarbottom': False, 117 | 118 | # show the self-serving link in the footer 119 | 'linktotheme': False, 120 | 121 | # width of the sidebar. page width is determined by a CSS rule. 122 | # I prefer to define things in rem because it scales with the 123 | # global font size rather than pixels or the local font size. 124 | 'sidebarwidth': '0px', 125 | 126 | # color of all body text 127 | 'textcolor': '#000000', 128 | 129 | # color of all headings (

    tags); defaults to the value of 130 | # textcolor, which is why it's defined here at all. 131 | 'headtextcolor': '', 132 | 133 | # color of text in the footer, including links; defaults to the 134 | # value of textcolor 135 | 'footertextcolor': '', 136 | 137 | # Custom CSS 138 | 'cssfiles': ['_static/css/bootstrap.min.css?' + version, '_static/css/bootstrap-theme.css?' + version], 139 | 140 | # Custom JS 141 | 'scriptfiles': ['../releases.js?' + version, '_static/js/bootstrap.min.js?' + version, '_static/js/releases-dropdown.js?' + version, '_static/js/bootstrap-theme.js?' + version] 142 | } 143 | 144 | # Add any paths that contain custom static files (such as style sheets) here, 145 | # relative to this directory. They are copied after the builtin static files, 146 | # so a file named "default.css" will overwrite the builtin "default.css". 147 | html_static_path = ['_static'] 148 | 149 | # -- Options for HTMLHelp output ------------------------------------------ 150 | 151 | # Output file base name for HTML help builder. 152 | htmlhelp_basename = 'NYAWCdoc' 153 | 154 | # -- Options for LaTeX output --------------------------------------------- 155 | 156 | latex_elements = { 157 | # The paper size ('letterpaper' or 'a4paper'). 158 | # 159 | # 'papersize': 'letterpaper', 160 | 161 | # The font size ('10pt', '11pt' or '12pt'). 162 | # 163 | # 'pointsize': '10pt', 164 | 165 | # Additional stuff for the LaTeX preamble. 166 | # 167 | # 'preamble': '', 168 | 169 | # Latex figure (float) alignment 170 | # 171 | # 'figure_align': 'htbp', 172 | } 173 | 174 | # Grouping the document tree into LaTeX files. List of tuples 175 | # (source start file, target name, title, 176 | # author, documentclass [howto, manual, or own class]). 177 | latex_documents = [ 178 | (master_doc, 'NYAWC.tex', 'N.Y.A.W.C Documentation', 179 | 'Tijme Gommers', 'manual'), 180 | ] 181 | 182 | # -- Options for manual page output --------------------------------------- 183 | 184 | # One entry per manual page. List of tuples 185 | # (source start file, name, description, authors, manual section). 186 | man_pages = [ 187 | (master_doc, 'nyawc', 'N.Y.A.W.C Documentation', 188 | [author], 1) 189 | ] 190 | 191 | # -- Options for Texinfo output ------------------------------------------- 192 | 193 | # Grouping the document tree into Texinfo files. List of tuples 194 | # (source start file, target name, title, author, 195 | # dir menu entry, description, category) 196 | texinfo_documents = [ 197 | (master_doc, 'NYAWC', 'N.Y.A.W.C Documentation', 198 | author, 'NYAWC', 'A web crawler that gathers more than you can imagine.', 199 | 'Miscellaneous'), 200 | ] 201 | 202 | # Title of the documentation 203 | html_title = "Not Your Average Web Crawler" 204 | 205 | # Home button title 206 | html_short_title = "Home" 207 | 208 | # Sidebar contents 209 | html_sidebars = { 210 | '**': [], 211 | } 212 | 213 | # Absolute link the the source code 214 | def linkcode_resolve(domain, info): 215 | if domain != 'py': 216 | return None 217 | 218 | if not info['module']: 219 | return None 220 | 221 | filename = info['module'].replace('.', '/') 222 | return "https://github.com/tijme/not-your-average-web-crawler/tree/{}/{}.py".format(semver, filename) 223 | 224 | # Napoleon 225 | napoleon_google_docstring = True 226 | napoleon_include_init_with_doc = True 227 | napoleon_include_private_with_doc = True 228 | 229 | # Always make sure current release is in releases.js 230 | import json 231 | from collections import OrderedDict 232 | 233 | releasesjs = open('../../releases.js').read().replace("var releases = ", "") 234 | releases = json.loads(releasesjs, object_pairs_hook=OrderedDict); 235 | 236 | releases[release] = True 237 | 238 | with open('../../releases.js', 'w') as outfile: 239 | outfile.write("var releases = " + json.dumps(releases)) 240 | -------------------------------------------------------------------------------- /docs/source/getting_started.rst: -------------------------------------------------------------------------------- 1 | .. title:: Getting Started 2 | 3 | Minimal example 4 | --------------- 5 | 6 | N.Y.A.W.C does not have a CLI entry point, so you need to create one yourself. Save the code below as ``example.py``. The example code prints all request URLs that were found by the crawler. 7 | 8 | .. code:: python 9 | 10 | # example.py 11 | 12 | from nyawc.Options import Options 13 | from nyawc.Crawler import Crawler 14 | from nyawc.QueueItem import QueueItem 15 | from nyawc.CrawlerActions import CrawlerActions 16 | from nyawc.http.Request import Request 17 | 18 | def cb_crawler_before_start(): 19 | print("Crawler started.") 20 | 21 | def cb_crawler_after_finish(queue): 22 | print("Crawler finished.") 23 | print("Found " + str(len(queue.get_all(QueueItem.STATUS_FINISHED))) + " requests.") 24 | 25 | def cb_request_before_start(queue, queue_item): 26 | print("Starting: {}".format(queue_item.request.url)) 27 | return CrawlerActions.DO_CONTINUE_CRAWLING 28 | 29 | def cb_request_after_finish(queue, queue_item, new_queue_items): 30 | print("Finished: {}".format(queue_item.request.url)) 31 | return CrawlerActions.DO_CONTINUE_CRAWLING 32 | 33 | options = Options() 34 | 35 | options.callbacks.crawler_before_start = cb_crawler_before_start # Called before the crawler starts crawling. Default is a null route. 36 | options.callbacks.crawler_after_finish = cb_crawler_after_finish # Called after the crawler finished crawling. Default is a null route. 37 | options.callbacks.request_before_start = cb_request_before_start # Called before the crawler starts a new request. Default is a null route. 38 | options.callbacks.request_after_finish = cb_request_after_finish # Called after the crawler finishes a request. Default is a null route. 39 | 40 | crawler = Crawler(options) 41 | crawler.start_with(Request("https://finnwea.com/")) 42 | 43 | Testing example.py 44 | ------------------ 45 | 46 | In the foreground 47 | ~~~~~~~~~~~~~~~~~ 48 | 49 | Output all contents to the console. 50 | 51 | ``$ python example.py`` 52 | 53 | In the background 54 | ~~~~~~~~~~~~~~~~~ 55 | 56 | Output all contents to a file and run the process in the background. 57 | 58 | ``$ python -u example.py > output.log`` 59 | 60 | Adding extra options 61 | -------------------- 62 | 63 | Callbacks 64 | ~~~~~~~~~ 65 | 66 | All the available callbacks are documented `here `_. 67 | 68 | Scope 69 | ~~~~~ 70 | 71 | You can set scope options to, for example, only crawl certain subdomains or certain request methods. See `this `_ page for all the available scope options. 72 | 73 | Identity 74 | ~~~~~~~~ 75 | 76 | Do you want to use authentication, set headers or use a proxy? Check `these `_ identity options for documentation. 77 | 78 | Routing 79 | ~~~~~~~ 80 | 81 | If you want to ignore similar requests (e.g. /news/1, /news/2, /news/3, etc) you can specify routes via the `routing `_ options. 82 | 83 | The kitchen sink 84 | ---------------- 85 | 86 | The kitchen sink is an example that implements all the features/options of N.Y.A.W.C. The kitchen sink is available for copy paste. `Check it out `_! 87 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. title:: Home 2 | 3 | .. raw:: html 4 | 5 |
    6 |
    7 |

    Did you ever want to test your payload against all requests of a certain domain? N.Y.A.W.C can help you with that. It crawls all requests (e.g. GET, POST or PUT) on the specified domain and keeps track of the request and response data. During the crawling process, the callbacks enable you to insert your payload at specific places and test if they worked. And using the built in options you can do even more. Get started!

    8 |
    9 |
    10 |
    11 |
    12 |
    13 | Step #1 14 |

    You can define your startpoint (a request) and the crawling scope and then start the crawler.

    15 |
    16 |
    17 | Step #2 18 |

    The crawler repeatedly starts the first request in the queue until max threads is reached.

    19 |
    20 |
    21 | Step #3 22 |

    The crawler adds all requests found in the response to the end of the queue (except duplicates).

    23 |
    24 |
    25 | Step #4 26 |

    The crawler goes back to step #2 to spawn new requests repeatedly until max threads is reached.

    27 |
    28 |
    29 |
    30 |
    31 | 32 |
    33 |
    34 |
    35 |
    36 |

    Several callbacks can be used throughout the crawling process to, for example, modify requests on the go.

    37 |
    38 |
    39 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | .. title:: Installation 2 | 3 | .. raw:: html 4 | 5 | 6 | 7 | Install using PIP 8 | ----------------- 9 | 10 | All releases of N.Y.A.W.C are available on PyPi (`link `_). 11 | 12 | #. Make sure you are using Python 2.7/3.3 or higher. 13 | #. ``$ pip install --upgrade nyawc`` 14 | 15 | Install using EasyInstall 16 | ------------------------- 17 | 18 | #. Make sure you are using Python 2.7/3.3 or higher. 19 | #. ``$ easy_install --upgrade nyawc`` 20 | 21 | Download as ZIP 22 | --------------- 23 | 24 | #. Make sure you are using Python 2.7/3.3 or higher. 25 | #. Download and extract the ZIP file (`link `__). 26 | #. Run the unit-tests to verify you have a working version. 27 | 28 | #. ``$ python -m unittest discover``. 29 | 30 | #. Install N.Y.A.W.C. 31 | 32 | #. ``$ python setup.py install`` 33 | 34 | Clone using GIT 35 | --------------- 36 | 37 | #. Make sure you are using Python 2.7/3.3 or higher. 38 | #. Clone the project 39 | 40 | #. ``$ git clone https://github.com/tijme/not-your-average-web-crawler.git``. 41 | 42 | #. Run the unit-tests to verify you have a working version. 43 | 44 | #. ``$ python -m unittest discover``. 45 | 46 | #. Install N.Y.A.W.C. 47 | 48 | #. ``$ python setup.py install`` 49 | -------------------------------------------------------------------------------- /docs/source/kitchen_sink.rst: -------------------------------------------------------------------------------- 1 | .. title:: Kitchen Sink 2 | 3 | The English phrase "Everything but the kitchen sink" means "almost anything one can think of". The example below contains all the functionalities from N.Y.A.W.C. 4 | 5 | .. code:: python 6 | 7 | # example.py 8 | 9 | from nyawc.Options import Options 10 | from nyawc.QueueItem import QueueItem 11 | from nyawc.Crawler import Crawler 12 | from nyawc.CrawlerActions import CrawlerActions 13 | from nyawc.http.Request import Request 14 | from requests.auth import HTTPBasicAuth 15 | 16 | def cb_crawler_before_start(): 17 | print("Crawler started.") 18 | 19 | def cb_crawler_after_finish(queue): 20 | print("Crawler finished.") 21 | print("Found " + str(len(queue.get_all(QueueItem.STATUS_FINISHED))) + " requests.") 22 | 23 | for queue_item in queue.get_all(QueueItem.STATUS_FINISHED).values(): 24 | print("[" + queue_item.request.method + "] " + queue_item.request.url + " (PostData: " + str(queue_item.request.data) + ")") 25 | 26 | def cb_request_before_start(queue, queue_item): 27 | # return CrawlerActions.DO_SKIP_TO_NEXT 28 | # return CrawlerActions.DO_STOP_CRAWLING 29 | 30 | return CrawlerActions.DO_CONTINUE_CRAWLING 31 | 32 | def cb_request_after_finish(queue, queue_item, new_queue_items): 33 | percentage = str(int(queue.get_progress())) 34 | total_requests = str(queue.count_total) 35 | 36 | print("At " + percentage + "% of " + total_requests + " requests ([" + str(queue_item.response.status_code) + "] " + queue_item.request.url + ").") 37 | 38 | # return CrawlerActions.DO_STOP_CRAWLING 39 | return CrawlerActions.DO_CONTINUE_CRAWLING 40 | 41 | def cb_request_in_thread_before_start(queue_item): 42 | pass 43 | 44 | def cb_request_in_thread_after_finish(queue_item): 45 | pass 46 | 47 | def cb_request_on_error(queue_item, message): 48 | print("[error] " + message) 49 | 50 | def cb_form_before_autofill(queue_item, elements, form_data): 51 | # return CrawlerActions.DO_NOT_AUTOFILL_FORM 52 | 53 | return CrawlerActions.DO_AUTOFILL_FORM 54 | 55 | def cb_form_after_autofill(queue_item, elements, form_data): 56 | pass 57 | 58 | # Declare the options 59 | options = Options() 60 | 61 | # Callback options (https://tijme.github.io/not-your-average-web-crawler/latest/options_callbacks.html) 62 | options.callbacks.crawler_before_start = cb_crawler_before_start # Called before the crawler starts crawling. Default is a null route. 63 | options.callbacks.crawler_after_finish = cb_crawler_after_finish # Called after the crawler finished crawling. Default is a null route. 64 | options.callbacks.request_before_start = cb_request_before_start # Called before the crawler starts a new request. Default is a null route. 65 | options.callbacks.request_after_finish = cb_request_after_finish # Called after the crawler finishes a request. Default is a null route. 66 | options.callbacks.request_in_thread_before_start = cb_request_in_thread_before_start # Called in the crawling thread (when it started). Default is a null route. 67 | options.callbacks.request_in_thread_after_finish = cb_request_in_thread_after_finish # Called in the crawling thread (when it finished). Default is a null route. 68 | options.callbacks.request_on_error = cb_request_on_error # Called if a request failed. Default is a null route. 69 | options.callbacks.form_before_autofill = cb_form_before_autofill # Called before the crawler autofills a form. Default is a null route. 70 | options.callbacks.form_after_autofill = cb_form_after_autofill # Called after the crawler autofills a form. Default is a null route. 71 | 72 | # Scope options (https://tijme.github.io/not-your-average-web-crawler/latest/options_crawling_scope.html) 73 | options.scope.protocol_must_match = False # Only crawl pages with the same protocol as the startpoint (e.g. only https). Default is False. 74 | options.scope.subdomain_must_match = True # Only crawl pages with the same subdomain as the startpoint. If the startpoint is not a subdomain, no subdomains will be crawled. Default is True. 75 | options.scope.hostname_must_match = True # Only crawl pages with the same hostname as the startpoint (e.g. only `finnwea`). Default is True. 76 | options.scope.tld_must_match = True # Only crawl pages with the same tld as the startpoint (e.g. only `.com`). Default is True. 77 | options.scope.max_depth = None # The maximum search depth. 0 only crawls the start request. 1 will also crawl all the requests found on the start request. 2 goes one level deeper, and so on. Default is None (unlimited). 78 | options.scope.request_methods = [ 79 | # The request methods to crawl. Default is all request methods 80 | Request.METHOD_GET, 81 | Request.METHOD_POST, 82 | Request.METHOD_PUT, 83 | Request.METHOD_DELETE, 84 | Request.METHOD_OPTIONS, 85 | Request.METHOD_HEAD 86 | ] 87 | 88 | # Identity options (https://tijme.github.io/not-your-average-web-crawler/latest/options_crawling_identity.html) 89 | options.identity.auth = HTTPBasicAuth('user', 'pass') # Or any other authentication (http://docs.python-requests.org/en/master/user/authentication/). Default is None. 90 | options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies') 91 | options.identity.cookies.set(name='gross_cookie', value='blech', domain='finnwea.com', path='/elsewhere') 92 | options.identity.proxies = { 93 | # No authentication 94 | # 'http': 'http://host:port', 95 | # 'https': 'http://host:port', 96 | 97 | # Basic authentication 98 | # 'http': 'http://user:pass@host:port', 99 | # 'https': 'https://user:pass@host:port', 100 | 101 | # SOCKS 102 | # 'http': 'socks5://user:pass@host:port', 103 | # 'https': 'socks5://user:pass@host:port' 104 | } 105 | options.identity.headers.update({ 106 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" 107 | }) 108 | 109 | # Performance options (https://tijme.github.io/not-your-average-web-crawler/latest/options_performance.html) 110 | options.performance.max_threads = 20 # The maximum amount of simultaneous threads to use for crawling. Default is 40. 111 | options.performance.request_timeout = 15 # The request timeout in seconds (throws an exception if exceeded). Default is 30. 112 | 113 | # Routing options (https://tijme.github.io/not-your-average-web-crawler/latest/options_routing.html) 114 | options.routing.minimum_threshold = 4 # The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20. 115 | options.routing.routes = [ 116 | # The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array. 117 | "^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$" # Only crawl /blog/{some-blog-alias} 4 times. 118 | ] 119 | 120 | # Misc options (https://tijme.github.io/not-your-average-web-crawler/latest/options_misc.html) 121 | options.misc.debug = False # If debug is enabled extra information will be logged to the console. Default is False. 122 | options.misc.verify_ssl_certificates = True # If verification is enabled all SSL certificates will be checked for validity. Default is True. 123 | options.misc.trusted_certificates = None # You can pass the path to a CA_BUNDLE file (.pem) or directory with certificates of trusted CAs. Default is None. 124 | 125 | crawler = Crawler(options) 126 | crawler.start_with(Request("https://finnwea.com/")) 127 | -------------------------------------------------------------------------------- /docs/source/migration.rst: -------------------------------------------------------------------------------- 1 | .. title:: Migration 2 | 3 | From 1.6 to 1.7 4 | --------------- 5 | 6 | .. raw:: html 7 | 8 |

    pip install --upgrade nyawc


    9 | 10 | **Default request timeout is now 30 seconds** 11 | 12 | From now on there is a default request timeout of 30 seconds. In previous versions it was always infinite and you couldn't specify it. 13 | 14 | If you want to keep the request timeout on infinite set the request timeout option to ``None``. 15 | 16 | .. code:: python 17 | 18 | options.performance.request_timeout = 30 19 | 20 | **Count attributes removed from queue** 21 | 22 | The count attributes (e.g. ``queue.count_in_progress``) are removed since the time complexity of Python's native ``len()`` method is already O(1). 23 | 24 | .. code:: python 25 | 26 | # Old 27 | print("In progress count: " + str(queue.count_in_progress)) 28 | 29 | # New 30 | print("In progress count: " + str(len(queue.get_all(QueueItem.STATUS_IN_PROGRESS)))) 31 | 32 | From 1.5 to 1.6 33 | --------------- 34 | 35 | .. raw:: html 36 | 37 |

    pip install --upgrade nyawc


    38 | 39 | **Headers have default values and are case insensitive** 40 | 41 | From now on the headers identity option has default values and is a case insensitive dict. When changing headers the ``.update()`` method should be used so the default headers remain the same. 42 | 43 | .. code:: python 44 | 45 | # Old 46 | options.identity.headers = { 47 | "User-Agent": "MyCustomUserAgent" 48 | } 49 | 50 | # New 51 | options.identity.headers.update({ 52 | "User-Agent": "MyCustomUserAgent" 53 | }) 54 | 55 | **New default user agent** 56 | 57 | The default user agent for the crawler has changed. In version 1.5 it was a fake Chrome user agent and from now on it is ``nyawc/1.6.0 CPython/3.6.1 Windows/10`` based on the versions you use. 58 | 59 | The Chrome user agent from version 1.5 can still be faked by using the code below. 60 | 61 | .. code:: python 62 | 63 | options.identity.headers.update({ 64 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" 65 | }) 66 | 67 | From 1.4 to 1.5 68 | --------------- 69 | 70 | .. raw:: html 71 | 72 |

    pip install --upgrade nyawc


    73 | 74 | **Renamed the domain must match scope option** 75 | 76 | Since version 1.5 the domain_must_match option is now called hostname_must_match. 77 | 78 | .. code:: python 79 | 80 | # Old 81 | Options().scope.domain_must_match = True/False 82 | 83 | # New 84 | Options().scope.hostname_must_match = True/False 85 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | nyawc 2 | ===== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | nyawc 8 | -------------------------------------------------------------------------------- /docs/source/nyawc.helpers.rst: -------------------------------------------------------------------------------- 1 | nyawc\.helpers package 2 | ====================== 3 | 4 | .. automodule:: nyawc.helpers 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | nyawc\.helpers\.HTTPRequestHelper module 13 | ---------------------------------------- 14 | 15 | .. automodule:: nyawc.helpers.HTTPRequestHelper 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | nyawc\.helpers\.PackageHelper module 21 | ------------------------------------ 22 | 23 | .. automodule:: nyawc.helpers.PackageHelper 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | nyawc\.helpers\.RandomInputHelper module 29 | ---------------------------------------- 30 | 31 | .. automodule:: nyawc.helpers.RandomInputHelper 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | nyawc\.helpers\.URLHelper module 37 | -------------------------------- 38 | 39 | .. automodule:: nyawc.helpers.URLHelper 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: 43 | 44 | 45 | -------------------------------------------------------------------------------- /docs/source/nyawc.http.rst: -------------------------------------------------------------------------------- 1 | nyawc\.http package 2 | =================== 3 | 4 | .. automodule:: nyawc.http 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | nyawc\.http\.Handler module 13 | --------------------------- 14 | 15 | .. automodule:: nyawc.http.Handler 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | nyawc\.http\.Request module 21 | --------------------------- 22 | 23 | .. automodule:: nyawc.http.Request 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | nyawc\.http\.Response module 29 | ---------------------------- 30 | 31 | .. automodule:: nyawc.http.Response 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | 37 | -------------------------------------------------------------------------------- /docs/source/nyawc.rst: -------------------------------------------------------------------------------- 1 | nyawc package 2 | ============= 3 | 4 | .. automodule:: nyawc 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | 14 | nyawc.helpers 15 | nyawc.http 16 | nyawc.scrapers 17 | 18 | Submodules 19 | ---------- 20 | 21 | nyawc\.Crawler module 22 | --------------------- 23 | 24 | .. automodule:: nyawc.Crawler 25 | :members: 26 | :undoc-members: 27 | :show-inheritance: 28 | 29 | nyawc\.CrawlerActions module 30 | ---------------------------- 31 | 32 | .. automodule:: nyawc.CrawlerActions 33 | :members: 34 | :undoc-members: 35 | :show-inheritance: 36 | 37 | nyawc\.CrawlerThread module 38 | --------------------------- 39 | 40 | .. automodule:: nyawc.CrawlerThread 41 | :members: 42 | :undoc-members: 43 | :show-inheritance: 44 | 45 | nyawc\.Options module 46 | --------------------- 47 | 48 | .. automodule:: nyawc.Options 49 | :members: 50 | :undoc-members: 51 | :show-inheritance: 52 | 53 | nyawc\.Queue module 54 | ------------------- 55 | 56 | .. automodule:: nyawc.Queue 57 | :members: 58 | :undoc-members: 59 | :show-inheritance: 60 | 61 | nyawc\.QueueItem module 62 | ----------------------- 63 | 64 | .. automodule:: nyawc.QueueItem 65 | :members: 66 | :undoc-members: 67 | :show-inheritance: 68 | 69 | 70 | -------------------------------------------------------------------------------- /docs/source/nyawc.scrapers.rst: -------------------------------------------------------------------------------- 1 | nyawc\.scrapers package 2 | ======================= 3 | 4 | .. automodule:: nyawc.scrapers 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | nyawc\.scrapers\.BaseScraper module 13 | ----------------------------------- 14 | 15 | .. automodule:: nyawc.scrapers.BaseScraper 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | nyawc\.scrapers\.CSSRegexLinkScraper module 21 | ------------------------------------------- 22 | 23 | .. automodule:: nyawc.scrapers.CSSRegexLinkScraper 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | nyawc\.scrapers\.HTMLSoupFormScraper module 29 | ------------------------------------------- 30 | 31 | .. automodule:: nyawc.scrapers.HTMLSoupFormScraper 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | nyawc\.scrapers\.HTMLSoupLinkScraper module 37 | ------------------------------------------- 38 | 39 | .. automodule:: nyawc.scrapers.HTMLSoupLinkScraper 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: 43 | 44 | nyawc\.scrapers\.JSONRegexLinkScraper module 45 | -------------------------------------------- 46 | 47 | .. automodule:: nyawc.scrapers.JSONRegexLinkScraper 48 | :members: 49 | :undoc-members: 50 | :show-inheritance: 51 | 52 | nyawc\.scrapers\.XMLRegexLinkScraper module 53 | ------------------------------------------- 54 | 55 | .. automodule:: nyawc.scrapers.XMLRegexLinkScraper 56 | :members: 57 | :undoc-members: 58 | :show-inheritance: 59 | 60 | 61 | -------------------------------------------------------------------------------- /docs/source/options_crawling_identity.rst: -------------------------------------------------------------------------------- 1 | .. title:: Crawling identity 2 | 3 | How to use identity options 4 | --------------------------- 5 | 6 | .. code:: python 7 | 8 | # identity_example.py 9 | 10 | from requests.auth import HTTPBasicAuth 11 | from nyawc.Options import Options 12 | from nyawc.Crawler import Crawler 13 | from nyawc.http.Request import Request 14 | 15 | options = Options() 16 | 17 | options.identity.auth = HTTPBasicAuth('user', 'pass') 18 | options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies') 19 | options.identity.cookies.set(name='gross_cookie', value='blech', domain='finnwea.com', path='/elsewhere') 20 | options.identity.proxies = { 21 | # No authentication 22 | # 'http': 'http://host:port', 23 | # 'https': 'http://host:port', 24 | 25 | # Basic authentication 26 | # 'http': 'http://user:pass@host:port', 27 | # 'https': 'https://user:pass@host:port', 28 | 29 | # SOCKS 30 | 'http': 'socks5://user:pass@host:port', 31 | 'https': 'socks5://user:pass@host:port' 32 | } 33 | options.identity.headers.update({ 34 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" 35 | }) 36 | 37 | crawler = Crawler(options) 38 | crawler.start_with(Request("https://finnwea.com/")) 39 | 40 | Available identity options 41 | -------------------------- 42 | 43 | Authentication 44 | ~~~~~~~~~~~~~~ 45 | 46 | Set the authentication for the crawler. Please check `python-requests `__ authentication for all the options. Default is None (no authentication). 47 | 48 | You can find examples of different types of authentication below. 49 | 50 | .. code:: python 51 | 52 | from requests.auth import HTTPBasicAuth 53 | options.identity.auth = HTTPBasicAuth('user', 'pass') 54 | 55 | from requests.auth import HTTPDigestAuth 56 | options.identity.auth = HTTPDigestAuth('user', 'pass') 57 | 58 | from requests_oauthlib import OAuth1 59 | options.identity.auth = OAuth1('YOUR_APP_KEY', 'YOUR_APP_SECRET', 'USER_OAUTH_TOKEN', 'USER_OAUTH_TOKEN_SECRET') 60 | 61 | Cookies 62 | ~~~~~~~ 63 | 64 | Set custom cookies for the crawler. Please check `python-requests `__ cookie jar for all the cookie options. 65 | 66 | .. code:: python 67 | 68 | options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies') 69 | 70 | Proxy 71 | ~~~~~ 72 | 73 | Set a proxy for the crawler. Please check `python-requests `__ proxies for all the proxy options. Default is None (no proxy). 74 | 75 | You can find examples of different types of proxies below. 76 | 77 | .. code:: python 78 | 79 | # Without authentication 80 | options.identity.proxies = { 81 | 'http': 'http://host:port', 82 | 'https': 'http://host:port' 83 | } 84 | 85 | # With basic authentication 86 | options.identity.proxies = { 87 | 'http': 'http://user:pass@host:port', 88 | 'https': 'https://user:pass@host:port' 89 | } 90 | 91 | # With SOCKS 92 | options.identity.proxies = { 93 | 'http': 'socks5://user:pass@host:port', 94 | 'https': 'socks5://user:pass@host:port' 95 | } 96 | 97 | Headers 98 | ~~~~~~~ 99 | 100 | Set custom headers for the crawler (as {key: value} CaseInsensitiveDict). For example, you can set a new user agent by using ``User-Agent`` as key, as shown below. 101 | 102 | Please note that you should use the ``.update()`` method so the default headers remain the same. 103 | 104 | .. code:: python 105 | 106 | options.identity.headers.update({ 107 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" # The user agent to make requests with. 108 | }) 109 | -------------------------------------------------------------------------------- /docs/source/options_crawling_scope.rst: -------------------------------------------------------------------------------- 1 | .. title:: Crawling scope 2 | 3 | How to use scope options 4 | ------------------------ 5 | 6 | .. code:: python 7 | 8 | # scope_example.py 9 | 10 | from nyawc.Options import Options 11 | from nyawc.Crawler import Crawler 12 | from nyawc.http.Request import Request 13 | 14 | options = Options() 15 | 16 | options.scope.protocol_must_match = False 17 | options.scope.subdomain_must_match = True 18 | options.scope.hostname_must_match = True 19 | options.scope.tld_must_match = True 20 | options.scope.max_depth = None 21 | options.scope.request_methods = [ 22 | Request.METHOD_GET, 23 | Request.METHOD_POST, 24 | Request.METHOD_PUT, 25 | Request.METHOD_DELETE, 26 | Request.METHOD_OPTIONS, 27 | Request.METHOD_HEAD 28 | ] 29 | 30 | crawler = Crawler(options) 31 | crawler.start_with(Request("https://finnwea.com/")) 32 | 33 | Available scope options 34 | ----------------------- 35 | 36 | Protocol must match 37 | ~~~~~~~~~~~~~~~~~~~ 38 | 39 | Only crawl pages with the same protocol as the startpoint (e.g. only https) if True. Default is False. 40 | 41 | .. code:: python 42 | 43 | options.scope.protocol_must_match = False 44 | 45 | Subdomain must match 46 | ~~~~~~~~~~~~~~~~~~~~ 47 | 48 | Only crawl pages with the same subdomain as the startpoint if True. If the startpoint is not a subdomain, no subdomains will be crawled. Default is True. 49 | 50 | Please note that the `www` subdomain will be treated the same as no subdomain. 51 | 52 | .. code:: python 53 | 54 | options.scope.subdomain_must_match = True 55 | 56 | Hostname must match 57 | ~~~~~~~~~~~~~~~~~~~ 58 | 59 | Only crawl pages with the same hostname as the startpoint (e.g. only `finnwea`) if True. Default is True. 60 | 61 | Please note that if you set this to false, chances are that it never stops crawling. 62 | 63 | .. code:: python 64 | 65 | options.scope.hostname_must_match = True 66 | 67 | TLD must match 68 | ~~~~~~~~~~~~~~ 69 | 70 | Only crawl pages with the same tld as the startpoint (e.g. only `.com`) if True. Default is True. 71 | 72 | .. code:: python 73 | 74 | options.scope.tld_must_match = True 75 | 76 | Maximum crawling depth 77 | ~~~~~~~~~~~~~~~~~~~~~~ 78 | 79 | The maximum search depth. Default is None (unlimited). 80 | 81 | - 0 will only crawl the start request. 82 | - 1 will also crawl all requests found on the start request. 83 | - 2 will go one level deeper. 84 | - And so on... 85 | 86 | .. code:: python 87 | 88 | options.scope.max_depth = None 89 | 90 | Allowed request methods 91 | ~~~~~~~~~~~~~~~~~~~~~~~ 92 | 93 | Only crawl these request methods. If empty or ``None`` all request methods will be crawled. Default is all. 94 | 95 | .. code:: python 96 | 97 | options.scope.request_methods = [ 98 | Request.METHOD_GET, 99 | Request.METHOD_POST, 100 | Request.METHOD_PUT, 101 | Request.METHOD_DELETE, 102 | Request.METHOD_OPTIONS, 103 | Request.METHOD_HEAD 104 | ] 105 | -------------------------------------------------------------------------------- /docs/source/options_misc.rst: -------------------------------------------------------------------------------- 1 | .. title:: Misc 2 | 3 | How to use misc options 4 | ----------------------- 5 | 6 | .. code:: python 7 | 8 | # misc_example.py 9 | 10 | from nyawc.Options import Options 11 | from nyawc.Crawler import Crawler 12 | from nyawc.http.Request import Request 13 | 14 | options = Options() 15 | 16 | options.misc.debug = False 17 | options.misc.verify_ssl_certificates = True 18 | options.misc.trusted_certificates = None 19 | 20 | crawler = Crawler(options) 21 | crawler.start_with(Request("https://finnwea.com/")) 22 | 23 | Available misc options 24 | ---------------------- 25 | 26 | Debug 27 | ~~~~~ 28 | 29 | If debug is enabled extra information will be logged to the console. Default is False. 30 | 31 | ``options.misc.debug = True`` 32 | 33 | 34 | Verify SSL certificates 35 | ~~~~~~~~~~~~~~~~~~~~~~~ 36 | 37 | If verification is enabled all SSL certificates will be checked for validity. Default is True. 38 | 39 | ``options.misc.verify_ssl_certificates = True`` 40 | 41 | 42 | Trusted certificates 43 | ~~~~~~~~~~~~~~~~~~~~ 44 | 45 | To trust certain certificates (e.g. if you are using a proxy), you can pass the path to a CA_BUNDLE file or directory with certificates of additional trusted CAs. Default is None (which means only domains with valid SSL certificates can be crawled). 46 | 47 | **If verify is set to a directory, the directory must have been processed using the c_rehash utility supplied with OpenSSL.** 48 | 49 | ``options.misc.trusted_certificates = '/path/to/certificate.pem'`` 50 | -------------------------------------------------------------------------------- /docs/source/options_performance.rst: -------------------------------------------------------------------------------- 1 | .. title:: Performance 2 | 3 | How to use performance options 4 | ------------------------------ 5 | 6 | .. code:: python 7 | 8 | # performance_example.py 9 | 10 | from nyawc.Options import Options 11 | from nyawc.Crawler import Crawler 12 | from nyawc.http.Request import Request 13 | 14 | options = Options() 15 | 16 | options.performance.max_threads = 20 17 | options.performance.request_timeout = 15 18 | 19 | crawler = Crawler(options) 20 | crawler.start_with(Request("https://finnwea.com/")) 21 | 22 | Available performance options 23 | ----------------------------- 24 | 25 | Maximum threads 26 | ~~~~~~~~~~~~~~~ 27 | 28 | The maximum amount of simultaneous threads to use for crawling. Default is 40. 29 | 30 | ``options.performance.max_threads = 40`` 31 | 32 | Request timeout 33 | ~~~~~~~~~~~~~~~ 34 | 35 | The request timeout in seconds (throws an exception if exceeded). Default is 30. 36 | 37 | ``options.performance.request_timeout = 30`` 38 | -------------------------------------------------------------------------------- /docs/source/options_routing.rst: -------------------------------------------------------------------------------- 1 | .. title:: Routing 2 | 3 | How to use routing options 4 | -------------------------- 5 | 6 | .. code:: python 7 | 8 | # routing_example.py 9 | 10 | from nyawc.Options import Options 11 | from nyawc.Crawler import Crawler 12 | from nyawc.http.Request import Request 13 | 14 | options = Options() 15 | 16 | options.routing.minimum_threshold = 4 17 | options.routing.routes = [ 18 | "^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$" 19 | ] 20 | 21 | crawler = Crawler(options) 22 | crawler.start_with(Request("https://finnwea.com/")) 23 | 24 | Available routing options 25 | ------------------------- 26 | 27 | Minimum threshold 28 | ~~~~~~~~~~~~~~~~~ 29 | 30 | The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20. 31 | 32 | For example, lets say we have these rquests; 33 | 34 | .. code:: 35 | 36 | https://finnwea.com/blog/1 37 | https://finnwea.com/blog/2 38 | https://finnwea.com/blog/3 39 | ... 40 | https://finnwea.com/blog/54 41 | 42 | It will only crawl the first 20 requests. After that it ignores the rest of the blog posts. 43 | 44 | **Please note that it will probably crawl a bit more than the minimum threshold depending on the maximum amount of threads to use.** 45 | 46 | ``options.routing.minimum_threshold = 20`` 47 | 48 | Routes 49 | ~~~~~~ 50 | 51 | The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array. 52 | 53 | For example the route below represents ``http://finnwea.com/blog/{a-variable-blog-alias}/``. 54 | 55 | ``options.routing.routes = ["^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$"]`` -------------------------------------------------------------------------------- /example_extensive.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | from nyawc.Options import Options 26 | from nyawc.QueueItem import QueueItem 27 | from nyawc.Crawler import Crawler 28 | from nyawc.CrawlerActions import CrawlerActions 29 | from nyawc.http.Request import Request 30 | from requests.auth import HTTPBasicAuth 31 | 32 | def cb_crawler_before_start(): 33 | print("Crawler started.") 34 | 35 | def cb_crawler_after_finish(queue): 36 | print("Crawler finished.") 37 | print("Found " + str(len(queue.get_all(QueueItem.STATUS_FINISHED))) + " requests.") 38 | 39 | for queue_item in queue.get_all(QueueItem.STATUS_FINISHED).values(): 40 | print("[" + queue_item.request.method + "] " + queue_item.request.url + " (PostData: " + str(queue_item.request.data) + ")") 41 | 42 | def cb_request_before_start(queue, queue_item): 43 | # return CrawlerActions.DO_SKIP_TO_NEXT 44 | # return CrawlerActions.DO_STOP_CRAWLING 45 | 46 | return CrawlerActions.DO_CONTINUE_CRAWLING 47 | 48 | def cb_request_after_finish(queue, queue_item, new_queue_items): 49 | percentage = str(int(queue.get_progress())) 50 | total_requests = str(queue.count_total) 51 | 52 | print("At " + percentage + "% of " + total_requests + " requests ([" + str(queue_item.response.status_code) + "] " + queue_item.request.url + ").") 53 | 54 | # return CrawlerActions.DO_STOP_CRAWLING 55 | return CrawlerActions.DO_CONTINUE_CRAWLING 56 | 57 | def cb_request_in_thread_before_start(queue_item): 58 | pass 59 | 60 | def cb_request_in_thread_after_finish(queue_item): 61 | pass 62 | 63 | def cb_request_on_error(queue_item, message): 64 | print("[error] " + message) 65 | 66 | def cb_form_before_autofill(queue_item, elements, form_data): 67 | # return CrawlerActions.DO_NOT_AUTOFILL_FORM 68 | 69 | return CrawlerActions.DO_AUTOFILL_FORM 70 | 71 | def cb_form_after_autofill(queue_item, elements, form_data): 72 | pass 73 | 74 | # Declare the options 75 | options = Options() 76 | 77 | # Callback options (https://tijme.github.io/not-your-average-web-crawler/latest/options_callbacks.html) 78 | options.callbacks.crawler_before_start = cb_crawler_before_start # Called before the crawler starts crawling. Default is a null route. 79 | options.callbacks.crawler_after_finish = cb_crawler_after_finish # Called after the crawler finished crawling. Default is a null route. 80 | options.callbacks.request_before_start = cb_request_before_start # Called before the crawler starts a new request. Default is a null route. 81 | options.callbacks.request_after_finish = cb_request_after_finish # Called after the crawler finishes a request. Default is a null route. 82 | options.callbacks.request_in_thread_before_start = cb_request_in_thread_before_start # Called in the crawling thread (when it started). Default is a null route. 83 | options.callbacks.request_in_thread_after_finish = cb_request_in_thread_after_finish # Called in the crawling thread (when it finished). Default is a null route. 84 | options.callbacks.request_on_error = cb_request_on_error # Called if a request failed. Default is a null route. 85 | options.callbacks.form_before_autofill = cb_form_before_autofill # Called before the crawler autofills a form. Default is a null route. 86 | options.callbacks.form_after_autofill = cb_form_after_autofill # Called after the crawler autofills a form. Default is a null route. 87 | 88 | # Scope options (https://tijme.github.io/not-your-average-web-crawler/latest/options_crawling_scope.html) 89 | options.scope.protocol_must_match = False # Only crawl pages with the same protocol as the startpoint (e.g. only https). Default is False. 90 | options.scope.subdomain_must_match = True # Only crawl pages with the same subdomain as the startpoint. If the startpoint is not a subdomain, no subdomains will be crawled. Default is True. 91 | options.scope.hostname_must_match = True # Only crawl pages with the same hostname as the startpoint (e.g. only `finnwea`). Default is True. 92 | options.scope.tld_must_match = True # Only crawl pages with the same tld as the startpoint (e.g. only `.com`). Default is True. 93 | options.scope.max_depth = None # The maximum search depth. 0 only crawls the start request. 1 will also crawl all the requests found on the start request. 2 goes one level deeper, and so on. Default is None (unlimited). 94 | options.scope.request_methods = [ 95 | # The request methods to crawl. Default is all request methods 96 | Request.METHOD_GET, 97 | Request.METHOD_POST, 98 | Request.METHOD_PUT, 99 | Request.METHOD_DELETE, 100 | Request.METHOD_OPTIONS, 101 | Request.METHOD_HEAD 102 | ] 103 | 104 | # Identity options (https://tijme.github.io/not-your-average-web-crawler/latest/options_crawling_identity.html) 105 | options.identity.auth = HTTPBasicAuth('user', 'pass') # Or any other authentication (http://docs.python-requests.org/en/master/user/authentication/). Default is None. 106 | options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies') 107 | options.identity.cookies.set(name='gross_cookie', value='blech', domain='finnwea.com', path='/elsewhere') 108 | options.identity.proxies = { 109 | # No authentication 110 | # 'http': 'http://host:port', 111 | # 'https': 'http://host:port', 112 | 113 | # Basic authentication 114 | # 'http': 'http://user:pass@host:port', 115 | # 'https': 'https://user:pass@host:port', 116 | 117 | # SOCKS 118 | # 'http': 'socks5://user:pass@host:port', 119 | # 'https': 'socks5://user:pass@host:port' 120 | } 121 | options.identity.headers.update({ 122 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" 123 | }) 124 | 125 | # Performance options (https://tijme.github.io/not-your-average-web-crawler/latest/options_performance.html) 126 | options.performance.max_threads = 20 # The maximum amount of simultaneous threads to use for crawling. Default is 40. 127 | options.performance.request_timeout = 15 # The request timeout in seconds (throws an exception if exceeded). Default is 30. 128 | 129 | # Routing options (https://tijme.github.io/not-your-average-web-crawler/latest/options_routing.html) 130 | options.routing.minimum_threshold = 4 # The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20. 131 | options.routing.routes = [ 132 | # The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array. 133 | "^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$" # Only crawl /blog/{some-blog-alias} 4 times. 134 | ] 135 | 136 | # Misc options (https://tijme.github.io/not-your-average-web-crawler/latest/options_misc.html) 137 | options.misc.debug = False # If debug is enabled extra information will be logged to the console. Default is False. 138 | options.misc.verify_ssl_certificates = True # If verification is enabled all SSL certificates will be checked for validity. Default is True. 139 | options.misc.trusted_certificates = None # You can pass the path to a CA_BUNDLE file (.pem) or directory with certificates of trusted CAs. Default is None. 140 | 141 | crawler = Crawler(options) 142 | crawler.start_with(Request("https://finnwea.com/")) 143 | -------------------------------------------------------------------------------- /example_minimal.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | from nyawc.Options import Options 26 | from nyawc.Crawler import Crawler 27 | from nyawc.QueueItem import QueueItem 28 | from nyawc.CrawlerActions import CrawlerActions 29 | from nyawc.http.Request import Request 30 | 31 | def cb_crawler_before_start(): 32 | print("Crawler started.") 33 | 34 | def cb_crawler_after_finish(queue): 35 | print("Crawler finished.") 36 | print("Found " + str(len(queue.get_all(QueueItem.STATUS_FINISHED))) + " requests.") 37 | 38 | def cb_request_before_start(queue, queue_item): 39 | print("Starting: {}".format(queue_item.request.url)) 40 | return CrawlerActions.DO_CONTINUE_CRAWLING 41 | 42 | def cb_request_after_finish(queue, queue_item, new_queue_items): 43 | print("Finished: {}".format(queue_item.request.url)) 44 | return CrawlerActions.DO_CONTINUE_CRAWLING 45 | 46 | options = Options() 47 | 48 | options.callbacks.crawler_before_start = cb_crawler_before_start # Called before the crawler starts crawling. Default is a null route. 49 | options.callbacks.crawler_after_finish = cb_crawler_after_finish # Called after the crawler finished crawling. Default is a null route. 50 | options.callbacks.request_before_start = cb_request_before_start # Called before the crawler starts a new request. Default is a null route. 51 | options.callbacks.request_after_finish = cb_request_after_finish # Called after the crawler finishes a request. Default is a null route. 52 | 53 | crawler = Crawler(options) 54 | crawler.start_with(Request("https://finnwea.com/")) 55 | -------------------------------------------------------------------------------- /nyawc/CrawlerActions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | class CrawlerActions(object): 26 | """The actions that crawler callbacks can return. 27 | 28 | Attributes: 29 | DO_CONTINUE_CRAWLING (int): Continue by crawling the request. 30 | DO_SKIP_TO_NEXT (int): Skip the current request and continue with the next one in line. 31 | DO_STOP_CRAWLING (int): Stop crawling and quit ongoing requests. 32 | DO_AUTOFILL_FORM (int): Autofill this form with random values. 33 | DO_NOT_AUTOFILL_FORM (int): Do not autofill this form with random values. 34 | 35 | """ 36 | 37 | DO_CONTINUE_CRAWLING = 1 38 | 39 | DO_SKIP_TO_NEXT = 2 40 | 41 | DO_STOP_CRAWLING = 3 42 | 43 | DO_AUTOFILL_FORM = 4 44 | 45 | DO_NOT_AUTOFILL_FORM = 5 46 | -------------------------------------------------------------------------------- /nyawc/CrawlerThread.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import threading 26 | 27 | from nyawc.helpers.DebugHelper import DebugHelper 28 | from nyawc.http.Handler import Handler 29 | from nyawc.QueueItem import QueueItem 30 | 31 | class CrawlerThread(threading.Thread): 32 | """The crawler thread executes the HTTP request using the HTTP handler. 33 | 34 | Attributes: 35 | __callback (obj): The method to call when finished 36 | __callback_lock (bool): The callback lock that prevents race conditions. 37 | __options (:class:`nyawc.Options`): The settins/options object. 38 | __queue_item (:class:`nyawc.QueueItem`): The queue item containing a request to execute. 39 | 40 | """ 41 | 42 | def __init__(self, callback, callback_lock, options, queue_item): 43 | """Constructs a crawler thread instance 44 | 45 | Args: 46 | callback (obj): The method to call when finished 47 | callback_lock (bool): The callback lock that prevents race conditions. 48 | options (:class:`nyawc.Options`): The settins/options object. 49 | queue_item (:class:`nyawc.QueueItem`): The queue item containing a request to execute. 50 | 51 | """ 52 | 53 | threading.Thread.__init__(self) 54 | 55 | self.__callback = callback 56 | self.__callback_lock = callback_lock 57 | self.__options = options 58 | self.__queue_item = queue_item 59 | 60 | def run(self): 61 | """Executes the HTTP call. 62 | 63 | Note: 64 | If this and the parent handler raised an error, the queue item status 65 | will be set to errored instead of finished. This is to prevent e.g. 404 66 | recursion. 67 | 68 | """ 69 | 70 | try: 71 | self.__options.callbacks.request_in_thread_before_start(self.__queue_item) 72 | except Exception as e: 73 | print(e) 74 | 75 | new_requests = [] 76 | failed = False 77 | 78 | try: 79 | handler = Handler(self.__options, self.__queue_item) 80 | new_requests = handler.get_new_requests() 81 | 82 | try: 83 | self.__queue_item.response.raise_for_status() 84 | except Exception: 85 | if self.__queue_item.request.parent_raised_error: 86 | failed = True 87 | else: 88 | for new_request in new_requests: 89 | new_request.parent_raised_error = True 90 | 91 | except Exception as e: 92 | failed = True 93 | 94 | error_message = "Setting status of '{}' to '{}' because of an HTTP error.".format( 95 | self.__queue_item.request.url, 96 | QueueItem.STATUS_ERRORED 97 | ) 98 | 99 | DebugHelper.output(self.__options, error_message) 100 | DebugHelper.output(self.__options, e) 101 | 102 | try: 103 | self.__options.callbacks.request_on_error(self.__queue_item, str(e)) 104 | except Exception as e: 105 | print(e) 106 | 107 | for new_request in new_requests: 108 | new_request.parent_url = self.__queue_item.request.url 109 | 110 | try: 111 | self.__options.callbacks.request_in_thread_after_finish(self.__queue_item) 112 | except Exception as e: 113 | print(e) 114 | 115 | with self.__callback_lock: 116 | self.__callback(self.__queue_item, new_requests, failed) 117 | -------------------------------------------------------------------------------- /nyawc/Queue.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | from collections import OrderedDict 26 | from nyawc.http.Response import Response 27 | from nyawc.QueueItem import QueueItem 28 | 29 | class Queue(object): 30 | """A 'hash' queue containing all the requests of the crawler. 31 | 32 | Note: 33 | This queue uses a certain hash to prevent duplicate entries and improve 34 | the time complexity by checking if the hash exists instead of iterating 35 | over all items. 36 | 37 | Attributes: 38 | __options (:class:`nyawc.Options`): The options to use (used when generating queue item hashes). 39 | count_total (int): The total count of requests in the queue. 40 | items_queued list(:class:`nyawc.QueueItem`): The queued items (yet to be executed). 41 | items_in_progress list(:class:`nyawc.QueueItem`): The items currently being executed. 42 | items_finished list(:class:`nyawc.QueueItem`): The finished items. 43 | items_cancelled list(:class:`nyawc.QueueItem`): Items that were cancelled. 44 | items_errored list(:class:`nyawc.QueueItem`): Items that generated an error. 45 | 46 | """ 47 | 48 | def __init__(self, options): 49 | """Constructs a Queue instance. 50 | 51 | Args: 52 | options (:class:`nyawc.Options`): The options to use. 53 | 54 | """ 55 | 56 | self.__options = options 57 | self.count_total = 0 58 | self.items_queued = OrderedDict() 59 | self.items_in_progress = OrderedDict() 60 | self.items_finished = OrderedDict() 61 | self.items_cancelled = OrderedDict() 62 | self.items_errored = OrderedDict() 63 | 64 | def add_request(self, request): 65 | """Add a request to the queue. 66 | 67 | Args: 68 | request (:class:`nyawc.http.Request`): The request to add. 69 | 70 | Returns: 71 | :class:`nyawc.QueueItem`: The created queue item. 72 | 73 | """ 74 | 75 | queue_item = QueueItem(request, Response(request.url)) 76 | self.add(queue_item) 77 | return queue_item 78 | 79 | def has_request(self, request): 80 | """Check if the given request already exists in the queue. 81 | 82 | Args: 83 | request (:class:`nyawc.http.Request`): The request to check. 84 | 85 | Returns: 86 | bool: True if already exists, False otherwise. 87 | 88 | """ 89 | 90 | queue_item = QueueItem(request, Response(request.url)) 91 | key = queue_item.get_hash() 92 | 93 | for status in QueueItem.STATUSES: 94 | if key in self.__get_var("items_" + status).keys(): 95 | return True 96 | 97 | return False 98 | 99 | def add(self, queue_item): 100 | """Add a request/response pair to the queue. 101 | 102 | Args: 103 | queue_item (:class:`nyawc.QueueItem`): The queue item to add. 104 | 105 | """ 106 | 107 | hash_key = queue_item.get_hash() 108 | items = self.__get_var("items_" + queue_item.status) 109 | 110 | if hash_key in items.keys(): 111 | return 112 | 113 | items[queue_item.get_hash()] = queue_item 114 | 115 | self.count_total += 1 116 | 117 | def move(self, queue_item, status): 118 | """Move a request/response pair to another status. 119 | 120 | Args: 121 | queue_item (:class:`nyawc.QueueItem`): The queue item to move 122 | status (str): The new status of the queue item. 123 | 124 | """ 125 | 126 | items = self.__get_var("items_" + queue_item.status) 127 | 128 | del items[queue_item.get_hash()] 129 | self.count_total -= 1 130 | 131 | queue_item.status = status 132 | self.add(queue_item) 133 | 134 | def move_bulk(self, from_statuses, to_status): 135 | """Move a bulk of request/response pairs to another status 136 | 137 | Args: 138 | from_statuses list(str): The statuses to move from 139 | to_status (str): The status to move to 140 | 141 | """ 142 | 143 | for status in from_statuses: 144 | from_status_items = self.__get_var("items_" + status) 145 | self.__set_var("items_" + status, OrderedDict()) 146 | 147 | to_status_items = self.__get_var("items_" + to_status) 148 | to_status_items.update(from_status_items) 149 | 150 | def get_first(self, status): 151 | """Get the first item in the queue that has the given status. 152 | 153 | Args: 154 | status (str): return the first item with this status. 155 | 156 | Returns: 157 | :class:`nyawc.QueueItem`: The first queue item with the given status. 158 | 159 | """ 160 | 161 | items = self.get_all(status) 162 | 163 | if items: 164 | return list(items.items())[0][1] 165 | 166 | return None 167 | 168 | def get_all(self, status): 169 | """Get all the items in the queue that have the given status. 170 | 171 | Args: 172 | status (str): return the items with this status. 173 | 174 | Returns: 175 | list(:class:`nyawc.QueueItem`): All the queue items with the given status. 176 | 177 | """ 178 | 179 | return self.__get_var("items_" + status) 180 | 181 | def get_progress(self): 182 | """Get the progress of the queue in percentage (float). 183 | 184 | Returns: 185 | float: The 'finished' progress in percentage. 186 | 187 | """ 188 | 189 | count_remaining = len(self.items_queued) + len(self.items_in_progress) 190 | percentage_remaining = 100 / self.count_total * count_remaining 191 | 192 | return 100 - percentage_remaining 193 | 194 | def __set_var(self, name, value): 195 | """Set an instance/class var by name. 196 | 197 | Args: 198 | name (str): The name of the variable. 199 | value (obj): I'ts new value. 200 | 201 | """ 202 | 203 | setattr(self, name, value) 204 | 205 | def __get_var(self, name): 206 | """Get an instance/class var by name. 207 | 208 | Args: 209 | name (str): The name of the variable. 210 | 211 | Returns: 212 | obj: I'ts value. 213 | 214 | """ 215 | 216 | return getattr(self, name) 217 | -------------------------------------------------------------------------------- /nyawc/QueueItem.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | from nyawc.helpers.URLHelper import URLHelper 26 | from bs4 import BeautifulSoup 27 | 28 | class QueueItem(object): 29 | """The QueueItem class keeps track of the request and response and the crawling status. 30 | 31 | Attributes: 32 | STATUS_QUEUED (str): Status for when the crawler did not yet start the request. 33 | STATUS_IN_PROGRESS (str): Status for when the crawler is currently crawling the request. 34 | STATUS_FINISHED (str): Status for when the crawler has finished crawling the request. 35 | STATUS_CANCELLED (str): Status for when the crawler has cancelled the request. 36 | STATUS_ERRORED (str): Status for when the crawler could not execute the request. 37 | STATUSES (arr): All statuses. 38 | status (str): The current crawling status. 39 | decomposed (bool): If the this queue item is decomposed. 40 | request (:class:`nyawc.http.Request`): The Request object. 41 | response (:class:`nyawc.http.Response`): The Response object. 42 | __response_soup (obj): The BeautifulSoup container for the response text. 43 | __index_hash (str): The index of the queue (if cached), otherwise None. 44 | 45 | Note: 46 | A queue item will be decomposed (cached objects are deleted to free up memory) when it is 47 | not likeley to be used again. After decompisition variables will not be cached anymore. 48 | 49 | """ 50 | 51 | STATUS_QUEUED = "queued" 52 | 53 | STATUS_IN_PROGRESS = "in_progress" 54 | 55 | STATUS_FINISHED = "finished" 56 | 57 | STATUS_CANCELLED = "cancelled" 58 | 59 | STATUS_ERRORED = "errored" 60 | 61 | STATUSES = [ 62 | STATUS_QUEUED, 63 | STATUS_IN_PROGRESS, 64 | STATUS_FINISHED, 65 | STATUS_CANCELLED, 66 | STATUS_ERRORED 67 | ] 68 | 69 | def __init__(self, request, response): 70 | """Constructs a QueueItem instance. 71 | 72 | Args: 73 | request (:class:`nyawc.http.Request`): The Request object. 74 | response (:class:`nyawc.http.Response`): The Response object (empty object when initialized). 75 | 76 | """ 77 | 78 | self.status = QueueItem.STATUS_QUEUED 79 | self.decomposed = False 80 | self.__response_soup = None 81 | self.__index_hash = None 82 | 83 | self.request = request 84 | self.response = response 85 | 86 | def get_soup_response(self): 87 | """Get the response as a cached BeautifulSoup container. 88 | 89 | Returns: 90 | obj: The BeautifulSoup container. 91 | 92 | """ 93 | 94 | if self.response is not None: 95 | if self.__response_soup is None: 96 | result = BeautifulSoup(self.response.text, "lxml") 97 | 98 | if self.decomposed: 99 | return result 100 | else: 101 | self.__response_soup = BeautifulSoup(self.response.text, "lxml") 102 | 103 | return self.__response_soup 104 | 105 | def decompose(self): 106 | """Decompose this queue item (set cached variables to None) to free up memory. 107 | 108 | Note: 109 | When setting cached variables to None memory will be released after the garbage 110 | collector ran. 111 | 112 | """ 113 | 114 | self.__response_soup = None 115 | 116 | self.decomposed = True 117 | 118 | def get_hash(self): 119 | """Generate and return the dict index hash of the given queue item. 120 | 121 | Note: 122 | Cookies should not be included in the hash calculation because 123 | otherwise requests are crawled multiple times with e.g. different 124 | session keys, causing infinite crawling recursion. 125 | 126 | Note: 127 | At this moment the keys do not actually get hashed since it works perfectly without and 128 | since hashing the keys requires us to built hash collision management. 129 | 130 | Returns: 131 | str: The hash of the given queue item. 132 | 133 | """ 134 | 135 | if self.__index_hash: 136 | return self.__index_hash 137 | 138 | key = self.request.method 139 | 140 | key += URLHelper.get_protocol(self.request.url) 141 | key += URLHelper.get_subdomain(self.request.url) 142 | key += URLHelper.get_hostname(self.request.url) 143 | key += URLHelper.get_tld(self.request.url) 144 | key += URLHelper.get_path(self.request.url) 145 | 146 | key += str(URLHelper.get_ordered_params(self.request.url)) 147 | 148 | if self.request.data is not None: 149 | key += str(self.request.data.keys()) 150 | 151 | self.__index_hash = key 152 | return self.__index_hash 153 | -------------------------------------------------------------------------------- /nyawc/Routing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import re 26 | 27 | class Routing(object): 28 | """The Routing class counts requests that match certain routes. 29 | 30 | Attributes: 31 | __routing_options (:class:`nyawc.OptionsRouting`): The options containing routing information. 32 | __routing_count (obj): The {key: value} dict that contains the amount of requests for certain routes. 33 | 34 | """ 35 | 36 | def __init__(self, options): 37 | """Constructs a Crawler instance. 38 | 39 | Args: 40 | options (:class:`nyawc.Options`): The options to use for the current crawling runtime. 41 | 42 | """ 43 | 44 | self.__routing_options = options.routing 45 | self.__routing_count = {} 46 | 47 | def increase_route_count(self, crawled_request): 48 | """Increase the count that determines how many times a URL of a certain route has been crawled. 49 | 50 | Args: 51 | crawled_request (:class:`nyawc.http.Request`): The request that possibly matches a route. 52 | 53 | """ 54 | 55 | for route in self.__routing_options.routes: 56 | if re.compile(route).match(crawled_request.url): 57 | count_key = str(route) + crawled_request.method 58 | 59 | if count_key in self.__routing_count.keys(): 60 | self.__routing_count[count_key] += 1 61 | else: 62 | self.__routing_count[count_key] = 1 63 | 64 | break 65 | 66 | def is_treshold_reached(self, scraped_request): 67 | """Check if similar requests to the given requests have already been crawled X times. Where X is the 68 | minimum treshold amount from the options. 69 | 70 | Args: 71 | scraped_request (:class:`nyawc.http.Request`): The request that possibly reached the minimum treshold. 72 | 73 | Returns: 74 | bool: True if treshold reached, false otherwise. 75 | 76 | """ 77 | 78 | for route in self.__routing_options.routes: 79 | if re.compile(route).match(scraped_request.url): 80 | count_key = str(route) + scraped_request.method 81 | 82 | if count_key in self.__routing_count.keys(): 83 | return self.__routing_count[count_key] >= self.__routing_options.minimum_threshold 84 | 85 | return False 86 | -------------------------------------------------------------------------------- /nyawc/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import sys 26 | 27 | python_version_compatible_with_nyawc = False 28 | 29 | if sys.version_info.major == 3 and sys.version_info.minor >= 3: 30 | python_version_compatible_with_nyawc = True 31 | 32 | if sys.version_info.major == 2 and sys.version_info.minor >= 7: 33 | python_version_compatible_with_nyawc = True 34 | 35 | if not python_version_compatible_with_nyawc: 36 | print("N.Y.A.W.C requires Python 2.7/3.3 or higher!") 37 | print("You are currently using Python {}.{}.".format(sys.version_info.major, sys.version_info.minor)) 38 | sys.exit(1) 39 | -------------------------------------------------------------------------------- /nyawc/helpers/DebugHelper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import requests 26 | 27 | class DebugHelper: 28 | """A helper for printing debug messages.""" 29 | 30 | @staticmethod 31 | def setup(options): 32 | """Initialize debug/logging in third party libraries correctly. 33 | 34 | Args: 35 | options (:class:`nyawc.Options`): The options to use for the current crawling runtime. 36 | 37 | """ 38 | 39 | if not options.misc.debug: 40 | requests.packages.urllib3.disable_warnings( 41 | requests.packages.urllib3.exceptions.InsecureRequestWarning 42 | ) 43 | 44 | 45 | @staticmethod 46 | def output(options, message): 47 | """Print the given message if the debug option in the given options is on. 48 | 49 | Args: 50 | options (:class:`nyawc.Options`): The options to use for the current crawling runtime. 51 | message (str): The message to print. 52 | 53 | """ 54 | 55 | if options.misc.debug: 56 | print("[DEBUG] " + str(message)) -------------------------------------------------------------------------------- /nyawc/helpers/HTTPRequestHelper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import copy 26 | 27 | from nyawc.helpers.URLHelper import URLHelper 28 | 29 | class HTTPRequestHelper: 30 | """A helper for the src.http.Request module.""" 31 | 32 | @staticmethod 33 | def patch_with_options(request, options, parent_queue_item=None): 34 | """Patch the given request with the given options (e.g. user agent). 35 | 36 | Args: 37 | request (:class:`nyawc.http.Request`): The request to patch. 38 | options (:class:`nyawc.Options`): The options to patch the request with. 39 | parent_queue_item (:class:`nyawc.QueueItem`): The parent queue item object (request/response pair) if exists. 40 | 41 | """ 42 | 43 | request.auth = copy.deepcopy(options.identity.auth) 44 | request.cookies = copy.deepcopy(options.identity.cookies) 45 | request.headers = copy.deepcopy(options.identity.headers) 46 | request.proxies = copy.deepcopy(options.identity.proxies) 47 | request.timeout = copy.copy(options.performance.request_timeout) 48 | 49 | if parent_queue_item != None: 50 | for cookie in parent_queue_item.request.cookies: 51 | request.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path) 52 | 53 | for cookie in parent_queue_item.response.cookies: 54 | request.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path) 55 | 56 | if options.misc.verify_ssl_certificates and options.misc.trusted_certificates: 57 | request.verify = options.misc.trusted_certificates 58 | else: 59 | request.verify = options.misc.verify_ssl_certificates 60 | 61 | @staticmethod 62 | def complies_with_scope(queue_item, new_request, scope): 63 | """Check if the new request complies with the crawling scope. 64 | 65 | Args: 66 | queue_item (:class:`nyawc.QueueItem`): The parent queue item of the new request. 67 | new_request (:class:`nyawc.http.Request`): The request to check. 68 | scope (:class:`nyawc.Options.OptionsScope`): The scope to check. 69 | 70 | Returns: 71 | bool: True if it complies, False otherwise. 72 | 73 | """ 74 | 75 | if not URLHelper.is_parsable(queue_item.request.url): 76 | return False 77 | 78 | if not URLHelper.is_parsable(new_request.url): 79 | return False 80 | 81 | if scope.request_methods: 82 | if not queue_item.request.method in scope.request_methods: 83 | return False 84 | 85 | if scope.protocol_must_match: 86 | if URLHelper.get_protocol(queue_item.request.url) != URLHelper.get_protocol(new_request.url): 87 | return False 88 | 89 | if scope.subdomain_must_match: 90 | current_subdomain = URLHelper.get_subdomain(queue_item.request.url) 91 | new_subdomain = URLHelper.get_subdomain(new_request.url) 92 | 93 | www_matches = False 94 | 95 | if current_subdomain == "www" and new_subdomain == "": 96 | www_matches = True 97 | 98 | if new_subdomain == "www" and current_subdomain == "": 99 | www_matches = True 100 | 101 | if not www_matches and current_subdomain != new_subdomain: 102 | return False 103 | 104 | if scope.hostname_must_match: 105 | if URLHelper.get_hostname(queue_item.request.url) != URLHelper.get_hostname(new_request.url): 106 | return False 107 | 108 | if scope.tld_must_match: 109 | if URLHelper.get_tld(queue_item.request.url) != URLHelper.get_tld(new_request.url): 110 | return False 111 | 112 | return True 113 | 114 | @staticmethod 115 | def get_cookie_header(queue_item): 116 | """Convert a requests cookie jar to a HTTP request cookie header value. 117 | 118 | Args: 119 | queue_item (:class:`nyawc.QueueItem`): The parent queue item of the new request. 120 | 121 | Returns: 122 | str: The HTTP cookie header value. 123 | 124 | """ 125 | 126 | header = [] 127 | path = URLHelper.get_path(queue_item.request.url) 128 | 129 | for cookie in queue_item.request.cookies: 130 | root_path = cookie.path == "" or cookie.path == "/" 131 | if path.startswith(cookie.path) or root_path: 132 | header.append(cookie.name + "=" + cookie.value) 133 | 134 | return "&".join(header) 135 | -------------------------------------------------------------------------------- /nyawc/helpers/PackageHelper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import os 26 | import re 27 | import pkg_resources 28 | 29 | class PackageHelper: 30 | """The Package class contains all the package related information (like the version number). 31 | 32 | Attributes: 33 | __name (str): Cached package name. 34 | __description (str): Cached package description. 35 | __alias (str): Cached package alias. 36 | __version (str): Cached package version number (if initialized). 37 | 38 | """ 39 | 40 | __name = "Not Your Average Web Crawler" 41 | 42 | __description = "A web crawler that gathers more than you can imagine." 43 | 44 | __alias = "nyawc" 45 | 46 | __version = None 47 | 48 | @staticmethod 49 | def get_name(): 50 | """Get the name of this package. 51 | 52 | Returns: 53 | str: The name of this package. 54 | 55 | """ 56 | 57 | return PackageHelper.__name 58 | 59 | @staticmethod 60 | def get_description(): 61 | """Get the description of this package. 62 | 63 | Returns: 64 | str: The description of this package. 65 | 66 | """ 67 | 68 | return PackageHelper.__description 69 | 70 | @staticmethod 71 | def get_alias(): 72 | """Get the alias of this package. 73 | 74 | Returns: 75 | str: The alias of this package. 76 | 77 | """ 78 | 79 | return PackageHelper.__alias 80 | 81 | @staticmethod 82 | def get_version(): 83 | """Get the version number of this package. 84 | 85 | Returns: 86 | str: The version number (marjor.minor.patch). 87 | 88 | Note: 89 | When this package is installed, the version number will be available through the 90 | package resource details. Otherwise this method will look for a ``.semver`` file. 91 | 92 | Note: 93 | In rare cases corrupt installs can cause the version number to be unknown. In this case 94 | the version number will be set to the string "Unknown". 95 | 96 | """ 97 | 98 | if PackageHelper.__version: 99 | return PackageHelper.__version 100 | 101 | PackageHelper.__version = "Unknown" 102 | 103 | # If this is a GIT clone without install, use the ``.semver`` file. 104 | file = os.path.realpath(__file__) 105 | folder = os.path.dirname(file) 106 | 107 | try: 108 | semver = open(folder + "/../../.semver", "r") 109 | PackageHelper.__version = semver.read().rstrip() 110 | semver.close() 111 | return PackageHelper.__version 112 | except: 113 | pass 114 | 115 | # If the package was installed, get the version number via Python's distribution details. 116 | try: 117 | distribution = pkg_resources.get_distribution(PackageHelper.get_alias()) 118 | if distribution.version: 119 | PackageHelper.__version = distribution.version 120 | return PackageHelper.__version 121 | except: 122 | pass 123 | 124 | return PackageHelper.__version 125 | 126 | @staticmethod 127 | def rst_to_pypi(contents): 128 | """Convert the given GitHub RST contents to PyPi RST contents (since some RST directives are not available in PyPi). 129 | 130 | Args: 131 | contents (str): The GitHub compatible RST contents. 132 | 133 | Returns: 134 | str: The PyPi compatible RST contents. 135 | 136 | """ 137 | 138 | # The PyPi description does not support the SVG file type. 139 | contents = contents.replace(".svg?pypi=png.from.svg", ".png") 140 | 141 | # Convert ``
    `` to a H1 title 142 | asterisks_length = len(PackageHelper.get_name()) 143 | asterisks = "*" * asterisks_length 144 | title = asterisks + "\n" + PackageHelper.get_name() + "\n" + asterisks; 145 | 146 | contents = re.sub(r"(\.\. raw\:\: html\n)(\n {2,4})(\
    )", title, contents) 147 | 148 | # The PyPi description does not support raw HTML 149 | contents = re.sub(r"(\.\. raw\:\: html\n)((\n {2,4})([A-Za-z0-9<>\ =\"\/])*)*", "", contents) 150 | 151 | return contents 152 | -------------------------------------------------------------------------------- /nyawc/helpers/RandomInputHelper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import random 26 | import string 27 | 28 | class RandomInputHelper: 29 | """A helper for generating random user input. 30 | 31 | Note: 32 | We need to cache the generated values to prevent infinite crawling 33 | loops. For example, if two responses contain the same ?search= form, 34 | the random generated value must be the same both of the times because 35 | otherwise the crawling would treat the new requests as two different 36 | requests. 37 | 38 | Attributes: 39 | cache (obj): Cached values of the generated data. 40 | 41 | """ 42 | 43 | cache = {} 44 | 45 | @staticmethod 46 | def get_for_type(input_type="text"): 47 | """Get a random string for the given html input type 48 | 49 | Args: 50 | input_type (str): The input type (e.g. email). 51 | 52 | Returns: 53 | str: The (cached) random value. 54 | 55 | """ 56 | 57 | if input_type in RandomInputHelper.cache: 58 | return RandomInputHelper.cache[input_type] 59 | 60 | types = { 61 | "text": RandomInputHelper.get_random_value, 62 | "hidden": RandomInputHelper.get_random_value, 63 | "search": RandomInputHelper.get_random_value, 64 | "color": RandomInputHelper.get_random_color, 65 | "week": {"function": RandomInputHelper.get_random_value, "params": [2, ["1234"]]}, 66 | "password": RandomInputHelper.get_random_password, 67 | "number": RandomInputHelper.get_random_number, 68 | "tel": RandomInputHelper.get_random_telephonenumber, 69 | "url": RandomInputHelper.get_random_url, 70 | "textarea": RandomInputHelper.get_random_text, 71 | "email": RandomInputHelper.get_random_email 72 | } 73 | 74 | if types.get(input_type) is None: 75 | return "" 76 | 77 | if type(types.get(input_type)) is dict: 78 | generator = types.get(input_type) 79 | value = generator.get("function")(*generator.get("params")) 80 | else: 81 | value = types.get(input_type)() 82 | 83 | RandomInputHelper.cache[input_type] = value 84 | 85 | return value 86 | 87 | @staticmethod 88 | def get_random_value(length=10, character_sets=[string.ascii_uppercase, string.ascii_lowercase]): 89 | """Get a random string with the given length. 90 | 91 | Args: 92 | length (int): The length of the string to return. 93 | character_sets list(str): The caracter sets to use. 94 | 95 | Returns: 96 | str: The random string. 97 | 98 | """ 99 | 100 | return "".join(random.choice("".join(character_sets)) for i in range(length)) 101 | 102 | @staticmethod 103 | def get_random_number(length=4): 104 | """Get a random number with the given length. 105 | 106 | Args: 107 | length (int): The length of the number to return. 108 | 109 | Returns: 110 | str: The random number. 111 | 112 | """ 113 | 114 | return RandomInputHelper.get_random_value(length, [string.digits]) 115 | 116 | @staticmethod 117 | def get_random_color(): 118 | """Get a random color in HEX format (including hash character). 119 | 120 | Returns: 121 | str: The random HEX color. 122 | 123 | """ 124 | 125 | return '#{:06x}'.format(random.randint(0, 0x00ffff)) 126 | 127 | @staticmethod 128 | def get_random_text(): 129 | """Get a random string with the given length. 130 | 131 | Args: 132 | length (int): The length of the string to return. 133 | 134 | Returns: 135 | str: The random string. 136 | 137 | """ 138 | 139 | return " ".join(RandomInputHelper.get_random_value()for i in range(20, 30)) 140 | 141 | @staticmethod 142 | def get_random_email(ltd="com"): 143 | """Get a random email address with the given ltd. 144 | 145 | Args: 146 | ltd (str): The ltd to use (e.g. com). 147 | 148 | Returns: 149 | str: The random email. 150 | 151 | """ 152 | 153 | email = [ 154 | RandomInputHelper.get_random_value(6, [string.ascii_lowercase]), 155 | "@", 156 | RandomInputHelper.get_random_value(6, [string.ascii_lowercase]), 157 | ".", 158 | ltd 159 | ] 160 | 161 | return "".join(email) 162 | 163 | @staticmethod 164 | def get_random_password(): 165 | """Get a random password that complies with most of the requirements. 166 | 167 | Note: 168 | This random password is not strong and not "really" random, and should only be 169 | used for testing purposes. 170 | 171 | Returns: 172 | str: The random password. 173 | 174 | """ 175 | 176 | password = [] 177 | 178 | password.append(RandomInputHelper.get_random_value(4, [string.ascii_lowercase])) 179 | password.append(RandomInputHelper.get_random_value(2, [string.digits])) 180 | password.append(RandomInputHelper.get_random_value(2, ["$&*@!"])) 181 | password.append(RandomInputHelper.get_random_value(4, [string.ascii_uppercase])) 182 | 183 | return "".join(password) 184 | 185 | @staticmethod 186 | def get_random_url(ltd="com"): 187 | """Get a random url with the given ltd. 188 | 189 | Args: 190 | ltd (str): The ltd to use (e.g. com). 191 | 192 | Returns: 193 | str: The random url. 194 | 195 | """ 196 | 197 | url = [ 198 | "https://", 199 | RandomInputHelper.get_random_value(8, [string.ascii_lowercase]), 200 | ".", 201 | ltd 202 | ] 203 | 204 | return "".join(url) 205 | 206 | @staticmethod 207 | def get_random_telephonenumber(): 208 | """Get a random 10 digit phone number that complies with most of the requirements. 209 | 210 | Returns: 211 | str: The random telephone number. 212 | 213 | """ 214 | 215 | phone = [ 216 | RandomInputHelper.get_random_value(3, "123456789"), 217 | RandomInputHelper.get_random_value(3, "12345678"), 218 | "".join(map(str, random.sample(range(10), 4))) 219 | ] 220 | 221 | return "-".join(phone) 222 | -------------------------------------------------------------------------------- /nyawc/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | -------------------------------------------------------------------------------- /nyawc/http/Handler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import os 26 | import importlib 27 | import requests 28 | 29 | class Handler(object): 30 | """The Handler class executes HTTP requests. 31 | 32 | Attributes: 33 | __options (obj): The settins/options object. 34 | __queue_item (obj): The queue item containing a request to execute. 35 | 36 | """ 37 | 38 | def __init__(self, options, queue_item): 39 | """Construct the HTTP handler. 40 | 41 | Args: 42 | options (:class:`nyawc.Options`): The settins/options object. 43 | queue_item (:class:`nyawc.QueueItem`): The queue item containing the request. 44 | 45 | """ 46 | 47 | self.__options = options 48 | self.__queue_item = queue_item 49 | 50 | self.__queue_item.response = self.__make_request( 51 | self.__queue_item.request.url, 52 | self.__queue_item.request.method, 53 | self.__queue_item.request.data, 54 | self.__queue_item.request.auth, 55 | self.__queue_item.request.cookies, 56 | self.__queue_item.request.headers, 57 | self.__queue_item.request.proxies, 58 | self.__queue_item.request.timeout, 59 | self.__queue_item.request.verify 60 | ) 61 | 62 | # In Python 2.x it could occur that the requests module returns a unicode URL. 63 | # See this issue for more info (https://github.com/tijme/not-your-average-web-crawler/issues/5) 64 | self.__queue_item.response.url = str(self.__queue_item.response.url) 65 | 66 | def get_new_requests(self): 67 | """Retrieve all the new request that were found in this request. 68 | 69 | Returns: 70 | list(:class:`nyawc.http.Request`): A list of request objects. 71 | 72 | """ 73 | 74 | content_type = self.__queue_item.response.headers.get('content-type') 75 | scrapers = self.__get_all_scrapers() 76 | new_requests = [] 77 | 78 | for scraper in scrapers: 79 | instance = scraper(self.__options, self.__queue_item) 80 | if self.__content_type_matches(content_type, instance.content_types): 81 | new_requests.extend(instance.get_requests()) 82 | 83 | return new_requests 84 | 85 | def __make_request(self, url, method, data, auth, cookies, headers, proxies, timeout, verify): 86 | """Execute a request with the given data. 87 | 88 | Args: 89 | url (str): The URL to call. 90 | method (str): The method (e.g. `get` or `post`). 91 | data (str): The data to call the URL with. 92 | auth (obj): The authentication class. 93 | cookies (obj): The cookie dict. 94 | headers (obj): The header dict. 95 | proxies (obj): The proxies dict. 96 | timeout (int): The request timeout in seconds. 97 | verify (mixed): SSL verification. 98 | 99 | Returns: 100 | obj: The response object. 101 | 102 | """ 103 | 104 | request_by_method = getattr(requests, method) 105 | return request_by_method( 106 | url=url, 107 | data=data, 108 | auth=auth, 109 | cookies=cookies, 110 | headers=headers, 111 | proxies=proxies, 112 | timeout=timeout, 113 | verify=verify, 114 | allow_redirects=True, 115 | stream=False 116 | ) 117 | 118 | def __get_all_scrapers(self): 119 | """Find all available scraper references. 120 | 121 | Returns: 122 | list(obj): The scraper references. 123 | 124 | """ 125 | 126 | modules_strings = self.__get_all_scrapers_modules() 127 | modules = [] 128 | 129 | for module_string in modules_strings: 130 | module = importlib.import_module("nyawc.scrapers." + module_string) 131 | modules.append(getattr(module, module_string)) 132 | 133 | return modules 134 | 135 | def __get_all_scrapers_modules(self): 136 | """Find all available scraper modules. 137 | 138 | Returns: 139 | list(obj): The scraper modules. 140 | 141 | """ 142 | 143 | modules = [] 144 | 145 | file = os.path.realpath(__file__) 146 | folder = os.path.dirname(file) 147 | 148 | for filename in os.listdir(folder + "/../scrapers"): 149 | if filename.endswith("Scraper.py") and not filename.startswith("Base"): 150 | modules.append(filename[:-3]) 151 | 152 | return modules 153 | 154 | def __content_type_matches(self, content_type, available_content_types): 155 | """Check if the given content type matches one of the available content types. 156 | 157 | Args: 158 | content_type (str): The given content type. 159 | available_content_types list(str): All the available content types. 160 | 161 | Returns: 162 | bool: True if a match was found, False otherwise. 163 | 164 | """ 165 | 166 | if content_type is None: 167 | return False 168 | 169 | if content_type in available_content_types: 170 | return True 171 | 172 | for available_content_type in available_content_types: 173 | if available_content_type in content_type: 174 | return True 175 | 176 | return False 177 | -------------------------------------------------------------------------------- /nyawc/http/Request.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | from nyawc.helpers.URLHelper import URLHelper 26 | 27 | class Request(object): 28 | """The Request class contains details that were used to request the specified URL. 29 | 30 | Attributes: 31 | METHOD_OPTIONS (str): A request method that can be used to request the URL. 32 | METHOD_GET (str): A request method that can be used to request the URL. 33 | METHOD_HEAD (str): A request method that can be used to request the URL. 34 | METHOD_POST (str): A request method that can be used to request the URL. 35 | METHOD_PUT (str): A request method that can be used to request the URL. 36 | METHOD_DELETE (str): A request method that can be used to request the URL. 37 | parent_raised_error (bool): If the parent request raised an error (e.g. 404). 38 | depth (int): The current crawling depth. 39 | url (str): The absolute URL to use when making the request. 40 | method (str): The request method to use for the request. 41 | data (obj): The post data {key: value} OrderedDict that will be sent. 42 | auth (obj): The (requests module) authentication class to use for the request. 43 | cookies (obj): The (requests module) cookie jar to use for the request. 44 | headers (obj): The headers {key: value} to use for the request. 45 | proxies (obj): The proxies {key: value} to use for the request. 46 | timeout (int): The amount of seconds to wait before a timeout exception will be thrown. 47 | verify (mixed): True or False based on if certificates should be checked or else a path to a trusted bundle. 48 | 49 | """ 50 | 51 | METHOD_OPTIONS = "options" 52 | 53 | METHOD_GET = "get" 54 | 55 | METHOD_HEAD = "head" 56 | 57 | METHOD_POST = "post" 58 | 59 | METHOD_PUT = "put" 60 | 61 | METHOD_DELETE = "delete" 62 | 63 | def __init__(self, url, method=METHOD_GET, data=None, auth=None, cookies=None, headers=None, proxies=None, timeout=30, verify=True): 64 | """Constructs a Request instance. 65 | 66 | Args: 67 | url (str): The absolute URL to use when making the request. 68 | method (str): The request method to use for the request. 69 | data (obj): The post data {key: value} OrderedDict that will be sent. 70 | auth (obj): The (requests module) authentication class to use for the request. 71 | cookies (obj): The (requests module) cookie jar to use for the request. 72 | headers (obj): The headers {key: value} to use for the request. 73 | proxies (obj): The proxies {key: value} to use for the request. 74 | timeout (int): The amount of seconds to wait before a timeout exception will be thrown. 75 | verify (mixed): True or False based on if certificates should be checked or else a path to a trusted bundle. 76 | 77 | """ 78 | 79 | self.parent_raised_error = False 80 | self.depth = 0 81 | 82 | self.url = url 83 | self.method = method 84 | self.auth = auth 85 | self.cookies = cookies 86 | self.headers = headers 87 | self.proxies = proxies 88 | self.timeout = timeout 89 | self.verify = verify 90 | 91 | if method == self.METHOD_GET: 92 | self.url = URLHelper.append_with_data(self.url, data) 93 | self.data = None 94 | else: 95 | self.data = data 96 | -------------------------------------------------------------------------------- /nyawc/http/Response.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | class Response(object): 26 | """Response placeholder class for before request is finished. 27 | 28 | Attributes: 29 | url (str): The absolute URL of the request/response. 30 | 31 | Note: 32 | This class will be replaced with the response class of Python's `requests` module when the 33 | request is finished. For more information check http://docs.python-requests.org/en/master/api/#requests.Response. 34 | 35 | """ 36 | 37 | def __init__(self, url): 38 | """Constructs a Response instance. 39 | 40 | Args: 41 | url (str): The absolute URL of the request/response. 42 | 43 | """ 44 | 45 | self.url = url 46 | -------------------------------------------------------------------------------- /nyawc/http/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | -------------------------------------------------------------------------------- /nyawc/scrapers/BaseScraper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | from nyawc.http.Request import Request 26 | from nyawc.helpers.URLHelper import URLHelper 27 | 28 | class BaseScraper(object): 29 | """The BaseScraper can be used to create other scrapers. 30 | 31 | Attributes: 32 | __options (:class:`nyawc.Options`): The settins/options object. 33 | __queue_item (:class:`nyawc.QueueItem`): The queue item containing the response to scrape. 34 | 35 | """ 36 | 37 | def __init__(self, options, queue_item): 38 | """Construct the HTMLSoupLinkScraper instance. 39 | 40 | Args: 41 | options (:class:`nyawc.Options`): The settins/options object. 42 | queue_item (:class:`nyawc.QueueItem`): The queue item containing a response the scrape. 43 | 44 | """ 45 | 46 | self.options = options 47 | self.queue_item = queue_item 48 | 49 | def get_requests(self): 50 | """Get all the new requests that were found in the response. 51 | 52 | Returns: 53 | list(:class:`nyawc.http.Request`): A list of new requests that were found. 54 | 55 | """ 56 | 57 | requests = self.derived_get_requests() 58 | 59 | for request in requests: 60 | request.url = URLHelper.remove_hash(request.url) 61 | 62 | return requests 63 | -------------------------------------------------------------------------------- /nyawc/scrapers/CSSRegexLinkScraper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import re 26 | 27 | from nyawc.http.Request import Request 28 | from nyawc.helpers.URLHelper import URLHelper 29 | from nyawc.scrapers.BaseScraper import BaseScraper 30 | 31 | class CSSRegexLinkScraper(BaseScraper): 32 | """The CSSRegexLinkScraper finds absolute and relative URLs in Cascading Style Sheets. 33 | 34 | Attributes: 35 | content_types list(str): The supported content types. 36 | __expressions list(obj): The regular expressions to execute. 37 | 38 | """ 39 | 40 | content_types = [ 41 | "text/css" 42 | ] 43 | 44 | __expressions = [ 45 | # Match absolute/relative URLs between any type of CSS quote 46 | {"group": 1, "raw": r"\(([\"\'])?(((((https?:)?\/)?\/)|(\.\.\/)+)([^\n ]*?))(\1)?\)"} 47 | ] 48 | 49 | def derived_get_requests(self): 50 | """Get all the new requests that were found in the response. 51 | 52 | Returns: 53 | list(:class:`nyawc.http.Request`): A list of new requests that were found. 54 | 55 | """ 56 | 57 | host = self.queue_item.response.url 58 | content = self.queue_item.response.text 59 | 60 | found_requests = [] 61 | 62 | for expression in self.__expressions: 63 | matches = re.findall(expression["raw"], content) 64 | 65 | for match in matches: 66 | found_url = match[expression["group"]] 67 | absolute_url = URLHelper.make_absolute(host, found_url) 68 | found_requests.append(Request(absolute_url)) 69 | 70 | return found_requests 71 | -------------------------------------------------------------------------------- /nyawc/scrapers/HTMLSoupFormScraper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | from nyawc.CrawlerActions import CrawlerActions 26 | from nyawc.http.Request import Request 27 | from nyawc.helpers.URLHelper import URLHelper 28 | from nyawc.helpers.RandomInputHelper import RandomInputHelper 29 | from nyawc.scrapers.BaseScraper import BaseScraper 30 | from collections import OrderedDict 31 | 32 | class HTMLSoupFormScraper(BaseScraper): 33 | """The HTMLSoupFormScraper finds requests from forms in HTML using BeautifulSoup. 34 | 35 | Attributes: 36 | content_types list(str): The supported content types. 37 | 38 | """ 39 | 40 | content_types = [ 41 | "text/html", 42 | "application/xhtml+xml" 43 | ] 44 | 45 | def derived_get_requests(self): 46 | """Get all the new requests that were found in the response. 47 | 48 | Returns: 49 | list(:class:`nyawc.http.Request`): A list of new requests that were found. 50 | 51 | """ 52 | 53 | host = self.queue_item.response.url 54 | soup = self.queue_item.get_soup_response() 55 | 56 | found_requests = [] 57 | 58 | for form in soup.find_all("form"): 59 | found_requests.append(self.__get_request(host, form)) 60 | 61 | return found_requests 62 | 63 | def __get_request(self, host, soup): 64 | """Build a request from the given soup form. 65 | 66 | Args: 67 | host str: The URL of the current queue item. 68 | soup (obj): The BeautifulSoup form. 69 | 70 | Returns: 71 | :class:`nyawc.http.Request`: The new Request. 72 | 73 | """ 74 | 75 | url = URLHelper.make_absolute(host, self.__trim_grave_accent(soup["action"])) if soup.has_attr("action") else host 76 | method_original = soup["method"] if soup.has_attr("method") else "get" 77 | method = "post" if method_original.lower() == "post" else "get" 78 | data = self.__get_form_data(soup) 79 | 80 | return Request(url, method, data) 81 | 82 | 83 | def __trim_grave_accent(self, href): 84 | """Trim grave accents manually (because BeautifulSoup doesn"t support it). 85 | 86 | Args: 87 | href (str): The BeautifulSoup href value. 88 | 89 | Returns: 90 | str: The BeautifulSoup href value without grave accents. 91 | 92 | """ 93 | 94 | if href.startswith("`"): 95 | href = href[1:] 96 | 97 | if href.endswith("`"): 98 | href = href[:-1] 99 | 100 | return href 101 | 102 | def __get_form_data(self, soup): 103 | """Build a form data dict from the given form. 104 | 105 | Args: 106 | soup (obj): The BeautifulSoup form. 107 | 108 | Returns: 109 | obj: The form data (key/value). 110 | 111 | """ 112 | 113 | elements = self.__get_valid_form_data_elements(soup) 114 | form_data = self.__get_default_form_data_input(elements) 115 | callback = self.options.callbacks.form_before_autofill 116 | action = callback(self.queue_item, elements, form_data) 117 | 118 | if action == CrawlerActions.DO_AUTOFILL_FORM: 119 | self.__autofill_form_data(form_data, elements) 120 | 121 | return form_data 122 | 123 | def __get_valid_form_data_elements(self, soup): 124 | """Get all valid form input elements. 125 | 126 | Note: 127 | An element is valid when the value can be updated client-side 128 | and the element has a name attribute. 129 | 130 | Args: 131 | soup (obj): The BeautifulSoup form. 132 | 133 | Returns: 134 | list(obj): Soup elements. 135 | 136 | """ 137 | 138 | elements = [] 139 | 140 | for element in soup.find_all(["input", "button", "textarea", "select"]): 141 | if element.has_attr("name"): 142 | elements.append(element) 143 | 144 | return elements 145 | 146 | def __get_default_form_data_input(self, elements): 147 | """Get the default form data {key: value} for the given elements. 148 | 149 | Args: 150 | elements list(obj): Soup elements. 151 | 152 | Returns: 153 | obj: The {key: value} form data 154 | 155 | """ 156 | 157 | form_data = OrderedDict() 158 | 159 | for element in elements: 160 | default_value = self.__get_default_value_from_element(element) 161 | 162 | if default_value is False: 163 | continue 164 | 165 | form_data[element["name"]] = default_value 166 | 167 | return form_data 168 | 169 | def __autofill_form_data(self, form_data, elements): 170 | """Autofill empty form data with random data. 171 | 172 | Args: 173 | form_data (obj): The {key: value} form data 174 | elements list(obj): Soup elements. 175 | 176 | Returns: 177 | obj: The {key: value} 178 | 179 | """ 180 | 181 | for element in elements: 182 | if not element["name"] in form_data: 183 | continue 184 | 185 | if not len(form_data[element["name"]]) is 0: 186 | continue 187 | 188 | if element.name == "textarea": 189 | form_data[element["name"]] = RandomInputHelper.get_for_type("textarea") 190 | continue 191 | 192 | if element.has_attr("type"): 193 | form_data[element["name"]] = RandomInputHelper.get_for_type(element["type"]) 194 | 195 | def __get_default_value_from_element(self, element): 196 | """Get the default value of a form element 197 | 198 | Args: 199 | elements (obj): The soup element. 200 | 201 | Returns: 202 | str: The default value 203 | 204 | """ 205 | 206 | if element.name == "select": 207 | options = element.find_all("option") 208 | is_multiple = element.has_attr("multiple") 209 | 210 | selected_options = [ 211 | option for option in options 212 | if option.has_attr("selected") 213 | ] 214 | 215 | if not selected_options and options: 216 | selected_options = [options[0]] 217 | 218 | selected_values = [] 219 | 220 | if is_multiple: 221 | for option in selected_options: 222 | value = option["value"] if option.has_attr("value") else option.string 223 | selected_values.append(value) 224 | 225 | return selected_values 226 | elif len(selected_options) >= 1: 227 | if selected_options[0].has_attr("value"): 228 | return selected_options[0]["value"] 229 | else: 230 | return selected_options[0].string 231 | 232 | return "" 233 | 234 | if element.name == "textarea": 235 | return element.string if element.string is not None else "" 236 | 237 | if element.name == "input" and element.has_attr("type"): 238 | if element["type"] in ("checkbox", "radio"): 239 | if not element.has_attr("checked"): 240 | return False 241 | 242 | if element.has_attr("value"): 243 | return element["value"] 244 | else: 245 | return "on" 246 | 247 | if element.has_attr("value"): 248 | return element["value"] 249 | 250 | return "" 251 | -------------------------------------------------------------------------------- /nyawc/scrapers/HTMLSoupLinkScraper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | from nyawc.http.Request import Request 26 | from nyawc.helpers.URLHelper import URLHelper 27 | from nyawc.scrapers.BaseScraper import BaseScraper 28 | 29 | class HTMLSoupLinkScraper(BaseScraper): 30 | """The HTMLSoupLinkScraper finds URLs from href attributes in HTML using BeautifulSoup. 31 | 32 | Attributes: 33 | content_types list(str): The supported content types. 34 | 35 | """ 36 | 37 | content_types = [ 38 | "text/html", 39 | "application/xhtml+xml" 40 | ] 41 | 42 | def derived_get_requests(self): 43 | """Get all the new requests that were found in the response. 44 | 45 | Returns: 46 | list(:class:`nyawc.http.Request`): A list of new requests that were found. 47 | 48 | """ 49 | 50 | attributes = { 51 | "src": True, 52 | "href": True, 53 | "link": True, 54 | "script": True, 55 | "url": True 56 | } 57 | 58 | host = self.queue_item.response.url 59 | soup = self.queue_item.get_soup_response() 60 | base_element = soup.find("base", href=True) 61 | elements = soup.select("[{}]".format("],[".join(attributes.keys()))) 62 | 63 | # Always use the URL from the base element if it exists. 64 | # https://www.w3schools.com/tags/tag_base.asp 65 | if base_element: 66 | host = URLHelper.make_absolute(host, base_element["href"]) 67 | 68 | found_requests = [] 69 | 70 | for element in elements: 71 | for attribute in attributes.keys(): 72 | if not element.has_attr(attribute): 73 | continue 74 | 75 | found_url = self.__trim_grave_accent(element[attribute]) 76 | 77 | if URLHelper.is_mailto(found_url): 78 | continue 79 | 80 | absolute_url = URLHelper.make_absolute(host, found_url) 81 | found_requests.append(Request(absolute_url)) 82 | 83 | return found_requests 84 | 85 | def __trim_grave_accent(self, href): 86 | """Trim grave accents manually (because BeautifulSoup doesn't support it). 87 | 88 | Args: 89 | href (str): The BeautifulSoup href value. 90 | 91 | Returns: 92 | str: The BeautifulSoup href value without grave accents. 93 | 94 | """ 95 | 96 | if href.startswith("`"): 97 | href = href[1:] 98 | 99 | if href.endswith("`"): 100 | href = href[:-1] 101 | 102 | return href 103 | -------------------------------------------------------------------------------- /nyawc/scrapers/JSONRegexLinkScraper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import re 26 | 27 | from nyawc.http.Request import Request 28 | from nyawc.helpers.URLHelper import URLHelper 29 | from nyawc.scrapers.BaseScraper import BaseScraper 30 | 31 | class JSONRegexLinkScraper(BaseScraper): 32 | """The JSONRegexLinkScraper finds absolute and relative URLs in JSON keys and values. 33 | 34 | Attributes: 35 | content_types list(str): The supported content types. 36 | __expressions list(obj): The regular expressions to execute. 37 | 38 | """ 39 | 40 | content_types = [ 41 | "application/json" 42 | ] 43 | 44 | __expressions = [ 45 | # Match absolute/relative URLs between any type of JSON quote 46 | {"group": 1, "raw": r"([\"\'\`])(((((https?:)?\/)?\/)|(\.\.\/)+)([^\n ]*?))\1"} 47 | ] 48 | 49 | def derived_get_requests(self): 50 | """Get all the new requests that were found in the response. 51 | 52 | Returns: 53 | list(:class:`nyawc.http.Request`): A list of new requests that were found. 54 | 55 | """ 56 | 57 | host = self.queue_item.response.url 58 | content = self.queue_item.response.text 59 | 60 | found_requests = [] 61 | 62 | for expression in self.__expressions: 63 | matches = re.findall(expression["raw"], content) 64 | 65 | for match in matches: 66 | found_url = match[expression["group"]] 67 | absolute_url = URLHelper.make_absolute(host, found_url) 68 | found_requests.append(Request(absolute_url)) 69 | 70 | return found_requests 71 | -------------------------------------------------------------------------------- /nyawc/scrapers/XMLRegexLinkScraper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import re 26 | 27 | from nyawc.http.Request import Request 28 | from nyawc.helpers.URLHelper import URLHelper 29 | from nyawc.scrapers.BaseScraper import BaseScraper 30 | 31 | class XMLRegexLinkScraper(BaseScraper): 32 | """The XMLRegexLinkScraper finds absolute and relative URLs in XML values. 33 | 34 | Attributes: 35 | content_types list(str): The supported content types. 36 | __expressions list(obj): The regular expressions to execute. 37 | 38 | """ 39 | 40 | content_types = [ 41 | "text/xml", 42 | "application/xml", 43 | "image/svg+xml" 44 | ] 45 | 46 | __expressions = [ 47 | # Match absolute/relative URLs between any type of XML tag 48 | {"group": 0, "raw": r">(((((https?:)?\/)?\/)|(\.\.\/)+)([^\n ]*?))<\/"}, 49 | 50 | # Match absolute/relative URLs between any type of XML quote 51 | {"group": 1, "raw": r"=([\"\'\`])(((((https?:)?\/)?\/)|(\.\.\/)+)([^\n ]*?))\1"} 52 | ] 53 | 54 | def derived_get_requests(self): 55 | """Get all the new requests that were found in the response. 56 | 57 | Returns: 58 | list(:class:`nyawc.http.Request`): A list of new requests that were found. 59 | 60 | """ 61 | 62 | host = self.queue_item.response.url 63 | content = self.queue_item.response.text 64 | 65 | found_requests = [] 66 | 67 | for expression in self.__expressions: 68 | matches = re.findall(expression["raw"], content) 69 | 70 | for match in matches: 71 | found_url = match[expression["group"]] 72 | absolute_url = URLHelper.make_absolute(host, found_url) 73 | found_requests.append(Request(absolute_url)) 74 | 75 | return found_requests 76 | -------------------------------------------------------------------------------- /nyawc/scrapers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.7.1 2 | lxml==4.9.1 3 | requests==2.21.0 4 | requests_toolbelt==0.9.1 5 | sphinx==1.8.3 6 | sphinx-better-theme==0.1.5 7 | sphinxcontrib-napoleon==0.7 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | from setuptools import find_packages, setup 26 | from nyawc.helpers.PackageHelper import PackageHelper 27 | 28 | with open("requirements.txt") as file: 29 | requirements = file.read().splitlines() 30 | 31 | with open("README.rst") as file: 32 | readme = PackageHelper.rst_to_pypi(file.read()) 33 | 34 | setup( 35 | name=PackageHelper.get_alias(), 36 | version=PackageHelper.get_version(), 37 | description=PackageHelper.get_description(), 38 | long_description=readme, 39 | keywords = ["vulnerability", "bug-bounty", "security", "post", "get", "request", "crawler", "scraper", "scanner"], 40 | classifiers=[ 41 | "Development Status :: 5 - Production/Stable", 42 | "Environment :: Console", 43 | "Intended Audience :: Developers", 44 | "Intended Audience :: Education", 45 | "Intended Audience :: Information Technology", 46 | "Intended Audience :: System Administrators", 47 | "License :: OSI Approved :: MIT License", 48 | "Natural Language :: English", 49 | "Operating System :: MacOS", 50 | "Operating System :: Microsoft :: Windows", 51 | "Operating System :: POSIX :: Linux", 52 | "Programming Language :: Python :: 3.6", 53 | "Programming Language :: Python :: 3.5", 54 | "Programming Language :: Python :: 2.7", 55 | "Topic :: Security" 56 | ], 57 | packages=find_packages(), 58 | platforms=["any"], 59 | author="Tijme Gommers", 60 | license="MIT", 61 | url="https://tijme.github.io/not-your-average-web-crawler/", 62 | install_requires=requirements 63 | ) 64 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/test/__init__.py -------------------------------------------------------------------------------- /test/site/fuzzing/empty.php: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/test/site/fuzzing/empty.php -------------------------------------------------------------------------------- /test/site/fuzzing/sleep.php: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/site/http_statuses/status_100.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | Response -------------------------------------------------------------------------------- /test/site/http_statuses/status_200.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | Response -------------------------------------------------------------------------------- /test/site/http_statuses/status_300.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | Response -------------------------------------------------------------------------------- /test/site/http_statuses/status_400.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | Response -------------------------------------------------------------------------------- /test/site/http_statuses/status_500.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | Response -------------------------------------------------------------------------------- /test/site/index.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | isFile()) { 10 | continue; 11 | } 12 | 13 | $ext = pathinfo($file->getPathname(), PATHINFO_EXTENSION); 14 | 15 | if ($ext != 'php' || $file->getBasename() == 'index.php') { 16 | continue; 17 | } 18 | 19 | $href = substr($file->getPathname(), 2); 20 | $href = str_replace('\\', '/', $href); 21 | 22 | echo '' . htmlentities($href) . '
    '; 23 | } 24 | ?> 25 | 26 | -------------------------------------------------------------------------------- /test/site/invalid_content_types/css.php: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/site/invalid_content_types/html.php: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/site/invalid_content_types/json.php: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/site/invalid_content_types/xhtml.php: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/site/invalid_content_types/xml.php: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/site/malformed_responses/css.php: -------------------------------------------------------------------------------- 1 | 2 | body { 3 | width: 4 | } 5 | 6 | p { 7 | height: 80px; 8 | 9 | span { 10 | margin-toppp: 23px; 11 | } -------------------------------------------------------------------------------- /test/site/malformed_responses/html.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | 8 | Scraped 9 | 10 | 11 |

    Hello, world!

    12 | 13 | -------------------------------------------------------------------------------- /test/site/malformed_responses/json.php: -------------------------------------------------------------------------------- 1 | 2 | {"menu": { 3 | "id": "file, 4 | "value": "File", 5 | "popup": { 6 | "menuitem": [ 7 | {"value": "New", "onclick": "CreateNewDoc()"}, 8 | {"value": "Open", "onclick": "OpenDoc()"}, 9 | {"value": "Close", "onclick": "CloseDoc()"} 10 | ] 11 | } 12 | }} -------------------------------------------------------------------------------- /test/site/malformed_responses/xhtml.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Scraped 9 | 10 | 11 |

    Hello, world!

    12 | 13 | -------------------------------------------------------------------------------- /test/site/malformed_responses/xml.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /test/test_helpers_url_helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import unittest 26 | 27 | from nyawc.helpers.URLHelper import URLHelper 28 | 29 | class TestUrlHelper(unittest.TestCase): 30 | """The TestUrlHelper class checks if the methods in the URLHelper work correctly.""" 31 | 32 | def test_make_absolute(self): 33 | """Check if the make absolute method works correctly.""" 34 | 35 | host = "https://example.ltd/current" 36 | 37 | tests = [ 38 | ("https://example.ltd/new.html", "new.html"), 39 | ("https://example.ltd/new", "new"), 40 | ("https://example.ltd/new1/new2", "new1/new2"), 41 | ("https://example.ltd/new1/new3", "/new1/new3"), 42 | ("https://example.ltd/current?a=a", "?a=a") 43 | ] 44 | 45 | for test in tests: 46 | self.assertEqual(URLHelper.make_absolute(host, test[1]), test[0]) 47 | 48 | def test_make_absolute_with_base(self): 49 | """Check if the make absolute method works correctly in interpreted with a base URL.""" 50 | 51 | host = "https://example.ltd/base/" 52 | 53 | tests = [ 54 | ("https://example.ltd/base/new.html", "new.html"), 55 | ("https://example.ltd/base/new", "new"), 56 | ("https://example.ltd/base/new1/new2", "new1/new2"), 57 | ("https://example.ltd/new1/new2", "/new1/new2"), 58 | ("https://example.ltd/base/?a=a", "?a=a") 59 | ] 60 | 61 | for test in tests: 62 | self.assertEqual(URLHelper.make_absolute(host, test[1]), test[0]) 63 | 64 | def test_make_absolute_folder_traversal(self): 65 | """Ensure folder traversal works correclty.""" 66 | 67 | host = "https://example.ltd/dir1/dir2/dir3" 68 | 69 | tests = [ 70 | ("https://example.ltd/dir1/dir2", "../"), 71 | ("https://example.ltd/dir1", "../../"), 72 | ("https://example.ltd", "../../../"), 73 | ("https://example.ltd", "../../../../"), 74 | ("https://example.ltd", "../../../../../") 75 | ] 76 | 77 | for test in tests: 78 | self.assertEqual(URLHelper.make_absolute(host, test[1]), test[0]) 79 | 80 | def test_get_protocol(self): 81 | """Check if the get protocol method works correctly.""" 82 | 83 | tests = [ 84 | ("", "domain.tld"), 85 | ("http", "http://domain.tld"), 86 | ("arbitrary", "arbitrary://omain.tld") 87 | ] 88 | 89 | for test in tests: 90 | self.assertEqual(URLHelper.get_protocol(test[1]), test[0]) 91 | 92 | def test_get_subdomain(self): 93 | """Check if the get subdomain method works correctly.""" 94 | 95 | tests = [ 96 | ("", ""), 97 | ("", "http://"), 98 | ("", "http://domain"), 99 | ("", "http://domain.tld"), 100 | ("sub1", "http://sub1.domain.tld"), 101 | ("sub2.sub1", "http://sub2.sub1.domain.tld"), 102 | ("sub3.sub2.sub1", "http://sub3.sub2.sub1.domain.tld") 103 | ] 104 | 105 | for test in tests: 106 | self.assertEqual(URLHelper.get_subdomain(test[1]), test[0]) 107 | 108 | def test_get_hostname(self): 109 | """Check if the get hostname method works correctly.""" 110 | 111 | tests = [ 112 | ("", ""), 113 | ("", "http://"), 114 | ("domain", "http://domain"), 115 | ("domain", "http://domain.tld"), 116 | ("domain", "http://sub1.domain.tld"), 117 | ("domain", "http://sub2.sub1.domain.tld") 118 | ] 119 | 120 | for test in tests: 121 | self.assertEqual(URLHelper.get_hostname(test[1]), test[0]) 122 | 123 | def test_get_tld(self): 124 | """Check if the get tld method works correctly.""" 125 | 126 | tests = [ 127 | ("", ""), 128 | ("", "http://"), 129 | ("", "http://domain"), 130 | ("tld", "http://domain.tld"), 131 | ("tld", "http://sub1.domain.tld"), 132 | ("tld", "http://sub2.sub1.domain.tld") 133 | ] 134 | 135 | for test in tests: 136 | self.assertEqual(URLHelper.get_tld(test[1]), test[0]) 137 | 138 | def test_get_ordered_params(self): 139 | """Check if the get ordered params method works correctly.""" 140 | 141 | val1 = URLHelper.get_ordered_params("http://example.tld?a=a&c=c&b=b&d=d") 142 | val2 = URLHelper.get_ordered_params("http://sub.domain.ltd?c=c&b=b&a=a&d=d") 143 | 144 | self.assertEqual(val1, val2) 145 | 146 | def test_append_with_data_encoded_and_decoded(self): 147 | """Make sure values do not get decoded or encoded.""" 148 | 149 | val1 = URLHelper.append_with_data("http://example.tld/", {"val": "{{aaaa}}"}) 150 | val2 = URLHelper.append_with_data("http://example.tld/", {"val": "%7B%7Baaaa%7D%7D"}) 151 | 152 | self.assertEqual(val1, "http://example.tld/?val={{aaaa}}") 153 | self.assertEqual(val2, "http://example.tld/?val=%7B%7Baaaa%7D%7D") 154 | -------------------------------------------------------------------------------- /test/test_queue.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import unittest 26 | 27 | from nyawc.helpers.HTTPRequestHelper import HTTPRequestHelper 28 | from nyawc.Queue import Queue 29 | from nyawc.http.Request import Request 30 | from nyawc.Options import Options 31 | 32 | class TestQueue(unittest.TestCase): 33 | """The TestQueue class tests if the hashes and counters of the queue work correctly.""" 34 | 35 | def test_hash_is_always_the_same(self): 36 | """Ensure the hashes are calculated correctly by checking for duplicates in the queue.""" 37 | 38 | options = Options() 39 | queue = Queue(options) 40 | 41 | for index in range(0, 100): 42 | request = Request("https://example.ltd?1=1#2=2") 43 | HTTPRequestHelper.patch_with_options(request, options) 44 | request.cookies.set(name='tasty_cookie{}'.format(index), value='yum', domain='example.ltd') 45 | queue.add_request(request) 46 | 47 | self.assertEqual(queue.count_total, 1) 48 | 49 | def test_hash_different_query_order(self): 50 | """Ensure query parameters in different orders are treated as one queue item.""" 51 | 52 | queue = Queue(Options()) 53 | 54 | queue.add_request(Request("https://www.example.ltd?b=b&c=c&a=a")) 55 | queue.add_request(Request("https://www.example.ltd?b=b&a=a&c=c")) 56 | queue.add_request(Request("https://www.example.ltd?a=a&b=b&c=c")) 57 | 58 | self.assertEqual(queue.count_total, 1) 59 | 60 | 61 | def test_hash_different_encoded_and_decoded_values(self): 62 | """Ensure encoded and decoded values have a different hash.""" 63 | 64 | queue = Queue(Options()) 65 | 66 | queue.add_request(Request("http://example.ltd?val={{aaaa}}")) 67 | queue.add_request(Request("http://example.ltd?val=%7B%7Baaaa%7D%7D")) 68 | 69 | self.assertEqual(queue.count_total, 2) 70 | -------------------------------------------------------------------------------- /test/test_scrapers_css_regex_link_scraper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import unittest 26 | 27 | from nyawc.scrapers.CSSRegexLinkScraper import CSSRegexLinkScraper 28 | from nyawc.QueueItem import QueueItem 29 | from nyawc.http.Request import Request 30 | from nyawc.http.Response import Response 31 | from nyawc.Options import Options 32 | 33 | class TestScrapersCSSRegexLinkScraper(unittest.TestCase): 34 | """The TestScrapersCSSRegexLinkScraper class tests if the CSSRegexLinkScraper is working correctly. 35 | 36 | Attributes: 37 | __host (str): The host were the new URLs were found on 38 | __urls list(obj): The URLs that were found 39 | 40 | """ 41 | 42 | __host = "https://example.ltd/" 43 | 44 | __urls = [ 45 | {"url": """https://example.ltd/?unique=1""", "must_pass": True, "test": """(https://example.ltd/?unique=1)"""}, 46 | {"url": """http://example.ltd/?unique=2""", "must_pass": True, "test": """(\"http://example.ltd/?unique=2\")"""}, 47 | {"url": """https://example.ltd/?unique=3""", "must_pass": True, "test": """('//example.ltd/?unique=3')"""}, 48 | 49 | {"url": None, "must_pass": False, "test": """@import url(this-should-not-pass)"""}, 50 | {"url": None, "must_pass": False, "test": """@import url(`https://example.ltd/`)"""} 51 | ] 52 | 53 | def test_xml_url_count(self): 54 | """Test if the amount of URLs found complies with the expected amount.""" 55 | 56 | html = "" 57 | for url in self.__urls: 58 | html += "\n" + url["test"] 59 | 60 | request = Request(self.__host) 61 | response = Response(self.__host) 62 | response.text = html 63 | 64 | finder = CSSRegexLinkScraper(Options(), QueueItem(request, response)) 65 | matches = finder.get_requests() 66 | 67 | self.assertEqual(len(matches), 3) 68 | 69 | def test_xml_url_matches(self): 70 | """Test if all the URLs match the found URLs.""" 71 | 72 | for url in self.__urls: 73 | request = Request(self.__host) 74 | response = Response(self.__host) 75 | response.text = url["test"] 76 | 77 | finder = CSSRegexLinkScraper(Options(), QueueItem(request, response)) 78 | requests = finder.get_requests() 79 | 80 | if url["must_pass"]: 81 | self.assertEqual(requests[0].url, url["url"]) 82 | self.assertEqual(len(requests), 1) 83 | else: 84 | self.assertEqual(len(requests), 0) 85 | -------------------------------------------------------------------------------- /test/test_scrapers_html_soup_form_scraper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import unittest 26 | 27 | from nyawc.scrapers.HTMLSoupFormScraper import HTMLSoupFormScraper 28 | from nyawc.QueueItem import QueueItem 29 | from nyawc.http.Request import Request 30 | from nyawc.http.Response import Response 31 | from nyawc.Options import Options 32 | 33 | class TestScrapersHTMLSoupFormScraper(unittest.TestCase): 34 | """The TestScrapersHTMLSoupFormScraper class tests if the HTMLSoupFormScraper is working correctly. 35 | 36 | Attributes: 37 | __host (str): The host were the new URLs were found on 38 | __urls list(obj): The URLs that were found 39 | 40 | """ 41 | 42 | __host = "https://example.ltd/" 43 | 44 | __urls = [ 45 | { 46 | "url": """https://example.ltd/action_page1.php""", 47 | "method": Request.METHOD_POST, 48 | "data": { 49 | "lastname": "Mouse", 50 | "name": "TestContent" 51 | }, 52 | "must_pass": True, 53 | "test": """ 54 |
    55 | First name:
    56 |
    57 | Last name:
    58 |

    59 | 60 | 61 |
    62 | """ 63 | }, 64 | { 65 | "url": """https://example.ltd/action_page2.php""", 66 | "method": Request.METHOD_POST, 67 | "data": { 68 | "lastname": "Mouse" 69 | }, 70 | "must_pass": True, 71 | "test": """ 72 |
    73 | First name:
    74 |
    75 | Last name:
    76 |

    77 | 78 |
    79 | """ 80 | }, 81 | { 82 | "url": """https://example.ltd/?lastname=Mouse""", 83 | "method": Request.METHOD_GET, 84 | "data": None, 85 | "must_pass": True, 86 | "test": """ 87 |
    88 | First name:
    89 |
    90 | Last name:
    91 |

    92 | 93 |
    94 | """ 95 | }, 96 | { 97 | "url": """https://example.ltd/?lastname=Mouse&test=TestContent""", 98 | "method": Request.METHOD_GET, 99 | "data": None, 100 | "must_pass": True, 101 | "test": """ 102 |
    103 | First name:
    104 |
    105 | Last name:
    106 |

    107 | 108 | 109 |
    110 | """ 111 | }, 112 | ] 113 | 114 | def test_soup_url_count(self): 115 | """Test if the amount of URLs found complies with the expected amount.""" 116 | 117 | html = "" 118 | for url in self.__urls: 119 | html += "\n" + url["test"] 120 | 121 | request = Request(self.__host) 122 | response = Response(self.__host) 123 | response.text = html 124 | 125 | finder = HTMLSoupFormScraper(Options(), QueueItem(request, response)) 126 | matches = finder.get_requests() 127 | 128 | self.assertEqual(len(matches), 4) 129 | 130 | def test_soup_url_matches(self): 131 | """Test if all the URLs match the found URLs.""" 132 | 133 | for url in self.__urls: 134 | request = Request(self.__host) 135 | response = Response(self.__host) 136 | response.text = url["test"] 137 | 138 | finder = HTMLSoupFormScraper(Options(), QueueItem(request, response)) 139 | requests = finder.get_requests() 140 | 141 | if url["must_pass"]: 142 | self.assertEqual(requests[0].url, url["url"]) 143 | self.assertEqual(len(requests), 1) 144 | else: 145 | self.assertEqual(len(requests), 0) 146 | -------------------------------------------------------------------------------- /test/test_scrapers_json_regex_link_scraper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import unittest 26 | 27 | from nyawc.scrapers.JSONRegexLinkScraper import JSONRegexLinkScraper 28 | from nyawc.QueueItem import QueueItem 29 | from nyawc.http.Request import Request 30 | from nyawc.http.Response import Response 31 | from nyawc.Options import Options 32 | 33 | class TestScrapersJSONRegexLinkScraper(unittest.TestCase): 34 | """The TestScrapersJSONRegexLinkScraper class tests if the JSONRegexLinkScraper is working correctly. 35 | 36 | Attributes: 37 | __host (str): The host were the new URLs were found on 38 | __urls list(obj): The URLs that were found 39 | 40 | """ 41 | 42 | __host = "https://example.ltd/" 43 | 44 | __urls = [ 45 | {"url": """https://example.ltd/?unique=1""", "must_pass": True, "test": """[\"https://example.ltd/?unique=1\"]"""}, 46 | {"url": """http://example.ltd/?unique=2""", "must_pass": True, "test": """{\"http://example.ltd/?unique=2\":\"\"}"""}, 47 | {"url": """https://example.ltd/?unique=3""", "must_pass": True, "test": """{\"//example.ltd/?unique=3\":\"\"}"""}, 48 | {"url": """https://example.ltd/aa/bb/?unique=4""", "must_pass": True, "test": """{\"/aa/bb/?unique=4\":\"\"}"""}, 49 | {"url": """https://example.ltd/aa/bb/?unique=5""", "must_pass": True, "test": """{\"\":\"/aa/bb/?unique=5\"}"""}, 50 | 51 | {"url": None, "must_pass": False, "test": """{\"\":\"asdfasdf/asdfasdf\"}"""}, 52 | ] 53 | 54 | def test_xml_url_count(self): 55 | """Test if the amount of URLs found complies with the expected amount.""" 56 | 57 | html = "" 58 | for url in self.__urls: 59 | html += "\n" + url["test"] 60 | 61 | request = Request(self.__host) 62 | response = Response(self.__host) 63 | response.text = html 64 | 65 | finder = JSONRegexLinkScraper(Options(), QueueItem(request, response)) 66 | matches = finder.get_requests() 67 | 68 | self.assertEqual(len(matches), 5) 69 | 70 | def test_xml_url_matches(self): 71 | """Test if all the URLs match the found URLs.""" 72 | 73 | for url in self.__urls: 74 | request = Request(self.__host) 75 | response = Response(self.__host) 76 | response.text = url["test"] 77 | 78 | finder = JSONRegexLinkScraper(Options(), QueueItem(request, response)) 79 | requests = finder.get_requests() 80 | 81 | if url["must_pass"]: 82 | self.assertEqual(requests[0].url, url["url"]) 83 | self.assertEqual(len(requests), 1) 84 | else: 85 | self.assertEqual(len(requests), 0) 86 | -------------------------------------------------------------------------------- /test/test_scrapers_xml_regex_link_scraper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import unittest 26 | 27 | from nyawc.scrapers.XMLRegexLinkScraper import XMLRegexLinkScraper 28 | from nyawc.QueueItem import QueueItem 29 | from nyawc.http.Request import Request 30 | from nyawc.http.Response import Response 31 | from nyawc.Options import Options 32 | 33 | class TestScrapersXMLRegexLinkScraper(unittest.TestCase): 34 | """The TestScrapersXMLRegexLinkScraper class tests if the XMLRegexLinkScraper is working correctly. 35 | 36 | Attributes: 37 | __host (str): The host were the new URLs were found on 38 | __urls list(obj): The URLs that were found 39 | 40 | """ 41 | 42 | __host = "https://example.ltd/" 43 | 44 | __urls = [ 45 | {"url": """https://example.ltd/?unique=1""", "must_pass": True, "test": """https://example.ltd/?unique=1"""}, 46 | {"url": """http://example.ltd/?unique=2""", "must_pass": True, "test": """http://example.ltd/?unique=2"""}, 47 | {"url": """https://example.ltd/?unique=3""", "must_pass": True, "test": """//example.ltd/?unique=3"""}, 48 | {"url": """https://example.ltd/aa/bb/?unique=4""", "must_pass": True, "test": """/aa/bb/?unique=4"""}, 49 | {"url": """https://example.ltd/aa/bb/?unique=5""", "must_pass": True, "test": """/aa/bb/?unique=5"""}, 50 | 51 | {"url": None, "must_pass": False, "test": """asdfasdf/asdfasdf"""}, 52 | ] 53 | 54 | def test_xml_url_count(self): 55 | """Test if the amount of URLs found complies with the expected amount.""" 56 | 57 | html = "" 58 | for url in self.__urls: 59 | html += "\n" + url["test"] 60 | 61 | request = Request(self.__host) 62 | response = Response(self.__host) 63 | response.text = html 64 | 65 | finder = XMLRegexLinkScraper(Options(), QueueItem(request, response)) 66 | matches = finder.get_requests() 67 | 68 | self.assertEqual(len(matches), 5) 69 | 70 | def test_xml_url_matches(self): 71 | """Test if all the URLs match the found URLs.""" 72 | 73 | for url in self.__urls: 74 | request = Request(self.__host) 75 | response = Response(self.__host) 76 | response.text = url["test"] 77 | 78 | finder = XMLRegexLinkScraper(Options(), QueueItem(request, response)) 79 | requests = finder.get_requests() 80 | 81 | if url["must_pass"]: 82 | self.assertEqual(requests[0].url, url["url"]) 83 | self.assertEqual(len(requests), 1) 84 | else: 85 | self.assertEqual(len(requests), 0) 86 | -------------------------------------------------------------------------------- /test/test_site.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # MIT License 4 | # 5 | # Copyright (c) 2017 Tijme Gommers 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | import os 26 | import unittest 27 | 28 | from nyawc.Options import Options 29 | from nyawc.Crawler import Crawler 30 | from nyawc.http.Request import Request 31 | from nyawc.CrawlerActions import CrawlerActions 32 | 33 | class TestSite(unittest.TestCase): 34 | """The TestSite class checks if the crawler handles invalid responses correctly. 35 | 36 | Attributes: 37 | travis (bool): If the current environment is in Travis CI. 38 | 39 | """ 40 | 41 | def __init__(self, *args, **kwargs): 42 | """Initialize the unit test and mark if the current environment is Travis CI. 43 | 44 | Args: 45 | args list(str): The command line arguments. 46 | kwargs **: Extra arguments 47 | 48 | """ 49 | 50 | super(TestSite, self).__init__(*args, **kwargs) 51 | self.travis = "UNITTEST_NYAWC_SITE" in os.environ 52 | 53 | def cb_request_after_finish(self, queue, queue_item, new_queue_items): 54 | """Crawler callback for when a request is finished crawling. 55 | 56 | Args: 57 | queue (:class:`nyawc.Queue`): The current crawling queue. 58 | queue_item (:class:`nyawc.QueueItem`): The queue item that was finished. 59 | new_queue_items list(:class:`nyawc.QueueItem`): The new queue items that were found in the one that finished. 60 | 61 | Returns: 62 | str: A crawler action (either DO_STOP_CRAWLING or DO_CONTINUE_CRAWLING). 63 | 64 | """ 65 | 66 | print("Finished: {}".format(queue_item.request.url)) 67 | return CrawlerActions.DO_CONTINUE_CRAWLING 68 | 69 | def test_crawl_website(self): 70 | """Crawl the website in `test/` and check if the count is correct.""" 71 | 72 | if not self.travis: 73 | print("\n\nPlease note that the 'TestSite' unit test did not run.") 74 | print("It will only run in Travis CI since it requires a webserver.\n") 75 | return 76 | 77 | options = Options() 78 | options.callbacks.request_after_finish = self.cb_request_after_finish 79 | crawler = Crawler(options) 80 | crawler.start_with(Request("http://localhost/")) 81 | 82 | self.assertEqual(crawler.queue.count_total, 18) 83 | --------------------------------------------------------------------------------