├── .github
    ├── CODE_OF_CONDUCT.rst
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE.rst
    └── PULL_REQUEST_TEMPLATE.rst
├── .gitignore
├── .semver
├── .travis.yml
├── CONTRIBUTING.rst
├── LICENSE.rst
├── MANIFEST.in
├── README.rst
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── _static
    │       ├── better.css
    │       ├── css
    │       │   ├── bootstrap-theme.css
    │       │   └── bootstrap.min.css
    │       ├── fonts
    │       │   ├── glyphicons-halflings-regular.eot
    │       │   ├── glyphicons-halflings-regular.svg
    │       │   ├── glyphicons-halflings-regular.ttf
    │       │   ├── glyphicons-halflings-regular.woff
    │       │   └── glyphicons-halflings-regular.woff2
    │       ├── img
    │       │   ├── flow.png
    │       │   ├── flow.svg
    │       │   ├── header.jpg
    │       │   ├── logo.png
    │       │   └── logo.svg
    │       └── js
    │       │   ├── bootstrap-theme.js
    │       │   ├── bootstrap.min.js
    │       │   └── releases-dropdown.js
    │   ├── _templates
    │       └── layout.html
    │   ├── conf.py
    │   ├── getting_started.rst
    │   ├── index.rst
    │   ├── installation.rst
    │   ├── kitchen_sink.rst
    │   ├── migration.rst
    │   ├── modules.rst
    │   ├── nyawc.helpers.rst
    │   ├── nyawc.http.rst
    │   ├── nyawc.rst
    │   ├── nyawc.scrapers.rst
    │   ├── options_callbacks.rst
    │   ├── options_crawling_identity.rst
    │   ├── options_crawling_scope.rst
    │   ├── options_misc.rst
    │   ├── options_performance.rst
    │   └── options_routing.rst
├── example_extensive.py
├── example_minimal.py
├── nyawc
    ├── Crawler.py
    ├── CrawlerActions.py
    ├── CrawlerThread.py
    ├── Options.py
    ├── Queue.py
    ├── QueueItem.py
    ├── Routing.py
    ├── __init__.py
    ├── helpers
    │   ├── DebugHelper.py
    │   ├── HTTPRequestHelper.py
    │   ├── PackageHelper.py
    │   ├── RandomInputHelper.py
    │   ├── URLHelper.py
    │   └── __init__.py
    ├── http
    │   ├── Handler.py
    │   ├── Request.py
    │   ├── Response.py
    │   └── __init__.py
    └── scrapers
    │   ├── BaseScraper.py
    │   ├── CSSRegexLinkScraper.py
    │   ├── HTMLSoupFormScraper.py
    │   ├── HTMLSoupLinkScraper.py
    │   ├── JSONRegexLinkScraper.py
    │   ├── XMLRegexLinkScraper.py
    │   └── __init__.py
├── requirements.txt
├── setup.py
└── test
    ├── __init__.py
    ├── site
        ├── fuzzing
        │   ├── empty.php
        │   └── sleep.php
        ├── http_statuses
        │   ├── status_100.php
        │   ├── status_200.php
        │   ├── status_300.php
        │   ├── status_400.php
        │   └── status_500.php
        ├── index.php
        ├── invalid_content_types
        │   ├── css.php
        │   ├── html.php
        │   ├── json.php
        │   ├── xhtml.php
        │   └── xml.php
        └── malformed_responses
        │   ├── css.php
        │   ├── html.php
        │   ├── json.php
        │   ├── xhtml.php
        │   └── xml.php
    ├── test_helpers_url_helper.py
    ├── test_queue.py
    ├── test_scrapers_css_regex_link_scraper.py
    ├── test_scrapers_html_soup_form_scraper.py
    ├── test_scrapers_html_soup_link_scraper.py
    ├── test_scrapers_json_regex_link_scraper.py
    ├── test_scrapers_xml_regex_link_scraper.py
    └── test_site.py


/.github/CODE_OF_CONDUCT.rst:
--------------------------------------------------------------------------------
 1 | Contributor Covenant Code of Conduct
 2 | ====================================
 3 | 
 4 | Our Pledge
 5 | ----------
 6 | 
 7 | In the interest of fostering an open and welcoming environment, we as
 8 | contributors and maintainers pledge to making participation in our
 9 | project and our community a harassment-free experience for everyone,
10 | regardless of age, body size, disability, ethnicity, gender identity and
11 | expression, level of experience, nationality, personal appearance, race,
12 | religion, or sexual identity and orientation.
13 | 
14 | Our Standards
15 | -------------
16 | 
17 | Examples of behavior that contributes to creating a positive environment
18 | include:
19 | 
20 | -  Using welcoming and inclusive language
21 | -  Being respectful of differing viewpoints and experiences
22 | -  Gracefully accepting constructive criticism
23 | -  Focusing on what is best for the community
24 | -  Showing empathy towards other community members
25 | 
26 | Examples of unacceptable behavior by participants include:
27 | 
28 | -  The use of sexualized language or imagery and unwelcome sexual
29 |    attention or advances
30 | -  Trolling, insulting/derogatory comments, and personal or political
31 |    attacks
32 | -  Public or private harassment
33 | -  Publishing others’ private information, such as a physical or
34 |    electronic address, without explicit permission
35 | -  Other conduct which could reasonably be considered inappropriate in a
36 |    professional setting
37 | 
38 | Our Responsibilities
39 | --------------------
40 | 
41 | Project maintainers are responsible for clarifying the standards of
42 | acceptable behavior and are expected to take appropriate and fair
43 | corrective action in response to any instances of unacceptable behavior.
44 | 
45 | Project maintainers have the right and responsibility to remove, edit,
46 | or reject comments, commits, code, wiki edits, issues, and other
47 | contributions that are not aligned to this Code of Conduct, or to ban
48 | temporarily or permanently any contributor for other behaviors that they
49 | deem inappropriate, threatening, offensive, or harmful.
50 | 
51 | Scope
52 | -----
53 | 
54 | This Code of Conduct applies both within project spaces and in public
55 | spaces when an individual is representing the project or its community.
56 | Examples of representing a project or community include using an
57 | official project e-mail address, posting via an official social media
58 | account, or acting as an appointed representative at an online or
59 | offline event. Representation of a project may be further defined and
60 | clarified by project maintainers.
61 | 
62 | Enforcement
63 | -----------
64 | 
65 | Instances of abusive, harassing, or otherwise unacceptable behavior may
66 | be reported by contacting the project team at 
67 | t{{dot}}gommers{{plus}}nyawc{{at}}outlook{{dot}}com. The project team 
68 | will review and investigate all complaints, and will respond in a 
69 | way that it deems appropriate to the circumstances. The project team is 
70 | obligated to maintain confidentiality with regard to the reporter of an 
71 | incident. Further details of specific enforcement policies may be posted 
72 | separately.
73 | 
74 | Project maintainers who do not follow or enforce the Code of Conduct in
75 | good faith may face temporary or permanent repercussions as determined
76 | by other members of the project’s leadership.
77 | 
78 | Attribution
79 | -----------
80 | 
81 | This Code of Conduct is adapted from the `Contributor Covenant`_,
82 | version 1.4, available at `http://contributor-covenant.org/version/1/4`_
83 | 
84 | .. _Contributor Covenant: http://contributor-covenant.org
85 | .. _`http://contributor-covenant.org/version/1/4`: http://contributor-covenant.org/version/1/4/
86 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: tijme
2 | custom: ['https://www.paypal.me/tijmegommers', 'https://bunq.me/tijme']
3 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.rst:
--------------------------------------------------------------------------------
 1 | One line summary of the issue here.
 2 | 
 3 | Expected behavior
 4 | =================
 5 | 
 6 | As concisely as possible, describe the expected behavior.
 7 | 
 8 | Actual behavior
 9 | =================
10 | 
11 | As concisely as possible, describe the observed behavior.
12 | 
13 | Steps to reproduce the behavior
14 | =================
15 | 
16 | Please list all relevant steps to reproduce the observed behavior.


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.rst:
--------------------------------------------------------------------------------
 1 | One line summary of the issue here.
 2 | 
 3 | Problem
 4 | =================
 5 | 
 6 | Explain the context and why you’re making that change. What is the problem you’re trying to solve? In some cases there is not a problem and this can be thought of being the motivation for your change.
 7 | 
 8 | Solution
 9 | =================
10 | 
11 | Describe the modifications you’ve done.
12 | 
13 | Result
14 | =================
15 | 
16 | What will change as a result of your pull request? Note that sometimes this section is unnecessary because it is self-explanatory based on the solution.
17 | 
18 | Checklist
19 | =================
20 | 
21 | -  [ ] All tests pass and ``example.py`` runs successfully.
22 | -  [ ] Code complies with the Google Python Style Guide.
23 | -  [ ] Change complies with the contribution guidelines.
24 | -  [ ] Mention ``Fixes #<issue number>`` in the description *if relevant*.
25 | -  [ ] Documentation/wiki is updated according to the change(s).
26 | 
27 | Google Python Style Guide: <https://google.github.io/styleguide/pyguide.html>.
28 | Contribution guidelines: <https://github.com/tijme/not-your-average-web-crawler/blob/master/CONTRIBUTING.rst>.


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Custom
  2 | nyawc/.semver
  3 | example_test.py
  4 | releases.js
  5 | 
  6 | # OS
  7 | Thumbs.db
  8 | .DS_Store
  9 | 
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | env/
 21 | /build
 22 | develop-eggs/
 23 | dist/
 24 | downloads/
 25 | eggs/
 26 | .eggs/
 27 | lib/
 28 | lib64/
 29 | parts/
 30 | sdist/
 31 | var/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *,cover
 55 | .hypothesis/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | docs/build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # IPython Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # celery beat schedule file
 86 | celerybeat-schedule
 87 | 
 88 | # dotenv
 89 | .env
 90 | 
 91 | # virtualenv
 92 | venv/
 93 | ENV/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 


--------------------------------------------------------------------------------
/.semver:
--------------------------------------------------------------------------------
1 | 1.8.2


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | os: linux
 2 | dist: trusty
 3 | sudo: required
 4 | language: python
 5 | 
 6 | env:
 7 |   - UNITTEST_NYAWC_SITE=1
 8 | 
 9 | python:
10 |   - 2.7
11 |   - 3.5
12 |   - 3.6
13 |   - 3.7-dev
14 | 
15 | install:
16 |   - sudo apt-get install -y apache2
17 |   - sudo apt-get install -y php5-common libapache2-mod-php5
18 |   - sudo service apache2 restart
19 |   - sudo rm -r /var/www/html/*
20 |   - sudo mv -T test/site /var/www/html
21 |   - sudo chown -R www-data:www-data /var/www
22 |   - pip install --upgrade setuptools
23 |   - pip install -r requirements.txt
24 | 
25 | script:
26 |   - python -m unittest discover
27 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
 1 | Contributing
 2 | ============
 3 | 
 4 | Getting Started
 5 | ---------------
 6 | 
 7 | -  Submit a ticket for your issue, assuming one does not already exist.
 8 | 
 9 |    -  Clearly describe the issue including steps to reproduce when it is
10 |       a bug.
11 |    -  Make sure you fill in the earliest version that you know has the
12 |       issue.
13 | 
14 | -  Fork the repository on GitHub.
15 | 
16 | Making Changes
17 | --------------
18 | 
19 | -  Create a topic branch from where you want to base your work.
20 | 
21 |    -  This is usually the develop branch.
22 |    -  To quickly create a topic branch based on master;
23 | 
24 |       -  ``git checkout -b bugfix-my-contribution``,
25 |       -  ``git checkout -b feature-my-contribution``.
26 | 
27 |    -  Please avoid working directly on the ``master`` branch.
28 | 
29 | -  Make sure your code complies with the `Google Python Style Guide`_.
30 | -  Make commits of logical units and make sure your commit messages are
31 |    in the proper format.
32 | -  Make sure you have added the necessary tests for your changes.
33 | -  Run *all* the tests to assure nothing else was accidentally broken.
34 | 
35 | Submitting Changes
36 | ------------------
37 | 
38 | -  Push your changes to the topic branch in your fork of the repository.
39 | -  Submit a pull request to the main repository
40 |    (``tijme/not-your-average-web-crawler``).
41 | 
42 | .. _Google Python Style Guide: https://google.github.io/styleguide/pyguide.html
43 | 


--------------------------------------------------------------------------------
/LICENSE.rst:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | =====================
 3 | 
 4 | Copyright (c) 2017 Tijme Gommers
 5 | 
 6 | Permission is hereby granted, free of charge, to any person
 7 | obtaining a copy of this software and associated documentation
 8 | files (the “Software”), to deal in the Software without
 9 | restriction, including without limitation the rights to use,
10 | copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the
12 | Software is furnished to do so, subject to the following
13 | conditions:
14 | 
15 | The above copyright notice and this permission notice shall be
16 | included in all copies or substantial portions of the Software.
17 | 
18 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
19 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 | OTHER DEALINGS IN THE SOFTWARE.
26 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include LICENSE.rst
3 | include requirements.txt
4 | include .semver


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | .. raw:: html
  2 | 
  3 |     <p align="center">
  4 | 
  5 | .. image:: https://tijme.github.io/not-your-average-web-crawler/latest/_static/img/logo.svg?pypi=png.from.svg
  6 |     :width: 300px
  7 |     :height: 300px
  8 |     :alt: N.Y.A.W.C. logo
  9 |     :align: center
 10 | 
 11 | .. raw:: html
 12 | 
 13 |     <br class="title">
 14 | 
 15 | .. image:: https://raw.finnwea.com/shield/?firstText=Donate%20via&secondText=Bunq
 16 |    :target: https://bunq.me/tijme/0/A%20web%20crawler%20(for%20bug%20hunting)%20that%20gathers%20more%20than%20you%20can%20imagine
 17 |    :alt: Donate via Bunq
 18 |    
 19 | .. image:: https://raw.finnwea.com/shield/?typeKey=TravisBuildStatus&typeValue1=tijme/not-your-average-web-crawler&typeValue2=master&cache=1
 20 |     :target: https://travis-ci.org/tijme/not-your-average-web-crawler
 21 |     :alt: Build Status
 22 | 
 23 | .. image:: https://raw.finnwea.com/vector-shields-v1/?typeKey=SemverVersion&typeValue1=tijme&typeValue2=not-your-average-web-crawler
 24 |     :target: https://pypi.python.org/pypi/nyawc/
 25 |     :alt: PyPi version
 26 |    
 27 | .. image:: https://raw.finnwea.com/shield/?firstText=License&secondText=MIT
 28 |    :target: https://github.com/tijme/not-your-average-web-crawler/blob/master/LICENSE.rst
 29 |    :alt: License: MIT
 30 | 
 31 | .. raw:: html
 32 | 
 33 |    </p>
 34 |    <h1>Not Your Average Web Crawler</h1>
 35 | 
 36 | N.Y.A.W.C is a Python library that enables you to test your payload against all requests of a certain domain. It crawls all requests (e.g. GET, POST or PUT) in the specified scope and keeps track of the request and response data. During the crawling process the callbacks enable you to insert your payload at specific places and test if they worked.
 37 | 
 38 | Table of contents
 39 | -----------------
 40 | 
 41 | -  `Installation <#installation>`__
 42 | -  `Crawling flow <#crawling-flow>`__
 43 | -  `Documentation <#documentation>`__
 44 | -  `Minimal implementation <#minimal-implementation>`__
 45 | -  `Testing <#testing>`__
 46 | -  `Issues <#issues>`__
 47 | -  `License <#license>`__
 48 | 
 49 | Installation
 50 | ------------
 51 | 
 52 | First make sure you're on `Python 2.7/3.3 <https://www.python.org/>`__ or higher. Then run the command below to install N.Y.A.W.C.
 53 | 
 54 | ``$ pip install --upgrade nyawc``
 55 | 
 56 | Crawling flow
 57 | -------------
 58 | 
 59 | 1. You can define your startpoint (a request) and the crawling scope and then start the crawler.
 60 | 2. The crawler repeatedly starts the first request in the queue until ``max threads`` is reached.
 61 | 3. The crawler adds all requests found in the response to the end of the queue (except duplicates).
 62 | 4. The crawler goes back to step #2 to spawn new requests repeatedly until ``max threads`` is reached.
 63 | 
 64 | .. image:: https://tijme.github.io/not-your-average-web-crawler/latest/_static/img/flow.svg
 65 |    :alt: N.Y.A.W.C crawling flow
 66 | 
 67 | **Please note that if the queue is empty and all crawler threads are finished, the crawler will stop.**
 68 | 
 69 | Documentation
 70 | -------------
 71 | 
 72 | Please refer to the `documentation <https://tijme.github.io/not-your-average-web-crawler/>`__ or the `API <https://tijme.github.io/not-your-average-web-crawler/latest/py-modindex.html>`__ for all the information about N.Y.A.W.C.
 73 | 
 74 | Minimal implementation
 75 | ----------------------
 76 | 
 77 | You can use the callbacks in ``example_minimal.py`` to run your own exploit against the requests. If you want an example of automated exploit scanning, please take a look at `ACSTIS <https://github.com/tijme/angularjs-csti-scanner>`__ (it uses N.Y.A.W.C to scan for AngularJS client-side template injection vulnerabilities).
 78 | 
 79 | You can also use the `kitchen sink <https://tijme.github.io/not-your-average-web-crawler/latest/kitchen_sink.html>`__ (which contains all the functionalities from N.Y.A.W.C.) instead of the example below. The code below is a minimal implementation of N.Y.A.W.C.
 80 | 
 81 | -  ``$ python example_minimal.py``
 82 | -  ``$ python -u example_minimal.py > output.log``
 83 | 
 84 | .. code:: python
 85 | 
 86 |     # example_minimal.py
 87 | 
 88 |     from nyawc.Options import Options
 89 |     from nyawc.QueueItem import QueueItem
 90 |     from nyawc.Crawler import Crawler
 91 |     from nyawc.CrawlerActions import CrawlerActions
 92 |     from nyawc.http.Request import Request
 93 | 
 94 |     def cb_crawler_before_start():
 95 |         print("Crawler started.")
 96 | 
 97 |     def cb_crawler_after_finish(queue):
 98 |         print("Crawler finished.")
 99 |         print("Found " + str(len(queue.get_all(QueueItem.STATUS_FINISHED))) + " requests.")
100 | 
101 |     def cb_request_before_start(queue, queue_item):
102 |         print("Starting: {}".format(queue_item.request.url))
103 |         return CrawlerActions.DO_CONTINUE_CRAWLING
104 | 
105 |     def cb_request_after_finish(queue, queue_item, new_queue_items):
106 |         print("Finished: {}".format(queue_item.request.url))
107 |         return CrawlerActions.DO_CONTINUE_CRAWLING
108 | 
109 |     options = Options()
110 | 
111 |     options.callbacks.crawler_before_start = cb_crawler_before_start # Called before the crawler starts crawling. Default is a null route.
112 |     options.callbacks.crawler_after_finish = cb_crawler_after_finish # Called after the crawler finished crawling. Default is a null route.
113 |     options.callbacks.request_before_start = cb_request_before_start # Called before the crawler starts a new request. Default is a null route.
114 |     options.callbacks.request_after_finish = cb_request_after_finish # Called after the crawler finishes a request. Default is a null route.
115 | 
116 |     crawler = Crawler(options)
117 |     crawler.start_with(Request("https://finnwea.com/"))
118 | 
119 | Testing
120 | -------
121 | 
122 | The testing can and will automatically be done by `Travis CI <https://travis-ci.org/tijme/not-your-average-web-crawler>`__ on every push to the master branch. If you want to manually run the unit tests, use the command below.
123 | 
124 | ``$ python -m unittest discover``
125 | 
126 | Issues
127 | ------
128 | 
129 | Issues or new features can be reported via the GitHub issue tracker. Please make sure your issue or feature has not yet been reported by anyone else before submitting a new one.
130 | 
131 | License
132 | -------
133 | 
134 | Not Your Average Web Crawler (N.Y.A.W.C) is open-sourced software licensed under the `MIT license <https://github.com/tijme/not-your-average-web-crawler/blob/master/LICENSE.rst>`__.
135 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = NYAWC
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -E -a
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -E -a


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | set SPHINXPROJ=NYAWC
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% -E -a
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% -E -a
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/source/_static/better.css:
--------------------------------------------------------------------------------
1 | @import url("better_basic.css");
2 | 


--------------------------------------------------------------------------------
/docs/source/_static/css/bootstrap-theme.css:
--------------------------------------------------------------------------------
  1 | /*
  2 | @media(max-width: 767px) {}
  3 | @media(min-width: 768px) {}
  4 | @media(min-width: 992px) {}
  5 | @media(min-width: 1200px) {}
  6 | */
  7 | 
  8 | /**
  9 |  *
 10 |  * General
 11 |  *
 12 |  */
 13 | html, body {
 14 |     width: 100%;
 15 |     height: 100%;
 16 | 
 17 |     margin: 0px 0px 0px 0px;
 18 |     padding: 0px 0px 0px 0px;
 19 | }
 20 | 
 21 | h1, h2, h3, h4, h5, h6 {
 22 |     font-family: Georgia, serif !important;
 23 | }
 24 | 
 25 | .vertical-center {
 26 |     width: 100%;
 27 |     min-height: 100%;
 28 |     display: flex;
 29 |     align-items: center;
 30 | }
 31 | 
 32 | /**
 33 |  *
 34 |  * Table of Contents
 35 |  *
 36 |  */
 37 | @media(max-width: 767px) {
 38 |     .nav-stacked {
 39 |         background: #f1f1f1;
 40 |         border-radius: 4px;
 41 |         border: 1px solid #f1f1f1;
 42 |         margin-bottom: 20px;
 43 |     }
 44 | }
 45 | 
 46 | .nav-stacked ul {
 47 |     padding-left: 35px;
 48 |     list-style: none;
 49 | }
 50 | 
 51 | .nav-stacked ul li {
 52 |     padding-top: 4px;
 53 |     padding-bottom: 4px;
 54 | }
 55 | 
 56 | 
 57 | .nav-stacked ul li a {
 58 |     font-size: 13px;
 59 | }
 60 | 
 61 | 
 62 | #tocscroll .affix {
 63 |     top: 20px;
 64 | }
 65 | 
 66 | /**
 67 |  *
 68 |  * Navbar
 69 |  *
 70 |  */
 71 | .navbar-inverse {
 72 |     height: 64px;
 73 | 
 74 |     background: transparent !important;
 75 |     border: none;
 76 | 
 77 |     margin: 0px;
 78 |     position: relative;
 79 |     z-index: 50;
 80 | 
 81 |     -webkit-box-shadow: 0 1px 2px rgba(0, 0, 0, 0.3);
 82 |     box-shadow: 0 1px 2px rgba(0, 0, 0, 0.3);
 83 | }
 84 | 
 85 | .navbar-inverse .navbar-nav > li > a {
 86 |     color: #fff;
 87 | 
 88 |     padding-top: 22px;
 89 |     padding-bottom: 22px;
 90 | }
 91 | 
 92 | .navbar-inverse .navbar-nav > li > a span.text {
 93 |     color: #fff;
 94 |     text-shadow: 0 0 5px #000;
 95 | }
 96 | 
 97 | .navbar-inverse .navbar-nav > li > a:hover span.text {
 98 |     border-bottom: 1px dotted #fff;
 99 | }
100 | 
101 | .navbar-inverse .navbar-nav > .open > a,
102 | .navbar-inverse .navbar-nav > .open > a:focus,
103 | .navbar-inverse .navbar-nav > .open > a:hover {
104 |     background: rgba(0, 0, 0, 0.35);
105 | }
106 | 
107 | @media(max-width: 767px) {
108 |     .navbar-inverse .navbar-toggle {
109 |         float: left;
110 |         border: none;
111 | 
112 |         padding-top: 17px;
113 |         padding-bottom: 17px;
114 |     }
115 | 
116 |     .navbar-inverse .navbar-toggle:focus,
117 |     .navbar-inverse .navbar-toggle:hover {
118 |         background: none;
119 |     }
120 | 
121 |     .navbar-inverse .navbar-collapse {
122 |         margin-left: -15px;
123 |         margin-right: -15px;
124 |         padding-left: 30px;
125 |         padding-right: 30px;
126 | 
127 |         border: none;
128 |         background: rgba(0, 0, 0, 0.9);
129 |         max-height: 500px;
130 |     }
131 | 
132 | 
133 |     .navbar-nav {
134 |         margin-top: 0px;
135 |         margin-bottom: 0px;
136 |     }
137 | 
138 |     .navbar-inverse .navbar-nav > li > a {
139 |         padding-top: 15px;
140 |         padding-bottom: 15px;
141 |     }
142 | }
143 | 
144 | /**
145 |  *
146 |  * Jumbotron
147 |  *
148 |  */
149 | .jumbotron {
150 |     height: 250px;
151 | 
152 |     background-color: #222222;
153 |     position: relative;
154 |     padding: 64px 0px 0px 0px;
155 |     margin: -64px 0px 20px 0px;
156 | }
157 | 
158 | .home.jumbotron {
159 |     height: 450px;
160 | }
161 | 
162 | .jumbotron img {
163 |     width: 100%;
164 |     height: 100%;
165 | 
166 |     top: 0;
167 |     left: 0;
168 |     z-index: 5;
169 | 
170 |     position: absolute;
171 |     object-fit: cover;
172 |     opacity: 0.9;
173 | }
174 | 
175 | .jumbotron .jumbotron-darken {
176 |     width: 100%;
177 |     height: 100%;
178 | 
179 |     z-index: 10;
180 | 
181 |     position: relative;
182 |     background: rgba(0, 0, 0, 0.35);
183 | }
184 | 
185 | .jumbotron .jumbotron-darken h1 {
186 |     color: #ffffff;
187 |     font-size: 40px;
188 | }
189 | 
190 |     @media(min-width: 768px) {
191 |         .jumbotron .jumbotron-darken h1 {
192 |             font-size: 45px;
193 |         }
194 |     }
195 | 
196 |     @media(min-width: 992px) {
197 |         .jumbotron .jumbotron-darken h1 {
198 |             font-size: 65px;
199 |         }
200 |     }
201 | 
202 | .jumbotron .jumbotron-darken h1 small {
203 |     display: block;
204 |     margin-top: 15px;
205 |     margin-bottom: 25px;
206 |     font-size: 18px;
207 |     color: #ffffff;
208 | }
209 | 
210 |     @media(min-width: 768px) {
211 |         .jumbotron .jumbotron-darken h1 small {
212 |             font-size: 22px;
213 |         }
214 |     }
215 | 
216 |     @media(min-width: 992px) {
217 |         .jumbotron .jumbotron-darken h1 small {
218 |             font-size: 26px;
219 |         }
220 |     }
221 | 
222 | .jumbotron .jumbotron-darken div.pre {
223 |     margin: 0px 0px 0px 0px;
224 |     padding: 2px 4px 2px 4px;
225 |     display: inline-block;
226 | 
227 |     border: none;
228 |     background: rgba(0, 0, 0, 0.75);
229 | 
230 |     font-size: 13px;
231 |     color: #ffffff;
232 |     text-align: center;
233 |     font-family: "Lucida Console", Monaco, monospace;
234 | }
235 | 
236 |     @media(min-width: 768px) {
237 |         .jumbotron .jumbotron-darken div.pre {
238 |             font-size: 14px;
239 |             padding: 3px 5px 3px 5px;
240 |         }
241 |     }
242 | 
243 | .jumbotron .jumbotron-darken .btn {
244 |     border: none;
245 |     background: rgba(0, 0, 0, 0.75);
246 |     color: #ffffff;
247 | }
248 | 
249 | /**
250 |  *
251 |  * Content
252 |  *
253 |  */
254 | .content-container h1 {
255 |     color: #000000;
256 | 
257 |     margin-top: 40px;
258 |     font-size: 28px;
259 | }
260 | 
261 | .content-container .section:first-child h1:first-child {
262 |     margin-top: 0px;
263 | }
264 | 
265 | .content-container h2 {
266 |     color: #333333;
267 | 
268 |     margin-top: 20px;
269 |     font-size: 22px;
270 | }
271 | 
272 | .content-container h3 {
273 |     color: #333333;
274 | 
275 |     margin-top: 15px;
276 |     font-size: 18px;
277 | }
278 | 
279 | .content-container .highlight pre {
280 |     border: 1px solid #f1f1f1;
281 | 
282 |     font-family: "Lucida Console", Monaco, monospace;
283 |     font-size: 12px;
284 |     word-break: break-word;
285 | 
286 |     color: #666666;
287 |     background: #f5f5f5;
288 | }
289 | 
290 | /**
291 |  *
292 |  * Footer
293 |  *
294 |  */
295 | .footer {
296 |     font-size: 11px;
297 |     color: #333333;
298 |     text-align: center;
299 | 
300 |     margin-top: 20px;
301 |     margin-bottom: 20px;
302 | }
303 | 
304 | /**
305 |  *
306 |  * Home
307 |  *
308 |  */
309 | img.flow {
310 |     margin: 20px auto;
311 | }
312 | 


--------------------------------------------------------------------------------
/docs/source/_static/fonts/glyphicons-halflings-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/docs/source/_static/fonts/glyphicons-halflings-regular.eot


--------------------------------------------------------------------------------
/docs/source/_static/fonts/glyphicons-halflings-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/docs/source/_static/fonts/glyphicons-halflings-regular.ttf


--------------------------------------------------------------------------------
/docs/source/_static/fonts/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/docs/source/_static/fonts/glyphicons-halflings-regular.woff


--------------------------------------------------------------------------------
/docs/source/_static/fonts/glyphicons-halflings-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/docs/source/_static/fonts/glyphicons-halflings-regular.woff2


--------------------------------------------------------------------------------
/docs/source/_static/img/flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/docs/source/_static/img/flow.png


--------------------------------------------------------------------------------
/docs/source/_static/img/header.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/docs/source/_static/img/header.jpg


--------------------------------------------------------------------------------
/docs/source/_static/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/docs/source/_static/img/logo.png


--------------------------------------------------------------------------------
/docs/source/_static/js/bootstrap-theme.js:
--------------------------------------------------------------------------------
 1 | function selectText(containerid) {
 2 |     if (document.selection) {
 3 |         var range = document.body.createTextRange();
 4 |         range.moveToElementText(document.getElementById(containerid));
 5 |         range.select();
 6 |     } else if (window.getSelection) {
 7 |         var range = document.createRange();
 8 |         range.selectNode(document.getElementById(containerid));
 9 |         window.getSelection().removeAllRanges();
10 |         window.getSelection().addRange(range);
11 |     }
12 | }
13 | 
14 | function unwrapToc() {
15 |     $('.nav-stacked').each(function() {
16 |         $(this).html($(this).find('ul').html());
17 |     });
18 | }
19 | 
20 | function tocInitializeAffix() {
21 |     $('#tocscroll .nav').affix({
22 |         offset: {
23 |             top: function () {
24 |                 var alertHeight = 0;
25 |                 if ($('.alert-version').lenght) {
26 |                     alertHeight = $('.alert-version').outerHeight()
27 |                 }
28 | 
29 |                 return (this.top = $('.jumbotron').outerHeight() + alertHeight)
30 |             },
31 |             bottom: function () {
32 |                 return (this.bottom = $('.footer').outerHeight())
33 |             }
34 |         }
35 |     });
36 | }
37 | 
38 | function tocAffixSetWidth() {
39 |     $('#tocscroll .nav').width($('#tocscroll').width())
40 | }
41 | 
42 | $(document).ready(function() {
43 |     unwrapToc();
44 |     tocInitializeAffix();
45 |     tocAffixSetWidth();
46 | });
47 | 
48 | $(window).resize(function () {
49 |     tocAffixSetWidth();
50 | });
51 | 


--------------------------------------------------------------------------------
/docs/source/_static/js/releases-dropdown.js:
--------------------------------------------------------------------------------
 1 | $(document).ready(function() {
 2 |     var latestVersion = Object.keys(releases)[Object.keys(releases).length - 1];
 3 |     var currentVersion = $('#releases').attr('data-selected');
 4 | 
 5 |     /**
 6 |      * Generate dropdown
 7 |      */
 8 |     var dropdownHtml = "";
 9 | 
10 |     Object.keys(releases).forEach(function(version, index) {
11 |         var isLatest = version == latestVersion
12 |         var labelHtml = isLatest ? " <span class=\"badge\">latest</span>" : "";
13 |         var labelLink = '../' + (isLatest ? 'latest' : version) + '/index.html';
14 | 
15 |         dropdownHtml = "<li><a href=\"" + labelLink + "\">Version " + version + labelHtml + "</a></li>" + dropdownHtml;
16 |     });
17 | 
18 |     $('#releases .dropdown-menu').html(dropdownHtml);
19 | 
20 |     /**
21 |      * Show message if not viewing the latest version
22 |      */
23 |     if (latestVersion != currentVersion) {
24 |         var message = "<strong>Warning!</strong> Version <a class=\"alert-link\" href=\"../latest/index.html\">" + latestVersion + "</a> is available (you are currently viewing version " + currentVersion + ").";
25 |         var messageHtml = '<div class="container"><div class="row"><div class="col-lg-10 col-lg-offset-1"><div class="alert alert-danger alert-version" role="alert">' + message + '</div></div></div></div>';
26 |         $(messageHtml).insertAfter($('.jumbotron'))
27 |     }
28 | });
29 | 


--------------------------------------------------------------------------------
/docs/source/_templates/layout.html:
--------------------------------------------------------------------------------
  1 | {% extends '!layout.html' %}
  2 | 
  3 | {% block header %}
  4 |     <nav class="navbar navbar-inverse navbar-fixed-top">
  5 |         <div class="container">
  6 |             <div class="row">
  7 |                 <div class="col-lg-10 col-lg-offset-1">
  8 |                     <div class="navbar-header">
  9 |                         <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar-collapse" aria-expanded="false">
 10 |                             <span class="sr-only">Toggle navigation</span>
 11 |                             <span class="icon-bar"></span>
 12 |                             <span class="icon-bar"></span>
 13 |                             <span class="icon-bar"></span>
 14 |                         </button>
 15 |                     </div>
 16 |                     <div class="collapse navbar-collapse" id="navbar-collapse">
 17 |                         <ul class="nav navbar-nav">
 18 |                             <li><a href="index.html"><span class="text">Home</span></a></li>
 19 |                             <li class="dropdown">
 20 |                                 <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false"><span class="text">Introduction</span> <span class="caret"></span></a>
 21 |                                 <ul class="dropdown-menu">
 22 |                                     <li><a href="installation.html">Installation</a></li>
 23 |                                     <li><a href="migration.html">Migration</a></li>
 24 |                                     <li><a href="getting_started.html">Getting Started</a></li>
 25 |                                     <li><a href="kitchen_sink.html">Kitchen Sink</a></li>
 26 |                                 </ul>
 27 |                             </li>
 28 |                             <li class="dropdown">
 29 |                                 <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false"><span class="text">Options</span> <span class="caret"></span></a>
 30 |                                 <ul class="dropdown-menu">
 31 |                                     <li><a href="options_callbacks.html">Callbacks</a></li>
 32 |                                     <li><a href="options_crawling_scope.html">Crawling scope</a></li>
 33 |                                     <li><a href="options_crawling_identity.html">Crawling identity</a></li>
 34 |                                     <li><a href="options_performance.html">Performance</a></li>
 35 |                                     <li><a href="options_routing.html">Routing</a></li>
 36 |                                     <li><a href="options_misc.html">Misc</a></li>
 37 |                                 </ul>
 38 |                             </li>
 39 |                             <li class="dropdown">
 40 |                                 <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false"><span class="text">API</span> <span class="caret"></span></a>
 41 |                                 <ul class="dropdown-menu">
 42 |                                     <li><a href="py-modindex.html">Code API</a></li>
 43 |                                     <li><a href="https://github.com/tijme/not-your-average-web-crawler" target="_blank">Source (GitHub)</a></li>
 44 |                                 </ul>
 45 |                             </li>
 46 |                         </ul>
 47 |                         <ul class="nav navbar-nav navbar-right">
 48 |                             <li id="releases" class="dropdown" data-selected="{{release}}">
 49 |                                 <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false"><span class="text">Version {{release}}</span> <span class="caret"></span></a>
 50 |                                 <ul class="dropdown-menu">
 51 |                                 </ul>
 52 |                             </li>
 53 |                         </ul>
 54 |                     </div>
 55 |                 </div>
 56 |             </div>
 57 |         </div>
 58 |     </nav>
 59 | {% endblock %}
 60 | 
 61 | {% block content %}
 62 | 
 63 |     {%- if pagename == 'index' %}
 64 |         <div class="home jumbotron">
 65 |             <div class="jumbotron-darken">
 66 |                 <div class="vertical-center">
 67 |                     <div class="container">
 68 |                         <div class="row">
 69 |                             <div class="col-lg-10 col-lg-offset-1 text-center">
 70 |                                 <h1 class="headline hidden-xs">Not Your Average Web Crawler <br><small>Execute your exploit against every request in scope</small></h1>
 71 |                                 <h1 class="headline visible-xs">N.Y.A.W.C <br><small>Execute your exploit against every request in scope</small></h1>
 72 |                                 <a href="getting_started.html" class="btn btn-black btn-lg">Getting Started</a>
 73 |                             </div>
 74 |                         </div>
 75 |                     </div>
 76 |                 </div>
 77 |             </div>
 78 |             <img class="background" src="_static/img/header.jpg">
 79 |         </div>
 80 |     {%- endif %}
 81 | 
 82 |     {%- if pagename != 'index' %}
 83 |         <div class="default jumbotron">
 84 |             <div class="jumbotron-darken">
 85 |                 <div class="vertical-center">
 86 |                     <div class="container">
 87 |                         <div class="row">
 88 |                             <div class="col-lg-10 col-lg-offset-1">
 89 |                                 <h1 class="headline">{{title}}</h1>
 90 |                             </div>
 91 |                         </div>
 92 |                     </div>
 93 |                 </div>
 94 |             </div>
 95 |             <img class="background" src="_static/img/header.jpg">
 96 |         </div>
 97 |     {%- endif %}
 98 | 
 99 |     <div class="container">
100 |         <div class="row">
101 |             <div class="col-lg-10 col-lg-offset-1">
102 |                 {%- if display_toc %}
103 |                     <div class="row">
104 |                         <nav class="col-sm-3 col-sm-push-9 visible-xs">
105 |                             <strong>Table of Contents</strong>
106 |                             <ul class="nav nav-pills nav-stacked">
107 |                                 {{ toc }}
108 |                             </ul>
109 |                         </nav>
110 |                         <nav class="col-sm-3 col-sm-push-9 hidden-xs" id="tocscroll">
111 |                             <ul class="nav nav-pills nav-stacked">
112 |                                 {{ toc }}
113 |                             </ul>
114 |                         </nav>
115 |                         <div class="col-sm-9 col-sm-pull-3">
116 |                 {%- endif %}
117 | 
118 |                 <div class="{%- if pagename != 'index' %}content-container{%- endif %}">
119 |                     {% block body %} {% endblock %}
120 |                 </div>
121 | 
122 |                 {%- if display_toc %}
123 |                         </div>
124 |                     </div>
125 |                 {%- endif %}
126 |             </div>
127 |         </div>
128 |     </div>
129 | {% endblock %}
130 | 
131 | {% block footer %}
132 |     <footer class="footer">
133 |         <div class="container">
134 |             <div class="row">
135 |                 <div class="col-md-10 col-md-offset-1">
136 |                     <p>N.Y.A.W.C v{{release}} is open-sourced software licensed under the <a href="https://github.com/tijme/not-your-average-web-crawler/blob/master/LICENSE.rst" target="_blank">MIT license</a>.</p>
137 |                 </div>
138 |             </div>
139 |         </div>
140 |     </footer>
141 | {% endblock %}
142 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # N.Y.A.W.C documentation build configuration file, created by
  5 | # sphinx-quickstart on Fri May 12 17:22:14 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | import os
 21 | import sys
 22 | sys.path.insert(0, os.path.abspath('../..'))
 23 | 
 24 | # -- General configuration ------------------------------------------------
 25 | 
 26 | # If your documentation needs a minimal Sphinx version, state it here.
 27 | #
 28 | # needs_sphinx = '1.0'
 29 | 
 30 | # Add any Sphinx extension module names here, as strings. They can be
 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 32 | # ones.
 33 | extensions = [
 34 |     'sphinx.ext.autodoc',
 35 |     'sphinxcontrib.napoleon',
 36 |     'sphinx.ext.linkcode',
 37 |     'sphinx.ext.todo'
 38 | ]
 39 | 
 40 | # Add any paths that contain templates here, relative to this directory.
 41 | templates_path = ['_templates']
 42 | 
 43 | # The suffix(es) of source filenames.
 44 | # You can specify multiple suffix as a list of string:
 45 | #
 46 | # source_suffix = ['.rst', '.md']
 47 | source_suffix = '.rst'
 48 | 
 49 | # The master toctree document.
 50 | master_doc = 'index'
 51 | 
 52 | # General information about the project.
 53 | project = 'N.Y.A.W.C'
 54 | copyright = '2017, Tijme Gommers'
 55 | author = 'Tijme Gommers'
 56 | 
 57 | # The version info for the project you're documenting, acts as replacement for
 58 | # |version| and |release|, also used in various other places throughout the
 59 | # built documents.
 60 | 
 61 | with open("../../.semver") as file:
 62 |     semver = file.read().rstrip()
 63 | 
 64 | # The short X.Y version.
 65 | version = semver
 66 | # The full version, including alpha/beta/rc tags.
 67 | release = semver
 68 | 
 69 | # The language for content autogenerated by Sphinx. Refer to documentation
 70 | # for a list of supported languages.
 71 | #
 72 | # This is also used if you do content translation via gettext catalogs.
 73 | # Usually you set "language" from the command line for these cases.
 74 | language = None
 75 | 
 76 | # List of patterns, relative to source directory, that match files and
 77 | # directories to ignore when looking for source files.
 78 | # This patterns also effect to html_static_path and html_extra_path
 79 | exclude_patterns = []
 80 | 
 81 | # The name of the Pygments (syntax highlighting) style to use.
 82 | pygments_style = 'sphinx'
 83 | 
 84 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 85 | todo_include_todos = False
 86 | 
 87 | # -- Options for HTML output ----------------------------------------------
 88 | 
 89 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 90 | # a list of builtin themes.
 91 | #
 92 | from better import better_theme_path
 93 | html_theme_path = [better_theme_path]
 94 | html_theme = 'better'
 95 | 
 96 | # Theme options are theme-specific and customize the look and feel of a theme
 97 | # further.  For a list of options available for each theme, see the
 98 | # documentation.
 99 | #
100 | html_theme_options = {
101 |   # show sidebar on the right instead of on the left
102 |   'rightsidebar': False,
103 | 
104 |   # CSS files to include after all other CSS files
105 |   # (refer to by relative path from conf.py directory, or link to a
106 |   # remote file)
107 |   # 'cssfiles': ['_static/my_style.css'],  # default is empty list
108 | 
109 |   # show a big text header with the value of html_title
110 |   'showheader': True,
111 | 
112 |   # show the breadcrumbs and index|next|previous links at the top of
113 |   # the page
114 |   'showrelbartop': False,
115 |   # same for bottom of the page
116 |   'showrelbarbottom': False,
117 | 
118 |   # show the self-serving link in the footer
119 |   'linktotheme': False,
120 | 
121 |   # width of the sidebar. page width is determined by a CSS rule.
122 |   # I prefer to define things in rem because it scales with the
123 |   # global font size rather than pixels or the local font size.
124 |   'sidebarwidth': '0px',
125 | 
126 |   # color of all body text
127 |   'textcolor': '#000000',
128 | 
129 |   # color of all headings (<h1> tags); defaults to the value of
130 |   # textcolor, which is why it's defined here at all.
131 |   'headtextcolor': '',
132 | 
133 |   # color of text in the footer, including links; defaults to the
134 |   # value of textcolor
135 |   'footertextcolor': '',
136 | 
137 |   # Custom CSS
138 |   'cssfiles': ['_static/css/bootstrap.min.css?' + version, '_static/css/bootstrap-theme.css?' + version],
139 | 
140 |   # Custom JS
141 |   'scriptfiles': ['../releases.js?' + version, '_static/js/bootstrap.min.js?' + version, '_static/js/releases-dropdown.js?' + version, '_static/js/bootstrap-theme.js?' + version]
142 | }
143 | 
144 | # Add any paths that contain custom static files (such as style sheets) here,
145 | # relative to this directory. They are copied after the builtin static files,
146 | # so a file named "default.css" will overwrite the builtin "default.css".
147 | html_static_path = ['_static']
148 | 
149 | # -- Options for HTMLHelp output ------------------------------------------
150 | 
151 | # Output file base name for HTML help builder.
152 | htmlhelp_basename = 'NYAWCdoc'
153 | 
154 | # -- Options for LaTeX output ---------------------------------------------
155 | 
156 | latex_elements = {
157 |     # The paper size ('letterpaper' or 'a4paper').
158 |     #
159 |     # 'papersize': 'letterpaper',
160 | 
161 |     # The font size ('10pt', '11pt' or '12pt').
162 |     #
163 |     # 'pointsize': '10pt',
164 | 
165 |     # Additional stuff for the LaTeX preamble.
166 |     #
167 |     # 'preamble': '',
168 | 
169 |     # Latex figure (float) alignment
170 |     #
171 |     # 'figure_align': 'htbp',
172 | }
173 | 
174 | # Grouping the document tree into LaTeX files. List of tuples
175 | # (source start file, target name, title,
176 | #  author, documentclass [howto, manual, or own class]).
177 | latex_documents = [
178 |     (master_doc, 'NYAWC.tex', 'N.Y.A.W.C Documentation',
179 |      'Tijme Gommers', 'manual'),
180 | ]
181 | 
182 | # -- Options for manual page output ---------------------------------------
183 | 
184 | # One entry per manual page. List of tuples
185 | # (source start file, name, description, authors, manual section).
186 | man_pages = [
187 |     (master_doc, 'nyawc', 'N.Y.A.W.C Documentation',
188 |      [author], 1)
189 | ]
190 | 
191 | # -- Options for Texinfo output -------------------------------------------
192 | 
193 | # Grouping the document tree into Texinfo files. List of tuples
194 | # (source start file, target name, title, author,
195 | #  dir menu entry, description, category)
196 | texinfo_documents = [
197 |     (master_doc, 'NYAWC', 'N.Y.A.W.C Documentation',
198 |      author, 'NYAWC', 'A web crawler that gathers more than you can imagine.',
199 |      'Miscellaneous'),
200 | ]
201 | 
202 | # Title of the documentation
203 | html_title = "Not Your Average Web Crawler"
204 | 
205 | # Home button title
206 | html_short_title = "Home"
207 | 
208 | # Sidebar contents
209 | html_sidebars = {
210 |   '**': [],
211 | }
212 | 
213 | # Absolute link the the source code
214 | def linkcode_resolve(domain, info):
215 |     if domain != 'py':
216 |         return None
217 | 
218 |     if not info['module']:
219 |         return None
220 | 
221 |     filename = info['module'].replace('.', '/')
222 |     return "https://github.com/tijme/not-your-average-web-crawler/tree/{}/{}.py".format(semver, filename)
223 | 
224 | # Napoleon
225 | napoleon_google_docstring = True
226 | napoleon_include_init_with_doc = True
227 | napoleon_include_private_with_doc  = True
228 | 
229 | # Always make sure current release is in releases.js
230 | import json
231 | from collections import OrderedDict
232 | 
233 | releasesjs = open('../../releases.js').read().replace("var releases = ", "")
234 | releases = json.loads(releasesjs, object_pairs_hook=OrderedDict);
235 | 
236 | releases[release] = True
237 | 
238 | with open('../../releases.js', 'w') as outfile:
239 |     outfile.write("var releases = " + json.dumps(releases))
240 | 


--------------------------------------------------------------------------------
/docs/source/getting_started.rst:
--------------------------------------------------------------------------------
 1 | .. title:: Getting Started
 2 | 
 3 | Minimal example
 4 | ---------------
 5 | 
 6 | N.Y.A.W.C does not have a CLI entry point, so you need to create one yourself. Save the code below as ``example.py``. The example code prints all request URLs that were found by the crawler.
 7 | 
 8 | .. code:: python
 9 | 
10 |     # example.py
11 | 
12 |     from nyawc.Options import Options
13 |     from nyawc.Crawler import Crawler
14 |     from nyawc.QueueItem import QueueItem
15 |     from nyawc.CrawlerActions import CrawlerActions
16 |     from nyawc.http.Request import Request
17 | 
18 |     def cb_crawler_before_start():
19 |         print("Crawler started.")
20 | 
21 |     def cb_crawler_after_finish(queue):
22 |         print("Crawler finished.")
23 |         print("Found " + str(len(queue.get_all(QueueItem.STATUS_FINISHED))) + " requests.")
24 | 
25 |     def cb_request_before_start(queue, queue_item):
26 |         print("Starting: {}".format(queue_item.request.url))
27 |         return CrawlerActions.DO_CONTINUE_CRAWLING
28 | 
29 |     def cb_request_after_finish(queue, queue_item, new_queue_items):
30 |         print("Finished: {}".format(queue_item.request.url))
31 |         return CrawlerActions.DO_CONTINUE_CRAWLING
32 | 
33 |     options = Options()
34 | 
35 |     options.callbacks.crawler_before_start = cb_crawler_before_start # Called before the crawler starts crawling. Default is a null route.
36 |     options.callbacks.crawler_after_finish = cb_crawler_after_finish # Called after the crawler finished crawling. Default is a null route.
37 |     options.callbacks.request_before_start = cb_request_before_start # Called before the crawler starts a new request. Default is a null route.
38 |     options.callbacks.request_after_finish = cb_request_after_finish # Called after the crawler finishes a request. Default is a null route.
39 | 
40 |     crawler = Crawler(options)
41 |     crawler.start_with(Request("https://finnwea.com/"))
42 | 
43 | Testing example.py
44 | ------------------
45 | 
46 | In the foreground
47 | ~~~~~~~~~~~~~~~~~
48 | 
49 | Output all contents to the console.
50 | 
51 | ``$ python example.py``
52 | 
53 | In the background
54 | ~~~~~~~~~~~~~~~~~
55 | 
56 | Output all contents to a file and run the process in the background.
57 | 
58 | ``$ python -u example.py > output.log``
59 | 
60 | Adding extra options
61 | --------------------
62 | 
63 | Callbacks
64 | ~~~~~~~~~
65 | 
66 | All the available callbacks are documented `here <options_callbacks.html>`_.
67 | 
68 | Scope
69 | ~~~~~
70 | 
71 | You can set scope options to, for example, only crawl certain subdomains or certain request methods. See `this <options_crawling_scope.html>`_ page for all the available scope options.
72 | 
73 | Identity
74 | ~~~~~~~~
75 | 
76 | Do you want to use authentication, set headers or use a proxy? Check `these <options_crawling_identity.html>`_ identity options for documentation.
77 | 
78 | Routing
79 | ~~~~~~~
80 | 
81 | If you want to ignore similar requests (e.g. /news/1, /news/2, /news/3, etc) you can specify routes via the `routing <options_routing.html>`_ options.
82 | 
83 | The kitchen sink
84 | ----------------
85 | 
86 | The kitchen sink is an example that implements all the features/options of N.Y.A.W.C. The kitchen sink is available for copy paste. `Check it out <kitchen_sink.html>`_!
87 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. title:: Home
 2 | 
 3 | .. raw:: html
 4 | 
 5 |     <div class="row">
 6 |         <div class="col-md-12">
 7 |             <p>Did you ever want to test your payload against all requests of a certain domain? N.Y.A.W.C can help you with that. It crawls all requests (e.g. GET, POST or PUT) on the specified domain and keeps track of the request and response data. During the crawling process, the callbacks enable you to insert your payload at specific places and test if they worked. And using the built in options you can do even more. <a href="getting_started.html">Get started</a>!</p>
 8 |         </div>
 9 |     </div>
10 |     <hr>
11 |     <div class="row">
12 |         <div class="col-md-3">
13 |             <strong>Step #1</strong>
14 |             <p>You can define your startpoint (a request) and the crawling scope and then start the crawler.</p>
15 |         </div>
16 |         <div class="col-md-3">
17 |             <strong>Step #2</strong>
18 |             <p>The crawler repeatedly starts the first request in the queue until <code>max threads</code> is reached.</p>
19 |         </div>
20 |         <div class="col-md-3">
21 |             <strong>Step #3</strong>
22 |             <p>The crawler adds all requests found in the response to the end of the queue (except duplicates).</p>
23 |         </div>
24 |         <div class="col-md-3">
25 |             <strong>Step #4</strong>
26 |             <p>The crawler goes back to step #2 to spawn new requests repeatedly until <code>max threads</code> is reached.</p>
27 |         </div>
28 |     </div>
29 |     <div class="row">
30 |         <div class="col-md-12 text-center">
31 |             <img src="_static/img/flow.svg" class="flow img-responsive" />
32 |         </div>
33 |     </div>
34 |     <div class="row">
35 |         <div class="col-md-12 text-center">
36 |             <p>Several <a href="options_callbacks.html">callbacks</a> can be used throughout the crawling process to, for example, modify requests on the go.</p>
37 |         </div>
38 |     </div>
39 | 


--------------------------------------------------------------------------------
/docs/source/installation.rst:
--------------------------------------------------------------------------------
 1 | .. title:: Installation
 2 | 
 3 | .. raw:: html
 4 | 
 5 |     <div class="alert alert-warning" role="alert"><strong>Hold on!</strong> Are you upgrading to a newer version? Please check our <a class="alert-link" href="migration.html">migration guide</a> for the instructions.</div>
 6 | 
 7 | Install using PIP
 8 | -----------------
 9 | 
10 | All releases of N.Y.A.W.C are available on PyPi (`link <https://pypi.python.org/pypi/nyawc/>`_).
11 | 
12 | #. Make sure you are using Python 2.7/3.3 or higher.
13 | #. ``$ pip install --upgrade nyawc``
14 | 
15 | Install using EasyInstall
16 | -------------------------
17 | 
18 | #. Make sure you are using Python 2.7/3.3 or higher.
19 | #. ``$ easy_install --upgrade nyawc``
20 | 
21 | Download as ZIP
22 | ---------------
23 | 
24 | #. Make sure you are using Python 2.7/3.3 or higher.
25 | #. Download and extract the ZIP file (`link <https://github.com/tijme/not-your-average-web-crawler/archive/master.zip>`__).
26 | #. Run the unit-tests to verify you have a working version.
27 | 
28 |    #. ``$ python -m unittest discover``.
29 | 
30 | #. Install N.Y.A.W.C.
31 | 
32 |    #. ``$ python setup.py install``
33 | 
34 | Clone using GIT
35 | ---------------
36 | 
37 | #. Make sure you are using Python 2.7/3.3 or higher.
38 | #. Clone the project
39 | 
40 |    #. ``$ git clone https://github.com/tijme/not-your-average-web-crawler.git``.
41 | 
42 | #. Run the unit-tests to verify you have a working version.
43 | 
44 |    #. ``$ python -m unittest discover``.
45 | 
46 | #. Install N.Y.A.W.C.
47 | 
48 |    #. ``$ python setup.py install``
49 | 


--------------------------------------------------------------------------------
/docs/source/kitchen_sink.rst:
--------------------------------------------------------------------------------
  1 | .. title:: Kitchen Sink
  2 | 
  3 | The English phrase "Everything but the kitchen sink" means "almost anything one can think of". The example below contains all the functionalities from N.Y.A.W.C.
  4 | 
  5 | .. code:: python
  6 | 
  7 |     # example.py
  8 | 
  9 |     from nyawc.Options import Options
 10 |     from nyawc.QueueItem import QueueItem
 11 |     from nyawc.Crawler import Crawler
 12 |     from nyawc.CrawlerActions import CrawlerActions
 13 |     from nyawc.http.Request import Request
 14 |     from requests.auth import HTTPBasicAuth
 15 | 
 16 |     def cb_crawler_before_start():
 17 |         print("Crawler started.")
 18 | 
 19 |     def cb_crawler_after_finish(queue):
 20 |         print("Crawler finished.")
 21 |         print("Found " + str(len(queue.get_all(QueueItem.STATUS_FINISHED))) + " requests.")
 22 | 
 23 |         for queue_item in queue.get_all(QueueItem.STATUS_FINISHED).values():
 24 |             print("[" + queue_item.request.method + "] " + queue_item.request.url + " (PostData: " + str(queue_item.request.data) + ")")
 25 | 
 26 |     def cb_request_before_start(queue, queue_item):
 27 |         # return CrawlerActions.DO_SKIP_TO_NEXT
 28 |         # return CrawlerActions.DO_STOP_CRAWLING
 29 | 
 30 |         return CrawlerActions.DO_CONTINUE_CRAWLING
 31 | 
 32 |     def cb_request_after_finish(queue, queue_item, new_queue_items):
 33 |         percentage = str(int(queue.get_progress()))
 34 |         total_requests = str(queue.count_total)
 35 | 
 36 |         print("At " + percentage + "% of " + total_requests + " requests ([" + str(queue_item.response.status_code) + "] " + queue_item.request.url + ").")
 37 | 
 38 |         # return CrawlerActions.DO_STOP_CRAWLING
 39 |         return CrawlerActions.DO_CONTINUE_CRAWLING
 40 | 
 41 |     def cb_request_in_thread_before_start(queue_item):
 42 |         pass
 43 | 
 44 |     def cb_request_in_thread_after_finish(queue_item):
 45 |         pass
 46 | 
 47 |     def cb_request_on_error(queue_item, message):
 48 |         print("[error] " + message)
 49 | 
 50 |     def cb_form_before_autofill(queue_item, elements, form_data):
 51 |         # return CrawlerActions.DO_NOT_AUTOFILL_FORM
 52 | 
 53 |         return CrawlerActions.DO_AUTOFILL_FORM
 54 | 
 55 |     def cb_form_after_autofill(queue_item, elements, form_data):
 56 |         pass
 57 | 
 58 |     # Declare the options
 59 |     options = Options()
 60 | 
 61 |     # Callback options (https://tijme.github.io/not-your-average-web-crawler/latest/options_callbacks.html)
 62 |     options.callbacks.crawler_before_start = cb_crawler_before_start # Called before the crawler starts crawling. Default is a null route.
 63 |     options.callbacks.crawler_after_finish = cb_crawler_after_finish # Called after the crawler finished crawling. Default is a null route.
 64 |     options.callbacks.request_before_start = cb_request_before_start # Called before the crawler starts a new request. Default is a null route.
 65 |     options.callbacks.request_after_finish = cb_request_after_finish # Called after the crawler finishes a request. Default is a null route.
 66 |     options.callbacks.request_in_thread_before_start = cb_request_in_thread_before_start # Called in the crawling thread (when it started). Default is a null route.
 67 |     options.callbacks.request_in_thread_after_finish = cb_request_in_thread_after_finish # Called in the crawling thread (when it finished). Default is a null route.
 68 |     options.callbacks.request_on_error = cb_request_on_error # Called if a request failed. Default is a null route.
 69 |     options.callbacks.form_before_autofill = cb_form_before_autofill # Called before the crawler autofills a form. Default is a null route.
 70 |     options.callbacks.form_after_autofill = cb_form_after_autofill # Called after the crawler autofills a form. Default is a null route.
 71 | 
 72 |     # Scope options (https://tijme.github.io/not-your-average-web-crawler/latest/options_crawling_scope.html)
 73 |     options.scope.protocol_must_match = False # Only crawl pages with the same protocol as the startpoint (e.g. only https). Default is False.
 74 |     options.scope.subdomain_must_match = True # Only crawl pages with the same subdomain as the startpoint. If the startpoint is not a subdomain, no subdomains will be crawled. Default is True.
 75 |     options.scope.hostname_must_match = True # Only crawl pages with the same hostname as the startpoint (e.g. only `finnwea`). Default is True.
 76 |     options.scope.tld_must_match = True # Only crawl pages with the same tld as the startpoint (e.g. only `.com`). Default is True.
 77 |     options.scope.max_depth = None # The maximum search depth. 0 only crawls the start request. 1 will also crawl all the requests found on the start request. 2 goes one level deeper, and so on. Default is None (unlimited).
 78 |     options.scope.request_methods = [
 79 |         # The request methods to crawl. Default is all request methods
 80 |         Request.METHOD_GET,
 81 |         Request.METHOD_POST,
 82 |         Request.METHOD_PUT,
 83 |         Request.METHOD_DELETE,
 84 |         Request.METHOD_OPTIONS,
 85 |         Request.METHOD_HEAD
 86 |     ]
 87 | 
 88 |     # Identity options (https://tijme.github.io/not-your-average-web-crawler/latest/options_crawling_identity.html)
 89 |     options.identity.auth = HTTPBasicAuth('user', 'pass') # Or any other authentication (http://docs.python-requests.org/en/master/user/authentication/). Default is None.
 90 |     options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies')
 91 |     options.identity.cookies.set(name='gross_cookie', value='blech', domain='finnwea.com', path='/elsewhere')
 92 |     options.identity.proxies = {
 93 |         # No authentication
 94 |         # 'http': 'http://host:port',
 95 |         # 'https': 'http://host:port',
 96 | 
 97 |         # Basic authentication
 98 |         # 'http': 'http://user:pass@host:port',
 99 |         # 'https': 'https://user:pass@host:port',
100 | 
101 |         # SOCKS
102 |         # 'http': 'socks5://user:pass@host:port',
103 |         # 'https': 'socks5://user:pass@host:port'
104 |     }
105 |     options.identity.headers.update({
106 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
107 |     })
108 | 
109 |     # Performance options (https://tijme.github.io/not-your-average-web-crawler/latest/options_performance.html)
110 |     options.performance.max_threads = 20 # The maximum amount of simultaneous threads to use for crawling. Default is 40.
111 |     options.performance.request_timeout = 15 # The request timeout in seconds (throws an exception if exceeded). Default is 30.
112 | 
113 |     # Routing options (https://tijme.github.io/not-your-average-web-crawler/latest/options_routing.html)
114 |     options.routing.minimum_threshold = 4 # The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20.
115 |     options.routing.routes = [ 
116 |         # The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array.
117 |         "^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$" # Only crawl /blog/{some-blog-alias} 4 times.
118 |     ]
119 | 
120 |     # Misc options (https://tijme.github.io/not-your-average-web-crawler/latest/options_misc.html)
121 |     options.misc.debug = False # If debug is enabled extra information will be logged to the console. Default is False.
122 |     options.misc.verify_ssl_certificates = True # If verification is enabled all SSL certificates will be checked for validity. Default is True.
123 |     options.misc.trusted_certificates = None # You can pass the path to a CA_BUNDLE file (.pem) or directory with certificates of trusted CAs. Default is None.
124 | 
125 |     crawler = Crawler(options)
126 |     crawler.start_with(Request("https://finnwea.com/"))
127 | 


--------------------------------------------------------------------------------
/docs/source/migration.rst:
--------------------------------------------------------------------------------
 1 | .. title:: Migration
 2 | 
 3 | From 1.6 to 1.7
 4 | ---------------
 5 | 
 6 | .. raw:: html
 7 | 
 8 |     <p><code>pip install --upgrade nyawc</code></p><br>
 9 | 
10 | **Default request timeout is now 30 seconds**
11 | 
12 | From now on there is a default request timeout of 30 seconds. In previous versions it was always infinite and you couldn't specify it.
13 | 
14 | If you want to keep the request timeout on infinite set the request timeout option to ``None``.
15 | 
16 | .. code:: python
17 | 
18 |     options.performance.request_timeout = 30
19 | 
20 | **Count attributes removed from queue**
21 | 
22 | The count attributes (e.g. ``queue.count_in_progress``) are removed since the time complexity of Python's native ``len()`` method is already O(1).
23 | 
24 | .. code:: python
25 | 
26 |     # Old
27 |     print("In progress count: " + str(queue.count_in_progress))
28 | 
29 |     # New
30 |     print("In progress count: " + str(len(queue.get_all(QueueItem.STATUS_IN_PROGRESS))))
31 | 
32 | From 1.5 to 1.6
33 | ---------------
34 | 
35 | .. raw:: html
36 | 
37 |     <p><code>pip install --upgrade nyawc</code></p><br>
38 | 
39 | **Headers have default values and are case insensitive**
40 | 
41 | From now on the headers identity option has default values and is a case insensitive dict. When changing headers the ``.update()`` method should be used so the default headers remain the same.
42 | 
43 | .. code:: python
44 | 
45 |     # Old
46 |     options.identity.headers = {
47 |         "User-Agent": "MyCustomUserAgent"
48 |     }
49 | 
50 |     # New
51 |     options.identity.headers.update({
52 |         "User-Agent": "MyCustomUserAgent"
53 |     })
54 | 
55 | **New default user agent**
56 | 
57 | The default user agent for the crawler has changed. In version 1.5 it was a fake Chrome user agent and from now on it is ``nyawc/1.6.0 CPython/3.6.1 Windows/10`` based on the versions you use.
58 | 
59 | The Chrome user agent from version 1.5 can still be faked by using the code below.
60 | 
61 | .. code:: python
62 | 
63 |     options.identity.headers.update({
64 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
65 |     })
66 | 
67 | From 1.4 to 1.5
68 | ---------------
69 | 
70 | .. raw:: html
71 | 
72 |     <p><code>pip install --upgrade nyawc</code></p><br>
73 | 
74 | **Renamed the domain must match scope option**
75 | 
76 | Since version 1.5 the domain_must_match option is now called hostname_must_match.
77 | 
78 | .. code:: python
79 | 
80 |     # Old
81 |     Options().scope.domain_must_match = True/False
82 | 
83 |     # New
84 |     Options().scope.hostname_must_match = True/False
85 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | nyawc
2 | =====
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    nyawc
8 | 


--------------------------------------------------------------------------------
/docs/source/nyawc.helpers.rst:
--------------------------------------------------------------------------------
 1 | nyawc\.helpers package
 2 | ======================
 3 | 
 4 | .. automodule:: nyawc.helpers
 5 |     :members:
 6 |     :undoc-members:
 7 |     :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | nyawc\.helpers\.HTTPRequestHelper module
13 | ----------------------------------------
14 | 
15 | .. automodule:: nyawc.helpers.HTTPRequestHelper
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 
20 | nyawc\.helpers\.PackageHelper module
21 | ------------------------------------
22 | 
23 | .. automodule:: nyawc.helpers.PackageHelper
24 |     :members:
25 |     :undoc-members:
26 |     :show-inheritance:
27 | 
28 | nyawc\.helpers\.RandomInputHelper module
29 | ----------------------------------------
30 | 
31 | .. automodule:: nyawc.helpers.RandomInputHelper
32 |     :members:
33 |     :undoc-members:
34 |     :show-inheritance:
35 | 
36 | nyawc\.helpers\.URLHelper module
37 | --------------------------------
38 | 
39 | .. automodule:: nyawc.helpers.URLHelper
40 |     :members:
41 |     :undoc-members:
42 |     :show-inheritance:
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/docs/source/nyawc.http.rst:
--------------------------------------------------------------------------------
 1 | nyawc\.http package
 2 | ===================
 3 | 
 4 | .. automodule:: nyawc.http
 5 |     :members:
 6 |     :undoc-members:
 7 |     :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | nyawc\.http\.Handler module
13 | ---------------------------
14 | 
15 | .. automodule:: nyawc.http.Handler
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 
20 | nyawc\.http\.Request module
21 | ---------------------------
22 | 
23 | .. automodule:: nyawc.http.Request
24 |     :members:
25 |     :undoc-members:
26 |     :show-inheritance:
27 | 
28 | nyawc\.http\.Response module
29 | ----------------------------
30 | 
31 | .. automodule:: nyawc.http.Response
32 |     :members:
33 |     :undoc-members:
34 |     :show-inheritance:
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/docs/source/nyawc.rst:
--------------------------------------------------------------------------------
 1 | nyawc package
 2 | =============
 3 | 
 4 | .. automodule:: nyawc
 5 |     :members:
 6 |     :undoc-members:
 7 |     :show-inheritance:
 8 | 
 9 | Subpackages
10 | -----------
11 | 
12 | .. toctree::
13 | 
14 |     nyawc.helpers
15 |     nyawc.http
16 |     nyawc.scrapers
17 | 
18 | Submodules
19 | ----------
20 | 
21 | nyawc\.Crawler module
22 | ---------------------
23 | 
24 | .. automodule:: nyawc.Crawler
25 |     :members:
26 |     :undoc-members:
27 |     :show-inheritance:
28 | 
29 | nyawc\.CrawlerActions module
30 | ----------------------------
31 | 
32 | .. automodule:: nyawc.CrawlerActions
33 |     :members:
34 |     :undoc-members:
35 |     :show-inheritance:
36 | 
37 | nyawc\.CrawlerThread module
38 | ---------------------------
39 | 
40 | .. automodule:: nyawc.CrawlerThread
41 |     :members:
42 |     :undoc-members:
43 |     :show-inheritance:
44 | 
45 | nyawc\.Options module
46 | ---------------------
47 | 
48 | .. automodule:: nyawc.Options
49 |     :members:
50 |     :undoc-members:
51 |     :show-inheritance:
52 | 
53 | nyawc\.Queue module
54 | -------------------
55 | 
56 | .. automodule:: nyawc.Queue
57 |     :members:
58 |     :undoc-members:
59 |     :show-inheritance:
60 | 
61 | nyawc\.QueueItem module
62 | -----------------------
63 | 
64 | .. automodule:: nyawc.QueueItem
65 |     :members:
66 |     :undoc-members:
67 |     :show-inheritance:
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/docs/source/nyawc.scrapers.rst:
--------------------------------------------------------------------------------
 1 | nyawc\.scrapers package
 2 | =======================
 3 | 
 4 | .. automodule:: nyawc.scrapers
 5 |     :members:
 6 |     :undoc-members:
 7 |     :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | nyawc\.scrapers\.BaseScraper module
13 | -----------------------------------
14 | 
15 | .. automodule:: nyawc.scrapers.BaseScraper
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 
20 | nyawc\.scrapers\.CSSRegexLinkScraper module
21 | -------------------------------------------
22 | 
23 | .. automodule:: nyawc.scrapers.CSSRegexLinkScraper
24 |     :members:
25 |     :undoc-members:
26 |     :show-inheritance:
27 | 
28 | nyawc\.scrapers\.HTMLSoupFormScraper module
29 | -------------------------------------------
30 | 
31 | .. automodule:: nyawc.scrapers.HTMLSoupFormScraper
32 |     :members:
33 |     :undoc-members:
34 |     :show-inheritance:
35 | 
36 | nyawc\.scrapers\.HTMLSoupLinkScraper module
37 | -------------------------------------------
38 | 
39 | .. automodule:: nyawc.scrapers.HTMLSoupLinkScraper
40 |     :members:
41 |     :undoc-members:
42 |     :show-inheritance:
43 | 
44 | nyawc\.scrapers\.JSONRegexLinkScraper module
45 | --------------------------------------------
46 | 
47 | .. automodule:: nyawc.scrapers.JSONRegexLinkScraper
48 |     :members:
49 |     :undoc-members:
50 |     :show-inheritance:
51 | 
52 | nyawc\.scrapers\.XMLRegexLinkScraper module
53 | -------------------------------------------
54 | 
55 | .. automodule:: nyawc.scrapers.XMLRegexLinkScraper
56 |     :members:
57 |     :undoc-members:
58 |     :show-inheritance:
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/docs/source/options_crawling_identity.rst:
--------------------------------------------------------------------------------
  1 | .. title:: Crawling identity
  2 | 
  3 | How to use identity options
  4 | ---------------------------
  5 | 
  6 | .. code:: python
  7 | 
  8 |     # identity_example.py
  9 | 
 10 |     from requests.auth import HTTPBasicAuth
 11 |     from nyawc.Options import Options
 12 |     from nyawc.Crawler import Crawler
 13 |     from nyawc.http.Request import Request
 14 | 
 15 |     options = Options()
 16 | 
 17 |     options.identity.auth = HTTPBasicAuth('user', 'pass')
 18 |     options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies')
 19 |     options.identity.cookies.set(name='gross_cookie', value='blech', domain='finnwea.com', path='/elsewhere')
 20 |     options.identity.proxies = {
 21 |         # No authentication
 22 |         # 'http': 'http://host:port',
 23 |         # 'https': 'http://host:port',
 24 | 
 25 |         # Basic authentication
 26 |         # 'http': 'http://user:pass@host:port',
 27 |         # 'https': 'https://user:pass@host:port',
 28 | 
 29 |         # SOCKS
 30 |         'http': 'socks5://user:pass@host:port',
 31 |         'https': 'socks5://user:pass@host:port'
 32 |     }
 33 |     options.identity.headers.update({
 34 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
 35 |     })
 36 | 
 37 |     crawler = Crawler(options)
 38 |     crawler.start_with(Request("https://finnwea.com/"))
 39 | 
 40 | Available identity options
 41 | --------------------------
 42 | 
 43 | Authentication
 44 | ~~~~~~~~~~~~~~
 45 | 
 46 | Set the authentication for the crawler. Please check `python-requests <http://docs.python-requests.org/en/master/user/authentication/>`__ authentication for all the options. Default is None (no authentication).
 47 | 
 48 | You can find examples of different types of authentication below.
 49 | 
 50 | .. code:: python
 51 | 
 52 |     from requests.auth import HTTPBasicAuth
 53 |     options.identity.auth = HTTPBasicAuth('user', 'pass')
 54 | 
 55 |     from requests.auth import HTTPDigestAuth
 56 |     options.identity.auth = HTTPDigestAuth('user', 'pass')
 57 | 
 58 |     from requests_oauthlib import OAuth1
 59 |     options.identity.auth = OAuth1('YOUR_APP_KEY', 'YOUR_APP_SECRET', 'USER_OAUTH_TOKEN', 'USER_OAUTH_TOKEN_SECRET')
 60 | 
 61 | Cookies
 62 | ~~~~~~~
 63 | 
 64 | Set custom cookies for the crawler. Please check `python-requests <http://docs.python-requests.org/en/master/user/quickstart/#cookies>`__ cookie jar for all the cookie options.
 65 | 
 66 | .. code:: python
 67 | 
 68 |     options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies')
 69 | 
 70 | Proxy
 71 | ~~~~~
 72 | 
 73 | Set a proxy for the crawler. Please check `python-requests <http://docs.python-requests.org/en/master/user/advanced/#proxies>`__ proxies for all the proxy options. Default is None (no proxy).
 74 | 
 75 | You can find examples of different types of proxies below.
 76 | 
 77 | .. code:: python
 78 | 
 79 |     # Without authentication
 80 |     options.identity.proxies = {
 81 |         'http': 'http://host:port',
 82 |         'https': 'http://host:port'
 83 |     }
 84 | 
 85 |     # With basic authentication
 86 |     options.identity.proxies = {
 87 |         'http': 'http://user:pass@host:port',
 88 |         'https': 'https://user:pass@host:port'
 89 |     }
 90 | 
 91 |     # With SOCKS
 92 |     options.identity.proxies = {
 93 |         'http': 'socks5://user:pass@host:port',
 94 |         'https': 'socks5://user:pass@host:port'
 95 |     }
 96 | 
 97 | Headers
 98 | ~~~~~~~
 99 | 
100 | Set custom headers for the crawler (as {key: value} CaseInsensitiveDict). For example, you can set a new user agent by using ``User-Agent`` as key, as shown below.
101 | 
102 | Please note that you should use the ``.update()`` method so the default headers remain the same.
103 | 
104 | .. code:: python
105 | 
106 |     options.identity.headers.update({
107 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" # The user agent to make requests with.
108 |     })
109 | 


--------------------------------------------------------------------------------
/docs/source/options_crawling_scope.rst:
--------------------------------------------------------------------------------
  1 | .. title:: Crawling scope
  2 | 
  3 | How to use scope options
  4 | ------------------------
  5 | 
  6 | .. code:: python
  7 | 
  8 |     # scope_example.py
  9 | 
 10 |     from nyawc.Options import Options
 11 |     from nyawc.Crawler import Crawler
 12 |     from nyawc.http.Request import Request
 13 | 
 14 |     options = Options()
 15 | 
 16 |     options.scope.protocol_must_match = False
 17 |     options.scope.subdomain_must_match = True
 18 |     options.scope.hostname_must_match = True
 19 |     options.scope.tld_must_match = True
 20 |     options.scope.max_depth = None
 21 |     options.scope.request_methods = [
 22 |         Request.METHOD_GET,
 23 |         Request.METHOD_POST,
 24 |         Request.METHOD_PUT,
 25 |         Request.METHOD_DELETE,
 26 |         Request.METHOD_OPTIONS,
 27 |         Request.METHOD_HEAD
 28 |     ]
 29 | 
 30 |     crawler = Crawler(options)
 31 |     crawler.start_with(Request("https://finnwea.com/"))
 32 | 
 33 | Available scope options
 34 | -----------------------
 35 | 
 36 | Protocol must match
 37 | ~~~~~~~~~~~~~~~~~~~
 38 | 
 39 | Only crawl pages with the same protocol as the startpoint (e.g. only https) if True. Default is False.
 40 | 
 41 | .. code:: python
 42 | 
 43 |     options.scope.protocol_must_match = False
 44 | 
 45 | Subdomain must match
 46 | ~~~~~~~~~~~~~~~~~~~~
 47 | 
 48 | Only crawl pages with the same subdomain as the startpoint if True. If the startpoint is not a subdomain, no subdomains will be crawled. Default is True.
 49 | 
 50 | Please note that the `www` subdomain will be treated the same as no subdomain.
 51 | 
 52 | .. code:: python
 53 | 
 54 |     options.scope.subdomain_must_match = True
 55 | 
 56 | Hostname must match
 57 | ~~~~~~~~~~~~~~~~~~~
 58 | 
 59 | Only crawl pages with the same hostname as the startpoint (e.g. only `finnwea`) if True. Default is True.
 60 | 
 61 | Please note that if you set this to false, chances are that it never stops crawling.
 62 | 
 63 | .. code:: python
 64 | 
 65 |     options.scope.hostname_must_match = True
 66 | 
 67 | TLD must match
 68 | ~~~~~~~~~~~~~~
 69 | 
 70 | Only crawl pages with the same tld as the startpoint (e.g. only `.com`) if True. Default is True.
 71 | 
 72 | .. code:: python
 73 | 
 74 |     options.scope.tld_must_match = True
 75 | 
 76 | Maximum crawling depth
 77 | ~~~~~~~~~~~~~~~~~~~~~~
 78 | 
 79 | The maximum search depth. Default is None (unlimited).
 80 | 
 81 | -  0 will only crawl the start request.
 82 | -  1 will also crawl all requests found on the start request.
 83 | -  2 will go one level deeper.
 84 | -  And so on...
 85 | 
 86 | .. code:: python
 87 | 
 88 |     options.scope.max_depth = None
 89 | 
 90 | Allowed request methods
 91 | ~~~~~~~~~~~~~~~~~~~~~~~
 92 | 
 93 | Only crawl these request methods. If empty or ``None`` all request methods will be crawled. Default is all.
 94 | 
 95 | .. code:: python
 96 | 
 97 |     options.scope.request_methods = [
 98 |         Request.METHOD_GET,
 99 |         Request.METHOD_POST,
100 |         Request.METHOD_PUT,
101 |         Request.METHOD_DELETE,
102 |         Request.METHOD_OPTIONS,
103 |         Request.METHOD_HEAD
104 |     ]
105 | 


--------------------------------------------------------------------------------
/docs/source/options_misc.rst:
--------------------------------------------------------------------------------
 1 | .. title:: Misc
 2 | 
 3 | How to use misc options
 4 | -----------------------
 5 | 
 6 | .. code:: python
 7 | 
 8 |     # misc_example.py
 9 | 
10 |     from nyawc.Options import Options
11 |     from nyawc.Crawler import Crawler
12 |     from nyawc.http.Request import Request
13 | 
14 |     options = Options()
15 | 
16 |     options.misc.debug = False
17 |     options.misc.verify_ssl_certificates = True
18 |     options.misc.trusted_certificates = None
19 | 
20 |     crawler = Crawler(options)
21 |     crawler.start_with(Request("https://finnwea.com/"))
22 | 
23 | Available misc options
24 | ----------------------
25 | 
26 | Debug
27 | ~~~~~
28 | 
29 | If debug is enabled extra information will be logged to the console. Default is False.
30 | 
31 | ``options.misc.debug = True``
32 | 
33 | 
34 | Verify SSL certificates
35 | ~~~~~~~~~~~~~~~~~~~~~~~
36 | 
37 | If verification is enabled all SSL certificates will be checked for validity. Default is True.
38 | 
39 | ``options.misc.verify_ssl_certificates = True``
40 | 
41 | 
42 | Trusted certificates
43 | ~~~~~~~~~~~~~~~~~~~~
44 | 
45 | To trust certain certificates (e.g. if you are using a proxy), you can pass the path to a CA_BUNDLE file or directory with certificates of additional trusted CAs. Default is None (which means only domains with valid SSL certificates can be crawled).
46 | 
47 | **If verify is set to a directory, the directory must have been processed using the c_rehash utility supplied with OpenSSL.**
48 | 
49 | ``options.misc.trusted_certificates = '/path/to/certificate.pem'``
50 | 


--------------------------------------------------------------------------------
/docs/source/options_performance.rst:
--------------------------------------------------------------------------------
 1 | .. title:: Performance
 2 | 
 3 | How to use performance options
 4 | ------------------------------
 5 | 
 6 | .. code:: python
 7 | 
 8 |     # performance_example.py
 9 | 
10 |     from nyawc.Options import Options
11 |     from nyawc.Crawler import Crawler
12 |     from nyawc.http.Request import Request
13 | 
14 |     options = Options()
15 | 
16 |     options.performance.max_threads = 20
17 |     options.performance.request_timeout = 15
18 | 
19 |     crawler = Crawler(options)
20 |     crawler.start_with(Request("https://finnwea.com/"))
21 | 
22 | Available performance options
23 | -----------------------------
24 | 
25 | Maximum threads
26 | ~~~~~~~~~~~~~~~
27 | 
28 | The maximum amount of simultaneous threads to use for crawling. Default is 40.
29 | 
30 | ``options.performance.max_threads = 40``
31 | 
32 | Request timeout
33 | ~~~~~~~~~~~~~~~
34 | 
35 | The request timeout in seconds (throws an exception if exceeded). Default is 30.
36 | 
37 | ``options.performance.request_timeout = 30``
38 | 


--------------------------------------------------------------------------------
/docs/source/options_routing.rst:
--------------------------------------------------------------------------------
 1 | .. title:: Routing
 2 | 
 3 | How to use routing options
 4 | --------------------------
 5 | 
 6 | .. code:: python
 7 | 
 8 |     # routing_example.py
 9 | 
10 |     from nyawc.Options import Options
11 |     from nyawc.Crawler import Crawler
12 |     from nyawc.http.Request import Request
13 | 
14 |     options = Options()
15 | 
16 |     options.routing.minimum_threshold = 4
17 |     options.routing.routes = [ 
18 |         "^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$"
19 |     ]
20 | 
21 |     crawler = Crawler(options)
22 |     crawler.start_with(Request("https://finnwea.com/"))
23 | 
24 | Available routing options
25 | -------------------------
26 | 
27 | Minimum threshold
28 | ~~~~~~~~~~~~~~~~~
29 | 
30 | The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20.
31 | 
32 | For example, lets say we have these rquests;
33 | 
34 | .. code::
35 | 
36 |     https://finnwea.com/blog/1
37 |     https://finnwea.com/blog/2
38 |     https://finnwea.com/blog/3
39 |     ...
40 |     https://finnwea.com/blog/54
41 | 
42 | It will only crawl the first 20 requests. After that it ignores the rest of the blog posts.
43 | 
44 | **Please note that it will probably crawl a bit more than the minimum threshold depending on the maximum amount of threads to use.**
45 | 
46 | ``options.routing.minimum_threshold = 20``
47 | 
48 | Routes
49 | ~~~~~~
50 | 
51 | The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array.
52 | 
53 | For example the route below represents ``http://finnwea.com/blog/{a-variable-blog-alias}/``.
54 | 
55 | ``options.routing.routes = ["^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$"]``


--------------------------------------------------------------------------------
/example_extensive.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2017 Tijme Gommers
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | from nyawc.Options import Options
 26 | from nyawc.QueueItem import QueueItem
 27 | from nyawc.Crawler import Crawler
 28 | from nyawc.CrawlerActions import CrawlerActions
 29 | from nyawc.http.Request import Request
 30 | from requests.auth import HTTPBasicAuth
 31 | 
 32 | def cb_crawler_before_start():
 33 |     print("Crawler started.")
 34 | 
 35 | def cb_crawler_after_finish(queue):
 36 |     print("Crawler finished.")
 37 |     print("Found " + str(len(queue.get_all(QueueItem.STATUS_FINISHED))) + " requests.")
 38 | 
 39 |     for queue_item in queue.get_all(QueueItem.STATUS_FINISHED).values():
 40 |         print("[" + queue_item.request.method + "] " + queue_item.request.url + " (PostData: " + str(queue_item.request.data) + ")")
 41 | 
 42 | def cb_request_before_start(queue, queue_item):
 43 |     # return CrawlerActions.DO_SKIP_TO_NEXT
 44 |     # return CrawlerActions.DO_STOP_CRAWLING
 45 | 
 46 |     return CrawlerActions.DO_CONTINUE_CRAWLING
 47 | 
 48 | def cb_request_after_finish(queue, queue_item, new_queue_items):
 49 |     percentage = str(int(queue.get_progress()))
 50 |     total_requests = str(queue.count_total)
 51 | 
 52 |     print("At " + percentage + "% of " + total_requests + " requests ([" + str(queue_item.response.status_code) + "] " + queue_item.request.url + ").")
 53 | 
 54 |     # return CrawlerActions.DO_STOP_CRAWLING
 55 |     return CrawlerActions.DO_CONTINUE_CRAWLING
 56 | 
 57 | def cb_request_in_thread_before_start(queue_item):
 58 |     pass
 59 | 
 60 | def cb_request_in_thread_after_finish(queue_item):
 61 |     pass
 62 | 
 63 | def cb_request_on_error(queue_item, message):
 64 |     print("[error] " + message)
 65 | 
 66 | def cb_form_before_autofill(queue_item, elements, form_data):
 67 |     # return CrawlerActions.DO_NOT_AUTOFILL_FORM
 68 | 
 69 |     return CrawlerActions.DO_AUTOFILL_FORM
 70 | 
 71 | def cb_form_after_autofill(queue_item, elements, form_data):
 72 |     pass
 73 | 
 74 | # Declare the options
 75 | options = Options()
 76 | 
 77 | # Callback options (https://tijme.github.io/not-your-average-web-crawler/latest/options_callbacks.html)
 78 | options.callbacks.crawler_before_start = cb_crawler_before_start # Called before the crawler starts crawling. Default is a null route.
 79 | options.callbacks.crawler_after_finish = cb_crawler_after_finish # Called after the crawler finished crawling. Default is a null route.
 80 | options.callbacks.request_before_start = cb_request_before_start # Called before the crawler starts a new request. Default is a null route.
 81 | options.callbacks.request_after_finish = cb_request_after_finish # Called after the crawler finishes a request. Default is a null route.
 82 | options.callbacks.request_in_thread_before_start = cb_request_in_thread_before_start # Called in the crawling thread (when it started). Default is a null route.
 83 | options.callbacks.request_in_thread_after_finish = cb_request_in_thread_after_finish # Called in the crawling thread (when it finished). Default is a null route.
 84 | options.callbacks.request_on_error = cb_request_on_error # Called if a request failed. Default is a null route.
 85 | options.callbacks.form_before_autofill = cb_form_before_autofill # Called before the crawler autofills a form. Default is a null route.
 86 | options.callbacks.form_after_autofill = cb_form_after_autofill # Called after the crawler autofills a form. Default is a null route.
 87 | 
 88 | # Scope options (https://tijme.github.io/not-your-average-web-crawler/latest/options_crawling_scope.html)
 89 | options.scope.protocol_must_match = False # Only crawl pages with the same protocol as the startpoint (e.g. only https). Default is False.
 90 | options.scope.subdomain_must_match = True # Only crawl pages with the same subdomain as the startpoint. If the startpoint is not a subdomain, no subdomains will be crawled. Default is True.
 91 | options.scope.hostname_must_match = True # Only crawl pages with the same hostname as the startpoint (e.g. only `finnwea`). Default is True.
 92 | options.scope.tld_must_match = True # Only crawl pages with the same tld as the startpoint (e.g. only `.com`). Default is True.
 93 | options.scope.max_depth = None # The maximum search depth. 0 only crawls the start request. 1 will also crawl all the requests found on the start request. 2 goes one level deeper, and so on. Default is None (unlimited).
 94 | options.scope.request_methods = [
 95 |     # The request methods to crawl. Default is all request methods
 96 |     Request.METHOD_GET,
 97 |     Request.METHOD_POST,
 98 |     Request.METHOD_PUT,
 99 |     Request.METHOD_DELETE,
100 |     Request.METHOD_OPTIONS,
101 |     Request.METHOD_HEAD
102 | ]
103 | 
104 | # Identity options (https://tijme.github.io/not-your-average-web-crawler/latest/options_crawling_identity.html)
105 | options.identity.auth = HTTPBasicAuth('user', 'pass') # Or any other authentication (http://docs.python-requests.org/en/master/user/authentication/). Default is None.
106 | options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies')
107 | options.identity.cookies.set(name='gross_cookie', value='blech', domain='finnwea.com', path='/elsewhere')
108 | options.identity.proxies = {
109 |     # No authentication
110 |     # 'http': 'http://host:port',
111 |     # 'https': 'http://host:port',
112 | 
113 |     # Basic authentication
114 |     # 'http': 'http://user:pass@host:port',
115 |     # 'https': 'https://user:pass@host:port',
116 | 
117 |     # SOCKS
118 |     # 'http': 'socks5://user:pass@host:port',
119 |     # 'https': 'socks5://user:pass@host:port'
120 | }
121 | options.identity.headers.update({
122 |     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
123 | })
124 | 
125 | # Performance options (https://tijme.github.io/not-your-average-web-crawler/latest/options_performance.html)
126 | options.performance.max_threads = 20 # The maximum amount of simultaneous threads to use for crawling. Default is 40.
127 | options.performance.request_timeout = 15 # The request timeout in seconds (throws an exception if exceeded). Default is 30.
128 | 
129 | # Routing options (https://tijme.github.io/not-your-average-web-crawler/latest/options_routing.html)
130 | options.routing.minimum_threshold = 4 # The minimum amount of requests to crawl (matching a certain route) before ignoring the rest. Default is 20.
131 | options.routing.routes = [ 
132 |     # The regular expressions that represent routes that should not be cralwed more times than the minimum treshold. Default is an empty array.
133 |     "^(https?:\/\/)?(www\.)?finnwea\.com\/blog\/[^\n \/]+\/$" # Only crawl /blog/{some-blog-alias} 4 times.
134 | ]
135 | 
136 | # Misc options (https://tijme.github.io/not-your-average-web-crawler/latest/options_misc.html)
137 | options.misc.debug = False # If debug is enabled extra information will be logged to the console. Default is False.
138 | options.misc.verify_ssl_certificates = True # If verification is enabled all SSL certificates will be checked for validity. Default is True.
139 | options.misc.trusted_certificates = None # You can pass the path to a CA_BUNDLE file (.pem) or directory with certificates of trusted CAs. Default is None.
140 | 
141 | crawler = Crawler(options)
142 | crawler.start_with(Request("https://finnwea.com/"))
143 | 


--------------------------------------------------------------------------------
/example_minimal.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | from nyawc.Options import Options
26 | from nyawc.Crawler import Crawler
27 | from nyawc.QueueItem import QueueItem
28 | from nyawc.CrawlerActions import CrawlerActions
29 | from nyawc.http.Request import Request
30 | 
31 | def cb_crawler_before_start():
32 |     print("Crawler started.")
33 | 
34 | def cb_crawler_after_finish(queue):
35 |     print("Crawler finished.")
36 |     print("Found " + str(len(queue.get_all(QueueItem.STATUS_FINISHED))) + " requests.")
37 | 
38 | def cb_request_before_start(queue, queue_item):
39 |     print("Starting: {}".format(queue_item.request.url))
40 |     return CrawlerActions.DO_CONTINUE_CRAWLING
41 | 
42 | def cb_request_after_finish(queue, queue_item, new_queue_items):
43 |     print("Finished: {}".format(queue_item.request.url))
44 |     return CrawlerActions.DO_CONTINUE_CRAWLING
45 | 
46 | options = Options()
47 | 
48 | options.callbacks.crawler_before_start = cb_crawler_before_start # Called before the crawler starts crawling. Default is a null route.
49 | options.callbacks.crawler_after_finish = cb_crawler_after_finish # Called after the crawler finished crawling. Default is a null route.
50 | options.callbacks.request_before_start = cb_request_before_start # Called before the crawler starts a new request. Default is a null route.
51 | options.callbacks.request_after_finish = cb_request_after_finish # Called after the crawler finishes a request. Default is a null route.
52 | 
53 | crawler = Crawler(options)
54 | crawler.start_with(Request("https://finnwea.com/"))
55 | 


--------------------------------------------------------------------------------
/nyawc/CrawlerActions.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | class CrawlerActions(object):
26 |     """The actions that crawler callbacks can return.
27 | 
28 |     Attributes:
29 |         DO_CONTINUE_CRAWLING (int): Continue by crawling the request.
30 |         DO_SKIP_TO_NEXT (int): Skip the current request and continue with the next one in line.
31 |         DO_STOP_CRAWLING (int): Stop crawling and quit ongoing requests.
32 |         DO_AUTOFILL_FORM (int): Autofill this form with random values.
33 |         DO_NOT_AUTOFILL_FORM (int): Do not autofill this form with random values.
34 | 
35 |     """
36 | 
37 |     DO_CONTINUE_CRAWLING = 1
38 | 
39 |     DO_SKIP_TO_NEXT = 2
40 | 
41 |     DO_STOP_CRAWLING = 3
42 | 
43 |     DO_AUTOFILL_FORM = 4
44 | 
45 |     DO_NOT_AUTOFILL_FORM = 5
46 | 


--------------------------------------------------------------------------------
/nyawc/CrawlerThread.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2017 Tijme Gommers
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | import threading
 26 | 
 27 | from nyawc.helpers.DebugHelper import DebugHelper
 28 | from nyawc.http.Handler import Handler
 29 | from nyawc.QueueItem import QueueItem
 30 | 
 31 | class CrawlerThread(threading.Thread):
 32 |     """The crawler thread executes the HTTP request using the HTTP handler.
 33 | 
 34 |     Attributes:
 35 |         __callback (obj): The method to call when finished
 36 |         __callback_lock (bool): The callback lock that prevents race conditions.
 37 |         __options (:class:`nyawc.Options`): The settins/options object.
 38 |         __queue_item (:class:`nyawc.QueueItem`): The queue item containing a request to execute.
 39 | 
 40 |     """
 41 | 
 42 |     def __init__(self, callback, callback_lock, options, queue_item):
 43 |         """Constructs a crawler thread instance
 44 | 
 45 |         Args:
 46 |             callback (obj): The method to call when finished
 47 |             callback_lock (bool): The callback lock that prevents race conditions.
 48 |             options (:class:`nyawc.Options`): The settins/options object.
 49 |             queue_item (:class:`nyawc.QueueItem`): The queue item containing a request to execute.
 50 | 
 51 |         """
 52 | 
 53 |         threading.Thread.__init__(self)
 54 | 
 55 |         self.__callback = callback
 56 |         self.__callback_lock = callback_lock
 57 |         self.__options = options
 58 |         self.__queue_item = queue_item
 59 | 
 60 |     def run(self):
 61 |         """Executes the HTTP call.
 62 | 
 63 |         Note:
 64 |             If this and the parent handler raised an error, the queue item status
 65 |             will be set to errored instead of finished. This is to prevent e.g. 404
 66 |             recursion.
 67 | 
 68 |         """
 69 | 
 70 |         try:
 71 |             self.__options.callbacks.request_in_thread_before_start(self.__queue_item)
 72 |         except Exception as e:
 73 |             print(e)
 74 | 
 75 |         new_requests = []
 76 |         failed = False
 77 | 
 78 |         try:
 79 |             handler = Handler(self.__options, self.__queue_item)
 80 |             new_requests = handler.get_new_requests()
 81 | 
 82 |             try:
 83 |                 self.__queue_item.response.raise_for_status()
 84 |             except Exception:
 85 |                 if self.__queue_item.request.parent_raised_error:
 86 |                     failed = True
 87 |                 else:
 88 |                     for new_request in new_requests:
 89 |                         new_request.parent_raised_error = True
 90 | 
 91 |         except Exception as e:
 92 |             failed = True
 93 | 
 94 |             error_message = "Setting status of '{}' to '{}' because of an HTTP error.".format(
 95 |                 self.__queue_item.request.url,
 96 |                 QueueItem.STATUS_ERRORED
 97 |             )
 98 | 
 99 |             DebugHelper.output(self.__options, error_message)
100 |             DebugHelper.output(self.__options, e)
101 | 
102 |             try:
103 |                 self.__options.callbacks.request_on_error(self.__queue_item, str(e))
104 |             except Exception as e:
105 |                 print(e)
106 | 
107 |         for new_request in new_requests:
108 |             new_request.parent_url = self.__queue_item.request.url
109 | 
110 |         try:
111 |             self.__options.callbacks.request_in_thread_after_finish(self.__queue_item)
112 |         except Exception as e:
113 |             print(e)
114 | 
115 |         with self.__callback_lock:
116 |             self.__callback(self.__queue_item, new_requests, failed)
117 | 


--------------------------------------------------------------------------------
/nyawc/Queue.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2017 Tijme Gommers
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | from collections import OrderedDict
 26 | from nyawc.http.Response import Response
 27 | from nyawc.QueueItem import QueueItem
 28 | 
 29 | class Queue(object):
 30 |     """A 'hash' queue containing all the requests of the crawler.
 31 | 
 32 |     Note:
 33 |         This queue uses a certain hash to prevent duplicate entries and improve
 34 |         the time complexity by checking if the hash exists instead of iterating
 35 |         over all items.
 36 | 
 37 |     Attributes:
 38 |         __options (:class:`nyawc.Options`): The options to use (used when generating queue item hashes).
 39 |         count_total (int): The total count of requests in the queue.
 40 |         items_queued list(:class:`nyawc.QueueItem`): The queued items (yet to be executed).
 41 |         items_in_progress list(:class:`nyawc.QueueItem`): The items currently being executed.
 42 |         items_finished list(:class:`nyawc.QueueItem`): The finished items.
 43 |         items_cancelled list(:class:`nyawc.QueueItem`): Items that were cancelled.
 44 |         items_errored list(:class:`nyawc.QueueItem`): Items that generated an error.
 45 | 
 46 |     """
 47 | 
 48 |     def __init__(self, options):
 49 |         """Constructs a Queue instance.
 50 | 
 51 |         Args:
 52 |             options (:class:`nyawc.Options`): The options to use.
 53 | 
 54 |         """
 55 | 
 56 |         self.__options = options
 57 |         self.count_total = 0
 58 |         self.items_queued = OrderedDict()
 59 |         self.items_in_progress = OrderedDict()
 60 |         self.items_finished = OrderedDict()
 61 |         self.items_cancelled = OrderedDict()
 62 |         self.items_errored = OrderedDict()
 63 | 
 64 |     def add_request(self, request):
 65 |         """Add a request to the queue.
 66 | 
 67 |         Args:
 68 |             request (:class:`nyawc.http.Request`): The request to add.
 69 | 
 70 |         Returns:
 71 |             :class:`nyawc.QueueItem`: The created queue item.
 72 | 
 73 |         """
 74 | 
 75 |         queue_item = QueueItem(request, Response(request.url))
 76 |         self.add(queue_item)
 77 |         return queue_item
 78 | 
 79 |     def has_request(self, request):
 80 |         """Check if the given request already exists in the queue.
 81 | 
 82 |         Args:
 83 |             request (:class:`nyawc.http.Request`): The request to check.
 84 | 
 85 |         Returns:
 86 |             bool: True if already exists, False otherwise.
 87 | 
 88 |         """
 89 | 
 90 |         queue_item = QueueItem(request, Response(request.url))
 91 |         key = queue_item.get_hash()
 92 | 
 93 |         for status in QueueItem.STATUSES:
 94 |             if key in self.__get_var("items_" + status).keys():
 95 |                 return True
 96 | 
 97 |         return False
 98 | 
 99 |     def add(self, queue_item):
100 |         """Add a request/response pair to the queue.
101 | 
102 |         Args:
103 |             queue_item (:class:`nyawc.QueueItem`): The queue item to add.
104 | 
105 |         """
106 | 
107 |         hash_key = queue_item.get_hash()
108 |         items = self.__get_var("items_" + queue_item.status)
109 | 
110 |         if hash_key in items.keys():
111 |             return
112 | 
113 |         items[queue_item.get_hash()] = queue_item
114 | 
115 |         self.count_total += 1
116 | 
117 |     def move(self, queue_item, status):
118 |         """Move a request/response pair to another status.
119 | 
120 |         Args:
121 |             queue_item (:class:`nyawc.QueueItem`): The queue item to move
122 |             status (str): The new status of the queue item.
123 | 
124 |         """
125 | 
126 |         items = self.__get_var("items_" + queue_item.status)
127 | 
128 |         del items[queue_item.get_hash()]
129 |         self.count_total -= 1
130 | 
131 |         queue_item.status = status
132 |         self.add(queue_item)
133 | 
134 |     def move_bulk(self, from_statuses, to_status):
135 |         """Move a bulk of request/response pairs to another status
136 | 
137 |         Args:
138 |             from_statuses list(str): The statuses to move from
139 |             to_status (str): The status to move to
140 | 
141 |         """
142 | 
143 |         for status in from_statuses:
144 |             from_status_items = self.__get_var("items_" + status)
145 |             self.__set_var("items_" + status, OrderedDict())
146 | 
147 |             to_status_items = self.__get_var("items_" + to_status)
148 |             to_status_items.update(from_status_items)
149 | 
150 |     def get_first(self, status):
151 |         """Get the first item in the queue that has the given status.
152 | 
153 |         Args:
154 |             status (str): return the first item with this status.
155 | 
156 |         Returns:
157 |             :class:`nyawc.QueueItem`: The first queue item with the given status.
158 | 
159 |         """
160 | 
161 |         items = self.get_all(status)
162 | 
163 |         if items:
164 |             return list(items.items())[0][1]
165 | 
166 |         return None
167 | 
168 |     def get_all(self, status):
169 |         """Get all the items in the queue that have the given status.
170 | 
171 |         Args:
172 |             status (str): return the items with this status.
173 | 
174 |         Returns:
175 |             list(:class:`nyawc.QueueItem`): All the queue items with the given status.
176 | 
177 |         """
178 | 
179 |         return self.__get_var("items_" + status)
180 | 
181 |     def get_progress(self):
182 |         """Get the progress of the queue in percentage (float).
183 | 
184 |         Returns:
185 |             float: The 'finished' progress in percentage.
186 | 
187 |         """
188 | 
189 |         count_remaining = len(self.items_queued) + len(self.items_in_progress)
190 |         percentage_remaining = 100 / self.count_total * count_remaining
191 | 
192 |         return 100 - percentage_remaining
193 | 
194 |     def __set_var(self, name, value):
195 |         """Set an instance/class var by name.
196 | 
197 |         Args:
198 |             name (str): The name of the variable.
199 |             value (obj): I'ts new value.
200 | 
201 |         """
202 | 
203 |         setattr(self, name, value)
204 | 
205 |     def __get_var(self, name):
206 |         """Get an instance/class var by name.
207 | 
208 |         Args:
209 |             name (str): The name of the variable.
210 | 
211 |         Returns:
212 |             obj: I'ts value.
213 | 
214 |         """
215 | 
216 |         return getattr(self, name)
217 | 


--------------------------------------------------------------------------------
/nyawc/QueueItem.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2017 Tijme Gommers
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | from nyawc.helpers.URLHelper import URLHelper
 26 | from bs4 import BeautifulSoup
 27 | 
 28 | class QueueItem(object):
 29 |     """The QueueItem class keeps track of the request and response and the crawling status.
 30 | 
 31 |     Attributes:
 32 |         STATUS_QUEUED (str): Status for when the crawler did not yet start the request.
 33 |         STATUS_IN_PROGRESS (str): Status for when the crawler is currently crawling the request.
 34 |         STATUS_FINISHED (str): Status for when the crawler has finished crawling the request.
 35 |         STATUS_CANCELLED (str): Status for when the crawler has cancelled the request.
 36 |         STATUS_ERRORED (str): Status for when the crawler could not execute the request.
 37 |         STATUSES (arr): All statuses.
 38 |         status (str): The current crawling status.
 39 |         decomposed (bool): If the this queue item is decomposed.
 40 |         request (:class:`nyawc.http.Request`): The Request object.
 41 |         response (:class:`nyawc.http.Response`): The Response object.
 42 |         __response_soup (obj): The BeautifulSoup container for the response text.
 43 |         __index_hash (str): The index of the queue (if cached), otherwise None.
 44 | 
 45 |     Note:
 46 |         A queue item will be decomposed (cached objects are deleted to free up memory) when it is
 47 |         not likeley to be used again. After decompisition variables will not be cached anymore.
 48 | 
 49 |     """
 50 | 
 51 |     STATUS_QUEUED = "queued"
 52 | 
 53 |     STATUS_IN_PROGRESS = "in_progress"
 54 | 
 55 |     STATUS_FINISHED = "finished"
 56 | 
 57 |     STATUS_CANCELLED = "cancelled"
 58 | 
 59 |     STATUS_ERRORED = "errored"
 60 | 
 61 |     STATUSES = [
 62 |         STATUS_QUEUED,
 63 |         STATUS_IN_PROGRESS,
 64 |         STATUS_FINISHED,
 65 |         STATUS_CANCELLED,
 66 |         STATUS_ERRORED
 67 |     ]
 68 | 
 69 |     def __init__(self, request, response):
 70 |         """Constructs a QueueItem instance.
 71 | 
 72 |         Args:
 73 |             request (:class:`nyawc.http.Request`): The Request object.
 74 |             response (:class:`nyawc.http.Response`): The Response object (empty object when initialized).
 75 | 
 76 |         """
 77 | 
 78 |         self.status = QueueItem.STATUS_QUEUED
 79 |         self.decomposed = False
 80 |         self.__response_soup = None
 81 |         self.__index_hash = None
 82 | 
 83 |         self.request = request
 84 |         self.response = response
 85 | 
 86 |     def get_soup_response(self):
 87 |         """Get the response as a cached BeautifulSoup container.
 88 | 
 89 |         Returns:
 90 |             obj: The BeautifulSoup container.
 91 | 
 92 |         """
 93 | 
 94 |         if self.response is not None:
 95 |             if self.__response_soup is None:
 96 |                 result = BeautifulSoup(self.response.text, "lxml")
 97 | 
 98 |                 if self.decomposed:
 99 |                     return result
100 |                 else:
101 |                     self.__response_soup = BeautifulSoup(self.response.text, "lxml")
102 | 
103 |         return self.__response_soup
104 | 
105 |     def decompose(self):
106 |         """Decompose this queue item (set cached variables to None) to free up memory.
107 | 
108 |         Note:
109 |             When setting cached variables to None memory will be released after the garbage 
110 |             collector ran.
111 |         
112 |         """
113 | 
114 |         self.__response_soup = None
115 | 
116 |         self.decomposed = True
117 | 
118 |     def get_hash(self):
119 |         """Generate and return the dict index hash of the given queue item.
120 | 
121 |         Note:
122 |             Cookies should not be included in the hash calculation because
123 |             otherwise requests are crawled multiple times with e.g. different
124 |             session keys, causing infinite crawling recursion.
125 | 
126 |         Note:
127 |             At this moment the keys do not actually get hashed since it works perfectly without and
128 |             since hashing the keys requires us to built hash collision management.
129 | 
130 |         Returns:
131 |             str: The hash of the given queue item.
132 | 
133 |         """
134 | 
135 |         if self.__index_hash:
136 |             return self.__index_hash
137 | 
138 |         key = self.request.method
139 | 
140 |         key += URLHelper.get_protocol(self.request.url)
141 |         key += URLHelper.get_subdomain(self.request.url)
142 |         key += URLHelper.get_hostname(self.request.url)
143 |         key += URLHelper.get_tld(self.request.url)
144 |         key += URLHelper.get_path(self.request.url)
145 | 
146 |         key += str(URLHelper.get_ordered_params(self.request.url))
147 | 
148 |         if self.request.data is not None:
149 |             key += str(self.request.data.keys())
150 | 
151 |         self.__index_hash = key
152 |         return self.__index_hash
153 | 


--------------------------------------------------------------------------------
/nyawc/Routing.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | import re
26 | 
27 | class Routing(object):
28 |     """The Routing class counts requests that match certain routes.
29 | 
30 |     Attributes:
31 |         __routing_options (:class:`nyawc.OptionsRouting`): The options containing routing information.
32 |         __routing_count (obj): The {key: value} dict that contains the amount of requests for certain routes.
33 | 
34 |     """
35 | 
36 |     def __init__(self, options):
37 |         """Constructs a Crawler instance.
38 | 
39 |         Args:
40 |             options (:class:`nyawc.Options`): The options to use for the current crawling runtime.
41 | 
42 |         """
43 | 
44 |         self.__routing_options = options.routing
45 |         self.__routing_count = {}
46 | 
47 |     def increase_route_count(self, crawled_request):
48 |         """Increase the count that determines how many times a URL of a certain route has been crawled.
49 | 
50 |         Args:
51 |             crawled_request (:class:`nyawc.http.Request`): The request that possibly matches a route.
52 | 
53 |         """
54 | 
55 |         for route in self.__routing_options.routes:
56 |             if re.compile(route).match(crawled_request.url):
57 |                 count_key = str(route) + crawled_request.method
58 |                 
59 |                 if count_key in self.__routing_count.keys():
60 |                     self.__routing_count[count_key] += 1
61 |                 else:
62 |                     self.__routing_count[count_key] = 1
63 | 
64 |                 break
65 | 
66 |     def is_treshold_reached(self, scraped_request):
67 |         """Check if similar requests to the given requests have already been crawled X times. Where X is the 
68 |         minimum treshold amount from the options.
69 | 
70 |         Args:
71 |             scraped_request (:class:`nyawc.http.Request`): The request that possibly reached the minimum treshold.
72 | 
73 |         Returns:
74 |             bool: True if treshold reached, false otherwise.
75 | 
76 |         """
77 | 
78 |         for route in self.__routing_options.routes:
79 |             if re.compile(route).match(scraped_request.url):
80 |                 count_key = str(route) + scraped_request.method
81 | 
82 |                 if count_key in self.__routing_count.keys():
83 |                     return self.__routing_count[count_key] >= self.__routing_options.minimum_threshold
84 |                 
85 |         return False
86 | 


--------------------------------------------------------------------------------
/nyawc/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | import sys
26 | 
27 | python_version_compatible_with_nyawc = False
28 | 
29 | if sys.version_info.major == 3 and sys.version_info.minor >= 3:
30 |     python_version_compatible_with_nyawc = True
31 | 
32 | if sys.version_info.major == 2 and sys.version_info.minor >= 7:
33 |     python_version_compatible_with_nyawc = True
34 | 
35 | if not python_version_compatible_with_nyawc:
36 |     print("N.Y.A.W.C requires Python 2.7/3.3 or higher!")
37 |     print("You are currently using Python {}.{}.".format(sys.version_info.major, sys.version_info.minor))
38 |     sys.exit(1)
39 | 


--------------------------------------------------------------------------------
/nyawc/helpers/DebugHelper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | import requests
26 | 
27 | class DebugHelper:
28 |     """A helper for printing debug messages."""
29 | 
30 |     @staticmethod
31 |     def setup(options):
32 |         """Initialize debug/logging in third party libraries correctly.
33 | 
34 |         Args:
35 |             options (:class:`nyawc.Options`): The options to use for the current crawling runtime.
36 | 
37 |         """
38 | 
39 |         if not options.misc.debug:
40 |             requests.packages.urllib3.disable_warnings(
41 |                 requests.packages.urllib3.exceptions.InsecureRequestWarning
42 |             )
43 | 
44 | 
45 |     @staticmethod
46 |     def output(options, message):
47 |         """Print the given message if the debug option in the given options is on.
48 | 
49 |         Args:
50 |             options (:class:`nyawc.Options`): The options to use for the current crawling runtime.
51 |             message (str): The message to print.
52 | 
53 |         """
54 | 
55 |         if options.misc.debug:
56 |             print("[DEBUG] " + str(message))


--------------------------------------------------------------------------------
/nyawc/helpers/HTTPRequestHelper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2017 Tijme Gommers
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | import copy
 26 | 
 27 | from nyawc.helpers.URLHelper import URLHelper
 28 | 
 29 | class HTTPRequestHelper:
 30 |     """A helper for the src.http.Request module."""
 31 | 
 32 |     @staticmethod
 33 |     def patch_with_options(request, options, parent_queue_item=None):
 34 |         """Patch the given request with the given options (e.g. user agent).
 35 | 
 36 |         Args:
 37 |             request (:class:`nyawc.http.Request`): The request to patch.
 38 |             options (:class:`nyawc.Options`): The options to patch the request with.
 39 |             parent_queue_item (:class:`nyawc.QueueItem`): The parent queue item object (request/response pair) if exists.
 40 | 
 41 |         """
 42 | 
 43 |         request.auth = copy.deepcopy(options.identity.auth)
 44 |         request.cookies = copy.deepcopy(options.identity.cookies)
 45 |         request.headers = copy.deepcopy(options.identity.headers)
 46 |         request.proxies = copy.deepcopy(options.identity.proxies)
 47 |         request.timeout = copy.copy(options.performance.request_timeout)
 48 | 
 49 |         if parent_queue_item != None:
 50 |             for cookie in parent_queue_item.request.cookies:
 51 |                 request.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path)
 52 | 
 53 |             for cookie in parent_queue_item.response.cookies:
 54 |                 request.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path)
 55 | 
 56 |         if options.misc.verify_ssl_certificates and options.misc.trusted_certificates:
 57 |             request.verify = options.misc.trusted_certificates
 58 |         else:
 59 |             request.verify = options.misc.verify_ssl_certificates
 60 | 
 61 |     @staticmethod
 62 |     def complies_with_scope(queue_item, new_request, scope):
 63 |         """Check if the new request complies with the crawling scope.
 64 | 
 65 |         Args:
 66 |             queue_item (:class:`nyawc.QueueItem`): The parent queue item of the new request.
 67 |             new_request (:class:`nyawc.http.Request`): The request to check.
 68 |             scope (:class:`nyawc.Options.OptionsScope`): The scope to check.
 69 | 
 70 |         Returns:
 71 |             bool: True if it complies, False otherwise.
 72 | 
 73 |         """
 74 | 
 75 |         if not URLHelper.is_parsable(queue_item.request.url):
 76 |             return False
 77 | 
 78 |         if not URLHelper.is_parsable(new_request.url):
 79 |             return False
 80 | 
 81 |         if scope.request_methods:
 82 |             if not queue_item.request.method in scope.request_methods:
 83 |                 return False
 84 | 
 85 |         if scope.protocol_must_match:
 86 |             if URLHelper.get_protocol(queue_item.request.url) != URLHelper.get_protocol(new_request.url):
 87 |                 return False
 88 | 
 89 |         if scope.subdomain_must_match:
 90 |             current_subdomain = URLHelper.get_subdomain(queue_item.request.url)
 91 |             new_subdomain = URLHelper.get_subdomain(new_request.url)
 92 | 
 93 |             www_matches = False
 94 | 
 95 |             if current_subdomain == "www" and new_subdomain == "":
 96 |                 www_matches = True
 97 | 
 98 |             if new_subdomain == "www" and current_subdomain == "":
 99 |                 www_matches = True
100 | 
101 |             if not www_matches and current_subdomain != new_subdomain:
102 |                 return False
103 | 
104 |         if scope.hostname_must_match:
105 |             if URLHelper.get_hostname(queue_item.request.url) != URLHelper.get_hostname(new_request.url):
106 |                 return False
107 | 
108 |         if scope.tld_must_match:
109 |             if URLHelper.get_tld(queue_item.request.url) != URLHelper.get_tld(new_request.url):
110 |                 return False
111 | 
112 |         return True
113 | 
114 |     @staticmethod
115 |     def get_cookie_header(queue_item):
116 |         """Convert a requests cookie jar to a HTTP request cookie header value.
117 | 
118 |         Args:
119 |             queue_item (:class:`nyawc.QueueItem`): The parent queue item of the new request.
120 | 
121 |         Returns:
122 |             str: The HTTP cookie header value.
123 | 
124 |         """
125 | 
126 |         header = []
127 |         path = URLHelper.get_path(queue_item.request.url)
128 | 
129 |         for cookie in queue_item.request.cookies:
130 |             root_path = cookie.path == "" or cookie.path == "/"
131 |             if path.startswith(cookie.path) or root_path:
132 |                 header.append(cookie.name + "=" + cookie.value)
133 | 
134 |         return "&".join(header)
135 | 


--------------------------------------------------------------------------------
/nyawc/helpers/PackageHelper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2017 Tijme Gommers
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | import os
 26 | import re
 27 | import pkg_resources
 28 | 
 29 | class PackageHelper:
 30 |     """The Package class contains all the package related information (like the version number).
 31 | 
 32 |     Attributes:
 33 |         __name (str): Cached package name.
 34 |         __description (str): Cached package description.
 35 |         __alias (str): Cached package alias.
 36 |         __version (str): Cached package version number (if initialized).
 37 | 
 38 |     """
 39 | 
 40 |     __name = "Not Your Average Web Crawler"
 41 | 
 42 |     __description = "A web crawler that gathers more than you can imagine."
 43 | 
 44 |     __alias = "nyawc"
 45 | 
 46 |     __version = None
 47 | 
 48 |     @staticmethod
 49 |     def get_name():
 50 |         """Get the name of this package.
 51 | 
 52 |         Returns:
 53 |             str: The name of this package.
 54 | 
 55 |         """
 56 | 
 57 |         return PackageHelper.__name
 58 | 
 59 |     @staticmethod
 60 |     def get_description():
 61 |         """Get the description of this package.
 62 | 
 63 |         Returns:
 64 |             str: The description of this package.
 65 | 
 66 |         """
 67 | 
 68 |         return PackageHelper.__description
 69 | 
 70 |     @staticmethod
 71 |     def get_alias():
 72 |         """Get the alias of this package.
 73 | 
 74 |         Returns:
 75 |             str: The alias of this package.
 76 | 
 77 |         """
 78 | 
 79 |         return PackageHelper.__alias
 80 | 
 81 |     @staticmethod
 82 |     def get_version():
 83 |         """Get the version number of this package.
 84 | 
 85 |         Returns:
 86 |             str: The version number (marjor.minor.patch).
 87 | 
 88 |         Note:
 89 |             When this package is installed, the version number will be available through the
 90 |             package resource details. Otherwise this method will look for a ``.semver`` file.
 91 | 
 92 |         Note:
 93 |             In rare cases corrupt installs can cause the version number to be unknown. In this case
 94 |             the version number will be set to the string "Unknown".
 95 | 
 96 |         """
 97 | 
 98 |         if PackageHelper.__version:
 99 |             return PackageHelper.__version
100 | 
101 |         PackageHelper.__version = "Unknown"
102 | 
103 |         # If this is a GIT clone without install, use the ``.semver`` file.
104 |         file = os.path.realpath(__file__)
105 |         folder = os.path.dirname(file)
106 | 
107 |         try:
108 |             semver = open(folder + "/../../.semver", "r")
109 |             PackageHelper.__version = semver.read().rstrip()
110 |             semver.close()
111 |             return PackageHelper.__version
112 |         except:
113 |             pass
114 | 
115 |         # If the package was installed, get the version number via Python's distribution details.
116 |         try:
117 |             distribution = pkg_resources.get_distribution(PackageHelper.get_alias())
118 |             if distribution.version:
119 |                 PackageHelper.__version = distribution.version
120 |             return PackageHelper.__version
121 |         except:
122 |             pass
123 | 
124 |         return PackageHelper.__version
125 | 
126 |     @staticmethod
127 |     def rst_to_pypi(contents):
128 |         """Convert the given GitHub RST contents to PyPi RST contents (since some RST directives are not available in PyPi).
129 | 
130 |         Args:
131 |             contents (str): The GitHub compatible RST contents.
132 | 
133 |         Returns:
134 |             str: The PyPi compatible RST contents.
135 | 
136 |         """
137 | 
138 |         # The PyPi description does not support the SVG file type.
139 |         contents = contents.replace(".svg?pypi=png.from.svg", ".png")
140 | 
141 |         # Convert ``<br class="title">`` to a H1 title
142 |         asterisks_length = len(PackageHelper.get_name())
143 |         asterisks = "*" * asterisks_length
144 |         title = asterisks + "\n" + PackageHelper.get_name() + "\n" + asterisks;
145 | 
146 |         contents = re.sub(r"(\.\. raw\:\: html\n)(\n {2,4})(\<br class=\"title\"\>)", title, contents)
147 | 
148 |         # The PyPi description does not support raw HTML
149 |         contents = re.sub(r"(\.\. raw\:\: html\n)((\n {2,4})([A-Za-z0-9<>\ =\"\/])*)*", "", contents)
150 | 
151 |         return contents
152 | 


--------------------------------------------------------------------------------
/nyawc/helpers/RandomInputHelper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2017 Tijme Gommers
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | import random
 26 | import string
 27 | 
 28 | class RandomInputHelper:
 29 |     """A helper for generating random user input.
 30 | 
 31 |     Note:
 32 |         We need to cache the generated values to prevent infinite crawling
 33 |         loops. For example, if two responses contain the same ?search= form,
 34 |         the random generated value must be the same both of the times because
 35 |         otherwise the crawling would treat the new requests as two different
 36 |         requests.
 37 | 
 38 |     Attributes:
 39 |         cache (obj): Cached values of the generated data.
 40 | 
 41 |     """
 42 | 
 43 |     cache = {}
 44 | 
 45 |     @staticmethod
 46 |     def get_for_type(input_type="text"):
 47 |         """Get a random string for the given html input type
 48 | 
 49 |         Args:
 50 |             input_type (str): The input type (e.g. email).
 51 | 
 52 |         Returns:
 53 |             str: The (cached) random value.
 54 | 
 55 |         """
 56 | 
 57 |         if input_type in RandomInputHelper.cache:
 58 |             return RandomInputHelper.cache[input_type]
 59 | 
 60 |         types = {
 61 |             "text": RandomInputHelper.get_random_value,
 62 |             "hidden": RandomInputHelper.get_random_value,
 63 |             "search": RandomInputHelper.get_random_value,
 64 |             "color": RandomInputHelper.get_random_color,
 65 |             "week": {"function": RandomInputHelper.get_random_value, "params": [2, ["1234"]]},
 66 |             "password": RandomInputHelper.get_random_password,
 67 |             "number": RandomInputHelper.get_random_number,
 68 |             "tel": RandomInputHelper.get_random_telephonenumber,
 69 |             "url": RandomInputHelper.get_random_url,
 70 |             "textarea": RandomInputHelper.get_random_text,
 71 |             "email": RandomInputHelper.get_random_email
 72 |         }
 73 | 
 74 |         if types.get(input_type) is None:
 75 |             return ""
 76 | 
 77 |         if type(types.get(input_type)) is dict:
 78 |             generator = types.get(input_type)
 79 |             value = generator.get("function")(*generator.get("params"))
 80 |         else:
 81 |             value = types.get(input_type)()
 82 | 
 83 |         RandomInputHelper.cache[input_type] = value
 84 | 
 85 |         return value
 86 | 
 87 |     @staticmethod
 88 |     def get_random_value(length=10, character_sets=[string.ascii_uppercase, string.ascii_lowercase]):
 89 |         """Get a random string with the given length.
 90 | 
 91 |         Args:
 92 |             length (int): The length of the string to return.
 93 |             character_sets list(str): The caracter sets to use.
 94 | 
 95 |         Returns:
 96 |             str: The random string.
 97 | 
 98 |         """
 99 | 
100 |         return "".join(random.choice("".join(character_sets)) for i in range(length))
101 | 
102 |     @staticmethod
103 |     def get_random_number(length=4):
104 |         """Get a random number with the given length.
105 | 
106 |         Args:
107 |             length (int): The length of the number to return.
108 | 
109 |         Returns:
110 |             str: The random number.
111 | 
112 |         """
113 | 
114 |         return RandomInputHelper.get_random_value(length, [string.digits])
115 | 
116 |     @staticmethod
117 |     def get_random_color():
118 |         """Get a random color in HEX format (including hash character).
119 | 
120 |         Returns:
121 |             str: The random HEX color.
122 | 
123 |         """
124 | 
125 |         return '#{:06x}'.format(random.randint(0, 0x00ffff))
126 | 
127 |     @staticmethod
128 |     def get_random_text():
129 |         """Get a random string with the given length.
130 | 
131 |         Args:
132 |             length (int): The length of the string to return.
133 | 
134 |         Returns:
135 |             str: The random string.
136 | 
137 |         """
138 | 
139 |         return " ".join(RandomInputHelper.get_random_value()for i in range(20, 30))
140 | 
141 |     @staticmethod
142 |     def get_random_email(ltd="com"):
143 |         """Get a random email address with the given ltd.
144 | 
145 |         Args:
146 |             ltd (str): The ltd to use (e.g. com).
147 | 
148 |         Returns:
149 |             str: The random email.
150 | 
151 |         """
152 | 
153 |         email = [
154 |             RandomInputHelper.get_random_value(6, [string.ascii_lowercase]),
155 |             "@",
156 |             RandomInputHelper.get_random_value(6, [string.ascii_lowercase]),
157 |             ".",
158 |             ltd
159 |         ]
160 | 
161 |         return "".join(email)
162 | 
163 |     @staticmethod
164 |     def get_random_password():
165 |         """Get a random password that complies with most of the requirements.
166 | 
167 |         Note:
168 |             This random password is not strong and not "really" random, and should only be
169 |             used for testing purposes.
170 | 
171 |         Returns:
172 |             str: The random password.
173 | 
174 |         """
175 | 
176 |         password = []
177 | 
178 |         password.append(RandomInputHelper.get_random_value(4, [string.ascii_lowercase]))
179 |         password.append(RandomInputHelper.get_random_value(2, [string.digits]))
180 |         password.append(RandomInputHelper.get_random_value(2, ["$&*@!"]))
181 |         password.append(RandomInputHelper.get_random_value(4, [string.ascii_uppercase]))
182 | 
183 |         return "".join(password)
184 | 
185 |     @staticmethod
186 |     def get_random_url(ltd="com"):
187 |         """Get a random url with the given ltd.
188 | 
189 |         Args:
190 |             ltd (str): The ltd to use (e.g. com).
191 | 
192 |         Returns:
193 |             str: The random url.
194 | 
195 |         """
196 | 
197 |         url = [
198 |             "https://",
199 |             RandomInputHelper.get_random_value(8, [string.ascii_lowercase]),
200 |             ".",
201 |             ltd
202 |         ]
203 | 
204 |         return "".join(url)
205 | 
206 |     @staticmethod
207 |     def get_random_telephonenumber():
208 |         """Get a random 10 digit phone number that complies with most of the requirements.
209 | 
210 |         Returns:
211 |             str: The random telephone number.
212 | 
213 |         """
214 | 
215 |         phone = [
216 |             RandomInputHelper.get_random_value(3, "123456789"),
217 |             RandomInputHelper.get_random_value(3, "12345678"),
218 |             "".join(map(str, random.sample(range(10), 4)))
219 |         ]
220 | 
221 |         return "-".join(phone)
222 | 


--------------------------------------------------------------------------------
/nyawc/helpers/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/nyawc/http/Handler.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2017 Tijme Gommers
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | import os
 26 | import importlib
 27 | import requests
 28 | 
 29 | class Handler(object):
 30 |     """The Handler class executes HTTP requests.
 31 | 
 32 |     Attributes:
 33 |         __options (obj): The settins/options object.
 34 |         __queue_item (obj): The queue item containing a request to execute.
 35 | 
 36 |     """
 37 | 
 38 |     def __init__(self, options, queue_item):
 39 |         """Construct the HTTP handler.
 40 | 
 41 |         Args:
 42 |             options (:class:`nyawc.Options`): The settins/options object.
 43 |             queue_item (:class:`nyawc.QueueItem`): The queue item containing the request.
 44 | 
 45 |         """
 46 | 
 47 |         self.__options = options
 48 |         self.__queue_item = queue_item
 49 | 
 50 |         self.__queue_item.response = self.__make_request(
 51 |             self.__queue_item.request.url,
 52 |             self.__queue_item.request.method,
 53 |             self.__queue_item.request.data,
 54 |             self.__queue_item.request.auth,
 55 |             self.__queue_item.request.cookies,
 56 |             self.__queue_item.request.headers,
 57 |             self.__queue_item.request.proxies,
 58 |             self.__queue_item.request.timeout,
 59 |             self.__queue_item.request.verify
 60 |         )
 61 | 
 62 |         # In Python 2.x it could occur that the requests module returns a unicode URL.
 63 |         # See this issue for more info (https://github.com/tijme/not-your-average-web-crawler/issues/5)
 64 |         self.__queue_item.response.url = str(self.__queue_item.response.url)
 65 | 
 66 |     def get_new_requests(self):
 67 |         """Retrieve all the new request that were found in this request.
 68 | 
 69 |         Returns:
 70 |             list(:class:`nyawc.http.Request`): A list of request objects.
 71 | 
 72 |         """
 73 | 
 74 |         content_type = self.__queue_item.response.headers.get('content-type')
 75 |         scrapers = self.__get_all_scrapers()
 76 |         new_requests = []
 77 | 
 78 |         for scraper in scrapers:
 79 |             instance = scraper(self.__options, self.__queue_item)
 80 |             if self.__content_type_matches(content_type, instance.content_types):
 81 |                 new_requests.extend(instance.get_requests())
 82 | 
 83 |         return new_requests
 84 | 
 85 |     def __make_request(self, url, method, data, auth, cookies, headers, proxies, timeout, verify):
 86 |         """Execute a request with the given data.
 87 | 
 88 |         Args:
 89 |             url (str): The URL to call.
 90 |             method (str): The method (e.g. `get` or `post`).
 91 |             data (str): The data to call the URL with.
 92 |             auth (obj): The authentication class.
 93 |             cookies (obj): The cookie dict.
 94 |             headers (obj): The header dict.
 95 |             proxies (obj): The proxies dict.
 96 |             timeout (int): The request timeout in seconds.
 97 |             verify (mixed): SSL verification.
 98 | 
 99 |         Returns:
100 |             obj: The response object.
101 | 
102 |         """
103 | 
104 |         request_by_method = getattr(requests, method)
105 |         return request_by_method(
106 |             url=url,
107 |             data=data,
108 |             auth=auth,
109 |             cookies=cookies,
110 |             headers=headers,
111 |             proxies=proxies,
112 |             timeout=timeout,
113 |             verify=verify,
114 |             allow_redirects=True,
115 |             stream=False
116 |         )
117 | 
118 |     def __get_all_scrapers(self):
119 |         """Find all available scraper references.
120 | 
121 |         Returns:
122 |             list(obj): The scraper references.
123 | 
124 |         """
125 | 
126 |         modules_strings = self.__get_all_scrapers_modules()
127 |         modules = []
128 | 
129 |         for module_string in modules_strings:
130 |             module = importlib.import_module("nyawc.scrapers." + module_string)
131 |             modules.append(getattr(module, module_string))
132 | 
133 |         return modules
134 | 
135 |     def __get_all_scrapers_modules(self):
136 |         """Find all available scraper modules.
137 | 
138 |         Returns:
139 |             list(obj): The scraper modules.
140 | 
141 |         """
142 | 
143 |         modules = []
144 | 
145 |         file = os.path.realpath(__file__)
146 |         folder = os.path.dirname(file)
147 | 
148 |         for filename in os.listdir(folder + "/../scrapers"):
149 |             if filename.endswith("Scraper.py") and not filename.startswith("Base"):
150 |                 modules.append(filename[:-3])
151 | 
152 |         return modules
153 | 
154 |     def __content_type_matches(self, content_type, available_content_types):
155 |         """Check if the given content type matches one of the available content types.
156 | 
157 |         Args:
158 |             content_type (str): The given content type.
159 |             available_content_types list(str): All the available content types.
160 | 
161 |         Returns:
162 |             bool: True if a match was found, False otherwise.
163 | 
164 |         """
165 | 
166 |         if content_type is None:
167 |             return False
168 | 
169 |         if content_type in available_content_types:
170 |             return True
171 | 
172 |         for available_content_type in available_content_types:
173 |             if available_content_type in content_type:
174 |                 return True
175 | 
176 |         return False
177 | 


--------------------------------------------------------------------------------
/nyawc/http/Request.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | from nyawc.helpers.URLHelper import URLHelper
26 | 
27 | class Request(object):
28 |     """The Request class contains details that were used to request the specified URL.
29 | 
30 |     Attributes:
31 |         METHOD_OPTIONS (str): A request method that can be used to request the URL.
32 |         METHOD_GET (str): A request method that can be used to request the URL.
33 |         METHOD_HEAD (str): A request method that can be used to request the URL.
34 |         METHOD_POST (str): A request method that can be used to request the URL.
35 |         METHOD_PUT (str): A request method that can be used to request the URL.
36 |         METHOD_DELETE (str): A request method that can be used to request the URL.
37 |         parent_raised_error (bool): If the parent request raised an error (e.g. 404).
38 |         depth (int): The current crawling depth.
39 |         url (str): The absolute URL to use when making the request.
40 |         method (str): The request method to use for the request.
41 |         data (obj): The post data {key: value} OrderedDict that will be sent.
42 |         auth (obj): The (requests module) authentication class to use for the request.
43 |         cookies (obj): The (requests module) cookie jar to use for the request.
44 |         headers (obj): The headers {key: value} to use for the request.
45 |         proxies (obj): The proxies {key: value} to use for the request.
46 |         timeout (int): The amount of seconds to wait before a timeout exception will be thrown.
47 |         verify (mixed): True or False based on if certificates should be checked or else a path to a trusted bundle.
48 | 
49 |     """
50 | 
51 |     METHOD_OPTIONS = "options"
52 | 
53 |     METHOD_GET = "get"
54 | 
55 |     METHOD_HEAD = "head"
56 | 
57 |     METHOD_POST = "post"
58 | 
59 |     METHOD_PUT = "put"
60 | 
61 |     METHOD_DELETE = "delete"
62 | 
63 |     def __init__(self, url, method=METHOD_GET, data=None, auth=None, cookies=None, headers=None, proxies=None, timeout=30, verify=True):
64 |         """Constructs a Request instance.
65 | 
66 |         Args:
67 |             url (str): The absolute URL to use when making the request.
68 |             method (str): The request method to use for the request.
69 |             data (obj): The post data {key: value} OrderedDict that will be sent.
70 |             auth (obj): The (requests module) authentication class to use for the request.
71 |             cookies (obj): The (requests module) cookie jar to use for the request.
72 |             headers (obj): The headers {key: value} to use for the request.
73 |             proxies (obj): The proxies {key: value} to use for the request.
74 |             timeout (int): The amount of seconds to wait before a timeout exception will be thrown.
75 |             verify (mixed): True or False based on if certificates should be checked or else a path to a trusted bundle.
76 | 
77 |         """
78 | 
79 |         self.parent_raised_error = False
80 |         self.depth = 0
81 | 
82 |         self.url = url
83 |         self.method = method
84 |         self.auth = auth
85 |         self.cookies = cookies
86 |         self.headers = headers
87 |         self.proxies = proxies
88 |         self.timeout = timeout
89 |         self.verify = verify
90 | 
91 |         if method == self.METHOD_GET:
92 |             self.url = URLHelper.append_with_data(self.url, data)
93 |             self.data = None
94 |         else:
95 |             self.data = data
96 | 


--------------------------------------------------------------------------------
/nyawc/http/Response.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | class Response(object):
26 |     """Response placeholder class for before request is finished.
27 | 
28 |     Attributes:
29 |         url (str): The absolute URL of the request/response.
30 | 
31 |     Note:
32 |         This class will be replaced with the response class of Python's `requests` module when the
33 |         request is finished. For more information check http://docs.python-requests.org/en/master/api/#requests.Response.
34 | 
35 |     """
36 | 
37 |     def __init__(self, url):
38 |         """Constructs a Response instance.
39 | 
40 |         Args:
41 |             url (str): The absolute URL of the request/response.
42 | 
43 |         """
44 | 
45 |         self.url = url
46 | 


--------------------------------------------------------------------------------
/nyawc/http/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/nyawc/scrapers/BaseScraper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | from nyawc.http.Request import Request
26 | from nyawc.helpers.URLHelper import URLHelper
27 | 
28 | class BaseScraper(object):
29 |     """The BaseScraper can be used to create other scrapers.
30 | 
31 |     Attributes:
32 |         __options (:class:`nyawc.Options`): The settins/options object.
33 |         __queue_item (:class:`nyawc.QueueItem`): The queue item containing the response to scrape.
34 | 
35 |     """
36 | 
37 |     def __init__(self, options, queue_item):
38 |         """Construct the HTMLSoupLinkScraper instance.
39 | 
40 |         Args:
41 |             options (:class:`nyawc.Options`): The settins/options object.
42 |             queue_item (:class:`nyawc.QueueItem`): The queue item containing a response the scrape.
43 | 
44 |         """
45 | 
46 |         self.options = options
47 |         self.queue_item = queue_item
48 | 
49 |     def get_requests(self):
50 |         """Get all the new requests that were found in the response.
51 | 
52 |         Returns:
53 |             list(:class:`nyawc.http.Request`): A list of new requests that were found.
54 | 
55 |         """
56 | 
57 |         requests = self.derived_get_requests()
58 | 
59 |         for request in requests:
60 |             request.url = URLHelper.remove_hash(request.url)
61 | 
62 |         return requests
63 | 


--------------------------------------------------------------------------------
/nyawc/scrapers/CSSRegexLinkScraper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | import re
26 | 
27 | from nyawc.http.Request import Request
28 | from nyawc.helpers.URLHelper import URLHelper
29 | from nyawc.scrapers.BaseScraper import BaseScraper
30 | 
31 | class CSSRegexLinkScraper(BaseScraper):
32 |     """The CSSRegexLinkScraper finds absolute and relative URLs in Cascading Style Sheets.
33 | 
34 |     Attributes:
35 |         content_types list(str): The supported content types.
36 |         __expressions list(obj): The regular expressions to execute.
37 | 
38 |     """
39 | 
40 |     content_types = [
41 |         "text/css"
42 |     ]
43 | 
44 |     __expressions = [
45 |         # Match absolute/relative URLs between any type of CSS quote
46 |         {"group": 1, "raw": r"\(([\"\'])?(((((https?:)?\/)?\/)|(\.\.\/)+)([^\n ]*?))(\1)?\)"}
47 |     ]
48 | 
49 |     def derived_get_requests(self):
50 |         """Get all the new requests that were found in the response.
51 | 
52 |         Returns:
53 |             list(:class:`nyawc.http.Request`): A list of new requests that were found.
54 | 
55 |         """
56 | 
57 |         host = self.queue_item.response.url
58 |         content = self.queue_item.response.text
59 | 
60 |         found_requests = []
61 | 
62 |         for expression in self.__expressions:
63 |             matches = re.findall(expression["raw"], content)
64 | 
65 |             for match in matches:
66 |                 found_url = match[expression["group"]]
67 |                 absolute_url = URLHelper.make_absolute(host, found_url)
68 |                 found_requests.append(Request(absolute_url))
69 | 
70 |         return found_requests
71 | 


--------------------------------------------------------------------------------
/nyawc/scrapers/HTMLSoupFormScraper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2017 Tijme Gommers
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | from nyawc.CrawlerActions import CrawlerActions
 26 | from nyawc.http.Request import Request
 27 | from nyawc.helpers.URLHelper import URLHelper
 28 | from nyawc.helpers.RandomInputHelper import RandomInputHelper
 29 | from nyawc.scrapers.BaseScraper import BaseScraper
 30 | from collections import OrderedDict
 31 | 
 32 | class HTMLSoupFormScraper(BaseScraper):
 33 |     """The HTMLSoupFormScraper finds requests from forms in HTML using BeautifulSoup.
 34 | 
 35 |     Attributes:
 36 |         content_types list(str): The supported content types.
 37 | 
 38 |     """
 39 | 
 40 |     content_types = [
 41 |         "text/html",
 42 |         "application/xhtml+xml"
 43 |     ]
 44 | 
 45 |     def derived_get_requests(self):
 46 |         """Get all the new requests that were found in the response.
 47 | 
 48 |         Returns:
 49 |             list(:class:`nyawc.http.Request`): A list of new requests that were found.
 50 | 
 51 |         """
 52 | 
 53 |         host = self.queue_item.response.url
 54 |         soup = self.queue_item.get_soup_response()
 55 | 
 56 |         found_requests = []
 57 | 
 58 |         for form in soup.find_all("form"):
 59 |             found_requests.append(self.__get_request(host, form))
 60 | 
 61 |         return found_requests
 62 | 
 63 |     def __get_request(self, host, soup):
 64 |         """Build a request from the given soup form.
 65 | 
 66 |         Args:
 67 |             host str: The URL of the current queue item.
 68 |             soup (obj): The BeautifulSoup form.
 69 | 
 70 |         Returns:
 71 |             :class:`nyawc.http.Request`: The new Request.
 72 | 
 73 |         """
 74 | 
 75 |         url = URLHelper.make_absolute(host, self.__trim_grave_accent(soup["action"])) if soup.has_attr("action") else host
 76 |         method_original = soup["method"] if soup.has_attr("method") else "get"
 77 |         method = "post" if method_original.lower() == "post" else "get"
 78 |         data = self.__get_form_data(soup)
 79 | 
 80 |         return Request(url, method, data)
 81 | 
 82 | 
 83 |     def __trim_grave_accent(self, href):
 84 |         """Trim grave accents manually (because BeautifulSoup doesn"t support it).
 85 | 
 86 |         Args:
 87 |             href (str): The BeautifulSoup href value.
 88 | 
 89 |         Returns:
 90 |             str: The BeautifulSoup href value without grave accents.
 91 | 
 92 |         """
 93 | 
 94 |         if href.startswith("`"):
 95 |             href = href[1:]
 96 | 
 97 |         if href.endswith("`"):
 98 |             href = href[:-1]
 99 | 
100 |         return href
101 | 
102 |     def __get_form_data(self, soup):
103 |         """Build a form data dict from the given form.
104 | 
105 |         Args:
106 |             soup (obj): The BeautifulSoup form.
107 | 
108 |         Returns:
109 |             obj: The form data (key/value).
110 | 
111 |         """
112 | 
113 |         elements = self.__get_valid_form_data_elements(soup)
114 |         form_data = self.__get_default_form_data_input(elements)
115 |         callback = self.options.callbacks.form_before_autofill
116 |         action = callback(self.queue_item, elements, form_data)
117 | 
118 |         if action == CrawlerActions.DO_AUTOFILL_FORM:
119 |             self.__autofill_form_data(form_data, elements)
120 | 
121 |         return form_data
122 | 
123 |     def __get_valid_form_data_elements(self, soup):
124 |         """Get all valid form input elements.
125 | 
126 |         Note:
127 |             An element is valid when the value can be updated client-side
128 |             and the element has a name attribute.
129 | 
130 |         Args:
131 |             soup (obj): The BeautifulSoup form.
132 | 
133 |         Returns:
134 |             list(obj): Soup elements.
135 | 
136 |         """
137 | 
138 |         elements = []
139 | 
140 |         for element in soup.find_all(["input", "button", "textarea", "select"]):
141 |             if element.has_attr("name"):
142 |                 elements.append(element)
143 | 
144 |         return elements
145 | 
146 |     def __get_default_form_data_input(self, elements):
147 |         """Get the default form data {key: value} for the given elements.
148 | 
149 |         Args:
150 |             elements list(obj): Soup elements.
151 | 
152 |         Returns:
153 |             obj: The {key: value} form data
154 | 
155 |         """
156 | 
157 |         form_data = OrderedDict()
158 | 
159 |         for element in elements:
160 |             default_value = self.__get_default_value_from_element(element)
161 | 
162 |             if default_value is False:
163 |                 continue
164 | 
165 |             form_data[element["name"]] = default_value
166 | 
167 |         return form_data
168 | 
169 |     def __autofill_form_data(self, form_data, elements):
170 |         """Autofill empty form data with random data.
171 | 
172 |         Args:
173 |             form_data (obj): The {key: value} form data
174 |             elements list(obj): Soup elements.
175 | 
176 |         Returns:
177 |             obj: The {key: value}
178 | 
179 |         """
180 | 
181 |         for element in elements:
182 |             if not element["name"] in form_data:
183 |                 continue
184 | 
185 |             if not len(form_data[element["name"]]) is 0:
186 |                 continue
187 | 
188 |             if element.name == "textarea":
189 |                 form_data[element["name"]] = RandomInputHelper.get_for_type("textarea")
190 |                 continue
191 | 
192 |             if element.has_attr("type"):
193 |                 form_data[element["name"]] = RandomInputHelper.get_for_type(element["type"])
194 | 
195 |     def __get_default_value_from_element(self, element):
196 |         """Get the default value of a form element
197 | 
198 |         Args:
199 |             elements (obj): The soup element.
200 | 
201 |         Returns:
202 |             str: The default value
203 | 
204 |         """
205 | 
206 |         if element.name == "select":
207 |             options = element.find_all("option")
208 |             is_multiple = element.has_attr("multiple")
209 | 
210 |             selected_options = [
211 |                 option for option in options
212 |                 if option.has_attr("selected")
213 |             ]
214 | 
215 |             if not selected_options and options:
216 |                 selected_options = [options[0]]
217 | 
218 |             selected_values = []
219 | 
220 |             if is_multiple:
221 |                 for option in selected_options:
222 |                     value = option["value"] if option.has_attr("value") else option.string
223 |                     selected_values.append(value)
224 | 
225 |                 return selected_values
226 |             elif len(selected_options) >= 1:
227 |                 if selected_options[0].has_attr("value"):
228 |                     return selected_options[0]["value"]
229 |                 else:
230 |                     return selected_options[0].string
231 | 
232 |             return ""
233 | 
234 |         if element.name == "textarea":
235 |             return element.string if element.string is not None else ""
236 | 
237 |         if element.name == "input" and element.has_attr("type"):
238 |             if element["type"] in ("checkbox", "radio"):
239 |                 if not element.has_attr("checked"):
240 |                     return False
241 | 
242 |                 if element.has_attr("value"):
243 |                     return element["value"]
244 |                 else:
245 |                     return "on"
246 | 
247 |         if element.has_attr("value"):
248 |             return element["value"]
249 | 
250 |         return ""
251 | 


--------------------------------------------------------------------------------
/nyawc/scrapers/HTMLSoupLinkScraper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2017 Tijme Gommers
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | from nyawc.http.Request import Request
 26 | from nyawc.helpers.URLHelper import URLHelper
 27 | from nyawc.scrapers.BaseScraper import BaseScraper
 28 | 
 29 | class HTMLSoupLinkScraper(BaseScraper):
 30 |     """The HTMLSoupLinkScraper finds URLs from href attributes in HTML using BeautifulSoup.
 31 | 
 32 |     Attributes:
 33 |         content_types list(str): The supported content types.
 34 | 
 35 |     """
 36 | 
 37 |     content_types = [
 38 |         "text/html",
 39 |         "application/xhtml+xml"
 40 |     ]
 41 | 
 42 |     def derived_get_requests(self):
 43 |         """Get all the new requests that were found in the response.
 44 | 
 45 |         Returns:
 46 |             list(:class:`nyawc.http.Request`): A list of new requests that were found.
 47 | 
 48 |         """
 49 | 
 50 |         attributes = {
 51 |             "src": True,
 52 |             "href": True,
 53 |             "link": True,
 54 |             "script": True,
 55 |             "url": True
 56 |         }
 57 | 
 58 |         host = self.queue_item.response.url
 59 |         soup = self.queue_item.get_soup_response()
 60 |         base_element = soup.find("base", href=True)
 61 |         elements = soup.select("[{}]".format("],[".join(attributes.keys())))
 62 | 
 63 |         # Always use the URL from the base element if it exists.
 64 |         # https://www.w3schools.com/tags/tag_base.asp
 65 |         if base_element:
 66 |             host = URLHelper.make_absolute(host, base_element["href"])
 67 | 
 68 |         found_requests = []
 69 | 
 70 |         for element in elements:
 71 |             for attribute in attributes.keys():
 72 |                 if not element.has_attr(attribute):
 73 |                     continue
 74 | 
 75 |                 found_url = self.__trim_grave_accent(element[attribute])
 76 | 
 77 |                 if URLHelper.is_mailto(found_url):
 78 |                     continue
 79 | 
 80 |                 absolute_url = URLHelper.make_absolute(host, found_url)
 81 |                 found_requests.append(Request(absolute_url))
 82 | 
 83 |         return found_requests
 84 | 
 85 |     def __trim_grave_accent(self, href):
 86 |         """Trim grave accents manually (because BeautifulSoup doesn't support it).
 87 | 
 88 |         Args:
 89 |             href (str): The BeautifulSoup href value.
 90 | 
 91 |         Returns:
 92 |             str: The BeautifulSoup href value without grave accents.
 93 | 
 94 |         """
 95 | 
 96 |         if href.startswith("`"):
 97 |             href = href[1:]
 98 | 
 99 |         if href.endswith("`"):
100 |             href = href[:-1]
101 | 
102 |         return href
103 | 


--------------------------------------------------------------------------------
/nyawc/scrapers/JSONRegexLinkScraper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | import re
26 | 
27 | from nyawc.http.Request import Request
28 | from nyawc.helpers.URLHelper import URLHelper
29 | from nyawc.scrapers.BaseScraper import BaseScraper
30 | 
31 | class JSONRegexLinkScraper(BaseScraper):
32 |     """The JSONRegexLinkScraper finds absolute and relative URLs in JSON keys and values.
33 | 
34 |     Attributes:
35 |         content_types list(str): The supported content types.
36 |         __expressions list(obj): The regular expressions to execute.
37 | 
38 |     """
39 | 
40 |     content_types = [
41 |         "application/json"
42 |     ]
43 | 
44 |     __expressions = [
45 |         # Match absolute/relative URLs between any type of JSON quote
46 |         {"group": 1, "raw": r"([\"\'\`])(((((https?:)?\/)?\/)|(\.\.\/)+)([^\n ]*?))\1"}
47 |     ]
48 | 
49 |     def derived_get_requests(self):
50 |         """Get all the new requests that were found in the response.
51 | 
52 |         Returns:
53 |             list(:class:`nyawc.http.Request`): A list of new requests that were found.
54 | 
55 |         """
56 | 
57 |         host = self.queue_item.response.url
58 |         content = self.queue_item.response.text
59 | 
60 |         found_requests = []
61 | 
62 |         for expression in self.__expressions:
63 |             matches = re.findall(expression["raw"], content)
64 | 
65 |             for match in matches:
66 |                 found_url = match[expression["group"]]
67 |                 absolute_url = URLHelper.make_absolute(host, found_url)
68 |                 found_requests.append(Request(absolute_url))
69 | 
70 |         return found_requests
71 | 


--------------------------------------------------------------------------------
/nyawc/scrapers/XMLRegexLinkScraper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | import re
26 | 
27 | from nyawc.http.Request import Request
28 | from nyawc.helpers.URLHelper import URLHelper
29 | from nyawc.scrapers.BaseScraper import BaseScraper
30 | 
31 | class XMLRegexLinkScraper(BaseScraper):
32 |     """The XMLRegexLinkScraper finds absolute and relative URLs in XML values.
33 | 
34 |     Attributes:
35 |         content_types list(str): The supported content types.
36 |         __expressions list(obj): The regular expressions to execute.
37 | 
38 |     """
39 | 
40 |     content_types = [
41 |         "text/xml",
42 |         "application/xml",
43 |         "image/svg+xml"
44 |     ]
45 | 
46 |     __expressions = [
47 |         # Match absolute/relative URLs between any type of XML tag
48 |         {"group": 0, "raw": r">(((((https?:)?\/)?\/)|(\.\.\/)+)([^\n ]*?))<\/"},
49 | 
50 |         # Match absolute/relative URLs between any type of XML quote
51 |         {"group": 1, "raw": r"=([\"\'\`])(((((https?:)?\/)?\/)|(\.\.\/)+)([^\n ]*?))\1"}
52 |     ]
53 | 
54 |     def derived_get_requests(self):
55 |         """Get all the new requests that were found in the response.
56 | 
57 |         Returns:
58 |             list(:class:`nyawc.http.Request`): A list of new requests that were found.
59 | 
60 |         """
61 | 
62 |         host = self.queue_item.response.url
63 |         content = self.queue_item.response.text
64 | 
65 |         found_requests = []
66 | 
67 |         for expression in self.__expressions:
68 |             matches = re.findall(expression["raw"], content)
69 | 
70 |             for match in matches:
71 |                 found_url = match[expression["group"]]
72 |                 absolute_url = URLHelper.make_absolute(host, found_url)
73 |                 found_requests.append(Request(absolute_url))
74 | 
75 |         return found_requests
76 | 


--------------------------------------------------------------------------------
/nyawc/scrapers/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.7.1
2 | lxml==4.9.1
3 | requests==2.21.0
4 | requests_toolbelt==0.9.1
5 | sphinx==1.8.3
6 | sphinx-better-theme==0.1.5
7 | sphinxcontrib-napoleon==0.7
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | from setuptools import find_packages, setup
26 | from nyawc.helpers.PackageHelper import PackageHelper
27 | 
28 | with open("requirements.txt") as file:
29 |     requirements = file.read().splitlines()
30 | 
31 | with open("README.rst") as file:
32 |     readme = PackageHelper.rst_to_pypi(file.read())
33 | 
34 | setup(
35 |     name=PackageHelper.get_alias(),
36 |     version=PackageHelper.get_version(),
37 |     description=PackageHelper.get_description(),
38 |     long_description=readme,
39 |     keywords = ["vulnerability", "bug-bounty", "security", "post", "get", "request", "crawler", "scraper", "scanner"],
40 |     classifiers=[
41 |         "Development Status :: 5 - Production/Stable",
42 |         "Environment :: Console",
43 |         "Intended Audience :: Developers",
44 |         "Intended Audience :: Education",
45 |         "Intended Audience :: Information Technology",
46 |         "Intended Audience :: System Administrators",
47 |         "License :: OSI Approved :: MIT License",
48 |         "Natural Language :: English",
49 |         "Operating System :: MacOS",
50 |         "Operating System :: Microsoft :: Windows",
51 |         "Operating System :: POSIX :: Linux",
52 |         "Programming Language :: Python :: 3.6",
53 |         "Programming Language :: Python :: 3.5",
54 |         "Programming Language :: Python :: 2.7",
55 |         "Topic :: Security"
56 |     ],
57 |     packages=find_packages(),
58 |     platforms=["any"],
59 |     author="Tijme Gommers",
60 |     license="MIT",
61 |     url="https://tijme.github.io/not-your-average-web-crawler/",
62 |     install_requires=requirements
63 | )
64 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/test/__init__.py


--------------------------------------------------------------------------------
/test/site/fuzzing/empty.php:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tijme/not-your-average-web-crawler/1de35beb32373307f68b9b9a2ee1506bb3208b35/test/site/fuzzing/empty.php


--------------------------------------------------------------------------------
/test/site/fuzzing/sleep.php:
--------------------------------------------------------------------------------
1 | <?php sleep(25); ?>


--------------------------------------------------------------------------------
/test/site/http_statuses/status_100.php:
--------------------------------------------------------------------------------
1 | <?php http_response_code(100); ?>
2 | 
3 | Response


--------------------------------------------------------------------------------
/test/site/http_statuses/status_200.php:
--------------------------------------------------------------------------------
1 | <?php http_response_code(200); ?>
2 | 
3 | Response


--------------------------------------------------------------------------------
/test/site/http_statuses/status_300.php:
--------------------------------------------------------------------------------
1 | <?php http_response_code(300); ?>
2 | 
3 | Response


--------------------------------------------------------------------------------
/test/site/http_statuses/status_400.php:
--------------------------------------------------------------------------------
1 | <?php http_response_code(400); ?>
2 | 
3 | Response


--------------------------------------------------------------------------------
/test/site/http_statuses/status_500.php:
--------------------------------------------------------------------------------
1 | <?php http_response_code(500); ?>
2 | 
3 | Response


--------------------------------------------------------------------------------
/test/site/index.php:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |     <body>
 4 |         <?php
 5 |             $dirIterator = new RecursiveDirectoryIterator('.');
 6 |             $recursiveIterator = new RecursiveIteratorIterator($dirIterator);
 7 | 
 8 |             foreach ($recursiveIterator as $file) {
 9 |                 if (!$file->isFile()) { 
10 |                     continue;
11 |                 }
12 | 
13 |                 $ext = pathinfo($file->getPathname(), PATHINFO_EXTENSION);
14 | 
15 |                 if ($ext != 'php' || $file->getBasename() == 'index.php') {
16 |                     continue;
17 |                 }
18 | 
19 |                 $href = substr($file->getPathname(), 2);
20 |                 $href = str_replace('\\', '/', $href);
21 | 
22 |                 echo '<a href="' . htmlentities($href) . '">' . htmlentities($href) . '</a><br>';
23 |             }
24 |         ?>
25 |     </body>
26 | </html>


--------------------------------------------------------------------------------
/test/site/invalid_content_types/css.php:
--------------------------------------------------------------------------------
1 | <?php 
2 | header('Content-Type: text/css');
3 | 
4 | for($i = 0; $i < 10000; $i ++) {
5 | 	echo 'A';
6 | }
7 | ?>


--------------------------------------------------------------------------------
/test/site/invalid_content_types/html.php:
--------------------------------------------------------------------------------
1 | <?php 
2 | header('Content-Type: text/html');
3 | 
4 | for($i = 0; $i < 10000; $i ++) {
5 | 	echo 'A';
6 | }
7 | ?>


--------------------------------------------------------------------------------
/test/site/invalid_content_types/json.php:
--------------------------------------------------------------------------------
1 | <?php 
2 | header('Content-Type: application/json');
3 | 
4 | for($i = 0; $i < 10000; $i ++) {
5 | 	echo 'A';
6 | }
7 | ?>


--------------------------------------------------------------------------------
/test/site/invalid_content_types/xhtml.php:
--------------------------------------------------------------------------------
1 | <?php 
2 | header('Content-Type: application/xhtml+xml');
3 | 
4 | for($i = 0; $i < 10000; $i ++) {
5 | 	echo 'A';
6 | }
7 | ?>


--------------------------------------------------------------------------------
/test/site/invalid_content_types/xml.php:
--------------------------------------------------------------------------------
1 | <?php 
2 | header('Content-Type: application/xml');
3 | 
4 | for($i = 0; $i < 10000; $i ++) {
5 | 	echo 'A';
6 | }
7 | ?>


--------------------------------------------------------------------------------
/test/site/malformed_responses/css.php:
--------------------------------------------------------------------------------
 1 | <?php header('Content-Type: text/css'); ?>
 2 | body {
 3 | 	width:
 4 | }
 5 | 
 6 | p {
 7 | 	height: 80px;
 8 | 
 9 | span {
10 | 	margin-toppp: 23px;
11 | }


--------------------------------------------------------------------------------
/test/site/malformed_responses/html.php:
--------------------------------------------------------------------------------
 1 | <?php header('Content-Type: text/html'); ?>
 2 | <!DOCTYPE html>
 3 | <html lang="en">
 4 |     <head>
 5 |         <meta charset="utf-8"
 6 |         <meta http-equiv="X-UA-Compatible" content="IE=edge">
 7 |         <meta name="viewport" content="width=device-width, initial-scale=1">
 8 |         <title>Scraped</title>
 9 |     </head>
10 |     <body>
11 |         <h1>Hello, world!</h1>
12 |     </body>
13 | </html>


--------------------------------------------------------------------------------
/test/site/malformed_responses/json.php:
--------------------------------------------------------------------------------
 1 | <?php header('Content-Type: application/json'); ?>
 2 | {"menu": {
 3 |   "id": "file,
 4 |   "value": "File",
 5 |   "popup": {
 6 |     "menuitem": [
 7 |       {"value": "New", "onclick": "CreateNewDoc()"},
 8 |       {"value": "Open", "onclick": "OpenDoc()"},
 9 |       {"value": "Close", "onclick": "CloseDoc()"}
10 |     ]
11 |   }
12 | }}


--------------------------------------------------------------------------------
/test/site/malformed_responses/xhtml.php:
--------------------------------------------------------------------------------
 1 | <?php header('Content-Type: application/xhtml+xml'); ?>
 2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 3 | <html xmlns="http://www.w3.org/1999/xhtml">
 4 |     <head>
 5 |         <meta charset="utf-8">
 6 |         <meta http-equiv="X-UA-Compatible" content="IE=edge">
 7 |         <meta name="viewport" content="width=device-width, initial-scale=1">
 8 |         <title>Scraped</title>
 9 |     </head>
10 |     <body>
11 |         <h1>Hello, world!</h2>
12 |     </body>
13 | </html>


--------------------------------------------------------------------------------
/test/site/malformed_responses/xml.php:
--------------------------------------------------------------------------------
1 | <?php header('Content-Type: application/xml'); ?>
2 | <menu id="file" value="File">
3 |   <popup>
4 |     <menuitem value="New" onclick="CreateNewDoc() />
5 |     <menuitem value="Open" onclick="OpenDoc()" />
6 |     <menuitem value="Close" onclick="CloseDoc()" />
7 |   </popup>
8 | </menu>


--------------------------------------------------------------------------------
/test/test_helpers_url_helper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2017 Tijme Gommers
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | import unittest
 26 | 
 27 | from nyawc.helpers.URLHelper import URLHelper
 28 | 
 29 | class TestUrlHelper(unittest.TestCase):
 30 |     """The TestUrlHelper class checks if the methods in the URLHelper work correctly."""
 31 | 
 32 |     def test_make_absolute(self):
 33 |         """Check if the make absolute method works correctly."""
 34 | 
 35 |         host = "https://example.ltd/current"
 36 | 
 37 |         tests = [
 38 |             ("https://example.ltd/new.html", "new.html"),
 39 |             ("https://example.ltd/new", "new"),
 40 |             ("https://example.ltd/new1/new2", "new1/new2"),
 41 |             ("https://example.ltd/new1/new3", "/new1/new3"),
 42 |             ("https://example.ltd/current?a=a", "?a=a")
 43 |         ]
 44 | 
 45 |         for test in tests:
 46 |             self.assertEqual(URLHelper.make_absolute(host, test[1]), test[0])
 47 | 
 48 |     def test_make_absolute_with_base(self):
 49 |         """Check if the make absolute method works correctly in interpreted with a base URL."""
 50 | 
 51 |         host = "https://example.ltd/base/"
 52 | 
 53 |         tests = [
 54 |             ("https://example.ltd/base/new.html", "new.html"),
 55 |             ("https://example.ltd/base/new", "new"),
 56 |             ("https://example.ltd/base/new1/new2", "new1/new2"),
 57 |             ("https://example.ltd/new1/new2", "/new1/new2"),
 58 |             ("https://example.ltd/base/?a=a", "?a=a")
 59 |         ]
 60 | 
 61 |         for test in tests:
 62 |             self.assertEqual(URLHelper.make_absolute(host, test[1]), test[0])
 63 | 
 64 |     def test_make_absolute_folder_traversal(self):
 65 |         """Ensure folder traversal works correclty."""
 66 | 
 67 |         host = "https://example.ltd/dir1/dir2/dir3"
 68 | 
 69 |         tests = [
 70 |             ("https://example.ltd/dir1/dir2", "../"),
 71 |             ("https://example.ltd/dir1", "../../"),
 72 |             ("https://example.ltd", "../../../"),
 73 |             ("https://example.ltd", "../../../../"),
 74 |             ("https://example.ltd", "../../../../../")
 75 |         ]
 76 | 
 77 |         for test in tests:
 78 |             self.assertEqual(URLHelper.make_absolute(host, test[1]), test[0])
 79 | 
 80 |     def test_get_protocol(self):
 81 |         """Check if the get protocol method works correctly."""
 82 | 
 83 |         tests = [
 84 |             ("", "domain.tld"),
 85 |             ("http", "http://domain.tld"),
 86 |             ("arbitrary", "arbitrary://omain.tld")
 87 |         ]
 88 | 
 89 |         for test in tests:
 90 |             self.assertEqual(URLHelper.get_protocol(test[1]), test[0])
 91 | 
 92 |     def test_get_subdomain(self):
 93 |         """Check if the get subdomain method works correctly."""
 94 | 
 95 |         tests = [
 96 |             ("", ""),
 97 |             ("", "http://"),
 98 |             ("", "http://domain"),
 99 |             ("", "http://domain.tld"),
100 |             ("sub1", "http://sub1.domain.tld"),
101 |             ("sub2.sub1", "http://sub2.sub1.domain.tld"),
102 |             ("sub3.sub2.sub1", "http://sub3.sub2.sub1.domain.tld")
103 |         ]
104 | 
105 |         for test in tests:
106 |             self.assertEqual(URLHelper.get_subdomain(test[1]), test[0])
107 | 
108 |     def test_get_hostname(self):
109 |         """Check if the get hostname method works correctly."""
110 | 
111 |         tests = [
112 |             ("", ""),
113 |             ("", "http://"),
114 |             ("domain", "http://domain"),
115 |             ("domain", "http://domain.tld"),
116 |             ("domain", "http://sub1.domain.tld"),
117 |             ("domain", "http://sub2.sub1.domain.tld")
118 |         ]
119 | 
120 |         for test in tests:
121 |             self.assertEqual(URLHelper.get_hostname(test[1]), test[0])
122 | 
123 |     def test_get_tld(self):
124 |         """Check if the get tld method works correctly."""
125 | 
126 |         tests = [
127 |             ("", ""),
128 |             ("", "http://"),
129 |             ("", "http://domain"),
130 |             ("tld", "http://domain.tld"),
131 |             ("tld", "http://sub1.domain.tld"),
132 |             ("tld", "http://sub2.sub1.domain.tld")
133 |         ]
134 | 
135 |         for test in tests:
136 |             self.assertEqual(URLHelper.get_tld(test[1]), test[0])
137 | 
138 |     def test_get_ordered_params(self):
139 |         """Check if the get ordered params method works correctly."""
140 | 
141 |         val1 = URLHelper.get_ordered_params("http://example.tld?a=a&c=c&b=b&d=d")
142 |         val2 = URLHelper.get_ordered_params("http://sub.domain.ltd?c=c&b=b&a=a&d=d")
143 | 
144 |         self.assertEqual(val1, val2)
145 | 
146 |     def test_append_with_data_encoded_and_decoded(self):
147 |         """Make sure values do not get decoded or encoded."""
148 | 
149 |         val1 = URLHelper.append_with_data("http://example.tld/", {"val": "{{aaaa}}"})
150 |         val2 = URLHelper.append_with_data("http://example.tld/", {"val": "%7B%7Baaaa%7D%7D"})
151 | 
152 |         self.assertEqual(val1, "http://example.tld/?val={{aaaa}}")
153 |         self.assertEqual(val2, "http://example.tld/?val=%7B%7Baaaa%7D%7D")
154 | 


--------------------------------------------------------------------------------
/test/test_queue.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | import unittest
26 | 
27 | from nyawc.helpers.HTTPRequestHelper import HTTPRequestHelper
28 | from nyawc.Queue import Queue
29 | from nyawc.http.Request import Request
30 | from nyawc.Options import Options
31 | 
32 | class TestQueue(unittest.TestCase):
33 |     """The TestQueue class tests if the hashes and counters of the queue work correctly."""
34 | 
35 |     def test_hash_is_always_the_same(self):
36 |         """Ensure the hashes are calculated correctly by checking for duplicates in the queue."""
37 | 
38 |         options = Options()
39 |         queue = Queue(options)
40 | 
41 |         for index in range(0, 100):
42 |             request = Request("https://example.ltd?1=1#2=2")
43 |             HTTPRequestHelper.patch_with_options(request, options)
44 |             request.cookies.set(name='tasty_cookie{}'.format(index), value='yum', domain='example.ltd')
45 |             queue.add_request(request)
46 | 
47 |         self.assertEqual(queue.count_total, 1)
48 | 
49 |     def test_hash_different_query_order(self):
50 |         """Ensure query parameters in different orders are treated as one queue item."""
51 | 
52 |         queue = Queue(Options())
53 | 
54 |         queue.add_request(Request("https://www.example.ltd?b=b&c=c&a=a"))
55 |         queue.add_request(Request("https://www.example.ltd?b=b&a=a&c=c"))
56 |         queue.add_request(Request("https://www.example.ltd?a=a&b=b&c=c"))
57 | 
58 |         self.assertEqual(queue.count_total, 1)
59 | 
60 | 
61 |     def test_hash_different_encoded_and_decoded_values(self):
62 |         """Ensure encoded and decoded values have a different hash."""
63 | 
64 |         queue = Queue(Options())
65 | 
66 |         queue.add_request(Request("http://example.ltd?val={{aaaa}}"))
67 |         queue.add_request(Request("http://example.ltd?val=%7B%7Baaaa%7D%7D"))
68 | 
69 |         self.assertEqual(queue.count_total, 2)
70 | 


--------------------------------------------------------------------------------
/test/test_scrapers_css_regex_link_scraper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | import unittest
26 | 
27 | from nyawc.scrapers.CSSRegexLinkScraper import CSSRegexLinkScraper
28 | from nyawc.QueueItem import QueueItem
29 | from nyawc.http.Request import Request
30 | from nyawc.http.Response import Response
31 | from nyawc.Options import Options
32 | 
33 | class TestScrapersCSSRegexLinkScraper(unittest.TestCase):
34 |     """The TestScrapersCSSRegexLinkScraper class tests if the CSSRegexLinkScraper is working correctly.
35 | 
36 |     Attributes:
37 |         __host (str): The host were the new URLs were found on
38 |         __urls list(obj): The URLs that were found
39 | 
40 |     """
41 | 
42 |     __host = "https://example.ltd/"
43 | 
44 |     __urls = [
45 |         {"url": """https://example.ltd/?unique=1""", "must_pass": True, "test": """(https://example.ltd/?unique=1)"""},
46 |         {"url": """http://example.ltd/?unique=2""", "must_pass": True, "test": """(\"http://example.ltd/?unique=2\")"""},
47 |         {"url": """https://example.ltd/?unique=3""", "must_pass": True, "test": """('//example.ltd/?unique=3')"""},
48 | 
49 |         {"url": None, "must_pass": False, "test": """@import url(this-should-not-pass)"""},
50 |         {"url": None, "must_pass": False, "test": """@import url(`https://example.ltd/`)"""}
51 |     ]
52 | 
53 |     def test_xml_url_count(self):
54 |         """Test if the amount of URLs found complies with the expected amount."""
55 | 
56 |         html = ""
57 |         for url in self.__urls:
58 |             html += "\n" + url["test"]
59 | 
60 |         request = Request(self.__host)
61 |         response = Response(self.__host)
62 |         response.text = html
63 | 
64 |         finder = CSSRegexLinkScraper(Options(), QueueItem(request, response))
65 |         matches = finder.get_requests()
66 | 
67 |         self.assertEqual(len(matches), 3)
68 | 
69 |     def test_xml_url_matches(self):
70 |         """Test if all the URLs match the found URLs."""
71 | 
72 |         for url in self.__urls:
73 |             request = Request(self.__host)
74 |             response = Response(self.__host)
75 |             response.text = url["test"]
76 | 
77 |             finder = CSSRegexLinkScraper(Options(), QueueItem(request, response))
78 |             requests = finder.get_requests()
79 | 
80 |             if url["must_pass"]:
81 |                 self.assertEqual(requests[0].url, url["url"])
82 |                 self.assertEqual(len(requests), 1)
83 |             else:
84 |                 self.assertEqual(len(requests), 0)
85 | 


--------------------------------------------------------------------------------
/test/test_scrapers_html_soup_form_scraper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # MIT License
  4 | #
  5 | # Copyright (c) 2017 Tijme Gommers
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in all
 15 | # copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 | # SOFTWARE.
 24 | 
 25 | import unittest
 26 | 
 27 | from nyawc.scrapers.HTMLSoupFormScraper import HTMLSoupFormScraper
 28 | from nyawc.QueueItem import QueueItem
 29 | from nyawc.http.Request import Request
 30 | from nyawc.http.Response import Response
 31 | from nyawc.Options import Options
 32 | 
 33 | class TestScrapersHTMLSoupFormScraper(unittest.TestCase):
 34 |     """The TestScrapersHTMLSoupFormScraper class tests if the HTMLSoupFormScraper is working correctly.
 35 | 
 36 |     Attributes:
 37 |         __host (str): The host were the new URLs were found on
 38 |         __urls list(obj): The URLs that were found
 39 | 
 40 |     """
 41 | 
 42 |     __host = "https://example.ltd/"
 43 | 
 44 |     __urls = [
 45 |         {
 46 |             "url": """https://example.ltd/action_page1.php""",
 47 |             "method": Request.METHOD_POST,
 48 |             "data": {
 49 |                 "lastname": "Mouse",
 50 |                 "name": "TestContent"
 51 |             },
 52 |             "must_pass": True,
 53 |             "test": """
 54 |                 <form action="/action_page1.php" method="post">
 55 |                     First name:<br>
 56 |                     <input type="text" value="Mickey"><br>
 57 |                     Last name:<br>
 58 |                     <input type="text" name="lastname" value="Mouse"><br><br>
 59 |                     <input type="submit" value="Submit">
 60 |                     <textarea name="test">TestContent</textarea>
 61 |                 </form>
 62 |             """
 63 |         },
 64 |         {
 65 |             "url": """https://example.ltd/action_page2.php""",
 66 |             "method": Request.METHOD_POST,
 67 |             "data": {
 68 |                 "lastname": "Mouse"
 69 |             },
 70 |             "must_pass": True,
 71 |             "test": """
 72 |                 <form action=`/action_page2.php` method="Post">
 73 |                     First name:<br>
 74 |                     <input type="text" value="Mickey"><br>
 75 |                     Last name:<br>
 76 |                     <input type="text" name="lastname" value="Mouse"><br><br>
 77 |                     <input type="submit" value="Submit">
 78 |                 </form>
 79 |             """
 80 |         },
 81 |         {
 82 |             "url": """https://example.ltd/?lastname=Mouse""",
 83 |             "method": Request.METHOD_GET,
 84 |             "data": None,
 85 |             "must_pass": True,
 86 |             "test": """
 87 |                 <form method="geT">
 88 |                     First name:<br>
 89 |                     <input type="text" value="Mickey"><br>
 90 |                     Last name:<br>
 91 |                     <input type="text" name="lastname" value="Mouse"><br><br>
 92 |                     <input type="submit" value="Submit">
 93 |                 </form>
 94 |             """
 95 |         },
 96 |         {
 97 |             "url": """https://example.ltd/?lastname=Mouse&test=TestContent""",
 98 |             "method": Request.METHOD_GET,
 99 |             "data": None,
100 |             "must_pass": True,
101 |             "test": """
102 |                 <form>
103 |                     First name:<br>
104 |                     <input type="text" value="Mickey"><br>
105 |                     Last name:<br>
106 |                     <input type="text" name="lastname" value="Mouse"><br><br>
107 |                     <textarea name="test">TestContent</textarea>
108 |                     <input type="submit" value="Submit">
109 |                 </form>
110 |             """
111 |         },
112 |     ]
113 | 
114 |     def test_soup_url_count(self):
115 |         """Test if the amount of URLs found complies with the expected amount."""
116 | 
117 |         html = ""
118 |         for url in self.__urls:
119 |             html += "\n" + url["test"]
120 | 
121 |         request = Request(self.__host)
122 |         response = Response(self.__host)
123 |         response.text = html
124 | 
125 |         finder = HTMLSoupFormScraper(Options(), QueueItem(request, response))
126 |         matches = finder.get_requests()
127 | 
128 |         self.assertEqual(len(matches), 4)
129 | 
130 |     def test_soup_url_matches(self):
131 |         """Test if all the URLs match the found URLs."""
132 | 
133 |         for url in self.__urls:
134 |             request = Request(self.__host)
135 |             response = Response(self.__host)
136 |             response.text = url["test"]
137 | 
138 |             finder = HTMLSoupFormScraper(Options(), QueueItem(request, response))
139 |             requests = finder.get_requests()
140 | 
141 |             if url["must_pass"]:
142 |                 self.assertEqual(requests[0].url, url["url"])
143 |                 self.assertEqual(len(requests), 1)
144 |             else:
145 |                 self.assertEqual(len(requests), 0)
146 | 


--------------------------------------------------------------------------------
/test/test_scrapers_json_regex_link_scraper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | import unittest
26 | 
27 | from nyawc.scrapers.JSONRegexLinkScraper import JSONRegexLinkScraper
28 | from nyawc.QueueItem import QueueItem
29 | from nyawc.http.Request import Request
30 | from nyawc.http.Response import Response
31 | from nyawc.Options import Options
32 | 
33 | class TestScrapersJSONRegexLinkScraper(unittest.TestCase):
34 |     """The TestScrapersJSONRegexLinkScraper class tests if the JSONRegexLinkScraper is working correctly.
35 | 
36 |     Attributes:
37 |         __host (str): The host were the new URLs were found on
38 |         __urls list(obj): The URLs that were found
39 | 
40 |     """
41 | 
42 |     __host = "https://example.ltd/"
43 | 
44 |     __urls = [
45 |         {"url": """https://example.ltd/?unique=1""", "must_pass": True, "test": """[\"https://example.ltd/?unique=1\"]"""},
46 |         {"url": """http://example.ltd/?unique=2""", "must_pass": True, "test": """{\"http://example.ltd/?unique=2\":\"\"}"""},
47 |         {"url": """https://example.ltd/?unique=3""", "must_pass": True, "test": """{\"//example.ltd/?unique=3\":\"\"}"""},
48 |         {"url": """https://example.ltd/aa/bb/?unique=4""", "must_pass": True, "test": """{\"/aa/bb/?unique=4\":\"\"}"""},
49 |         {"url": """https://example.ltd/aa/bb/?unique=5""", "must_pass": True, "test": """{\"\":\"/aa/bb/?unique=5\"}"""},
50 | 
51 |         {"url": None, "must_pass": False, "test": """{\"\":\"asdfasdf/asdfasdf\"}"""},
52 |     ]
53 | 
54 |     def test_xml_url_count(self):
55 |         """Test if the amount of URLs found complies with the expected amount."""
56 | 
57 |         html = ""
58 |         for url in self.__urls:
59 |             html += "\n" + url["test"]
60 | 
61 |         request = Request(self.__host)
62 |         response = Response(self.__host)
63 |         response.text = html
64 | 
65 |         finder = JSONRegexLinkScraper(Options(), QueueItem(request, response))
66 |         matches = finder.get_requests()
67 | 
68 |         self.assertEqual(len(matches), 5)
69 | 
70 |     def test_xml_url_matches(self):
71 |         """Test if all the URLs match the found URLs."""
72 | 
73 |         for url in self.__urls:
74 |             request = Request(self.__host)
75 |             response = Response(self.__host)
76 |             response.text = url["test"]
77 | 
78 |             finder = JSONRegexLinkScraper(Options(), QueueItem(request, response))
79 |             requests = finder.get_requests()
80 | 
81 |             if url["must_pass"]:
82 |                 self.assertEqual(requests[0].url, url["url"])
83 |                 self.assertEqual(len(requests), 1)
84 |             else:
85 |                 self.assertEqual(len(requests), 0)
86 | 


--------------------------------------------------------------------------------
/test/test_scrapers_xml_regex_link_scraper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | import unittest
26 | 
27 | from nyawc.scrapers.XMLRegexLinkScraper import XMLRegexLinkScraper
28 | from nyawc.QueueItem import QueueItem
29 | from nyawc.http.Request import Request
30 | from nyawc.http.Response import Response
31 | from nyawc.Options import Options
32 | 
33 | class TestScrapersXMLRegexLinkScraper(unittest.TestCase):
34 |     """The TestScrapersXMLRegexLinkScraper class tests if the XMLRegexLinkScraper is working correctly.
35 | 
36 |     Attributes:
37 |         __host (str): The host were the new URLs were found on
38 |         __urls list(obj): The URLs that were found
39 | 
40 |     """
41 | 
42 |     __host = "https://example.ltd/"
43 | 
44 |     __urls = [
45 |         {"url": """https://example.ltd/?unique=1""", "must_pass": True, "test": """<link>https://example.ltd/?unique=1</link>"""},
46 |         {"url": """http://example.ltd/?unique=2""", "must_pass": True, "test": """<link>http://example.ltd/?unique=2</link>"""},
47 |         {"url": """https://example.ltd/?unique=3""", "must_pass": True, "test": """<link>//example.ltd/?unique=3</link>"""},
48 |         {"url": """https://example.ltd/aa/bb/?unique=4""", "must_pass": True, "test": """<link>/aa/bb/?unique=4</link>"""},
49 |         {"url": """https://example.ltd/aa/bb/?unique=5""", "must_pass": True, "test": """<abc>/aa/bb/?unique=5</def>"""},
50 | 
51 |         {"url": None, "must_pass": False, "test": """<link>asdfasdf/asdfasdf</link>"""},
52 |     ]
53 | 
54 |     def test_xml_url_count(self):
55 |         """Test if the amount of URLs found complies with the expected amount."""
56 | 
57 |         html = ""
58 |         for url in self.__urls:
59 |             html += "\n" + url["test"]
60 | 
61 |         request = Request(self.__host)
62 |         response = Response(self.__host)
63 |         response.text = html
64 | 
65 |         finder = XMLRegexLinkScraper(Options(), QueueItem(request, response))
66 |         matches = finder.get_requests()
67 | 
68 |         self.assertEqual(len(matches), 5)
69 | 
70 |     def test_xml_url_matches(self):
71 |         """Test if all the URLs match the found URLs."""
72 | 
73 |         for url in self.__urls:
74 |             request = Request(self.__host)
75 |             response = Response(self.__host)
76 |             response.text = url["test"]
77 | 
78 |             finder = XMLRegexLinkScraper(Options(), QueueItem(request, response))
79 |             requests = finder.get_requests()
80 | 
81 |             if url["must_pass"]:
82 |                 self.assertEqual(requests[0].url, url["url"])
83 |                 self.assertEqual(len(requests), 1)
84 |             else:
85 |                 self.assertEqual(len(requests), 0)
86 | 


--------------------------------------------------------------------------------
/test/test_site.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # MIT License
 4 | #
 5 | # Copyright (c) 2017 Tijme Gommers
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in all
15 | # copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | # SOFTWARE.
24 | 
25 | import os
26 | import unittest
27 | 
28 | from nyawc.Options import Options
29 | from nyawc.Crawler import Crawler
30 | from nyawc.http.Request import Request
31 | from nyawc.CrawlerActions import CrawlerActions
32 | 
33 | class TestSite(unittest.TestCase):
34 |     """The TestSite class checks if the crawler handles invalid responses correctly.
35 | 
36 |     Attributes:
37 |         travis (bool): If the current environment is in Travis CI.
38 | 
39 |     """
40 | 
41 |     def __init__(self, *args, **kwargs):
42 |         """Initialize the unit test and mark if the current environment is Travis CI.
43 | 
44 |         Args:
45 |             args list(str): The command line arguments.
46 |             kwargs **: Extra arguments
47 | 
48 |         """
49 | 
50 |         super(TestSite, self).__init__(*args, **kwargs)
51 |         self.travis = "UNITTEST_NYAWC_SITE" in os.environ
52 | 
53 |     def cb_request_after_finish(self, queue, queue_item, new_queue_items):
54 |         """Crawler callback for when a request is finished crawling.
55 | 
56 |         Args:
57 |             queue (:class:`nyawc.Queue`): The current crawling queue.
58 |             queue_item (:class:`nyawc.QueueItem`): The queue item that was finished.
59 |             new_queue_items list(:class:`nyawc.QueueItem`): The new queue items that were found in the one that finished.
60 | 
61 |         Returns:
62 |             str: A crawler action (either DO_STOP_CRAWLING or DO_CONTINUE_CRAWLING).
63 | 
64 |         """
65 | 
66 |         print("Finished: {}".format(queue_item.request.url))
67 |         return CrawlerActions.DO_CONTINUE_CRAWLING
68 | 
69 |     def test_crawl_website(self):
70 |         """Crawl the website in `test/` and check if the count is correct."""
71 | 
72 |         if not self.travis:
73 |             print("\n\nPlease note that the 'TestSite' unit test did not run.")
74 |             print("It will only run in Travis CI since it requires a webserver.\n")
75 |             return
76 | 
77 |         options = Options()
78 |         options.callbacks.request_after_finish = self.cb_request_after_finish
79 |         crawler = Crawler(options)
80 |         crawler.start_with(Request("http://localhost/"))
81 | 
82 |         self.assertEqual(crawler.queue.count_total, 18)
83 | 


--------------------------------------------------------------------------------