├── .gitignore
├── .pylintrc
├── .readthedocs.yml
├── .travis.yml
├── LICENSE
├── Makefile
├── README.md
├── conf
    ├── .gitignore
    ├── local.ini.template
    └── system.ini
├── dev
    ├── .gitignore
    ├── docker-compose.yml
    ├── gencert.py
    └── nginx.conf
├── docs
    ├── .gitignore
    ├── Makefile
    ├── README.txt
    ├── administration.rst
    ├── changelog.rst
    ├── client-side-logging.png
    ├── conf.py
    ├── configuration.rst
    ├── dashboard.png
    ├── dashboard_first_crawl.png
    ├── development.rst
    ├── first_crawl.rst
    ├── first_crawl_results.png
    ├── gui.png
    ├── index.rst
    ├── installation.rst
    ├── internals.rst
    ├── jupyter_new_notebook.png
    ├── jupyter_notebook.png
    ├── logo.png
    ├── policy.rst
    ├── protobuf.html
    ├── protobuf.rst
    ├── requirements.txt
    ├── rethinkdb_gui.png
    ├── start_crawl.png
    ├── start_crawl_filled_in.png
    ├── terminal.png
    └── websocket_api.rst
├── integration
    ├── __init__.py
    ├── test_db.py
    └── test_subscription.py
├── notebooks
    ├── .gitignore
    ├── Example.ipynb
    └── Frontier Reloading.ipynb
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── starbelly
    ├── __init__.py
    ├── __main__.py
    ├── backoff.py
    ├── bootstrap.py
    ├── captcha.py
    ├── config.py
    ├── db.py
    ├── downloader.py
    ├── extractor.py
    ├── frontier.py
    ├── job.py
    ├── login.py
    ├── policy.py
    ├── rate_limiter.py
    ├── resource_monitor.py
    ├── robots.py
    ├── schedule.py
    ├── server
    │   ├── __init__.py
    │   ├── captcha.py
    │   ├── job.py
    │   ├── login.py
    │   ├── policy.py
    │   ├── rate_limit.py
    │   ├── schedule.py
    │   ├── subscription.py
    │   └── system.py
    ├── starbelly_pb2.py
    ├── storage.py
    ├── subscription.py
    └── version.py
├── tests
    ├── __init__.py
    ├── test_async_mock.py
    ├── test_backoff.py
    ├── test_captcha.py
    ├── test_config.py
    ├── test_crawl.py
    ├── test_downloader.py
    ├── test_extractor.py
    ├── test_frontier.py
    ├── test_job.py
    ├── test_login.py
    ├── test_policy.py
    ├── test_rate_limiter.py
    ├── test_resource_monitor.py
    ├── test_robots.py
    ├── test_schedule.py
    ├── test_server.py
    ├── test_storage.py
    └── test_subscription.py
└── tools
    ├── __init__.py
    ├── clear.py
    ├── container_init.py
    ├── delete_dangling_bodies.py
    └── shell.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info
2 | .coverage
3 | .coverage.*
4 | .ipynb_checkpoints
5 | .pytest_cache
6 | .vscode
7 | __pycache__
8 | README.html
9 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 2 | version: 2
 3 | 
 4 | sphinx:
 5 |   configuration: docs/conf.py
 6 | 
 7 | python:
 8 |   version: 3.7
 9 |   install:
10 |     - requirements: docs/requirements.txt
11 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | git:
 4 |   depth: 1
 5 | 
 6 | matrix:
 7 |   include:
 8 |     - python: 3.7
 9 |       dist: xenial
10 |       sudo: yes
11 | 
12 | before_install:
13 |   - pip install poetry
14 | 
15 | install:
16 |   - poetry install
17 | 
18 | script:
19 |   - poetry run make test
20 | 
21 | after_success:
22 |   - poetry run make coverage
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Hyperion Gray LLC
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # The targets in this makefile should be executed inside Poetry, i.e. `poetry run make
 2 | # docs`.
 3 | 
 4 | .PHONY: docs
 5 | 
 6 | docs:
 7 | 	$(MAKE) -C docs html
 8 | 
 9 | test:
10 | 	pytest tests/ --cov=starbelly --cov-report=term-missing
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | 
 3 | Starbelly is a user-friendly web crawler that is easy to deploy and configure.
 4 | Learn more at
 5 | [starbelly.readthedocs.io](http://starbelly.readthedocs.io/en/latest/).
 6 | 
 7 | [![Build Status](https://img.shields.io/travis/com/HyperionGray/starbelly.svg?style=flat-square)](https://travis-ci.org/HyperionGray/starbelly)
 8 | [![Coverage](https://img.shields.io/coveralls/github/HyperionGray/starbelly.svg?style=flat-square)](https://coveralls.io/github/HyperionGray/starbelly)
 9 | [![Read the Docs](https://img.shields.io/readthedocs/starbelly.svg)](https://starbelly.readthedocs.io)
10 | 
11 | 
12 | # LICENSE
13 | 
14 | Starbelly is under a proprietary license. Please contact Hyperion Gray at acaceres@hyperiongray.com
15 | 
16 | ---
17 | 
18 | <a href="https://www.hyperiongray.com/?pk_campaign=github&pk_kwd=starbelly"><img alt="define hyperion gray" width="500px" src="https://hyperiongray.s3.amazonaws.com/define-hg.svg"></a>
19 | 
20 | 


--------------------------------------------------------------------------------
/conf/.gitignore:
--------------------------------------------------------------------------------
1 | local.ini
2 | 


--------------------------------------------------------------------------------
/conf/local.ini.template:
--------------------------------------------------------------------------------
1 | [database]
2 | 
3 | host =
4 | db =
5 | user =
6 | password =
7 | super_user =
8 | super_password =
9 | 


--------------------------------------------------------------------------------
/conf/system.ini:
--------------------------------------------------------------------------------
 1 | [database]
 2 | 
 3 | host =
 4 | port = 28015
 5 | db =
 6 | user =
 7 | password =
 8 | super_user =
 9 | super_password =
10 | 
11 | [rate_limiter]
12 | 
13 | capacity = 10000
14 | 


--------------------------------------------------------------------------------
/dev/.gitignore:
--------------------------------------------------------------------------------
1 | ca.crt
2 | server.crt
3 | 


--------------------------------------------------------------------------------
/dev/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | 
 3 | volumes:
 4 |   db_data: {}
 5 | 
 6 | services:
 7 |   db:
 8 |     network_mode: host
 9 |     image: rethinkdb:2.4
10 |     container_name: starbelly-dev-db
11 |     volumes:
12 |       - db_data:/data
13 | 
14 |   web:
15 |     network_mode: host
16 |     image: nginx:1.15
17 |     container_name: starbelly-dev-web
18 |     volumes:
19 |       - "${PWD}/nginx.conf:/etc/nginx/conf.d/default.conf"
20 |       - "${PWD}/server.crt:/etc/nginx/server.crt"
21 | 


--------------------------------------------------------------------------------
/dev/gencert.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pathlib
 3 | import sys
 4 | 
 5 | import trustme
 6 | 
 7 | 
 8 | logging.basicConfig(level=logging.INFO)
 9 | 
10 | 
11 | def main():
12 |     if len(sys.argv) != 2:
13 |         logging.error('Usage: %s <hostname>', sys.argv[0])
14 |         sys.exit(1)
15 |     hostname = sys.argv[1]
16 |     here = pathlib.Path(__file__).parent
17 |     ca_path = here / 'ca.crt'
18 |     server_path = here / 'server.crt'
19 |     logging.info('Creating self-signed certificate for "%s"', hostname)
20 |     ca_cert = trustme.CA()
21 |     ca_cert.cert_pem.write_to_path(ca_path)
22 |     logging.info(' * CA certificate: {}'.format(ca_path))
23 |     server_cert = ca_cert.issue_server_cert(hostname)
24 |     server_cert.private_key_and_cert_chain_pem.write_to_path(server_path)
25 |     logging.info(' * Server certificate: {}'.format(server_path))
26 |     logging.info('Done')
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     main()
31 | 


--------------------------------------------------------------------------------
/dev/nginx.conf:
--------------------------------------------------------------------------------
 1 | server {
 2 |   listen 80 default_server;
 3 |   return 301 https://$host$request_uri;
 4 | }
 5 | 
 6 | server {
 7 |   listen 443 ssl default_server;
 8 |   ssl_certificate /etc/nginx/server.crt;
 9 |   ssl_certificate_key /etc/nginx/server.crt;
10 |   ssl_protocols TLSv1.2;
11 |   ssl_ciphers HIGH:!aNULL:!MD5;
12 |   ssl_session_cache shared:TLSCACHE:1m;
13 |   add_header Strict-Transport-Security "max-age=31536000";
14 | 
15 |   location /ws/ {
16 |     proxy_pass http://localhost:8000;
17 |     proxy_http_version 1.1;
18 |     proxy_set_header Upgrade $http_upgrade;
19 |     proxy_set_header Connection "upgrade";
20 |   }
21 | 
22 |   # Reverse proxy to `pub serve`
23 |   location / {
24 |     proxy_pass http://localhost:8081;
25 |     proxy_http_version 1.1;
26 |     proxy_intercept_errors on;
27 |     error_page 404 =200 /index.html;
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | 
3 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = Starbelly
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/README.txt:
--------------------------------------------------------------------------------
1 | To build documentation in this directory, install `sphinx` and
2 | `sphinx_rtd_theme` packages from pip, then run `make html`.
3 | 


--------------------------------------------------------------------------------
/docs/administration.rst:
--------------------------------------------------------------------------------
 1 | *******************
 2 | Administrator Guide
 3 | *******************
 4 | 
 5 | .. contents::
 6 |     :depth: 2
 7 | 
 8 | 
 9 | Overview
10 | ========
11 | 
12 | This section goes over some common tasks that you may need to perform as a
13 | Starbelly administrator. In the examples below, if a command prompt is prefixed
14 | with a container name, then that indicates that the command must be run inside
15 | a specific Docker container. For example, if you see this:
16 | 
17 | .. code::
18 | 
19 |     starbelly-dev-app:/starbelly# ls /usr/local/etc
20 |     jupyter
21 | 
22 | Then that command should be run inside of the ``starbelly-dev-app`` container.
23 | To obtain a shell inside that container, run:
24 | 
25 | .. code::
26 | 
27 |     $ docker exec -it starbelly-dev-app /bin/bash
28 |     starbelly-dev-app#
29 | 
30 | You can use the same technique to get a shell inside the ``starbelly-dev-db`` or
31 | ``starbelly-dev-web`` containers.
32 | 
33 | Clear Database
34 | ==============
35 | 
36 | To clear all data from the database, including crawl data, job data, and other
37 | state:
38 | 
39 | .. code::
40 | 
41 |     starbelly-dev-app:/starbelly# python tools/clear.py
42 | 
43 | Change Password
44 | ===============
45 | 
46 | Adding or changing passwords is covered in the :doc:`installation` under the
47 | "Security" section.
48 | 


--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
 1 | *********
 2 | Changelog
 3 | *********
 4 | 
 5 | 
 6 | v2.0.0 (2019-XX-XX)
 7 | ===================
 8 | 
 9 | - Massive rewrite of Starbelly's I/O to use Trio instead of asyncio.
10 | - Upgrade web client to Dart 2 and Angular 5.
11 | 
12 | 
13 | v1.0.0 (2017-11-03)
14 | ===================
15 | 
16 | - Initial release.
17 | 


--------------------------------------------------------------------------------
/docs/client-side-logging.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/client-side-logging.png


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Starbelly documentation build configuration file, created by
  5 | # sphinx-quickstart on Thu Nov  2 15:00:59 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | # import os
 21 | # import sys
 22 | # sys.path.insert(0, os.path.abspath('.'))
 23 | 
 24 | import pathlib
 25 | import sys
 26 | 
 27 | # -- General configuration ------------------------------------------------
 28 | 
 29 | # If your documentation needs a minimal Sphinx version, state it here.
 30 | #
 31 | # needs_sphinx = '1.0'
 32 | 
 33 | # Add any Sphinx extension module names here, as strings. They can be
 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 35 | # ones.
 36 | extensions = [
 37 |     'sphinx.ext.autodoc',
 38 |     'sphinx.ext.intersphinx',
 39 |     'sphinx.ext.graphviz',
 40 |     'sphinxcontrib_trio',
 41 | ]
 42 | 
 43 | # Add any paths that contain templates here, relative to this directory.
 44 | templates_path = ['_templates']
 45 | 
 46 | # The suffix(es) of source filenames.
 47 | # You can specify multiple suffix as a list of string:
 48 | #
 49 | # source_suffix = ['.rst', '.md']
 50 | source_suffix = '.rst'
 51 | 
 52 | # The master toctree document.
 53 | master_doc = 'index'
 54 | 
 55 | # General information about the project.
 56 | project = 'Starbelly'
 57 | copyright = 'Hyperion Gray 2017'
 58 | author = 'Mark E. Haase'
 59 | 
 60 | # The version info for the project you're documenting, acts as replacement for
 61 | # |version| and |release|, also used in various other places throughout the
 62 | # built documents.
 63 | #
 64 | # The short X.Y version.
 65 | import starbelly.version
 66 | version = starbelly.version.__version__
 67 | # The full version, including alpha/beta/rc tags.
 68 | release = version
 69 | 
 70 | # The language for content autogenerated by Sphinx. Refer to documentation
 71 | # for a list of supported languages.
 72 | #
 73 | # This is also used if you do content translation via gettext catalogs.
 74 | # Usually you set "language" from the command line for these cases.
 75 | language = None
 76 | 
 77 | # List of patterns, relative to source directory, that match files and
 78 | # directories to ignore when looking for source files.
 79 | # This patterns also effect to html_static_path and html_extra_path
 80 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 81 | 
 82 | # The name of the Pygments (syntax highlighting) style to use.
 83 | pygments_style = 'sphinx'
 84 | 
 85 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 86 | todo_include_todos = False
 87 | 
 88 | 
 89 | # -- Options for HTML output ----------------------------------------------
 90 | 
 91 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 92 | # a list of builtin themes.
 93 | #
 94 | html_theme = 'sphinx_rtd_theme'
 95 | 
 96 | # Theme options are theme-specific and customize the look and feel of a theme
 97 | # further.  For a list of options available for each theme, see the
 98 | # documentation.
 99 | #
100 | # html_theme_options = {}
101 | 
102 | # Add any paths that contain custom static files (such as style sheets) here,
103 | # relative to this directory. They are copied after the builtin static files,
104 | # so a file named "default.css" will overwrite the builtin "default.css".
105 | html_static_path = ['_static']
106 | 
107 | # Custom sidebar templates, must be a dictionary that maps document names
108 | # to template names.
109 | #
110 | # This is required for the alabaster theme
111 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
112 | html_sidebars = {
113 |     '**': [
114 |         'relations.html',  # needs 'show_related': True theme option to display
115 |         'searchbox.html',
116 |     ]
117 | }
118 | 
119 | 
120 | # -- Options for HTMLHelp output ------------------------------------------
121 | 
122 | # Output file base name for HTML help builder.
123 | htmlhelp_basename = 'Starbellydoc'
124 | 
125 | 
126 | # -- Options for LaTeX output ---------------------------------------------
127 | 
128 | latex_elements = {
129 |     # The paper size ('letterpaper' or 'a4paper').
130 |     #
131 |     # 'papersize': 'letterpaper',
132 | 
133 |     # The font size ('10pt', '11pt' or '12pt').
134 |     #
135 |     # 'pointsize': '10pt',
136 | 
137 |     # Additional stuff for the LaTeX preamble.
138 |     #
139 |     # 'preamble': '',
140 | 
141 |     # Latex figure (float) alignment
142 |     #
143 |     # 'figure_align': 'htbp',
144 | }
145 | 
146 | # Grouping the document tree into LaTeX files. List of tuples
147 | # (source start file, target name, title,
148 | #  author, documentclass [howto, manual, or own class]).
149 | latex_documents = [
150 |     (master_doc, 'Starbelly.tex', 'Starbelly Documentation',
151 |      'Mark E. Haase', 'manual'),
152 | ]
153 | 
154 | 
155 | # -- Options for manual page output ---------------------------------------
156 | 
157 | # One entry per manual page. List of tuples
158 | # (source start file, name, description, authors, manual section).
159 | man_pages = [
160 |     (master_doc, 'starbelly', 'Starbelly Documentation',
161 |      [author], 1)
162 | ]
163 | 
164 | 
165 | # -- Options for Texinfo output -------------------------------------------
166 | 
167 | # Grouping the document tree into Texinfo files. List of tuples
168 | # (source start file, target name, title, author,
169 | #  dir menu entry, description, category)
170 | texinfo_documents = [
171 |     (master_doc, 'Starbelly', 'Starbelly Documentation',
172 |      author, 'Starbelly', 'One line description of project.',
173 |      'Miscellaneous'),
174 | ]
175 | 
176 | # -- Options for extensions -----------------------------------------------
177 | 
178 | intersphinx_mapping = {
179 |     'trio': ('https://trio.readthedocs.io/en/stable/', None),
180 |     'yarl': ('https://yarl.readthedocs.io/en/stable/', None),
181 | }
182 | 


--------------------------------------------------------------------------------
/docs/configuration.rst:
--------------------------------------------------------------------------------
 1 | *******************
 2 | Configuration Guide
 3 | *******************
 4 | 
 5 | .. contents::
 6 |     :depth: 2
 7 | 
 8 | Overview
 9 | ========
10 | 
11 | Starbelly can be configured entirely through its graphical interface. In fact,
12 | this is one of the advantages to using Starbelly: no more arcane configuration
13 | files or custom code! The tradeoff, of course, is that Starbelly has fewer
14 | configuration options than other crawlers and may not be flexible enough to
15 | solve all crawling problems.
16 | 
17 | The configuration items are all contained in the *Configuration* submenu on the
18 | left side of the interface.
19 | 
20 | CAPTCHA Solvers
21 | ===============
22 | 
23 | Starbelly has the ability to automatically log into a website if it has the
24 | appropriate credentials (see `Credentials`_ below). Some login forms may
25 | require a CAPTCHA. In those cases, you may configure a CAPTCHA solving service.
26 | Starbelly supports any CAPTCHA service that is compatible with the Antigate API.
27 | You may create multiple configurations in order to use multiple backend solvers
28 | or just to send different configurations to the same service.
29 | 
30 | Once you have created a CAPTCHA solver, specify that CAPTCHA solver in a crawl
31 | policy in order to send login CAPTCHAs to the solving service during crawls.
32 | 
33 | Credentials
34 | ===========
35 | 
36 | Starbelly has the ability to automatically log into a website if it has the
37 | appropriate credentials. To configure credentials for a site, you only need to
38 | specify a login URL. (If the login URL enables single sign-on for multiple
39 | subdomains, then you  should also specify the domain name that you wish to
40 | authenticate on.)
41 | 
42 | For each domain, you may set up multiple username & password credentials. When
43 | the crawler encounters that domain during a crawl, it will randomly pick one of
44 | the credentials and attempt to login with it. (The crawler uses machine learning
45 | to identify and parse the login form.)
46 | 
47 | Rate Limits
48 | ===========
49 | 
50 | The crawler observes rate limits between subsequent requests to a single domain.
51 | For example, with the default delay of 5 seconds, the crawler will wait 5
52 | seconds after a request completes until it initiates another request to that
53 | same domain. Therfore, the crawler will download at most 12 pages per minute
54 | from a single domain using the default rate limit. In practice, it will download
55 | fewer than 12 pages per minute, since each request itself also takes some
56 | non-negligible amount of time.
57 | 
58 | Furthermore, rate limits apply across all jobs. For example, if you have two
59 | different jobs crawling one domain, each job will effectively be limited to 6
60 | pages per minute instead of 12.
61 | 
62 | On the *Rate Limits* configuration screen, you may change the global limit as
63 | well as customize rate limits for specific domains. This allows you to specify
64 | lower rate limits for domains that can handle higher traffic. For example, you
65 | might crawl web servers on your corporate intranet faster than you crawl a
66 | public internet server.
67 | 


--------------------------------------------------------------------------------
/docs/dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/dashboard.png


--------------------------------------------------------------------------------
/docs/dashboard_first_crawl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/dashboard_first_crawl.png


--------------------------------------------------------------------------------
/docs/first_crawl.rst:
--------------------------------------------------------------------------------
 1 | ****************
 2 | Your First Crawl
 3 | ****************
 4 | 
 5 | Starbelly offers a lot of ways to fine tune crawling, but for your first crawl,
 6 | we will just use all of its default settings. The goal of this section is to
 7 | perform a deep crawl of a news site. Here are a few example sites you may want
 8 | to try:
 9 | 
10 | - `CNN <https://www.cnn.com/>`__
11 | - `Fox News <https://www.washingtonpost.com/>`__
12 | - `New York Times <https://www.nytimes.com/>`__
13 | - `Washington Post <https://www.washingtonpost.com/>`__
14 | 
15 | Begin by accessing the Starbelly GUI in your web browser. You should be able to
16 | see the dashboard:
17 | 
18 | .. image:: dashboard.png
19 |    :alt: screenshot of dashboard
20 | 
21 | The dashboard is currently empty because you have not started any crawls yet,
22 | but let us take a moment to get familiar with the interface. On the left side,
23 | under the "Starbelly" logo, is the menu. Click any item in the menu to view it.
24 | There is a gray bar across the top of the screen. The left side of this bar
25 | displays breadcrumbs, e.g. the *home* icon and the *dashboard* icon. The
26 | breadcrumbs help you keep track of where you are, and also allow you to quickly
27 | return to earlier screens. The right side of the bar shows the status
28 | "Connected", meaning that it is connected to the Starbelly server.
29 | 
30 | Next, click on *Start Crawl* in the menu.
31 | 
32 | .. image:: start_crawl.png
33 |    :alt: screenshot of blank crawl form
34 | 
35 | This screen allows you to start a crawl. First, enter a seed URL. Second, select
36 | the *Deep Crawl* policy (more on that in a moment). Optionally, you may assign a
37 | name to the crawl. If you do not assign a name, then the crawler will choose a
38 | name for you. Finally, you may assign tags. Tags may be used by consumers of
39 | crawl data, but they do not have any effect on the crawl itself, so leave it
40 | blank for now. (You can edit the tags later if you wish.)
41 | 
42 | .. image:: start_crawl_filled_in.png
43 |    :alt: screenshot of filled-in crawl form
44 | 
45 | When you are ready, click the *Start Crawl* button. You should see a
46 | notification that the crawl is starting. Go back to the Dashboard and you should
47 | now be able to see that your crawl is running. The dashboard updates in realtime
48 | as the crawler downloads documents.
49 | 
50 | .. image:: dashboard_first_crawl.png
51 |    :alt: dashboard shows the first crawl is running
52 | 
53 | You can click on the name of the crawl to view details about that crawl job. The
54 | details screen also updates in real time.
55 | 
56 | .. image:: first_crawl_results.png
57 |    :alt: detailed look at results of first crawl
58 | 
59 | The crawl will continue to run, downloading various documents that it finds,
60 | until it reaches its end. But how does it decide what documents to download, and
61 | how does it decide when the crawl should end? These questions are answered by
62 | consulting the *crawl policy*, which guides the crawler's decision making. In
63 | this example, we used the default *Deep Crawl* policy, which stays inside the
64 | same domain as the seed URL and crawls to depth 10. You may customize this
65 | policy or create any number of your own policies to carefully refine crawler
66 | behavior.
67 | 
68 | Where does the crawl data go and what can you do with it? You can view crawl
69 | results inside Starbelly, but this feature is intended to help with debugging,
70 | not as a practical way to use crawl data. Crawl data is stored inside
71 | Starbelly's database until you explicitly delete it (see the *Crawl Results*
72 | screen). Starbelly is just a crawling *frontend*, which means that it is
73 | designed to be plugged into another application that can read the crawl data and
74 | do something useful with it, such as a search engine or a scraping program.
75 | 
76 | Now that you have a crawl under your belt, you might want to do a deeper dive
77 | into :doc:`configuration` and :doc:`policy`.
78 | 


--------------------------------------------------------------------------------
/docs/first_crawl_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/first_crawl_results.png


--------------------------------------------------------------------------------
/docs/gui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/gui.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. image:: logo.png
 2 |     :height: 150px
 3 |     :width: 150px
 4 |     :align: center
 5 | 
 6 | Starbelly
 7 | =========
 8 | 
 9 | Starbelly is a user-friendly and highly configurable web crawler front end.
10 | Compared to other crawling systems, such as Nutch or Scrapy, Starbelly trades
11 | off lower scalability for improved usability. Starbelly eschews the arcane
12 | configuration files and custom code required for other crawling systems,
13 | favoring a GUI for configuration and managment. Starbelly exposes all of its
14 | features and data through an efficient API, allowing you to build crawling-based
15 | systems on top of it. For example, you might plug in an Elastic Search backend
16 | to build a custom search engine, or plug in a scraper to create a data
17 | collection pipeline.
18 | 
19 | .. toctree::
20 |     :maxdepth: 1
21 | 
22 |     installation
23 |     first_crawl
24 |     configuration
25 |     policy
26 |     administration
27 |     websocket_api
28 |     protobuf
29 |     development
30 |     internals
31 |     changelog
32 | 
33 | .. image:: https://hyperiongray.s3.amazonaws.com/define-hg.svg
34 |     :target: https://www.hyperiongray.com/?pk_campaign=github&pk_kwd=agnostic
35 |     :alt: define hyperiongray
36 |     :width: 500px
37 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
  1 | Installation Guide
  2 | ==================
  3 | 
  4 | Prerequisites
  5 | -------------
  6 | 
  7 | Starbelly is offered as a collection of Docker images and Docker-compose
  8 | configurations. If you're not familiar with Docker, it is a system for deploying
  9 | multiple software components into individual containers and orchestrating the
 10 | entire system.
 11 | 
 12 | First, install `Docker <https://docs.docker.com/engine/installation/>`__ using
 13 | the instructions for your platform.
 14 | 
 15 | Next, install `Docker Compose <https://docs.docker.com/compose/install/>`__
 16 | using the instructions for your platform.
 17 | 
 18 | Docker Compose
 19 | --------------
 20 | 
 21 | Docker Compose is used to set up and run multiple Docker containers together.
 22 | You should have installed Docker Compose in the previous step. Now you need a
 23 | Docker Compose configuration file (usually called ``docker-compose.yml``) that
 24 | specifies what containers need to be created and how they should be configured.
 25 | A sample docker-compose.yml configuration file is available for Starbelly, but
 26 | you may need to tailor this file to your unique environment.
 27 | 
 28 | Download this `zip file
 29 | <https://github.com/HyperionGray/starbelly-docker/archive/master.zip>`__
 30 | and extract it. (If you have Git installed, you can run ``git clone
 31 | git@github.com:HyperionGray/starbelly-docker.git`` instead.) From the
 32 | ``starbelly-docker/starbelly`` directory, run the following command:
 33 | 
 34 | .. code::
 35 | 
 36 |     $ docker-compose up -d
 37 | 
 38 | This will download the required Docker images, create the corresponding
 39 | containers, and then start the entire application on ports 80 and 443. Once
 40 | the application has started, open up a browser and try navigating to the host
 41 | where you are running Starbelly. The default username and password is "admin".
 42 | 
 43 | You should see the Dashboard:
 44 | 
 45 | .. image:: dashboard.png
 46 |    :alt: screenshot of dashboard
 47 | 
 48 | If you experience any problems, try using the command ``docker-compose logs``
 49 | to view logging output from the Docker containers.
 50 | 
 51 | Security
 52 | --------
 53 | 
 54 | If your Starbelly instance is exposed to the internet, then you should
 55 | immediately do two things to secure it:
 56 | 
 57 | 1. Change the admin password.
 58 | 2. Create TLS certificates
 59 | 
 60 | The **admin password** is stored in a file called ``htpasswd`` and it can be
 61 | created or edited using the ``htpasswd`` command from the Apache2 utilities
 62 | package (called ``apache2-utils`` on Ubuntu distributions). Install that package
 63 | and then run this command:
 64 | 
 65 | .. code::
 66 | 
 67 |     $ htpasswd -c passwd admin
 68 |     New password:
 69 |     Re-type new password:
 70 |     Adding password for user admin
 71 | 
 72 | Type in the new password when prompted. You can change passwords or add
 73 | additional passwords to an existing file by running ``htpasswd passwd USER``,
 74 | where ``USER`` is the username to change. When you are done, copy the ``passwd``
 75 | file into the Docker container and remove the original.
 76 | 
 77 | .. code::
 78 | 
 79 |     $ docker cp passwd starbelly-web:/etc/nginx/tls/
 80 |     $ docker exec starbelly-web nginx -s reload
 81 |     2017/11/02 14:29:37 [notice] 1437#1437: signal process started
 82 |     $ rm passwd
 83 | 
 84 | The default **TLS certificate** is automatically generated and self-signed when
 85 | the container is created. If you have a valid domain name for your Starbelly
 86 | server, then you should obtain a real certificate for it.
 87 | 
 88 | If you have obtained your own certificates, you can install them as follows,
 89 | where ``certificate.pem`` is the full certificate chain in PEM format and
 90 | ``privatekey.pem`` is the private key in PEM format.
 91 | 
 92 | .. code::
 93 | 
 94 |     $ docker cp certificate.pem starbelly-web:/etc/nginx/tls/server.crt
 95 |     $ docker cp privatekey.pem starbelly-web:/etc/nginx/tls/server.key
 96 |     $ docker exec starbelly-web nginx -s reload
 97 |     2017/11/02 14:29:37 [notice] 1437#1437: signal process started
 98 | 
 99 | If you do not already have TLS certificates, you may obtain free certificates
100 | from `Let's Encrypt <https://letsencrypt.org/>`__. First, install the certbot
101 | application using the `instructions for your platform
102 | <https://certbot.eff.org/>`__. Now run certbot to create initial certificates,
103 | replacing ``YOUR_DOMAIN`` with the fully qualified domain name of the server and
104 | replacing ``/path/to/starbelly-docker/starbelly`` with the path where you placed
105 | the Starbelly docker configuration files from an earlier step.
106 | 
107 | .. code::
108 | 
109 |     $ cd /path/to/starbelly-docker/starbelly
110 |     $ certbot certonly \
111 |         --webroot -w certbot-webroot \
112 |         -d YOUR_DOMAIN \
113 |         --deploy-hook ./deploy-certbot.py
114 |     Plugins selected: Authenticator webroot, Installer None
115 |     Enter email address (used for urgent renewal and security notices) (Enter 'c' to
116 |     cancel): YOUR EMAIL HERE
117 | 
118 |     -------------------------------------------------------------------------------
119 |     Please read the Terms of Service at
120 |     https://letsencrypt.org/documents/LE-SA-v1.1.1-August-1-2016.pdf. You must agree
121 |     in order to register with the ACME server at
122 |     https://acme-staging.api.letsencrypt.org/directory
123 |     -------------------------------------------------------------------------------
124 |     (A)gree/(C)ancel: a
125 | 
126 |     ...snip...
127 | 
128 | This command will attempt to get TLS certificates from the Let's Encrypt server.
129 | If you've never run certbot on this server before, it will prompt you to enter
130 | a contact e-mail address and agree to the terms of service.
131 | 
132 | If certificate generation succeeds, it will install those certificates into the
133 | Docker container. This certificate is valid for 90 days and will need to be
134 | renewed before it expires. Create a daily cron job containing the following
135 | command to ensure that the certificate will be renewed appropriately.
136 | 
137 | .. code::
138 | 
139 |     certbot renew --deploy-hook /path/to/starbelly-docker/starbelly/deploy-certbot.py
140 | 
141 | Next Steps
142 | ----------
143 | 
144 | Now that you have Starbelly up and running, take a look at :doc:`first_crawl`.
145 | 


--------------------------------------------------------------------------------
/docs/jupyter_new_notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/jupyter_new_notebook.png


--------------------------------------------------------------------------------
/docs/jupyter_notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/jupyter_notebook.png


--------------------------------------------------------------------------------
/docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/logo.png


--------------------------------------------------------------------------------
/docs/policy.rst:
--------------------------------------------------------------------------------
  1 | ******
  2 | Policy
  3 | ******
  4 | 
  5 | .. contents::
  6 |    :depth: 2
  7 | 
  8 | Overview
  9 | ========
 10 | 
 11 | The *crawl policy* is one of the most important and powerful concepts in
 12 | Starbelly. A policy controls the crawler's behavior and decision making, guiding
 13 | which links the crawler follows, what kinds of resources it downloads, and how
 14 | long or how far it runs. When you start a crawl job, you must specify which
 15 | policy that job should use.
 16 | 
 17 | In this part of the documentation, we take a look at the features of the crawl
 18 | policy. To begin, click *Policy* in the Starbelly menu, then click on an
 19 | existing policy to view it, or click *New Policy* to create a new policy.
 20 | 
 21 | Authentication
 22 | ==============
 23 | 
 24 | The authentication policy determines how a crawler can authenticate itself to a
 25 | web site. When the crawler sees a domain in a crawl for the first time, it
 26 | checks to see if it has any credentials for that domain. (See the configuration
 27 | of Credentials for more information.) If it does, it picks one
 28 | of the appropriate credentials at random and tries to login with it. Some login
 29 | forms may require a CAPTCHA. In those cases, you may configure a CAPTCHA solver
 30 | and specify that solver in the policy.
 31 | 
 32 | Robots.txt
 33 | ==========
 34 | 
 35 | `Robots.txt <http://www.robotstxt.org/>`__ is a standard for specifying how
 36 | crawlers should interact with websites. By default, Starbelly will attempt to
 37 | download a ``robots.txt`` from each domain that it visits, and it will obey the
 38 | directives of any such files that it finds. In some circumstances, however,
 39 | such as crawling some old sites, it may be useful to ignore or even invert the
 40 | directives in a site's robots.txt, which you can configure using the policy.
 41 | 
 42 | URL Normalization
 43 | =================
 44 | 
 45 | The crawler attempts to avoid crawling the same URL multiple times. If two links
 46 | contain exactly identical URLs, then the crawler will only download that
 47 | resource once. On some sites, especially dynamically generated sites, multiple
 48 | URLs may refer to the same resource and differ only in the order of URL query
 49 | parameters or the values of semantically meaningless query parameters like
 50 | session IDs.
 51 | 
 52 | The URL normalization policy allows you to control this behavior. When enabled,
 53 | the crawler normalizes URLS using a number of techniques, including:
 54 | 
 55 | - sorting query parameters alphabetically
 56 | - upper case percent encodings
 57 | - remove query fragments
 58 | - etc.
 59 | 
 60 | You may specify URL query parameters that should be discarded during
 61 | normalization. By default, the crawler discards several common session ID
 62 | parameters. Alternatively, you can disable URL normalization completely,
 63 | although this may result in lots of duplicated downloads.
 64 | 
 65 | URL Rules
 66 | =========
 67 | 
 68 | The URL rules policy controls how a crawler selects links to follow. For each
 69 | page that is downloaded, the crawler extracts candidate links. For each candidate
 70 | link, the crawler checks the rules one-by-one until a rule matches, then the crawler
 71 | applies the matching rule.
 72 | 
 73 | For example, the default *Deep Crawl* policy contains two URL rules:
 74 | 
 75 | 1. If the URL *matches* the regex ``^https?://({SEED_DOMAINS})/`` then *add* ``1.0``.
 76 | 2. Else *multiply by* ``0.0``.
 77 | 
 78 | Let's say the URL is seeded with ``http://foo.com/bar``. It downloads this
 79 | document and assigns it a cost of 1.0. Cost is roughly similar to the concept of
 80 | *crawl depth* in other crawlers, but it is a bit more sophisticated. Each link
 81 | is assigned a cost based on the cost of the document where it was found and the
 82 | URL rule that it matches. If a link cost evaluates to zero, then the link is
 83 | thrown away. If the link is greater than zero but less than the "Max Cost"
 84 | specified in the crawl policy, then the crawler schedules the link to be
 85 | fetched. Links are fetched roughly in order of cost, so lower-cost items are
 86 | typically fetched before higher-cost items.
 87 | 
 88 | After the crawler downloads the document at ``http://foo.com/bar``, it checks
 89 | each link in that document against the URL rules in the policy. For example, if
 90 | the link matches the regex in rule #1, then the link will be given a score of
 91 | 2.0: the rule says to add 1.0 to the cost of its parent (which was 1.0).
 92 | 
 93 | If the link matches rule #2, then that rule says to multiply the parent's cost
 94 | by zero. This results in the new cost being set to zero, and the crawler
 95 | discards links where the cost is zero, so the link will not be followed.
 96 | 
 97 | Although the URL rules are a bit complicated at first, they turn out to be a
 98 | very powerful way to guide the crawler. For example, if we step back a bit and
 99 | consider the effect of the two rules above, we see that it follows links inside
100 | the seed domain and does not follow links outside the seed domain. In other
101 | words, this is a *deep crawl*!
102 | 
103 | If we replace the two rules here with just a single rule that says "Always add
104 | 1.0" , then that would result in a *broad crawl* policy! In fact, you can go
105 | look at the default *Broad Crawl* policy included in Starbelly to confirm that
106 | this is how it works.
107 | 
108 | User Agents
109 | ===========
110 | 
111 | When the crawler downloads a resource, it sends a *User Agent* string in the
112 | headers. By default, Starbelly sends a user agent that identifies itself with a
113 | version number and includes a URL to its source code repository. You may
114 | customize what user agent is sent using the policy. If you include multiple user
115 | agent strings, one will be chosen at random for each request.
116 | 
117 | Proxy Rules
118 | ===========
119 | 
120 | By default, the crawler downloads resources directly from their hosts. In some
121 | cases, you may want to proxy requests through an intermediary. The *Proxy Rules*
122 | specify which proxy server should be used for which request, similar to the *URL
123 | Rules* above.
124 | 
125 | MIME Type Rules
126 | ===============
127 | 
128 | While *URL Rules* determine which links to follow, *MIME Type Rules*  determine
129 | what types of resources to download. By default, the crawler only downloads
130 | resources that have a MIME type matching the regex ``^text/``, which matches
131 | plain text and HTML resources. If you want the crawler to download images, for
132 | example, then you would add a new rule like ``^image/*`` that would match GIF,
133 | JPEG, and PNG resources.
134 | 
135 | The MIME type of a resource is determined by inspecting the ``Content-Type``
136 | header, which  means that *MIME Type Rules* are not applied until *after the
137 | crawler downloads headers* for a resource. If the crawler determines that a
138 | resource should not be downloaded, then the crawler closes the connection and
139 | discards any data that has already been downloaded.
140 | 
141 | Limits
142 | ======
143 | 
144 | The *Limits* policy specifies limits on how far and how long the crawl should
145 | run. If a limit is left blank, then that limit will not be applied to the crawl.
146 | 
147 | - Max cost: the crawler will not follow links that have a cost greater than the
148 |   one specified here.
149 | - Max duration: the maximum amount of time the crawler should run, in seconds.
150 | - Max items: the maximum number of items that the crawler should download. This
151 |   number includes successes, errors, and exceptions.
152 | 


--------------------------------------------------------------------------------
/docs/protobuf.rst:
--------------------------------------------------------------------------------
 1 | .. _protobuf_messages:
 2 | 
 3 | Protobuf Messages
 4 | =================
 5 | 
 6 | The structure of the API is described in :doc:`websocket_api`. The details of
 7 | all the individual messages are documened here. The client always sends a
 8 | `Request <#.Request>`__ message. The server always sends a `ServerMessage
 9 | <#.ServerMessage>`__ message, which contains either a `Response <#.Response>`__
10 | to a request or an `Event <#.Event>`__ belonging to a subscription.
11 | 
12 | .. raw:: html
13 |    :file: protobuf.html
14 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | # RTD does not support Poetry natively, so this is a temporary workaround. See:
2 | # https://github.com/readthedocs/readthedocs.org/issues/4912
3 | sphinx
4 | sphinx-rtd-theme
5 | sphinxcontrib_trio
6 | .
7 | 


--------------------------------------------------------------------------------
/docs/rethinkdb_gui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/rethinkdb_gui.png


--------------------------------------------------------------------------------
/docs/start_crawl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/start_crawl.png


--------------------------------------------------------------------------------
/docs/start_crawl_filled_in.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/start_crawl_filled_in.png


--------------------------------------------------------------------------------
/docs/terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/terminal.png


--------------------------------------------------------------------------------
/docs/websocket_api.rst:
--------------------------------------------------------------------------------
  1 | .. _api-documentation:
  2 | 
  3 | WebSocket API
  4 | =============
  5 | 
  6 | Overview
  7 | --------
  8 | 
  9 | The crawler is controlled completely by an API. Clients connect to the crawler
 10 | using `websockets
 11 | <https://developer.mozilla.org/en-US/docs/Web/API/WebSockets_API>`__ and
 12 | exchange messages with the crawler using `protobuf messages
 13 | <https://developers.google.com/protocol-buffers/>`__. The built-in GUI relies
 14 | solely on this API, so everything that can be done in the GUI can also be done
 15 | with the API – and more!
 16 | 
 17 | One of the central goals for the API is to enable clients to synchronize crawl
 18 | results in real time. Most crawling systems are batch-oriented: you run the
 19 | crawler for a period of time and then collect the results when the crawl is
 20 | finished. Starbelly is streaming-oriented: it can send crawl results to a client
 21 | as soon as it downloads them.
 22 | 
 23 | Let's imagine that a crawl has started running and already has 1,000 results. A
 24 | client can connect to Starbelly and quickly fetch the first 1,000 results.
 25 | Because the crawler is still running, new results will continue to stream in as
 26 | the crawler downloads them. If either the server or the client needs to
 27 | disconnect for some reason, the client is able to reconnect later and pick up
 28 | the stream exactly where it left off.
 29 | 
 30 | Connecting to API
 31 | -----------------
 32 | 
 33 | The API is exposed as a websocket service on port 443 at the path ``/ws/``. For
 34 | example, if starbelly is running on the host ``starbelly.example.com``, then you
 35 | should connect to the web socket using the URL
 36 | ``wss://starbelly.example.com/ws/``. By default, Starbelly uses HTTP basic
 37 | authentication, so you need to include those credentials when you connect to the
 38 | API.
 39 | 
 40 | Messages
 41 | --------
 42 | 
 43 | Starbelly uses ``protobuf`` to encode messages sent between the client and the
 44 | server. There are three types of message used in the API:
 45 | 
 46 | 1. Request
 47 | 2. Response
 48 | 3. Event
 49 | 
 50 | The *request* and *response* messages are created in pairs: the client sends a
 51 | *request* to the server and the server sends back exactly one *response* per
 52 | request. The response indicates whether the request was successful and may
 53 | include other data related to the request.
 54 | 
 55 | Although each request generates a response, the responses are not necessarily
 56 | sent back in the same order that the requests are received. If the client sends
 57 | two commands very quickly (call them A and B), it may get the responses back in
 58 | either order, e.g. A→B or B→A. For this reason, the client should include a
 59 | unique ``request_id`` with each request; the server will include the same
 60 | ``request_id`` in its response so that the client can track which response goes
 61 | with which request. The client can assign request IDs in any manner that it
 62 | chooses, but one sensible approach would be to assign an incrementing sequence
 63 | of integers.
 64 | 
 65 | The third type of message is an *event*, which is pushed from the server to the
 66 | client. For example, the client can send a request to subscribe to job status.
 67 | The server will send a response containing a subscription ID. Now, whenever a
 68 | job has a status event, such as downloading a new resource, the server will send
 69 | an event to the client containing the job status data and the corresponding
 70 | subscription ID. The client can close the subscription by sending another
 71 | request. The server will stop sending event messages and will send a response
 72 | indicating that the subscription has been cancelled.
 73 | 
 74 | Protobuf is a binary serialization format that supports common data types like
 75 | integers, strings, lists, and maps. It is similar in purpose to JSON, but
 76 | protobuf is more efficient in terms of encoding overhead and serialization
 77 | speed.
 78 | 
 79 | For complete specification of the available protobuf messages, see
 80 | :doc:`protobuf`.
 81 | 
 82 | Example Session
 83 | ---------------
 84 | 
 85 | This section shows a complete interaction where a client starts a crawl and
 86 | synchronizes crawl results. To begin, the client sends a ``RequestSetJob``
 87 | request to the server that includes the seed URL, a policy identifier, and a
 88 | crawl name.
 89 | 
 90 | .. code::
 91 | 
 92 |     Request {
 93 |         request_id: 1
 94 |         Command: RequestSetJob {
 95 |             run_state: RUNNING
 96 |             policy_id: d28b379ff3668322bfd5d56e11d4e34e
 97 |             seeds: "https://cnn.com"
 98 |             name: "My Crawl"
 99 |         }
100 |     }
101 | 
102 | The server will kick off a crawling job and will send a response telling the
103 | client that the job has started successfully and including an identifier for the
104 | new job.
105 | 
106 | .. code::
107 | 
108 |     Response {
109 |         request_id: 1
110 |         is_success: true
111 |         Body: ResponseNewJob {
112 |             job_id: 0514478baffd401546b755bf460b5997
113 |         }
114 |     }
115 | 
116 | Notice that the response includes the request ID sent by the client, so
117 | we know that this is a response to the above request.
118 | 
119 | This response tells us that the crawl is starting, but we would like to keep
120 | track of the crawl's progress and know when it finishes. The next step is to
121 | send a subscription request for job status events.
122 | 
123 | .. code::
124 | 
125 |     Request {
126 |         request_id: 2
127 |         Command: RequestSubscribeJobStatus {
128 |             min_interval: 3.0
129 |         }
130 |     }
131 | 
132 | This subscription provides high-level job status for *all* crawl jobs, including
133 | data like how many items have been downloaded, how many pages had errors, how
134 | many pages results in exceptions, etc. Job status can change rapidly when the
135 | crawler is busy, because each item downloaded counts as a change in job status.
136 | The ``min_interval`` parameter specifies the minimum amount of time in between
137 | job status events sent by the server. In this example, if there are multiple job
138 | status events, the server will batch them together and send at most 1 event
139 | every 3 seconds for this subscription. On the other hand, if the crawl is very
140 | slow, then it may send events even less frequently than that.
141 | 
142 | The server will create the subscription and respond with a subscription
143 | identifier.
144 | 
145 | .. code::
146 | 
147 |     Response {
148 |         request_id: 1
149 |         is_success: true
150 |         Body: ResponseNewSubscription {
151 |             subscription_id: 300
152 |         }
153 |     }
154 | 
155 | When the client first subscribes to job status, the crawler will send the
156 | complete status of each currently running job. For example, if the crawler has
157 | already downloaded one item, the job status may look like this:
158 | 
159 | .. code::
160 | 
161 |     Event {
162 |         subscription_id: 300
163 |         Body: JobList {
164 |             jobs: {
165 |                 job_id: 0514478baffd401546b755bf460b5997
166 |                 seeds: "https://cnn.com"
167 |                 policy: d28b379ff3668322bfd5d56e11d4e34e
168 |                 name: "My Crawl"
169 |                 run_state: RUNNING
170 |                 started_at: "2017-11-03T10:14:42.194744"
171 |                 item_count: 1
172 |                 http_success_count: 1
173 |                 http_error_count: 0
174 |                 exception_count: 0
175 |                 http_status_counts: {
176 |                     200: 1
177 |                 }
178 |             }
179 |         }
180 |     }
181 | 
182 | After sending complete job status, the crawler will send small updates as the
183 | job status changes. For example, after the crawler downloads a second item, it
184 | will send an event like this:
185 | 
186 | .. code::
187 | 
188 |     Event {
189 |         subscription_id: 300
190 |         Body: JobList {
191 |             jobs: {
192 |                 job_id: 0514478baffd401546b755bf460b5997
193 |                 item_count: 2
194 |                 http_success_count: 2
195 |                 http_status_counts: {
196 |                     200: 2
197 |                 }
198 |             }
199 |         }
200 |     }
201 | 
202 | Notice how the second message is much smaller: it only contains the fields that
203 | have changed since the previous event. This is how the job status subscription
204 | allows clients to efficiently keep track of the status of all jobs. This API is
205 | used in the GUI to power the Dashboard and Results screens.
206 | 
207 | For a complete list of API messages, see :ref:`protobuf_messages`.
208 | 
209 | Web Client
210 | ----------
211 | 
212 | The crawler GUI is implemented as a stand-alone application written in Dart, and
213 | it interacts with the Starbelly server solely through the public API. Therefore,
214 | anything that you can do in the GUI can also be done through the API.
215 | 
216 | https://github.com/hyperiongray/starbelly-web-client
217 | 
218 | Python Client
219 | -------------
220 | 
221 | A very basic and incomplete Python client library implementation is available:
222 | 
223 | https://github.com/hyperiongray/starbelly-python-client
224 | 
225 | This client library will be improved over time and made more stable, but for
226 | now it may be used as a reference implementation.
227 | 


--------------------------------------------------------------------------------
/integration/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | This module contains integration tests.
 3 | 
 4 | These tests rely on a RethinkDB server running on localhost 28015.
 5 | '''
 6 | from functools import wraps
 7 | 
 8 | import pytest
 9 | from rethinkdb import RethinkDB
10 | import trio
11 | 
12 | from starbelly.config import get_config
13 | 
14 | # Add this project to the Python path:
15 | from os.path import dirname
16 | from sys import path
17 | path.append(dirname(dirname(__file__)))
18 | 
19 | 
20 | class fail_after:
21 |     ''' This decorator fails if the runtime of the decorated function (as
22 |     measured by the Trio clock) exceeds the specified value. '''
23 |     def __init__(self, seconds):
24 |         self._seconds = seconds
25 | 
26 |     def __call__(self, fn):
27 |         @wraps(fn)
28 |         async def wrapper(*args, **kwargs):
29 |             with trio.move_on_after(self._seconds) as cancel_scope:
30 |                 await fn(*args, **kwargs)
31 |             if cancel_scope.cancelled_caught:
32 |                 pytest.fail('Test runtime exceeded the maximum {} seconds'
33 |                     .format(self._seconds))
34 |         return wrapper
35 | 
36 | 
37 | @pytest.fixture
38 | async def db_pool(nursery):
39 |     r = RethinkDB()
40 |     r.set_loop_type('trio')
41 |     db_config = get_config()['database']
42 |     db_pool = r.ConnectionPool(
43 |         host=db_config['host'],
44 |         port=db_config['port'],
45 |         db='integration_testing',
46 |         user=db_config['super_user'],
47 |         password=db_config['super_password'],
48 |         nursery=nursery
49 |     )
50 |     async with db_pool.connection() as conn:
51 |         await r.db_create('integration_testing').run(conn)
52 |     yield db_pool
53 |     async with db_pool.connection() as conn:
54 |         await r.db_drop('integration_testing').run(conn)
55 |     await db_pool.close()
56 | 


--------------------------------------------------------------------------------
/notebooks/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | 


--------------------------------------------------------------------------------
/notebooks/Example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     " # Example Notebook\n",
  8 |     " \n",
  9 |     " This notebook shows how to use the Starbelly shell from inside a Jupyter notebook."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "Starbelly v1.2.0-dev Shell\n"
 22 |      ]
 23 |     }
 24 |    ],
 25 |    "source": [
 26 |     "%autoawait trio\n",
 27 |     "from pathlib import Path\n",
 28 |     "from sys import path\n",
 29 |     "path.append(str(Path().resolve().parent))\n",
 30 |     "from tools.shell import *"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "data": {
 40 |       "text/plain": [
 41 |        "'starbelly-app'"
 42 |       ]
 43 |      },
 44 |      "execution_count": 2,
 45 |      "metadata": {},
 46 |      "output_type": "execute_result"
 47 |     }
 48 |    ],
 49 |    "source": [
 50 |     "config['database']['user']"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 3,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "15:24:09 [tools.shell] INFO: Hello, world!\n"
 63 |      ]
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "logger.info('Hello, world!')"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 4,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "async def foo():\n",
 77 |     "    await trio.sleep(1)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 5,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "await foo()"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 6,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "trio.run(foo)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 7,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "name": "stdout",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "RethinkDB Cursor: [\n",
108 |       "    {'name': 'Deep Crawl'},\n",
109 |       "    {'name': 'Broad Crawl'},\n",
110 |       "]\n"
111 |      ]
112 |     }
113 |    ],
114 |    "source": [
115 |     "policy_query = r.table('policy').pluck('name')\n",
116 |     "policy_results = run_query(policy_query)\n",
117 |     "print_results(policy_results)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": []
126 |   }
127 |  ],
128 |  "metadata": {
129 |   "kernelspec": {
130 |    "display_name": "Python 3.7 (starbelly)",
131 |    "language": "python",
132 |    "name": "starbelly"
133 |   },
134 |   "language_info": {
135 |    "codemirror_mode": {
136 |     "name": "ipython",
137 |     "version": 3
138 |    },
139 |    "file_extension": ".py",
140 |    "mimetype": "text/x-python",
141 |    "name": "python",
142 |    "nbconvert_exporter": "python",
143 |    "pygments_lexer": "ipython3",
144 |    "version": "3.7.0"
145 |   }
146 |  },
147 |  "nbformat": 4,
148 |  "nbformat_minor": 2
149 | }
150 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "starbelly"
 3 | version = "2.0.0-dev"
 4 | description = "Streaming crawler with a graphical user interface"
 5 | authors = ["Mark E. Haase <mehaase@gmail.com>"]
 6 | license = "MIT"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.7"
10 | aiohttp = "^3.6.2"
11 | aiohttp-socks = "^0.3.9"
12 | beautifulsoup4 = "^4.9.1"
13 | cchardet = "^2.1.6"
14 | feedparser = "^5.2.1"
15 | formasaurus = {url = "https://hyperiongray.s3-us-west-2.amazonaws.com/formasaurus-0.9.0.tar.gz"}
16 | lxml = "^4.5.1"
17 | protobuf = "^3.12.2"
18 | psutil = "^5.7.0"
19 | python-dateutil = "^2.8.1"
20 | python-mimeparse = "^1.6.0"
21 | pytz = "==2020.1"
22 | rethinkdb = {url = "https://hyperiongray.s3-us-west-2.amazonaws.com/rethinkdb-2.4.7%2Bstarbelly.tar.gz"}
23 | robotexclusionrulesparser = "^1.7.1"
24 | trio = "^0.15.1"
25 | trio-asyncio = "^0.11.0"
26 | trio-websocket = "^0.8.0"
27 | w3lib = "1.22.0"
28 | watchdog = "^0.10.2"
29 | yarl = "^1.4.2"
30 | scikit-learn = ">=0.18" # required for formasaurus
31 | scipy = "^1.4.1" # required for formasaurus
32 | sklearn-crfsuite = ">=0.3.1" # required for formasaurus
33 | joblib = "^0.15.1" # required for formasaurus
34 | tldextract = "^2.2.2" # required for formasaurus
35 | 
36 | [tool.poetry.dev-dependencies]
37 | coverage = "^5.1"
38 | coveralls = "^2.0.0"
39 | pytest-cov = "^2.9.0"
40 | pytest-mock = "^3.1.0"
41 | sphinxcontrib-trio = "^1.1.2"
42 | sphinx-rtd-theme = "^0.4.3"
43 | pytest-trio = "^0.6.0"
44 | trustme = "^0.6.0"
45 | 
46 | [build-system]
47 | requires = ["poetry>=0.12"]
48 | build-backend = "poetry.masonry.api"
49 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | log_level = debug
3 | trio_mode = true
4 | 


--------------------------------------------------------------------------------
/starbelly/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/starbelly/__init__.py


--------------------------------------------------------------------------------
/starbelly/__main__.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import os
  4 | import signal
  5 | import subprocess
  6 | import sys
  7 | import time
  8 | 
  9 | from watchdog.events import FileSystemEventHandler
 10 | from watchdog.observers import Observer
 11 | 
 12 | from .bootstrap import Bootstrap
 13 | from .config import get_config, get_path
 14 | 
 15 | 
 16 | class ProcessWatchdog(FileSystemEventHandler):
 17 |     ''' Handle watchdog events by restarting a subprocess. '''
 18 | 
 19 |     def __init__(self):
 20 |         ''' Constructor. '''
 21 | 
 22 |         self._logger = logging.getLogger('watchdog')
 23 |         self._process = None
 24 | 
 25 |     def dispatch(self, event):
 26 |         ''' Restart the subprocess if a source/config file changed. '''
 27 | 
 28 |         path = event.src_path
 29 |         file = os.path.basename(path)
 30 |         descr = '{} was {}'.format(event.src_path, event.event_type)
 31 | 
 32 |         if (file.endswith('.py') and not file.startswith('test_')) or \
 33 |             file.endswith('.ini'):
 34 |             self._logger.info('%s (Reloading)', descr)
 35 |             self.terminate_process()
 36 |             self.start_process()
 37 | 
 38 |     def join(self):
 39 |         ''' Wait for subprocess to exit. '''
 40 |         if self._process is not None:
 41 |             self._process.wait()
 42 | 
 43 |     def start_process(self):
 44 |         ''' Start the subprocess. '''
 45 | 
 46 |         if self._process is not None:
 47 |             msg = 'Cannot start subprocess if it is already running.'
 48 |             raise RuntimeError(msg)
 49 | 
 50 |         time.sleep(1)
 51 |         args = [sys.executable, '-m', __package__] + sys.argv[1:]
 52 |         new_env = dict(os.environ)
 53 |         new_env['WATCHDOG_RUNNING'] = '1'
 54 |         self._process = subprocess.Popen(args, env=new_env)
 55 | 
 56 |     def terminate_process(self):
 57 |         ''' Terminate the subprocess. '''
 58 |         if self._process is not None:
 59 |             try:
 60 |                 self._process.send_signal(signal.SIGTERM)
 61 |                 self._process.wait()
 62 |                 self._process = None
 63 |             except ProcessLookupError:
 64 |                 pass # The process already died.
 65 | 
 66 | 
 67 | class Reloader:
 68 |     ''' Reloads the subprocess when a source file is modified. '''
 69 |     def __init__(self):
 70 |         ''' Constructor. '''
 71 |         self._logger = logging.getLogger('reloader')
 72 |         self._observer = None
 73 |         self._running = False
 74 |         self._watchdog = None
 75 | 
 76 |     def run(self):
 77 |         ''' Run the reloader. '''
 78 | 
 79 |         self._logger.info('Running with reloader...')
 80 |         self._watchdog = ProcessWatchdog()
 81 |         self._watchdog.start_process()
 82 | 
 83 |         self._observer = Observer()
 84 |         self._observer.schedule(
 85 |             self._watchdog, str(get_path('starbelly')), recursive=True)
 86 |         self._observer.start()
 87 | 
 88 |         while True:
 89 |             time.sleep(1)
 90 | 
 91 |     def shutdown(self, signum, _):
 92 |         ''' Exit the reloader. '''
 93 |         signame = signal.Signals(signum).name
 94 |         self._logger.info('Caught %s (shutting down)', signame)
 95 |         self._watchdog.terminate_process()
 96 |         self._observer.stop()
 97 |         self._observer.join()
 98 |         sys.exit(0)
 99 | 
100 | 
101 | def configure_logging(log_level, error_log):
102 |     ''' Set default format and output stream for logging. '''
103 |     log_format = '%(asctime)s [%(name)s] %(levelname)s: %(message)s'
104 |     log_date_format = '%Y-%m-%d %H:%M:%S'
105 |     log_formatter = logging.Formatter(log_format, log_date_format)
106 |     log_level = getattr(logging, log_level.upper())
107 |     log_handler = logging.StreamHandler(sys.stderr)
108 |     log_handler.setFormatter(log_formatter)
109 |     log_handler.setLevel(log_level)
110 |     logger = logging.getLogger()
111 |     logger.addHandler(log_handler)
112 |     logger.setLevel(log_level)
113 |     if log_level < logging.INFO:
114 |         logging.getLogger('watchdog').setLevel(logging.INFO)
115 |         logging.getLogger('trio-websocket').setLevel(logging.INFO)
116 | 
117 |     if error_log is not None:
118 |         exc_handler = logging.FileHandler(error_log)
119 |         exc_handler.setFormatter(log_formatter)
120 |         exc_handler.setLevel(logging.ERROR)
121 |         logger.addHandler(exc_handler)
122 | 
123 | 
124 | def get_args():
125 |     ''' Parse command line arguments. '''
126 |     arg_parser = argparse.ArgumentParser(description='Starbelly')
127 |     arg_parser.add_argument(
128 |         '--log-level',
129 |         default='warning',
130 |         metavar='LEVEL',
131 |         choices=['debug', 'info', 'warning', 'error', 'critical'],
132 |         help='Set logging verbosity (default: warning)'
133 |     )
134 |     arg_parser.add_argument(
135 |         '--ip',
136 |         default='127.0.0.1',
137 |         help='The IP address to bind to (default: 127.0.0.1)'
138 |     )
139 |     arg_parser.add_argument(
140 |         '--port',
141 |         type=int,
142 |         default=8000,
143 |         help='The TCP port to bind to (default: 8000)'
144 |     )
145 |     arg_parser.add_argument(
146 |         '--reload',
147 |         action='store_true',
148 |         help='Auto-reload when code or static assets are modified.'
149 |     )
150 |     arg_parser.add_argument(
151 |         '--error-log',
152 |         help='Copy error logs to the specified file.'
153 |     )
154 |     return arg_parser.parse_args()
155 | 
156 | 
157 | def main():
158 |     ''' Set up watchdog or run starbelly. '''
159 |     args = get_args()
160 |     configure_logging(args.log_level, args.error_log)
161 |     config = get_config()
162 | 
163 |     if args.reload and os.getenv('WATCHDOG_RUNNING') is None:
164 |         reloader = Reloader()
165 |         signal.signal(signal.SIGINT, reloader.shutdown)
166 |         signal.signal(signal.SIGTERM, reloader.shutdown)
167 |         reloader.run()
168 |     else:
169 |         bootstrap = Bootstrap(config, args)
170 |         bootstrap.run()
171 | 
172 | 
173 | if __name__ == '__main__':
174 |     main()
175 | 


--------------------------------------------------------------------------------
/starbelly/backoff.py:
--------------------------------------------------------------------------------
 1 | import trio
 2 | 
 3 | 
 4 | class ExponentialBackoff:
 5 |     ''' An experimental class: this makes it simple to write loops that poll
 6 |     a resource and backoff when the resource is not ready.
 7 | 
 8 |     For example, if you are polling the database for some new records, you might
 9 |     wait 1 second and then try again. If there are still no records, then you
10 |     wait 2 seconds before trying again, then 4 seconds, then 8, etc.
11 | 
12 |     This is written as an async iterator, so you can just loop over it and it
13 |     will automatically delay in between loop iterations.
14 |     '''
15 |     def __init__(self, min_=0.25, max_=64):
16 |         '''
17 |         Constructor.
18 | 
19 |         :param int start: The initial delay between loop iterations.
20 |         :param int max_: The maximum delay.
21 |         '''
22 |         self._backoff = min_
23 |         self._initial = True
24 |         self._min = min_
25 |         self._max = max_
26 | 
27 |     def __repr__(self):
28 |         return '<ExponentialBackoff value={}>'.format(self._backoff)
29 | 
30 |     def __aiter__(self):
31 |         ''' This instance is an async iterator. '''
32 |         return self
33 | 
34 |     async def __anext__(self):
35 |         ''' Add a delay in between loop iterations. (No delay for the first
36 |         iteration. '''
37 |         if self._initial:
38 |             backoff = 0
39 |             self._initial = False
40 |         else:
41 |             backoff = self._backoff
42 |             await trio.sleep(backoff)
43 |         return backoff
44 | 
45 |     def increase(self):
46 |         ''' Double the current backoff, but not if it would exceed this
47 |         instance's max value. '''
48 |         if self._backoff <= self._max // 2:
49 |             self._backoff *= 2
50 | 
51 |     def decrease(self):
52 |         ''' Halve the current backoff, not if would be less than 1. '''
53 |         if self._backoff >= 2 * self._min:
54 |             self._backoff //= 2
55 | 


--------------------------------------------------------------------------------
/starbelly/bootstrap.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta, timezone
  2 | import logging
  3 | 
  4 | from rethinkdb import RethinkDB
  5 | from rethinkdb.trio_net.net_trio import Connection as RethinkDBTrioConnection
  6 | import trio
  7 | 
  8 | from .db import (
  9 |     BootstrapDb,
 10 |     CrawlFrontierDb,
 11 |     CrawlManagerDb,
 12 |     CrawlExtractorDb,
 13 |     CrawlStorageDb,
 14 |     LoginDb,
 15 |     ScheduleDb,
 16 |     ServerDb,
 17 |     SubscriptionDb,
 18 | )
 19 | from .job import CrawlManager, StatsTracker
 20 | from .rate_limiter import RateLimiter
 21 | from .resource_monitor import ResourceMonitor
 22 | from .robots import RobotsTxtManager
 23 | from .schedule import Scheduler
 24 | from .server import Server
 25 | 
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | class Bootstrap:
 31 |     ''' Main class for bootstrapping the crawler. '''
 32 |     def __init__(self, config, args):
 33 |         '''
 34 |         Constructor.
 35 | 
 36 |         :param config: Output of config parser.
 37 |         :param args: Output of argparse.
 38 |         :param
 39 |         '''
 40 |         self._args = args
 41 |         self._config = config
 42 | 
 43 |     def run(self):
 44 |         ''' Run the main task on the event loop. '''
 45 |         logger.info('Starbelly is starting...')
 46 |         try:
 47 |             trio.run(self._main,
 48 |                 restrict_keyboard_interrupt_to_checkpoints=True)
 49 |         except KeyboardInterrupt:
 50 |             logger.warning('Quitting due to KeyboardInterrupt')
 51 |         logger.info('Starbelly has stopped.')
 52 | 
 53 |     def _db_pool(self, nursery):
 54 |         '''
 55 |         Create a database connectoin pool.
 56 | 
 57 |         :param nursery: A Trio nursery to spawn database connections in.
 58 |         :returns: A RethinkDB connection pool.
 59 |         '''
 60 |         r = RethinkDB()
 61 |         r.set_loop_type('trio')
 62 |         db_config = self._config['database']
 63 |         return r.ConnectionPool(
 64 |             connection_type=RethinkDBTrioConnection,
 65 |             host=db_config['host'],
 66 |             port=db_config['port'],
 67 |             db=db_config['db'],
 68 |             user=db_config['user'],
 69 |             password=db_config['password'],
 70 |             nursery=nursery
 71 |         )
 72 | 
 73 |     async def _main(self):
 74 |         '''
 75 |         The main task.
 76 | 
 77 |         :returns: This function runs until cancelled.
 78 |         '''
 79 |         # Create db pool & objects
 80 |         async with trio.open_nursery() as nursery:
 81 |             db_pool = self._db_pool(nursery)
 82 |             bootstrap_db = BootstrapDb(db_pool)
 83 |             crawl_db = CrawlManagerDb(db_pool)
 84 |             extractor_db = CrawlExtractorDb(db_pool)
 85 |             frontier_db = CrawlFrontierDb(db_pool)
 86 |             login_db = LoginDb(db_pool)
 87 |             schedule_db = ScheduleDb(db_pool)
 88 |             storage_db = CrawlStorageDb(db_pool)
 89 |             logging.info('Doing startup check...')
 90 |             await bootstrap_db.startup_check()
 91 | 
 92 |             # Create a rate limiter
 93 |             rate_limiter = RateLimiter(capacity=1_000)
 94 |             logger.info('Initializing rate limiter...')
 95 |             rate_limits = await bootstrap_db.get_rate_limits()
 96 |             for rate_limit in rate_limits:
 97 |                 rate_limiter.set_rate_limit(rate_limit['token'],
 98 |                     rate_limit['delay'])
 99 |             logger.info('Rate limiter is initialized.')
100 | 
101 |             # Create a robots.txt manager
102 |             robots_txt_manager = RobotsTxtManager(db_pool)
103 | 
104 |             # Create a tracker for job stats and initialize with jobs that are
105 |             # unfinished or recently finishe.d
106 |             recent_period = timedelta(hours=24)
107 |             recent_dt = datetime.now(tz=timezone.utc) - recent_period
108 |             stats_tracker = StatsTracker(recent_period)
109 |             dashboard_jobs = await crawl_db.get_dashboard_jobs(recent_dt)
110 |             for job_doc in dashboard_jobs:
111 |                 stats_dict = {
112 |                     'id': job_doc['id'],
113 |                     'run_state': job_doc['run_state'],
114 |                     'name': job_doc['name'],
115 |                     'seeds': job_doc['seeds'],
116 |                     'tags': job_doc['tags'],
117 |                     'started_at': job_doc['started_at'],
118 |                     'completed_at': job_doc['completed_at'],
119 |                     'item_count': job_doc['item_count'],
120 |                     'http_success_count': job_doc['http_success_count'],
121 |                     'http_error_count': job_doc['http_error_count'],
122 |                     'exception_count': job_doc['exception_count'],
123 |                     'http_status_counts': job_doc['http_status_counts'],
124 |                 }
125 |                 stats_tracker.add_job(stats_dict)
126 | 
127 |             # Create a crawl manager
128 |             crawl_manager = CrawlManager(rate_limiter, stats_tracker,
129 |                 robots_txt_manager, crawl_db, frontier_db, extractor_db,
130 |                 storage_db, login_db)
131 | 
132 |             # Create a resource monitor: one sample per second and keep 1 minute of
133 |             # history.
134 |             resource_monitor = ResourceMonitor(interval=1.0, buffer_size=60,
135 |                 crawl_resources_fn=crawl_manager.get_resource_usage,
136 |                 rate_limiter=rate_limiter)
137 | 
138 |             # Create a scheduler
139 |             scheduler = Scheduler(schedule_db, crawl_manager)
140 | 
141 |             # Create a server
142 |             server_db = ServerDb(db_pool)
143 |             subscription_db = SubscriptionDb(db_pool)
144 |             server = Server(self._args.ip, self._args.port, server_db,
145 |                 subscription_db, crawl_manager, rate_limiter, resource_monitor,
146 |                 stats_tracker, scheduler)
147 | 
148 |             # Run all the components
149 |             await nursery.start(crawl_manager.run, name='Crawl Manager')
150 |             nursery.start_soon(rate_limiter.run, name='Rate Limiter')
151 |             nursery.start_soon(resource_monitor.run, name='Resource Monitor')
152 |             nursery.start_soon(scheduler.run, name='Scheduler')
153 |             await nursery.start(server.run, name='Server')
154 | 


--------------------------------------------------------------------------------
/starbelly/captcha.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | from uuid import UUID
  3 | 
  4 | import starbelly.starbelly_pb2
  5 | 
  6 | 
  7 | class CaptchaSolver:
  8 |     ''' An interface for a CAPTCHA solving service. '''
  9 |     def __init__(self, doc):
 10 |         '''
 11 |         Constructor.
 12 | 
 13 |         :param dict doc: A database document.
 14 |         '''
 15 |         self.id = doc['id']
 16 |         self.name = doc['name']
 17 |         self.service_url = doc['service_url']
 18 |         self.api_key = doc['api_key']
 19 |         self.require_phrase = doc['require_phrase']
 20 |         self.case_sensitive = doc['case_sensitive']
 21 |         self.characters = doc['characters']
 22 |         self.require_math = doc['require_math']
 23 |         self.min_length = doc.get('min_length', 0)
 24 |         self.max_length = doc.get('max_length', 0)
 25 | 
 26 |     def get_command(self, img_data):
 27 |         '''
 28 |         Return a JSON API command.
 29 | 
 30 |         :param bytes img_data: The image data for the CAPTCHA.
 31 |         :returns: A command that can be serialized to JSON.
 32 |         :rtype: dict
 33 |         '''
 34 |         img_b64 = base64.b64encode(img_data).decode('ascii')
 35 | 
 36 |         if self.characters == 'ALPHANUMERIC':
 37 |             numeric = 0
 38 |         elif self.characters == 'NUMERIC_ONLY':
 39 |             numeric = 1
 40 |         elif self.characters == 'ALPHA_ONLY':
 41 |             numeric = 2
 42 |         else:
 43 |             raise Exception('Invalid characters setting: {}'.format(
 44 |                 self.characters))
 45 | 
 46 |         return {
 47 |             'clientKey': self.api_key,
 48 |             'task': {
 49 |                 'type': 'ImageToTextTask',
 50 |                 'body': img_b64,
 51 |                 'phrase': self.require_phrase,
 52 |                 'case': self.case_sensitive,
 53 |                 'numeric': numeric,
 54 |                 'math': self.require_math,
 55 |                 'minLength': self.min_length,
 56 |                 'maxLength': self.max_length,
 57 |             }
 58 |         }
 59 | 
 60 | 
 61 | def captcha_doc_to_pb(doc):
 62 |     '''
 63 |     Convert CAPTCHA solver from database document to protobuf.
 64 | 
 65 |     :param dict doc: A database document.
 66 |     :returns: A protobuf message.
 67 |     '''
 68 |     pb = starbelly.starbelly_pb2.CaptchaSolver()
 69 |     pb.name = doc['name']
 70 |     pb.solver_id = UUID(doc['id']).bytes
 71 |     pb.created_at = doc['created_at'].isoformat()
 72 |     pb.updated_at = doc['updated_at'].isoformat()
 73 |     type_ = doc['type']
 74 |     if type_ == 'antigate':
 75 |         pb.antigate.CopyFrom(_antigate_doc_to_pb(doc))
 76 |     else:
 77 |         raise Exception('Unknown CAPTCHA solver type ({})'.format(type_))
 78 |     return pb
 79 | 
 80 | 
 81 | def _antigate_doc_to_pb(doc):
 82 |     '''
 83 |     Convert Antigate CAPTCHA solver from database doc to protobuf.
 84 | 
 85 |     :param dict doc: A database document.
 86 |     :returns: A protobuf message.
 87 |    '''
 88 |     pb = starbelly.starbelly_pb2.CaptchaSolverAntigate()
 89 |     pb.service_url = doc['service_url']
 90 |     pb.api_key = doc['api_key']
 91 |     pb.require_phrase = doc['require_phrase']
 92 |     pb.case_sensitive = doc['case_sensitive']
 93 |     pb.characters = starbelly.starbelly_pb2.CaptchaSolverAntigateCharacters \
 94 |         .Value(doc['characters'])
 95 |     pb.require_math = doc['require_math']
 96 |     if 'min_length' in doc:
 97 |         pb.min_length = doc['min_length']
 98 |     if 'max_length' in doc:
 99 |         pb.max_length = doc['max_length']
100 |     return pb
101 | 
102 | 
103 | def captcha_pb_to_doc(pb):
104 |     '''
105 |     Convert CAPTCHA solver from protobuf to database document.
106 | 
107 |     :param pb: A protobuf message.
108 |     :returns: A database document.
109 |     :rtype: dict
110 |     '''
111 |     if pb.name.strip() == '':
112 |         raise Exception('Name is required.')
113 |     doc = {'name': pb.name}
114 |     if pb.HasField('solver_id'):
115 |         doc['id'] = str(UUID(bytes=pb.solver_id))
116 |     type_ = pb.WhichOneof('SolverType')
117 |     if type_ == 'antigate':
118 |         doc.update(_antigate_pb_to_doc(pb))
119 |     else:
120 |         raise Exception('Unknown CAPTCHA solver type ({})'.format(type_))
121 |     return doc
122 | 
123 | 
124 | def _antigate_pb_to_doc(pb):
125 |     '''
126 |     Convert Antigate CAPTCHA solver from database doc to protobuf.
127 | 
128 |     :param pb: A protobuf message.
129 |     :returns: A database document.
130 |     :rtype: dict
131 |     '''
132 |     antigate = pb.antigate
133 |     doc = {
134 |         'service_url': antigate.service_url,
135 |         'api_key': antigate.api_key,
136 |         'require_phrase': antigate.require_phrase,
137 |         'case_sensitive': antigate.case_sensitive,
138 |         'characters': starbelly.starbelly_pb2.CaptchaSolverAntigateCharacters \
139 |             .Name(antigate.characters),
140 |         'require_math': antigate.require_math,
141 |         'type': 'antigate',
142 |     }
143 |     if antigate.HasField('min_length'):
144 |         doc['min_length'] = antigate.min_length
145 |     if antigate.HasField('max_length'):
146 |         doc['max_length'] = antigate.max_length
147 |     return doc
148 | 


--------------------------------------------------------------------------------
/starbelly/config.py:
--------------------------------------------------------------------------------
 1 | import configparser
 2 | import pathlib
 3 | 
 4 | 
 5 | _root = pathlib.Path(__file__).resolve().parent.parent
 6 | 
 7 | 
 8 | def get_path(relpath):
 9 |     ''' Get absolute path to a project-relative path. '''
10 |     return _root / relpath
11 | 
12 | 
13 | def get_config():
14 |     '''
15 |     Read the application configuration from the standard configuration files.
16 | 
17 |     :rtype: ConfigParser
18 |     '''
19 |     config_dir = get_path("conf")
20 |     config_files = [
21 |         config_dir / "system.ini",
22 |         config_dir / "local.ini",
23 |     ]
24 |     config = configparser.ConfigParser()
25 |     config.optionxform = str
26 |     config.read(config_files)
27 |     return config
28 | 


--------------------------------------------------------------------------------
/starbelly/extractor.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import logging
  3 | 
  4 | from bs4 import BeautifulSoup
  5 | import cchardet
  6 | import feedparser
  7 | import mimeparse
  8 | import trio
  9 | import w3lib.encoding
 10 | import yarl
 11 | 
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | chardet = lambda s: cchardet.detect(s).get('encoding')
 15 | 
 16 | 
 17 | class CrawlExtractor:
 18 |     ''' Extract URLs from crawled items and add them to the frontier table. '''
 19 |     def __init__(self, job_id, db, send_channel, receive_channel, policy,
 20 |             downloader, robots_txt_manager, old_urls, stats, batch_size=100):
 21 |         '''
 22 |         Constructor.
 23 | 
 24 |         :param str job_id: The ID of the job to extract response for.
 25 |         :param starbelly.db.CrawlExtractorDb db: A database layer.
 26 |         :param trio.SendChannel send_channel: A channel that sends
 27 |             DownloadResponse instances.
 28 |         :param trio.ReceiveChannel receive_channel: A channel that receives
 29 |             DownloadResponse instances.
 30 |         :param starbelly.policy.Policy: A policy for computing costs.
 31 |         :param starbelly.downloader.Downloader: A downloader used for this job.
 32 |         :param starbelly.robots.RobotsTxtManager: A robots.txt manager.
 33 |         :param set old_urls: A set of hashed URLs that this crawl has seen before.
 34 |             These URLs will not be added to the crawl frontier a second time.
 35 |         :param dict stats: A dictionary of crawl statistics.
 36 |         :param int batch_size: The maximum size of inserts to do in a single
 37 |             database query. If more items than this are extracted from a
 38 |             document, then multiple queries will be issued.
 39 |         '''
 40 |         self._job_id = job_id
 41 |         self._db = db
 42 |         self._send_channel = send_channel
 43 |         self._receive_channel = receive_channel
 44 |         self._policy = policy
 45 |         self._downloader = downloader
 46 |         self._robots_txt_manager = robots_txt_manager
 47 |         self._old_urls = old_urls
 48 |         self._stats = stats
 49 |         self._batch_size = batch_size
 50 | 
 51 |     def __repr__(self):
 52 |         ''' Report crawl job ID. '''
 53 |         return '<CrawlExtractor job_id={}>'.format(self._job_id[:8])
 54 | 
 55 |     @property
 56 |     def old_urls(self):
 57 |         return self._old_urls
 58 | 
 59 |     async def run(self):
 60 |         '''
 61 |         Read responses from extraction channel and add them to the frontier.
 62 | 
 63 |         :returns: This function runs until cancelled.
 64 |         '''
 65 |         async for response in self._receive_channel:
 66 |             try:
 67 |                 if response.is_success:
 68 |                     await self._extract(response)
 69 |             except Exception:
 70 |                 logger.exception('%r Extractor exception on %r', self, response)
 71 |             finally:
 72 |                 await self._db.delete_frontier_item(response.frontier_id)
 73 |                 await self._send_channel.send(response)
 74 | 
 75 |     async def _extract(self, response):
 76 |         '''
 77 |         Find links in a response body and put them in the frontier.
 78 | 
 79 |         :param starbelly.downloader.DownloadReponse:
 80 |         '''
 81 |         logger.debug('%r Extracting links from %s', self, response.url)
 82 |         extracted_urls = await trio.run_sync_in_worker_thread(
 83 |             extract_urls, response)
 84 |         insert_items = list()
 85 | 
 86 |         for counter, url in enumerate(extracted_urls):
 87 |             # Check if the policy allows us to follow this URL.
 88 |             new_cost = self._policy.url_rules.get_cost(response.cost, url)
 89 |             exceeds_max_cost = self._policy.limits.exceeds_max_cost(new_cost)
 90 |             if new_cost <= 0 or exceeds_max_cost:
 91 |                 continue
 92 |             robots_ok = await self._robots_txt_manager.is_allowed(url,
 93 |                 self._policy, self._downloader)
 94 |             if not robots_ok:
 95 |                 continue
 96 | 
 97 |             # Normalize and hash URL.
 98 |             url_can = self._policy.url_normalization.normalize(url)
 99 |             hash_ = hashlib.blake2b(url_can.encode('ascii'), digest_size=16)
100 |             url_hash = hash_.digest()
101 | 
102 |             # If we haven't seen this URL before, it should be added to the
103 |             # frontier.
104 |             if url_hash not in self._old_urls:
105 |                 logger.debug('%r Adding URL %s (cost=%0.2f)', self, url,
106 |                     new_cost)
107 |                 insert_items.append({
108 |                     'cost': new_cost,
109 |                     'job_id': self._job_id,
110 |                     'url': url,
111 |                     'in_flight': False,
112 |                 })
113 |                 self._old_urls.add(url_hash)
114 | 
115 |             # Don't monopolize the event loop:
116 |             if counter % self._batch_size == self._batch_size - 1:
117 |                 await trio.sleep(0)
118 | 
119 |         # Insert items in batches
120 |         start = 0
121 |         while start < len(insert_items):
122 |             end = min(start + self._batch_size, len(insert_items))
123 |             self._stats['frontier_size'] += end - start
124 |             await self._db.insert_frontier_items(insert_items[start:end])
125 |             start = end
126 | 
127 | 
128 | def extract_urls(response):
129 |     '''
130 |     Extract URLs from a response body.
131 | 
132 |     Any relative URLs found in the response body are converted to absolute URLs
133 |     using the original request URL.
134 | 
135 |     :param starbelly.downloader.DownloadResponse response: A response to
136 |         extract URLs from.
137 |     :returns: A list of URLs.
138 |     :rtype: list[str]
139 |     '''
140 |     extracted_urls = list()
141 | 
142 |     if response.is_success:
143 |         base_url = response.url
144 |         type_, subtype, _ = mimeparse.parse_mime_type(response.content_type)
145 | 
146 |         if type_ == 'text' and subtype == 'html' or \
147 |            type_ == 'application' and subtype == 'xhtml+xml':
148 |             extracted_urls = _extract_html(response)
149 |         elif type_ == 'application' and subtype == 'atom+xml' or \
150 |              type_ == 'application' and subtype == 'rss+xml':
151 |             extracted_urls = _extract_feed(response)
152 |         else:
153 |             raise ValueError('Unsupported MIME in extract_urls(): {} (url={})'
154 |                 .format(response.content_type, base_url))
155 | 
156 |     return extracted_urls
157 | 
158 | 
159 | def _extract_feed(response):
160 |     '''
161 |     Extract links from Atom or RSS feeds.
162 | 
163 |     :param starbelly.downloader.DownloadResponse response: An Atom/RSS response
164 |         to extract URLs from.
165 |     :returns: A list of URLs.
166 |     :rtype: list[str]
167 |     '''
168 |     doc = feedparser.parse(response.body)
169 |     return [entry.link for entry in doc.entries]
170 | 
171 | 
172 | def _extract_html(response):
173 |     '''
174 |     Extract links from HTML document <a> tags.
175 | 
176 |     :param starbelly.downloader.DownloadResponse response: An HTML response to
177 |         extract URLs from.
178 |     :returns: A list of URLs.
179 |     :rtype: list[str]
180 |     '''
181 |     _, html = w3lib.encoding.html_to_unicode(
182 |         response.content_type,
183 |         response.body,
184 |         auto_detect_fun=chardet
185 |     )
186 | 
187 |     doc = BeautifulSoup(html, 'lxml')
188 |     base_tag = doc.head.base
189 |     base_url = None
190 | 
191 |     if base_tag is not None:
192 |         base_href = base_tag.get('href')
193 |         if base_href is not None:
194 |             base_url = yarl.URL(base_href)
195 | 
196 |     if base_url is None:
197 |         base_url = yarl.URL(response.url)
198 | 
199 |     extracted_urls = list()
200 | 
201 |     for anchor in doc.find_all('a', href=True):
202 |         href = anchor.get('href')
203 | 
204 |         try:
205 |             parsed_href = yarl.URL(href)
206 |         except:
207 |             logger.exception('Rejecting malformed URL base=%s url=%s',
208 |                 str(response.url), href)
209 |             continue
210 | 
211 |         absolute_href = base_url.join(parsed_href)
212 | 
213 |         if absolute_href.scheme in ('http', 'https'):
214 |             extracted_urls.append(str(absolute_href))
215 | 
216 |     return extracted_urls
217 | 


--------------------------------------------------------------------------------
/starbelly/frontier.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | import logging
  3 | 
  4 | from rethinkdb import RethinkDB
  5 | from yarl import URL
  6 | 
  7 | from .backoff import ExponentialBackoff
  8 | from .downloader import DownloadRequest
  9 | 
 10 | 
 11 | r = RethinkDB()
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class FrontierExhaustionError(Exception):
 16 |     ''' Indicates that the frontier has no items remaining (in-flight or
 17 |     otherwise. '''
 18 | 
 19 | 
 20 | @dataclass
 21 | class FrontierItem:
 22 |     ''' Represents a resource that should be crawled. '''
 23 |     frontier_id: bytes
 24 |     job_id: bytes
 25 |     url: str
 26 |     cost: float
 27 | 
 28 |     @classmethod
 29 |     def from_doc(cls, doc):
 30 |         '''
 31 |         Create a frontier item from a database document.
 32 | 
 33 |         :param dict doc: A database document.
 34 |         '''
 35 |         return cls(doc['id'], doc['job_id'], doc['url'], doc['cost'])
 36 | 
 37 | 
 38 | class CrawlFrontier:
 39 |     ''' Contains the logic for managing a crawl frontier, i.e. the URLs that
 40 |     have already been crawled and the URLs that are remaining to be crawled. '''
 41 |     def __init__(self, job_id, db, send_channel, login_manager, policy, stats):
 42 |         '''
 43 |         Constructor
 44 | 
 45 |         :param str job_id: The ID of the job corresponding to this frontier.
 46 |         :param starbelly.db.CrawlFrontierDb db: A database layer.
 47 |         :param trio.SendChannel send_channel: This channel is used to send
 48 |             ``FrontierItem`` that need to be downloaded, e.g. to send to the
 49 |             rate limiter.
 50 |         :param starbelly.login.LoginManager login_manager: Used when the
 51 |             frontier sees an unauthenticated domain and needs to log in.
 52 |         :param starbelly.policy.Policy: The policy to use.
 53 |         :param dict stats: A dictionary of crawl statistics.
 54 |         '''
 55 |         self._job_id = job_id
 56 |         self._db = db
 57 |         self._send_channel = send_channel
 58 |         self._login_manager = login_manager
 59 |         self._policy = policy
 60 |         self._authenticated_domains = set()
 61 |         self._stats = stats
 62 | 
 63 |     def __repr__(self):
 64 |         ''' Include job ID in the repr. '''
 65 |         return '<CrawlFrontier job_id={}>'.format(self._job_id[:8])
 66 | 
 67 |     async def run(self):
 68 |         '''
 69 |         This task takes items off the frontier and sends them to the rate
 70 |         limiter.
 71 | 
 72 |         :returns: This function runs until cancelled.
 73 |         '''
 74 |         await self._initialize()
 75 | 
 76 |         while True:
 77 |             frontier_items = await self._get_batch()
 78 |             for item in frontier_items:
 79 |                 if self._policy.authentication.is_enabled():
 80 |                     domain = URL(item.url).host
 81 |                     if domain not in self._authenticated_domains:
 82 |                         await self._login_manager.login(domain)
 83 |                         self._authenticated_domains.add(domain)
 84 |                 logger.debug('%r Sending: %r', self, item)
 85 |                 request = DownloadRequest.from_frontier_item(item)
 86 |                 await self._send_channel.send(request)
 87 | 
 88 |     async def _initialize(self):
 89 |         ''' Initialize frontier database documents. '''
 90 | 
 91 |         size = await self._db.get_frontier_size(self._job_id)
 92 |         logger.info('%r Initialization complete (size=%d)', self, size)
 93 |         self._stats['frontier_size'] = size
 94 | 
 95 |     async def _get_batch(self, size=10):
 96 |         '''
 97 |         Get a batch of items from the frontier table, ordered by ascending cost.
 98 |         If no items available, poll the database until items become available.
 99 | 
100 |         :param int size:
101 |         :returns: A batch of frontier items.
102 |         :rtype: list[FrontierItem]
103 |         '''
104 |         backoff = ExponentialBackoff(min_=1, max_=16)
105 |         async for _ in backoff:
106 |             docs = await self._db.get_frontier_batch(self._job_id, size)
107 |             if docs:
108 |                 self._stats['frontier_size'] -= len(docs)
109 |                 break
110 |             else:
111 |                 if await self._db.any_in_flight(self._job_id):
112 |                     backoff.increase()
113 |                 else:
114 |                     raise FrontierExhaustionError()
115 | 
116 |         return [FrontierItem.from_doc(doc) for doc in docs]
117 | 


--------------------------------------------------------------------------------
/starbelly/login.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from functools import partial
  3 | import logging
  4 | import random
  5 | 
  6 | import aiohttp
  7 | import cchardet
  8 | import formasaurus
  9 | import trio
 10 | import trio_asyncio
 11 | import w3lib.encoding
 12 | from yarl import URL
 13 | 
 14 | from .downloader import DownloadRequest
 15 | 
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | chardet = lambda s: cchardet.detect(s).get("encoding")
 19 | 
 20 | 
 21 | def get_captcha_image_element(form):
 22 |     """
 23 |     Return the <img> element in an lxml form that contains the CAPTCHA.
 24 | 
 25 |     NOTE: This assumes the first image in the form is the CAPTCHA image. If
 26 |     a form has multiple images, maybe use the etree .sourceline attribute to
 27 |     figure out which image is closer to the CAPTCHA input? Or crawl through
 28 |     the element tree to find the image?
 29 | 
 30 |     :param form: An lxml form element.
 31 |     :returns: An lxml image element.
 32 |     """
 33 |     img_el = form.find(".//img")
 34 |     if img_el is None:
 35 |         raise Exception("Cannot locate CAPTCHA image")
 36 |     return img_el
 37 | 
 38 | 
 39 | def select_login_fields(fields):
 40 |     """
 41 |     Select field having highest probability for class ``field``.
 42 | 
 43 |     :param dict fields: Nested dictionary containing label probabilities
 44 |         for each form element.
 45 |     :returns: (username field, password field, captcha field)
 46 |     :rtype: tuple
 47 |     """
 48 |     username_field = None
 49 |     username_prob = 0
 50 |     password_field = None
 51 |     password_prob = 0
 52 |     captcha_field = None
 53 |     captcha_prob = 0
 54 | 
 55 |     for field_name, labels in fields.items():
 56 |         for label, prob in labels.items():
 57 |             if label in ("username", "username or email") and prob > username_prob:
 58 |                 username_field = field_name
 59 |                 username_prob = prob
 60 |             elif label == "password" and prob > password_prob:
 61 |                 password_field = field_name
 62 |                 password_prob = prob
 63 |             elif label == "captcha" and prob > captcha_prob:
 64 |                 captcha_field = field_name
 65 |                 captcha_prob = prob
 66 | 
 67 |     return username_field, password_field, captcha_field
 68 | 
 69 | 
 70 | def select_login_form(forms):
 71 |     """
 72 |     Select form having highest probability for login class.
 73 | 
 74 |     :param dict forms: Nested dict containing label probabilities for each
 75 |         form.
 76 |     :returns: (login form, login meta)
 77 |     :rtype: tuple
 78 |     """
 79 |     login_form = None
 80 |     login_meta = None
 81 |     login_prob = 0
 82 | 
 83 |     for form, meta in forms:
 84 |         for type_, prob in meta["form"].items():
 85 |             if type_ == "login" and prob > login_prob:
 86 |                 login_form = form
 87 |                 login_meta = meta
 88 |                 login_prob = prob
 89 | 
 90 |     return login_form, login_meta
 91 | 
 92 | 
 93 | class LoginManager:
 94 |     def __init__(self, job_id, db, policy, downloader):
 95 |         """
 96 |         Constructor
 97 | 
 98 |         :param starbelly.db.LoginDb: A database layer.
 99 |         """
100 |         self._job_id = job_id
101 |         self._db = db
102 |         self._policy = policy
103 |         self._downloader = downloader
104 | 
105 |     async def login(self, domain):
106 |         """
107 |         Attempt a login for the given domain.
108 | 
109 |         :param str domain: The domain to log into.
110 |         """
111 |         domain_login = await self._db.get_login(domain)
112 |         if domain_login is None:
113 |             return
114 | 
115 |         # Ensure login has users
116 |         if not domain_login.get("users", []):
117 |             logger.warning("No users for login: %s", domain_login)
118 |             return
119 | 
120 |         # Select random user from domain_login
121 |         user = random.choice(domain_login["users"])
122 |         masked_pass = user["password"][:2] + "******"
123 |         logger.info(
124 |             "Attempting login: domain=%s with user=%s password=%s",
125 |             domain,
126 |             user["username"],
127 |             masked_pass,
128 |         )
129 |         request = DownloadRequest(
130 |             frontier_id=None,
131 |             job_id=self._job_id,
132 |             method="GET",
133 |             url=domain_login["login_url"],
134 |             form_data=None,
135 |             cost=1.0,
136 |         )
137 |         response = await self._downloader.download(request)
138 |         if not response.is_success:
139 |             logger.error("Login aborted: cannot fetch %s", response.url)
140 |             return
141 |         try:
142 |             action, method, data = await self._get_login_form(
143 |                 response, user["username"], user["password"]
144 |             )
145 |         except Exception as e:
146 |             logger.exception("Cannot parse login form: %s", e)
147 |             return
148 |         logger.info("Login action=%s method=%s data=%r", action, method, data)
149 |         request = DownloadRequest(
150 |             frontier_id=None,
151 |             job_id=self._job_id,
152 |             method=method,
153 |             url=action,
154 |             form_data=data,
155 |             cost=1.0,
156 |         )
157 |         response = await self._downloader.download(request)
158 |         if not response.is_success:
159 |             logger.error(
160 |                 "Login failed action=%s (see downloader log for" " details)", action
161 |             )
162 | 
163 |     async def _download_captcha_image(self, img_src):
164 |         """
165 |         Download and return a CAPTCHA image.
166 | 
167 |         :param str img_src: The URL to download the image from.
168 |         :rtype bytes:
169 |         """
170 |         logger.info("Downloading CAPTCHA image src=%s", img_src)
171 |         request = DownloadRequest(
172 |             frontier_id=None,
173 |             job_id=None,
174 |             method="GET",
175 |             url=img_src,
176 |             form_data=None,
177 |             cost=0,
178 |         )
179 |         response = await self._downloader.download(request)
180 | 
181 |         if response.status_code == 200 and response.body is not None:
182 |             img_data = response.body
183 |         else:
184 |             raise Exception("Failed to download CAPTCHA image src={}".format(img_src))
185 | 
186 |         return img_data
187 | 
188 |     async def _get_login_form(self, response, username, password):
189 |         """
190 |         Attempt to extract login form action and form data from a response,
191 |         substituting the provided ``username`` and ``password`` into the
192 |         corresponding fields. Returns the data needed to POST a login request.
193 | 
194 |         :param starbelly.downloader.DownloadResponse response:
195 |         :param str username: The username to log in with.
196 |         :param str password: The password to log in with.
197 |         :returns: (action, method, fields)
198 |         :rtype: tuple
199 |         """
200 |         _, html = w3lib.encoding.html_to_unicode(
201 |             response.content_type, response.body, auto_detect_fun=chardet
202 |         )
203 | 
204 |         forms = await trio.run_sync_in_worker_thread(
205 |             partial(formasaurus.extract_forms, html, proba=True)
206 |         )
207 |         form, meta = select_login_form(forms)
208 | 
209 |         if form is None:
210 |             raise Exception("Can't find login form")
211 | 
212 |         login_field, password_field, captcha_field = select_login_fields(meta["fields"])
213 |         if login_field is None or password_field is None:
214 |             raise Exception("Can't find username/password fields")
215 | 
216 |         form.fields[login_field] = username
217 |         form.fields[password_field] = password
218 | 
219 |         if captcha_field is not None:
220 |             if self._policy.captcha_solver is None:
221 |                 raise Exception(
222 |                     "CAPTCHA required for login url={} but there is"
223 |                     " no CAPTCHA solver available".format(response.url)
224 |                 )
225 | 
226 |             img_el = get_captcha_image_element(form)
227 |             img_src = str(URL(response.url).join(URL(img_el.get("src"))))
228 |             img_data = await self._download_captcha_image(img_src)
229 |             captcha_text = await self._solve_captcha_asyncio(img_data)
230 |             form.fields[captcha_field] = captcha_text
231 | 
232 |         form_action = URL(response.url).join(URL(form.action))
233 |         return form_action, form.method, dict(form.fields)
234 | 
235 |     @trio_asyncio.aio_as_trio
236 |     async def _solve_captcha_asyncio(self, img_data):
237 |         """
238 |         Send an image CAPTCHA to an external solver and return the solution.
239 |         This function uses aiohttp and therefore must run on the asyncio loop.
240 | 
241 |         :param bytes img_data: The CAPTCHA image.
242 |         :rtype: str
243 |         """
244 |         solver = self._policy.captcha_solver
245 |         solution = None
246 |         task_url = str(URL(solver.service_url).join(URL("createTask")))
247 |         poll_url = str(URL(solver.service_url).join(URL("getTaskResult")))
248 | 
249 |         # This doesn't use the downloader object because this is a third party
250 |         # and is not the subject of our crawl.
251 |         async with aiohttp.ClientSession() as session:
252 |             # Send CAPTCHA task to service
253 |             command = solver.get_command(img_data)
254 |             async with session.post(task_url, json=command) as response:
255 |                 result = await response.json()
256 |                 if result["errorId"] != 0:
257 |                     raise Exception("CAPTCHA API error {}".format(result["errorId"]))
258 |                 task_id = result["taskId"]
259 |                 logger.info("Sent image to CAPTCHA API task_id=%d", task_id)
260 | 
261 |             # Poll for task completion. (Try 6 times.)
262 |             solution = None
263 |             for attempt in range(6):
264 |                 await asyncio.sleep(5)
265 |                 command = {
266 |                     "clientKey": solver.api_key,
267 |                     "taskId": task_id,
268 |                 }
269 |                 logger.info(
270 |                     "Polling for CAPTCHA solution task_id=%d," " attempt=%d",
271 |                     task_id,
272 |                     attempt + 1,
273 |                 )
274 |                 async with session.post(poll_url, json=command) as response:
275 |                     result = await response.json()
276 |                     if result["errorId"] != 0:
277 |                         raise Exception(
278 |                             "CAPTCHA API error {}".format(result["errorId"])
279 |                         )
280 |                     solution = result["solution"]["text"]
281 |                     break
282 | 
283 |         if solution is None:
284 |             raise Exception("CAPTCHA API never completed task")
285 | 
286 |         return solution
287 | 


--------------------------------------------------------------------------------
/starbelly/resource_monitor.py:
--------------------------------------------------------------------------------
  1 | from collections import deque
  2 | from datetime import datetime, timezone
  3 | import logging
  4 | 
  5 | import psutil
  6 | import trio
  7 | 
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | class ResourceMonitor:
 13 |     '''
 14 |     Keep track of consumption and usage statistics for various resources.
 15 |     '''
 16 |     def __init__(self, interval, buffer_size, crawl_resources_fn, rate_limiter):
 17 |         '''
 18 |         Constructor.
 19 | 
 20 |         :param float interval: The number of seconds to wait between
 21 |             measurements.
 22 |         :param int buffer_size: The number of measurements to store in the
 23 |             internal buffer.
 24 |         :param callable crawl_resource_fn: A function that will return a dict
 25 |             of crawl resources.
 26 |         :param starbelly.rate_limiter.RateLimiter rate_limiter:
 27 |         '''
 28 |         self._interval = interval
 29 |         self._crawl_resources_fn = crawl_resources_fn
 30 |         self._rate_limiter = rate_limiter
 31 |         self._measurements = deque(maxlen=buffer_size)
 32 |         self._channels = list()
 33 | 
 34 |     def get_channel(self, channel_size):
 35 |         '''
 36 |         Get a statistics channel. The resource monitor will send measurements to
 37 |         this channel until the receive end is closed. Note that if the channel
 38 |         is full, the resource monitor does not block! It will drop messages
 39 |         instead.
 40 | 
 41 |         :param int channel_size: The size of the channel to create.
 42 |         :returns: A channel that will receive resource statistics at regular
 43 |             intervals.
 44 |         :rtype: trio.ReceiveChannel
 45 |         '''
 46 |         logger.debug('Creating new channel with size=%d', channel_size)
 47 |         send_channel, recv_channel = trio.open_memory_channel(channel_size)
 48 |         self._channels.append(send_channel)
 49 |         return recv_channel
 50 | 
 51 |     def history(self, n=None):
 52 |         '''
 53 |         Return the most recent ``n`` measurements.
 54 | 
 55 |         :param int n: The number of measurements to retrieve. If ``n`` is None
 56 |             or there are fewer than ``n`` measurements, return all measurements.
 57 |         :rtype: list
 58 |         '''
 59 |         # A deque can't be sliced, so we have to do some extra work to return
 60 |         # the <n> most recent measurements from the end.
 61 |         history_iter = iter(self._measurements)
 62 |         if n is not None:
 63 |             for _ in range(len(self._measurements) - n):
 64 |                 next(history_iter)
 65 |         return list(history_iter)
 66 | 
 67 |     async def run(self):
 68 |         '''
 69 |         Run the resource monitor.
 70 | 
 71 |         :returns: Runs until cancelled.
 72 |         '''
 73 |         next_run = trio.current_time() + self._interval
 74 |         while True:
 75 |             measurement = self._measure()
 76 |             self._measurements.append(measurement)
 77 |             to_remove = set()
 78 |             for channel in self._channels:
 79 |                 try:
 80 |                     channel.send_nowait(measurement)
 81 |                 except trio.WouldBlock:
 82 |                     continue
 83 |                 except trio.BrokenResourceError:
 84 |                     to_remove.add(channel)
 85 |             for channel in to_remove:
 86 |                 logger.debug('Removing closed channel')
 87 |                 self._channels.remove(channel)
 88 |             sleep_time = next_run - trio.current_time()
 89 |             while sleep_time < 0:
 90 |                 sleep_time += self._interval
 91 |             await trio.sleep(sleep_time)
 92 |             next_run += self._interval
 93 | 
 94 |     def _measure(self):
 95 |         '''
 96 |         Record one set of measurements.
 97 | 
 98 |         :rtype: dict
 99 |         '''
100 |         measurement = dict()
101 |         measurement['timestamp'] = datetime.now(timezone.utc)
102 | 
103 |         # CPUs
104 |         measurement['cpus'] = psutil.cpu_percent(percpu=True)
105 | 
106 |         # Memory
107 |         vm = psutil.virtual_memory()
108 |         measurement['memory_used'] = vm.used
109 |         measurement['memory_total'] = vm.total
110 | 
111 |         # Disks
112 |         measurement['disks'] = list()
113 |         for partition in psutil.disk_partitions():
114 |             disk = dict()
115 |             disk['mount'] = partition.mountpoint
116 |             usage = psutil.disk_usage(disk['mount'])
117 |             disk['used'] = usage.used
118 |             disk['total'] = usage.total
119 |             measurement['disks'].append(disk)
120 | 
121 |         # Networks
122 |         measurement['networks'] = list()
123 |         for name, nic in psutil.net_io_counters(pernic=True).items():
124 |             net = dict()
125 |             net['name'] = name
126 |             net['sent'] = nic.bytes_sent
127 |             net['received'] = nic.bytes_recv
128 |             measurement['networks'].append(net)
129 | 
130 |         # Crawl Job Resources
131 |         measurement['jobs'] = list()
132 |         crawl_resources = self._crawl_resources_fn()
133 |         for job in crawl_resources['jobs']:
134 |             measurement['jobs'].append(job.copy())
135 | 
136 |         # Crawl Global Resources
137 |         measurement['current_downloads'] = crawl_resources['current_downloads']
138 |         measurement['maximum_downloads'] = crawl_resources['maximum_downloads']
139 |         measurement['rate_limiter'] = self._rate_limiter.item_count
140 | 
141 |         return measurement
142 | 


--------------------------------------------------------------------------------
/starbelly/robots.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | from datetime import datetime, timezone
  3 | import logging
  4 | 
  5 | from robotexclusionrulesparser import RobotExclusionRulesParser
  6 | from rethinkdb import RethinkDB
  7 | from yarl import URL
  8 | import trio
  9 | 
 10 | from .downloader import DownloadRequest
 11 | 
 12 | 
 13 | r = RethinkDB()
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class RobotsTxtManager:
 18 |     ''' Store and manage robots.txt files. '''
 19 |     def __init__(self, db_pool, max_age=24*60*60, max_cache=1e3):
 20 |         '''
 21 |         Constructor.
 22 | 
 23 |         :param db_pool: A DB connection pool.
 24 |         :param int max_age: The maximum age before a robots.txt is downloaded
 25 |             again.
 26 |         :param int max_cache: The maximum number of robots.txt files to cache
 27 |             in memory.
 28 |         '''
 29 |         self._db_pool = db_pool
 30 |         self._events = dict()
 31 |         self._cache = OrderedDict()
 32 |         self._max_age = max_age
 33 |         self._max_cache = max_cache
 34 | 
 35 |     async def is_allowed(self, url, policy, downloader):
 36 |         '''
 37 |         Return True if ``url`` is allowed by the applicable robots.txt file.
 38 | 
 39 |         This fetches the applicable robots.txt if we don't have a recent copy
 40 |         of it cached in memory or in the database. The ``policy`` is used if a
 41 |         robots.txt file needs to be fetched from the network.
 42 | 
 43 |         :param str url: Check this URL to see if the robots.txt and accompanying
 44 |             policy permit access to it.
 45 |         :param Policy policy:
 46 |         :param Downloader downloader:
 47 |         :rtype: bool
 48 |         '''
 49 |         if policy.robots_txt.usage == 'IGNORE':
 50 |             # No need to fetch robots.txt.
 51 |             return True
 52 | 
 53 |         robots_url = str(URL(url).with_path('robots.txt')
 54 |                                  .with_query(None)
 55 |                                  .with_fragment(None))
 56 | 
 57 |         # Check if cache has a current copy of robots.txt.
 58 |         try:
 59 |             robots = self._cache[robots_url]
 60 |             if robots.is_older_than(self._max_age):
 61 |                 del self._cache[robots_url]
 62 |                 robots = None
 63 |             else:
 64 |                 self._cache.move_to_end(robots_url)
 65 |         except KeyError:
 66 |             robots = None
 67 | 
 68 |         # Do we need to fetch robots into cache?
 69 |         if robots is None:
 70 |             try:
 71 |                 # If another task is fetching it, then just wait for that task.
 72 |                 await self._events[robots_url].wait()
 73 |                 robots = self._cache[robots_url]
 74 |             except KeyError:
 75 |                 # Create a new task to fetch it.
 76 |                 self._events[robots_url] = trio.Event()
 77 |                 robots = await self._get_robots(robots_url, downloader)
 78 |                 event = self._events.pop(robots_url)
 79 |                 event.set()
 80 | 
 81 |         # Note: we only check the first user agent.
 82 |         user_agent = policy.user_agents.get_first_user_agent()
 83 |         robots_decision = robots.is_allowed(user_agent, url)
 84 |         if policy.robots_txt.usage == 'OBEY':
 85 |             return robots_decision
 86 |         return not robots_decision
 87 | 
 88 |     async def _get_robots(self, robots_url, downloader):
 89 |         '''
 90 |         Locate and return a robots.txt file.
 91 | 
 92 |         Looks for non-expired robots.txt file first in database then request
 93 |         from network. Wherever the robots file is found, it is placed into the
 94 |         cache and then returned.
 95 | 
 96 |         If we get a copy from the network, then we also store a copy in the
 97 |         database. If we cannot get a copy from the network (e.g. 404 error) and
 98 |         we have a database copy, then we update the database copy's expiration.
 99 |         If we cannot get a copy from database or network, then we create a
100 |         permissive robots.txt and use that instead.
101 | 
102 |         :param str url: Fetch the file at this URL.
103 |         :param Downloader downloader:
104 |         :rtype: RobotsTxt
105 |         '''
106 |         # Check DB. If not there (or expired), check network.
107 |         now = datetime.now(timezone.utc)
108 |         robots_doc = await self._get_robots_from_db(robots_url)
109 | 
110 |         if robots_doc is None or \
111 |                 (now - robots_doc['updated_at']).seconds > self._max_age:
112 |             robots_file = await self._get_robots_from_net(robots_url,
113 |                 downloader)
114 |         else:
115 |             robots_file = None
116 | 
117 |         if robots_doc is None:
118 |             # No local copy: create a new local copy. If robots_file is None, it
119 |             # will be treated as a permissive RobotsTxt.
120 |             logger.info('Saving new robots.txt file: %s', robots_url)
121 |             robots_doc = {
122 |                 'file': robots_file,
123 |                 'updated_at': now,
124 |                 'url': robots_url,
125 |             }
126 |             robots = RobotsTxt(robots_doc)
127 |         else:
128 |             # If we have a network copy, use that to update local copy.
129 |             # Otherwise, just update the local copy's timestamp.
130 |             robots = RobotsTxt(robots_doc)
131 |             logger.info('Updating robots.txt file: %s', robots_url)
132 |             if robots_file is not None:
133 |                 robots_doc['file'] = robots_file
134 |             else:
135 |                 del robots_doc['file']
136 | 
137 |             robots_doc['updated_at'] = now
138 |             del robots_doc['url']
139 | 
140 |         # Upsert robots_docs.
141 |         await self._save_robots_to_db(robots_doc)
142 | 
143 |         # Add to cache before completing the future to avoid race condition.
144 |         self._cache[robots_url] = robots
145 |         self._cache.move_to_end(robots_url)
146 |         if len(self._cache) > self._max_cache:
147 |             self._cache.popitem(last=False)
148 |         return robots
149 | 
150 |     async def _get_robots_from_db(self, robots_url):
151 |         '''
152 |         Get robots document from the database.
153 | 
154 |         Returns None if it doesn't exist in the database.
155 | 
156 |         :param str robots_url: The URL of the robots.txt file.
157 |         :returns: A database document.
158 |         :rtype: dict
159 |         '''
160 |         query = r.table('robots_txt').get_all(robots_url, index='url').nth(0)
161 | 
162 |         async with self._db_pool.connection() as conn:
163 |             try:
164 |                 db_robots = await query.run(conn)
165 |             except r.ReqlNonExistenceError:
166 |                 db_robots = None
167 | 
168 |         return db_robots
169 | 
170 |     async def _get_robots_from_net(self, robots_url, downloader):
171 |         '''
172 |         Get robots.txt file from the network.
173 | 
174 |         Returns None if the file cannot be fetched (e.g. 404 error).
175 | 
176 |         :param str robots_url: Fetch the robots.txt file at this URL.
177 |         :param Downloader downloader:
178 |         :returns: Contents of robots.txt file or None if it couldn't be
179 |             downloaded.
180 |         :rtype: str
181 |         '''
182 | 
183 |         logger.info('Fetching robots.txt: %s', robots_url)
184 |         request = DownloadRequest(frontier_id=None, job_id=None, method='GET',
185 |             url=robots_url, form_data=None, cost=0)
186 |         response = await downloader.download(request, skip_mime=True)
187 | 
188 |         if response.status_code == 200 and response.body is not None:
189 |             # There are no invalid byte sequences in latin1 encoding, so this
190 |             # should always succeed.
191 |             robots_file = response.body.decode('latin1')
192 |         else:
193 |             robots_file = None
194 | 
195 |         return robots_file
196 | 
197 |     async def _save_robots_to_db(self, robots_doc):
198 |         async with self._db_pool.connection() as conn:
199 |             await (
200 |                 r.table('robots_txt')
201 |                  .insert(robots_doc, conflict='update')
202 |                  .run(conn)
203 |             )
204 | 
205 | 
206 | class RobotsTxt:
207 |     '''
208 |     Wrapper around robots.txt parser that adds the date the file was fetched.
209 | 
210 |     If the ``robots_file`` is None or cannot be parsed, then it's treated as a
211 |     highly permissive robots.txt.
212 |     '''
213 |     def __init__(self, robots_doc):
214 |         ''' Initialize from database document representation. '''
215 |         self._updated_at = robots_doc['updated_at']
216 |         self._robots = RobotExclusionRulesParser()
217 | 
218 |         if robots_doc['file'] is not None:
219 |             # The parser never throws an exception, it just ignores things that
220 |             # it doesn't understand.
221 |             self._robots.parse(robots_doc['file'])
222 | 
223 |     def is_allowed(self, user_agent, url):
224 |         '''
225 |         Return True if ``url`` is allowed by this robots.txt file.
226 | 
227 |         :param str user_agent: The user agent that want to access the URL.
228 |         :param str url: The URL that the user agent wants to access.
229 |         :rtype: bool
230 |         '''
231 |         return self._robots.is_allowed(user_agent, url)
232 | 
233 |     def is_older_than(self, age):
234 |         '''
235 |         Return True if this robots file is older than ``age``.
236 | 
237 |         :param datetime age: A timezone-aware datetime.
238 |         :rtype: bool
239 |         '''
240 |         return (datetime.now(timezone.utc) - self._updated_at).seconds >= age
241 | 


--------------------------------------------------------------------------------
/starbelly/server/__init__.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from datetime import datetime, timedelta
  3 | from functools import partial
  4 | import gzip
  5 | import inspect
  6 | import logging
  7 | import operator
  8 | from time import time
  9 | from urllib.parse import urlparse
 10 | 
 11 | import dateutil.parser
 12 | from dateutil.tz import tzlocal
 13 | from google.protobuf.message import DecodeError
 14 | import rethinkdb as r
 15 | from rethinkdb.errors import ReqlNonExistenceError
 16 | from trio_websocket import ConnectionClosed, serve_websocket
 17 | import trio
 18 | 
 19 | from starbelly.subscription import SubscriptionManager
 20 | from starbelly.starbelly_pb2 import Request, Response, ServerMessage
 21 | 
 22 | # Define API handler decorator before importing API implementations, because
 23 | # those implementations use this decorator to register themselves with the
 24 | # server.
 25 | _handlers = dict()
 26 | def api_handler(handler):
 27 |     '''
 28 |     This decorator registers a function as a callable command through the
 29 |     API server.
 30 |     '''
 31 |     _handlers[handler.__name__] = handler
 32 |     return handler
 33 | 
 34 | 
 35 | class InvalidRequestException(Exception):
 36 |     ''' Indicates a request is invalid. '''
 37 | 
 38 | 
 39 | # pylint: disable=cyclic-import, wrong-import-position
 40 | from .captcha import *
 41 | from .job import *
 42 | from .login import *
 43 | from .policy import *
 44 | from .rate_limit import *
 45 | from .schedule import *
 46 | from .subscription import *
 47 | from .system import *
 48 | 
 49 | 
 50 | logger = logging.getLogger(__name__)
 51 | 
 52 | 
 53 | class Server:
 54 |     ''' Handles websocket connections from clients and command dispatching. '''
 55 | 
 56 |     def __init__(self, host, port, server_db, subscription_db, crawl_manager,
 57 |             rate_limiter, resource_monitor, stats_tracker, scheduler):
 58 |         '''
 59 |         Constructor
 60 | 
 61 |         :param str host: The hostname to serve on.
 62 |         :param int port: The port to serve on, or zero to automatically pick a
 63 |             port.
 64 |         :param starbelly.db.ServerDb server_db:
 65 |         :param starbelly.db.SubscriptionDb subscription_db:
 66 |         :param starbelly.job.CrawlManager crawl_manager:
 67 |         :param starbelly.rate_limiter.RateLimiter:
 68 |         :param starbelly.resource_monitor.ResourceMonitor resource_monitor:
 69 |         :param starbelly.job.StatsTracker stats_tracker:
 70 |         :param starbelly.schedule.Scheduler scheduler:
 71 |         '''
 72 |         self._host = host
 73 |         self._port = port
 74 |         self._server_db = server_db
 75 |         self._subscription_db = subscription_db
 76 |         self._crawl_manager = crawl_manager
 77 |         self._rate_limiter = rate_limiter
 78 |         self._resource_monitor = resource_monitor
 79 |         self._stats_tracker = stats_tracker
 80 |         self._scheduler = scheduler
 81 | 
 82 |     @property
 83 |     def port(self):
 84 |         return self._port
 85 | 
 86 |     async def run(self, *, task_status=trio.TASK_STATUS_IGNORED):
 87 |         '''
 88 |         Run the websocket server.
 89 | 
 90 |         To ensure that the server is ready, call ``await
 91 |         nursery.start(server.run)``.
 92 | 
 93 |         :returns: Runs until cancelled.
 94 |         '''
 95 |         logger.info('Starting server on %s:%d', self._host, self._port)
 96 |         async with trio.open_nursery() as nursery:
 97 |             serve_fn = partial(serve_websocket, self._handle_connection,
 98 |                 self._host, self._port, ssl_context=None,
 99 |                 handler_nursery=nursery)
100 |             server = await nursery.start(serve_fn, name='Connection Listener')
101 |             self._port = server.port
102 |             task_status.started()
103 |         logger.info('Server stopped')
104 | 
105 |     async def _handle_connection(self, request):
106 |         '''
107 |         Handle an incoming connection.
108 | 
109 |         :param request: A WebSocket connection request.
110 |         '''
111 |         headers = dict(request.headers)
112 |         ws = await request.accept()
113 |         client = '{}:{}'.format(ws.remote.address, ws.remote.port)
114 |         logger.info('Connection opened: client=%s path=%s', client, ws.path)
115 |         connection = Connection(client, ws, self._server_db,
116 |             self._subscription_db, self._crawl_manager, self._rate_limiter,
117 |             self._resource_monitor, self._stats_tracker, self._scheduler)
118 |         await connection.run()
119 | 
120 | 
121 | class Connection:
122 |     def __init__(self, client, ws, server_db, subscription_db, crawl_manager,
123 |             rate_limiter, resource_monitor, stats_tracker, scheduler):
124 |         '''
125 |         Constructor.
126 | 
127 |         :param str client: Description the client (IP address and port) that
128 |             opened this connection.
129 |         :param trio_websocket.WebSocketConnection ws: A websocket connection.
130 |         :param starbelly.db.ServerDb: A database layer.
131 |         :param starbelly.db.SubscriptionDb: A database layer.
132 |         :param starbelly.job.CrawlManager crawl_manager: A crawl manager.
133 |         :param starbelly.rate_limiter.RateLimiter: A rate limiter.
134 |         :param starbelly.resource_monitor.ResourceMonitor resource_monitor: A
135 |             resource monitor.
136 |         :param starbelly.schedule.Scheduler scheduler: A scheduler.
137 |         :param starbelly.job.StatsTracker stats_tracker:
138 |         :param starbelly.subscription.SubscriptionManager: A subscription
139 |             manager.
140 |         '''
141 |         self._client = client
142 |         self._ws = ws
143 |         self._server_db = server_db
144 |         self._subscription_db = subscription_db
145 |         self._crawl_manager = crawl_manager
146 |         self._rate_limiter = rate_limiter
147 |         self._resource_monitor = resource_monitor
148 |         self._scheduler = scheduler
149 |         self._subscription_db = subscription_db
150 |         self._nursery = None
151 |         self._stats_tracker = stats_tracker
152 |         self._subscription_manager = None
153 | 
154 |     async def run(self):
155 |         '''
156 |         Run the connection: read requests and send responses.
157 | 
158 |         This opens an internal nursery in case background tasks, like
159 |         subscriptions, need to be started.
160 | 
161 |         :returns: This runs until the connection is closed.
162 |         '''
163 |         try:
164 |             async with trio.open_nursery() as nursery:
165 |                 self._nursery = nursery
166 |                 self._subscription_manager = SubscriptionManager(
167 |                     self._subscription_db, nursery, self._ws)
168 |                 while True:
169 |                     request_data = await self._ws.get_message()
170 |                     nursery.start_soon(self._handle_request, request_data,
171 |                         name='Request Handler')
172 |         except ConnectionClosed:
173 |             logger.info('Connection closed for %s', self._client)
174 |         except:
175 |             logger.exception('Connection exception')
176 |         finally:
177 |             await self._ws.aclose()
178 | 
179 |     async def _handle_request(self, request_data):
180 |         '''
181 |         Handle a single API request.
182 | 
183 |         :param request: A protobuf request object.
184 |         '''
185 |         start = trio.current_time()
186 |         message = ServerMessage()
187 |         message.response.is_success = False
188 |         request = None
189 | 
190 |         try:
191 |             # Prepare response.
192 |             request = Request.FromString(request_data)
193 |             message.response.request_id = request.request_id
194 | 
195 |             # Find an appropriate handler.
196 |             command_name = request.WhichOneof('Command')
197 |             if command_name is None:
198 |                 raise InvalidRequestException('No command specified')
199 |             command = getattr(request, command_name)
200 |             try:
201 |                 handler = _handlers[command_name]
202 |             except KeyError:
203 |                 raise InvalidRequestException('Invalid command name: {}'
204 |                     .format(command_name)) from None
205 | 
206 |             # Inject dependencies into argument list, then call the handler.
207 |             argspec = inspect.getfullargspec(handler)
208 |             args = list()
209 |             for var in argspec[0]:
210 |                 if var == 'command':
211 |                     args.append(command)
212 |                 elif var == 'crawl_manager':
213 |                     args.append(self._crawl_manager)
214 |                 elif var == 'nursery':
215 |                     args.append(self._nursery)
216 |                 elif var == 'rate_limiter':
217 |                     args.append(self._rate_limiter)
218 |                 elif var == 'resource_monitor':
219 |                     args.append(self._resource_monitor)
220 |                 elif var == 'response':
221 |                     args.append(message.response)
222 |                 elif var == 'scheduler':
223 |                     args.append(self._scheduler)
224 |                 elif var == 'server_db':
225 |                     args.append(self._server_db)
226 |                 elif var == 'subscription_manager':
227 |                     args.append(self._subscription_manager)
228 |                 elif var == 'stats_tracker':
229 |                     args.append(self._stats_tracker)
230 |                 elif var == 'websocket':
231 |                     args.append(self._ws)
232 |                 else:
233 |                     raise Exception('Unknown dependency "{}" in handler {}()'
234 |                         .format(var, command_name))
235 | 
236 |             await handler(*args)
237 |             message.response.is_success = True
238 |             elapsed = trio.current_time() - start
239 |             logger.info('Request OK %s %s %0.3fs', self._client,
240 |                 command_name, elapsed)
241 |         except DecodeError:
242 |             # Failure to decode a protobuf message means that the connection
243 |             # is severely damaged; raise to the nursery so we can close the
244 |             # entire connection.
245 |             raise
246 |         except InvalidRequestException as ire:
247 |             error_message = str(ire)
248 |             logger.error('Request ERROR %s %s (%s)', command_name,
249 |                 self._client, error_message)
250 |             message.response.error_message = error_message
251 |         except:
252 |             logger.exception('Exception while handling request:\n%r',
253 |                 request)
254 |             message.response.error_message = 'A server exception occurred'
255 | 
256 |         message_data = message.SerializeToString()
257 |         await self._ws.send_message(message_data)
258 | 


--------------------------------------------------------------------------------
/starbelly/server/captcha.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timezone
 2 | import logging
 3 | from uuid import UUID
 4 | 
 5 | from . import api_handler, InvalidRequestException
 6 | from ..captcha import captcha_doc_to_pb, captcha_pb_to_doc
 7 | 
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | @api_handler
13 | async def delete_captcha_solver(command, server_db):
14 |     ''' Delete a a CAPTCHA solver. '''
15 |     solver_id = str(UUID(bytes=command.solver_id))
16 |     try:
17 |         await server_db.delete_captcha_solver(solver_id)
18 |     except ValueError as ve:
19 |         raise InvalidRequestException(str(ve)) from None
20 | 
21 | 
22 | @api_handler
23 | async def get_captcha_solver(command, response, server_db):
24 |     ''' Get a CAPTCHA solver. '''
25 |     solver_id = str(UUID(bytes=command.solver_id))
26 |     doc = await server_db.get_captcha_solver(solver_id)
27 | 
28 |     if doc is None:
29 |         raise InvalidRequestException('No CAPTCHA solver found for that ID')
30 | 
31 |     response.solver.CopyFrom(captcha_doc_to_pb(doc))
32 | 
33 | 
34 | @api_handler
35 | async def list_captcha_solvers(command, response, server_db):
36 |     ''' Return a list of CAPTCHA solvers. '''
37 |     limit = command.page.limit
38 |     offset = command.page.offset
39 |     count, docs = await server_db.list_captcha_solvers(limit, offset)
40 | 
41 |     for doc in docs:
42 |         solver = response.list_captcha_solvers.solvers.add()
43 |         solver.CopyFrom(captcha_doc_to_pb(doc))
44 | 
45 |     response.list_captcha_solvers.total = count
46 | 
47 | 
48 | @api_handler
49 | async def set_captcha_solver(command, response, server_db):
50 |     ''' Create or update CAPTCHA solver. '''
51 |     now = datetime.now(timezone.utc)
52 |     doc = captcha_pb_to_doc(command.solver)
53 |     new_id = await server_db.set_captcha_solver(doc, now)
54 |     if new_id:
55 |         response.new_solver.solver_id = UUID(new_id).bytes
56 | 


--------------------------------------------------------------------------------
/starbelly/server/job.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import logging
  3 | from uuid import UUID
  4 | from yarl import URL
  5 | 
  6 | import dateutil.parser
  7 | 
  8 | from . import api_handler, InvalidRequestException
  9 | from ..policy import Policy
 10 | from ..starbelly_pb2 import JobRunState as PbRunState
 11 | 
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | @api_handler
 17 | async def delete_job(command, server_db, stats_tracker):
 18 |     """ Delete a job. """
 19 |     job_id = str(UUID(bytes=command.job_id))
 20 |     await server_db.delete_job(job_id)
 21 |     stats_tracker.delete_job(job_id)
 22 | 
 23 | 
 24 | @api_handler
 25 | async def get_job(command, response, server_db):
 26 |     """ Get status for a single job. """
 27 |     job_id = str(UUID(bytes=command.job_id))
 28 |     job_doc = await server_db.get_job(job_id)
 29 |     if not job_doc:
 30 |         raise InvalidRequestException(f"No job exists with ID={job_id}")
 31 | 
 32 |     job = response.job
 33 |     job.job_id = UUID(job_doc["id"]).bytes
 34 |     for seed in job_doc["seeds"]:
 35 |         job.seeds.append(seed)
 36 |     for tag in job_doc["tags"]:
 37 |         job.tags.append(tag)
 38 |     Policy.convert_doc_to_pb(job_doc["policy"], job.policy)
 39 |     job.name = job_doc["name"]
 40 |     job.item_count = job_doc["item_count"]
 41 |     job.http_success_count = job_doc["http_success_count"]
 42 |     job.http_error_count = job_doc["http_error_count"]
 43 |     job.exception_count = job_doc["exception_count"]
 44 |     job.started_at = job_doc["started_at"].isoformat()
 45 |     if job_doc["completed_at"] is not None:
 46 |         job.completed_at = job_doc["completed_at"].isoformat()
 47 |     run_state = job_doc["run_state"].upper()
 48 |     job.run_state = PbRunState.Value(run_state)
 49 |     http_status_counts = job_doc["http_status_counts"]
 50 |     for status_code, count in http_status_counts.items():
 51 |         job.http_status_counts[int(status_code)] = count
 52 | 
 53 | 
 54 | @api_handler
 55 | async def get_job_items(command, response, server_db):
 56 |     """ Get a page of items (crawl responses) from a job. """
 57 |     job_id = str(UUID(bytes=command.job_id))
 58 |     limit = command.page.limit
 59 |     offset = command.page.offset
 60 |     count, items = await server_db.get_job_items(
 61 |         job_id,
 62 |         limit,
 63 |         offset,
 64 |         command.include_success,
 65 |         command.include_error,
 66 |         command.include_exception,
 67 |     )
 68 |     response.list_items.total = count
 69 |     compression_ok = command.compression_ok
 70 |     for item_doc in items:
 71 |         item = response.list_items.items.add()
 72 | 
 73 |         if item_doc["join"] is None:
 74 |             item.is_compressed = False
 75 |         elif item_doc["join"]["is_compressed"] and not compression_ok:
 76 |             item.body = gzip.decompress(item_doc["join"]["body"])
 77 |             item.is_compressed = False
 78 |         else:
 79 |             item.body = item_doc["join"]["body"]
 80 |             item.is_compressed = item_doc["join"]["is_compressed"]
 81 |         if "content_type" in item_doc:
 82 |             item.content_type = item_doc["content_type"]
 83 |         if "exception" in item_doc:
 84 |             item.exception = item_doc["exception"]
 85 |         if "status_code" in item_doc:
 86 |             item.status_code = item_doc["status_code"]
 87 |         header_iter = iter(item_doc.get("headers", []))
 88 |         for key in header_iter:
 89 |             value = next(header_iter)
 90 |             header = item.headers.add()
 91 |             header.key = key
 92 |             header.value = value
 93 |         item.cost = item_doc["cost"]
 94 |         item.job_id = UUID(item_doc["job_id"]).bytes
 95 |         item.completed_at = item_doc["completed_at"].isoformat()
 96 |         item.started_at = item_doc["started_at"].isoformat()
 97 |         item.duration = item_doc["duration"]
 98 |         item.url = item_doc["url"]
 99 |         item.url_can = item_doc["canonical_url"]
100 |         item.is_success = item_doc["is_success"]
101 | 
102 | 
103 | @api_handler
104 | async def list_jobs(command, response, server_db):
105 |     """ Return a list of jobs. """
106 |     limit = command.page.limit
107 |     offset = command.page.offset
108 |     if command.HasField("started_after"):
109 |         started_after = dateutil.parser.parse(command.started_after)
110 |     else:
111 |         started_after = None
112 |     tag = command.tag if command.HasField("tag") else None
113 |     schedule_id = (
114 |         str(UUID(bytes=command.schedule_id))
115 |         if command.HasField("schedule_id")
116 |         else None
117 |     )
118 |     count, jobs = await server_db.list_jobs(
119 |         limit, offset, started_after, tag, schedule_id
120 |     )
121 |     response.list_jobs.total = count
122 | 
123 |     for job_doc in jobs:
124 |         job = response.list_jobs.jobs.add()
125 |         job.job_id = UUID(job_doc["id"]).bytes
126 |         job.name = job_doc["name"]
127 |         for seed in job_doc["seeds"]:
128 |             job.seeds.append(seed)
129 |         for tag in job_doc["tags"]:
130 |             job.tags.append(tag)
131 |         job.item_count = job_doc["item_count"]
132 |         job.http_success_count = job_doc["http_success_count"]
133 |         job.http_error_count = job_doc["http_error_count"]
134 |         job.exception_count = job_doc["exception_count"]
135 |         job.started_at = job_doc["started_at"].isoformat()
136 |         if job_doc["completed_at"] is not None:
137 |             job.completed_at = job_doc["completed_at"].isoformat()
138 |         run_state = job_doc["run_state"].upper()
139 |         job.run_state = PbRunState.Value(run_state)
140 |         http_status_counts = job_doc["http_status_counts"]
141 |         for status_code, count in http_status_counts.items():
142 |             job.http_status_counts[int(status_code)] = count
143 | 
144 | 
145 | @api_handler
146 | async def set_job(command, crawl_manager, response):
147 |     """ Create or update job metadata. """
148 |     if command.HasField("job_id"):
149 |         # Update run state of existing job.
150 |         job_id = str(UUID(bytes=command.job_id))
151 |         if command.HasField("run_state"):
152 |             run_state = command.run_state
153 |             if run_state == PbRunState.Value("CANCELLED"):
154 |                 await crawl_manager.cancel_job(job_id)
155 |             elif run_state == PbRunState.Value("PAUSED"):
156 |                 await crawl_manager.pause_job(job_id)
157 |             elif run_state == PbRunState.Value("RUNNING"):
158 |                 await crawl_manager.resume_job(job_id)
159 |             else:
160 |                 raise InvalidRequestException(
161 |                     f"Not allowed to set job run state: {run_state}"
162 |                 )
163 |     else:
164 |         # Create new job.
165 |         if not command.policy_id:
166 |             raise InvalidRequestException('"policy_id" is required')
167 |         if not command.seeds:
168 |             raise InvalidRequestException('"seeds" is required')
169 |         name = command.name
170 |         policy_id = str(UUID(bytes=command.policy_id))
171 |         seeds = [s.strip() for s in command.seeds]
172 |         tags = [t.strip() for t in command.tags]
173 | 
174 |         if name.strip() == "":
175 |             url = URL(seeds[0])
176 |             name = url.host
177 |             if len(seeds) > 1:
178 |                 name += "& {} more".format(len(seeds) - 1)
179 | 
180 |         job_id = await crawl_manager.start_job(name, seeds, tags, policy_id)
181 |         response.new_job.job_id = UUID(job_id).bytes
182 | 


--------------------------------------------------------------------------------
/starbelly/server/login.py:
--------------------------------------------------------------------------------
 1 | from . import api_handler, InvalidRequestException
 2 | 
 3 | 
 4 | @api_handler
 5 | async def delete_domain_login(command, server_db):
 6 |     ''' Delete a domain login and all of its users. '''
 7 |     await server_db.delete_domain_login(command.domain)
 8 | 
 9 | 
10 | @api_handler
11 | async def get_domain_login(command, response, server_db):
12 |     ''' Get a domain login. '''
13 |     domain = command.domain
14 |     domain_login = await server_db.get_domain_login(domain)
15 |     if domain_login is None:
16 |         raise InvalidRequestException('No domain credentials found for'
17 |             ' domain={}'.format(domain))
18 |     response.domain_login.domain = domain_login['domain']
19 |     response.domain_login.login_url = domain_login['login_url']
20 |     if domain_login['login_test'] is not None:
21 |         response.domain_login.login_test = domain_login['login_test']
22 | 
23 |     for user in domain_login['users']:
24 |         dl_user = response.domain_login.users.add()
25 |         dl_user.username = user['username']
26 |         dl_user.password = user['password']
27 |         dl_user.working = user['working']
28 | 
29 | 
30 | @api_handler
31 | async def list_domain_logins(command, response, server_db):
32 |     ''' Return a list of domain logins. '''
33 |     limit = command.page.limit
34 |     offset = command.page.offset
35 |     count, docs = await server_db.list_domain_logins(limit, offset)
36 |     response.list_domain_logins.total = count
37 |     for doc in docs:
38 |         dl = response.list_domain_logins.logins.add()
39 |         dl.domain = doc['domain']
40 |         dl.login_url = doc['login_url']
41 |         if doc['login_test'] is not None:
42 |             dl.login_test = doc['login_test']
43 |         for user_doc in doc['users']:
44 |             user = dl.users.add()
45 |             user.username = user_doc['username']
46 |             user.password = user_doc['password']
47 |             user.working = user_doc['working']
48 | 
49 | 
50 | @api_handler
51 | async def set_domain_login(command, server_db):
52 |     ''' Create or update a domain login. '''
53 |     domain_login = command.login
54 | 
55 |     if not domain_login.HasField('domain'):
56 |         raise InvalidRequestException('domain is required.')
57 | 
58 |     domain = domain_login.domain
59 |     doc = await server_db.get_domain_login(domain)
60 |     if doc is None:
61 |         if not domain_login.HasField('login_url'):
62 |             raise InvalidRequestException('login_url is required to'
63 |                 ' create a domain login.')
64 |         doc = {
65 |             'domain': domain,
66 |             'login_url': domain_login.login_url,
67 |             'login_test': None,
68 |         }
69 | 
70 |     if domain_login.HasField('login_url'):
71 |         doc['login_url'] = domain_login.login_url
72 | 
73 |     if domain_login.HasField('login_test'):
74 |         doc['login_test'] = domain_login.login_test
75 | 
76 |     doc['users'] = list()
77 | 
78 |     for user in domain_login.users:
79 |         doc['users'].append({
80 |             'username': user.username,
81 |             'password': user.password,
82 |             'working': user.working,
83 |         })
84 | 
85 |     await server_db.set_domain_login(doc)
86 | 


--------------------------------------------------------------------------------
/starbelly/server/policy.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timezone
 2 | from uuid import UUID
 3 | 
 4 | from . import api_handler
 5 | from ..policy import Policy
 6 | from ..version import __version__
 7 | 
 8 | 
 9 | @api_handler
10 | async def delete_policy(command, server_db):
11 |     ''' Delete a policy. '''
12 |     policy_id = str(UUID(bytes=command.policy_id))
13 |     await server_db.delete_policy(policy_id)
14 | 
15 | 
16 | @api_handler
17 | async def get_policy(command, response, server_db):
18 |     ''' Get a single policy. '''
19 |     policy_id = str(UUID(bytes=command.policy_id))
20 |     policy_doc = await server_db.get_policy(policy_id)
21 |     Policy.convert_doc_to_pb(policy_doc, response.policy)
22 | 
23 | 
24 | @api_handler
25 | async def list_policies(command, response, server_db):
26 |     ''' Get a list of policies. '''
27 |     limit = command.page.limit
28 |     offset = command.page.offset
29 |     count, docs = await server_db.list_policies(limit, offset)
30 |     response.list_policies.total = count
31 | 
32 |     for policy_doc in docs:
33 |         policy = response.list_policies.policies.add()
34 |         policy.policy_id = UUID(policy_doc['id']).bytes
35 |         policy.name = policy_doc['name']
36 |         policy.created_at = policy_doc['created_at'].isoformat()
37 |         policy.updated_at = policy_doc['updated_at'].isoformat()
38 | 
39 |     return response
40 | 
41 | 
42 | @api_handler
43 | async def set_policy(command, response, server_db):
44 |     '''
45 |     Create or update a single policy.
46 | 
47 |     If the policy ID is set, then update the corresponding policy.
48 |     Otherwise, create a new policy.
49 |     '''
50 |     policy_doc = Policy.convert_pb_to_doc(command.policy)
51 |     # Validate policy by trying to instantiate a Policy object, which will
52 |     # raise an exception if the policy is invalid.
53 |     Policy(policy_doc, version=__version__,
54 |         seeds=['http://test1.com', 'http://test2.org'])
55 |     now = datetime.now(timezone.utc)
56 |     new_id = await server_db.set_policy(policy_doc, now)
57 |     if new_id is not None:
58 |         response.new_policy.policy_id = UUID(new_id).bytes
59 | 


--------------------------------------------------------------------------------
/starbelly/server/rate_limit.py:
--------------------------------------------------------------------------------
 1 | from . import api_handler, InvalidRequestException
 2 | from ..rate_limiter import get_domain_token, GLOBAL_RATE_LIMIT_TOKEN
 3 | 
 4 | 
 5 | @api_handler
 6 | async def list_rate_limits(command, response, server_db):
 7 |     ''' Get a page of rate limits. '''
 8 |     limit = command.page.limit
 9 |     offset = command.page.offset
10 |     count, rate_limits = await server_db.list_rate_limits(limit, offset)
11 |     response.list_rate_limits.total = count
12 | 
13 |     for rate_limit in rate_limits:
14 |         rl = response.list_rate_limits.rate_limits.add()
15 |         rl.name = rate_limit['name']
16 |         rl.token = rate_limit['token']
17 |         rl.delay = rate_limit['delay']
18 |         if rl.name.startswith('domain:'):
19 |             rl.domain = rl.name.split(':')[1]
20 | 
21 | 
22 | @api_handler
23 | async def set_rate_limit(command, rate_limiter, server_db):
24 |     ''' Set a rate limit. '''
25 |     delay = command.delay if command.HasField('delay') else None
26 | 
27 |     if command.HasField('domain'):
28 |         # Set a specific rate limit.
29 |         domain = command.domain
30 |         token = get_domain_token(domain)
31 |         name = 'domain:{}'.format(domain)
32 |     else:
33 |         # Set global rate limit.
34 |         if delay is None:
35 |             raise InvalidRequestException(
36 |                 'Cannot delete the global rate limit.')
37 |         token = GLOBAL_RATE_LIMIT_TOKEN
38 |         name = 'Global Rate Limit'
39 | 
40 |     await server_db.set_rate_limit(name, token, delay)
41 |     if delay is None:
42 |         rate_limiter.delete_rate_limit(token)
43 |     else:
44 |         rate_limiter.set_rate_limit(token, delay)
45 | 


--------------------------------------------------------------------------------
/starbelly/server/schedule.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timezone
 2 | from uuid import UUID
 3 | 
 4 | from . import api_handler
 5 | from ..schedule import Schedule
 6 | from ..starbelly_pb2 import JobRunState as PbRunState
 7 | 
 8 | 
 9 | @api_handler
10 | async def delete_schedule(command, scheduler, server_db):
11 |     ''' Delete a job schedule. '''
12 |     schedule_id = str(UUID(bytes=command.schedule_id))
13 |     await server_db.delete_schedule(schedule_id)
14 |     scheduler.remove_schedule(schedule_id)
15 | 
16 | 
17 | @api_handler
18 | async def get_schedule(command, response, server_db):
19 |     ''' Get metadata for a job schedule. '''
20 |     schedule_id = str(UUID(bytes=command.schedule_id))
21 |     doc = await server_db.get_schedule(schedule_id)
22 |     if doc is None:
23 |         response.is_success = False
24 |         response.error_message = f'No schedule exists with ID={schedule_id}'
25 |     else:
26 |         pb = response.schedule
27 |         Schedule.from_doc(doc).to_pb(pb)
28 | 
29 | 
30 | @api_handler
31 | async def list_schedules(command, response, server_db):
32 |     ''' Return a list of job schedules. '''
33 |     limit = command.page.limit
34 |     offset = command.page.offset
35 |     count, schedules = await server_db.list_schedules(limit, offset)
36 |     response.list_schedules.total = count
37 |     for doc in schedules:
38 |         pb = response.list_schedules.schedules.add()
39 |         Schedule.from_doc(doc).to_pb(pb)
40 | 
41 | 
42 | @api_handler
43 | async def list_schedule_jobs(command, response, server_db):
44 |     ''' Return a list of job schedules. '''
45 |     schedule_id = str(UUID(bytes=command.schedule_id))
46 |     limit = command.page.limit
47 |     offset = command.page.offset
48 |     count, jobs = await server_db.list_schedule_jobs(schedule_id, limit, offset)
49 |     response.list_schedule_jobs.total = count
50 |     for job_doc in jobs:
51 |         job = response.list_schedule_jobs.jobs.add()
52 |         job.job_id = UUID(job_doc['id']).bytes
53 |         job.name = job_doc['name']
54 |         for seed in job_doc['seeds']:
55 |             job.seeds.append(seed)
56 |         for tag in job_doc['tags']:
57 |             job.tags.append(tag)
58 |         job.item_count = job_doc['item_count']
59 |         job.http_success_count = job_doc['http_success_count']
60 |         job.http_error_count = job_doc['http_error_count']
61 |         job.exception_count = job_doc['exception_count']
62 |         job.started_at = job_doc['started_at'].isoformat()
63 |         if job_doc['completed_at'] is not None:
64 |             job.completed_at = job_doc['completed_at'].isoformat()
65 |         run_state = job_doc['run_state'].upper()
66 |         job.run_state = PbRunState.Value(run_state)
67 |         http_status_counts = job_doc['http_status_counts']
68 |         for status_code, count in http_status_counts.items():
69 |             job.http_status_counts[int(status_code)] = count
70 | 
71 | 
72 | @api_handler
73 | async def set_schedule(command, response, scheduler, server_db):
74 |     ''' Create or update job schedule metadata. '''
75 |     doc = Schedule.from_pb(command.schedule).to_doc()
76 |     now = datetime.now(timezone.utc)
77 |     schedule_id = await server_db.set_schedule(doc, now)
78 |     if schedule_id:
79 |         response.new_schedule.schedule_id = UUID(schedule_id).bytes
80 |     else:
81 |         schedule_id = str(UUID(bytes=command.schedule.schedule_id))
82 |     scheduler.remove_schedule(schedule_id)
83 |     if command.schedule.enabled:
84 |         schedule_doc = await server_db.get_schedule(schedule_id)
85 |         job_docs = await server_db.list_schedule_jobs(schedule_id, limit=1,
86 |             offset=0)
87 |         try:
88 |             latest_job_doc = job_docs[0]
89 |         except IndexError:
90 |             latest_job_doc = None
91 |         scheduler.add_schedule(schedule_doc, latest_job_doc)
92 | 


--------------------------------------------------------------------------------
/starbelly/server/subscription.py:
--------------------------------------------------------------------------------
 1 | from uuid import UUID
 2 | 
 3 | import trio.hazmat
 4 | 
 5 | from . import api_handler
 6 | 
 7 | 
 8 | @api_handler
 9 | async def subscribe_job_sync(command, crawl_manager, response,
10 |         subscription_manager):
11 |     ''' Handle the subscribe crawl items command. '''
12 |     job_id = str(UUID(bytes=command.job_id))
13 |     compression_ok = command.compression_ok
14 |     job_state_recv = crawl_manager.get_job_state_channel()
15 |     sync_token = command.sync_token if command.HasField('sync_token') else None
16 |     sub_id = subscription_manager.subscribe_job_sync(job_id, compression_ok,
17 |         job_state_recv, sync_token)
18 |     response.new_subscription.subscription_id = sub_id
19 | 
20 | 
21 | @api_handler
22 | async def subscribe_job_status(command, response, subscription_manager,
23 |         stats_tracker):
24 |     ''' Handle the subscribe crawl status command. '''
25 |     sub_id = subscription_manager.subscribe_job_status(stats_tracker,
26 |         command.min_interval)
27 |     response.new_subscription.subscription_id = sub_id
28 | 
29 | 
30 | @api_handler
31 | async def subscribe_resource_monitor(command, response, resource_monitor,
32 |         subscription_manager):
33 |     ''' Handle the subscribe resource monitor command. '''
34 |     sub_id = subscription_manager.subscribe_resource_monitor(resource_monitor,
35 |         command.history)
36 |     response.new_subscription.subscription_id = sub_id
37 | 
38 | 
39 | @api_handler
40 | async def subscribe_task_monitor(command, response, subscription_manager):
41 |     ''' Handle the subscribe task monitor command. '''
42 |     root_task = trio.hazmat.current_root_task()
43 |     sub_id = subscription_manager.subscribe_task_monitor(command.period,
44 |         root_task)
45 |     response.new_subscription.subscription_id = sub_id
46 | 
47 | 
48 | @api_handler
49 | async def unsubscribe(command, subscription_manager):
50 |     ''' Handle an unsubscribe command. '''
51 |     sub_id = command.subscription_id
52 |     subscription_manager.cancel_subscription(sub_id)
53 | 


--------------------------------------------------------------------------------
/starbelly/server/system.py:
--------------------------------------------------------------------------------
 1 | import cProfile
 2 | import operator
 3 | import pstats
 4 | 
 5 | import trio
 6 | 
 7 | from . import api_handler, InvalidRequestException
 8 | 
 9 | 
10 | @api_handler
11 | async def performance_profile(command, response):
12 |     ''' Run CPU profiler. '''
13 |     profile = cProfile.Profile()
14 |     profile.enable()
15 |     await trio.sleep(command.duration)
16 |     profile.disable()
17 | 
18 |     # pstats sorting only works when you use pstats printing... so we have
19 |     # to build our own data structure in order to sort it.
20 |     pr_stats = pstats.Stats(profile)
21 |     stats = list()
22 |     for key, value in pr_stats.stats.items():
23 |         stats.append({
24 |             'file': key[0],
25 |             'line_number': key[1],
26 |             'function': key[2],
27 |             'calls': value[0],
28 |             'non_recursive_calls': value[1],
29 |             'total_time': value[2],
30 |             'cumulative_time': value[3],
31 |         })
32 | 
33 |     try:
34 |         stats.sort(key=operator.itemgetter(command.sort_by), reverse=True)
35 |     except KeyError:
36 |         raise InvalidRequestException('Invalid sort key: {}'
37 |             .format(command.sort_by))
38 | 
39 |     response.performance_profile.total_calls = pr_stats.total_calls
40 |     response.performance_profile.total_time = pr_stats.total_tt
41 | 
42 |     for stat in stats[:command.top_n]:
43 |         function = response.performance_profile.functions.add()
44 |         function.file = stat['file']
45 |         function.line_number = stat['line_number']
46 |         function.function = stat['function']
47 |         function.calls = stat['calls']
48 |         function.non_recursive_calls = stat['non_recursive_calls']
49 |         function.total_time = stat['total_time']
50 |         function.cumulative_time = stat['cumulative_time']
51 | 


--------------------------------------------------------------------------------
/starbelly/storage.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import hashlib
  3 | import logging
  4 | import functools
  5 | 
  6 | import mimeparse
  7 | import trio
  8 | 
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | def should_compress_body(response):
 14 |     '''
 15 |     Returns true if the response body should be compressed.
 16 | 
 17 |     This logic can be amended over time to add additional MIME types that
 18 |     should be compressed.
 19 | 
 20 |     :param starbelly.downloader.DownloadResponse response:
 21 |     '''
 22 |     should_compress = False
 23 |     type_, subtype, _ = mimeparse.parse_mime_type(response.content_type)
 24 |     if type_ == 'text':
 25 |         should_compress = True
 26 |     elif type_ == 'application' and subtype in ('json', 'pdf'):
 27 |         should_compress = True
 28 |     return should_compress
 29 | 
 30 | 
 31 | class CrawlStorage:
 32 |     ''' This class stores crawl items in the database. '''
 33 |     def __init__(self, job_id, db, send_channel, receive_channel, policy,
 34 |         sequence):
 35 |         '''
 36 |         Constructor
 37 | 
 38 |         :param str job_id: The job to store items for.
 39 |         :param starbelly.db.CrawlStorageDb db: Database layer.
 40 |         :param starbelly.policy.Policy: A policy to use for determining which
 41 |             responses to save.
 42 |         :param sequence: An iterator that returns a sequence number for each
 43 |             item to be saved.
 44 |         '''
 45 |         self._job_id = job_id
 46 |         self._db = db
 47 |         self._send_channel = send_channel
 48 |         self._receive_channel = receive_channel
 49 |         self._policy = policy
 50 |         self._sequence = sequence
 51 | 
 52 |     def __repr__(self):
 53 |         ''' Put job ID in repr. '''
 54 |         return '<CrawlStorage job_id={}>'.format(self._job_id[:8])
 55 | 
 56 |     async def run(self):
 57 |         '''
 58 |         Read items from channel and saves them into the database.
 59 | 
 60 |         :returns: This function runs until cancelled.
 61 |         '''
 62 |         async for response in self._receive_channel:
 63 |             await self._save_response(response)
 64 |             await self._db.update_job_stats(self._job_id, response)
 65 |             await self._send_channel.send(response)
 66 | 
 67 |     async def _save_response(self, response):
 68 |         '''
 69 |         Save a response to the database.
 70 | 
 71 |         :param starbelly.downloader.DownloadResponse response:
 72 |         '''
 73 |         response_doc = {
 74 |             'completed_at': response.completed_at,
 75 |             'cost': response.cost,
 76 |             'duration': response.duration,
 77 |             'job_id': self._job_id,
 78 |             'started_at': response.started_at,
 79 |             'url': response.url.human_repr(),
 80 |             'canonical_url': response.canonical_url,
 81 |         }
 82 | 
 83 |         if response.exception is None:
 84 |             response_doc['completed_at'] = response.completed_at
 85 |             response_doc['content_type'] = response.content_type
 86 |             response_doc['is_success'] = response.status_code // 100 == 2
 87 |             response_doc['status_code'] = response.status_code
 88 |             compress_body = should_compress_body(response)
 89 | 
 90 |             headers = list()
 91 |             for key, value in response.headers.items():
 92 |                 headers.append(key.upper())
 93 |                 headers.append(value)
 94 |             response_doc['headers'] = headers
 95 | 
 96 |             body_hash = hashlib.blake2b(response.body, digest_size=16).digest()
 97 |             if compress_body:
 98 |                 body = await trio.run_sync_in_worker_thread(functools.partial(
 99 |                     gzip.compress, response.body, compresslevel=6))
100 |             else:
101 |                 body = response.body
102 | 
103 |             response_doc['body_id'] = body_hash
104 |             response_body_doc = {
105 |                 'id': body_hash,
106 |                 'body': body,
107 |                 'is_compressed': compress_body,
108 |             }
109 |         else:
110 |             response_doc['exception'] = response.exception
111 |             response_doc['is_success'] = False
112 |             response_body_doc = None
113 | 
114 |         response_doc['sequence'] = next(self._sequence)
115 |         await self._db.save_response(response_doc, response_body_doc)
116 | 


--------------------------------------------------------------------------------
/starbelly/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '2.0.0-dev'
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
  1 | from contextlib import contextmanager
  2 | from functools import wraps
  3 | import pathlib
  4 | from os.path import dirname
  5 | from sys import path
  6 | from unittest.mock import Mock
  7 | 
  8 | import pytest
  9 | import trio
 10 | import trio_asyncio
 11 | 
 12 | 
 13 | # Add this project to the Python path.
 14 | path.append(dirname(dirname(__file__)))
 15 | 
 16 | 
 17 | @contextmanager
 18 | def assert_min_elapsed(seconds):
 19 |     '''
 20 |     Fail the test if the execution of a block takes less than ``seconds``.
 21 |     '''
 22 |     start = trio.current_time()
 23 |     yield
 24 |     elapsed = trio.current_time() - start
 25 |     assert elapsed >= seconds, 'Completed in under {} seconds'.format(seconds)
 26 | 
 27 | 
 28 | @contextmanager
 29 | def assert_max_elapsed(seconds):
 30 |     '''
 31 |     Fail the test if the execution of a block takes longer than ``seconds``.
 32 |     '''
 33 |     try:
 34 |         with trio.fail_after(seconds):
 35 |             yield
 36 |     except trio.TooSlowError:
 37 |         pytest.fail('Failed to complete within {} seconds'.format(seconds))
 38 | 
 39 | 
 40 | @contextmanager
 41 | def assert_elapsed(seconds, delta=0.1):
 42 |     '''
 43 |     Fail the test if the execution of a block takes more than seconds+delta time
 44 |     or less than seconds-delta time.
 45 |     '''
 46 |     with assert_min_elapsed(seconds-delta), assert_max_elapsed(seconds+delta):
 47 |         yield
 48 | 
 49 | 
 50 | class AsyncMock:
 51 |     ''' A mock that acts like an async def function. '''
 52 |     def __init__(self, return_value=None, return_values=None, raises=None,
 53 |             side_effect=None):
 54 |         self._raises = None
 55 |         self._side_effect = None
 56 |         self._return_value = None
 57 |         self._index = None
 58 |         self._call_count = 0
 59 |         self._call_args = None
 60 |         self._call_kwargs = None
 61 | 
 62 |         if raises:
 63 |             self._raises = raises
 64 |         elif return_values:
 65 |             self._return_value = return_values
 66 |             self._index = 0
 67 |         elif side_effect:
 68 |             self._side_effect=side_effect
 69 |         else:
 70 |             self._return_value = return_value
 71 | 
 72 |     @property
 73 |     def call_args(self):
 74 |         return self._call_args
 75 | 
 76 |     @property
 77 |     def call_kwargs(self):
 78 |         return self._call_kwargs
 79 | 
 80 |     @property
 81 |     def called(self):
 82 |         return self._call_count > 0
 83 | 
 84 |     @property
 85 |     def call_count(self):
 86 |         return self._call_count
 87 | 
 88 |     async def __call__(self, *args, **kwargs):
 89 |         self._call_args = args
 90 |         self._call_kwargs = kwargs
 91 |         self._call_count += 1
 92 |         if self._raises:
 93 |             raise(self._raises)
 94 |         elif self._side_effect:
 95 |             return await self._side_effect(*args, **kwargs)
 96 |         elif self._index is not None:
 97 |             return_index = self._index
 98 |             self._index += 1
 99 |             return self._return_value[return_index]
100 |         else:
101 |             return self._return_value
102 | 
103 | 
104 | async def async_iter(iter):
105 |     '''
106 |     Convert a synchronous iterable into an async iterator.
107 | 
108 |     :param iterable iter:
109 |     '''
110 |     for item in iter:
111 |         await trio.sleep(0)
112 |         yield item
113 | 
114 | 
115 | @pytest.fixture
116 | async def asyncio_loop():
117 |     ''' Open an asyncio loop. Useful for things like aiohttp.CookieJar that
118 |     require a global loop. '''
119 |     async with trio_asyncio.open_loop() as loop:
120 |         yield loop
121 | 
122 | 
123 | class fail_after:
124 |     ''' This decorator fails if the runtime of the decorated function (as
125 |     measured by the Trio clock) exceeds the specified value. '''
126 |     def __init__(self, seconds):
127 |         self._seconds = seconds
128 | 
129 |     def __call__(self, fn):
130 |         @wraps(fn)
131 |         async def wrapper(*args, **kwargs):
132 |             with trio.move_on_after(self._seconds) as cancel_scope:
133 |                 await fn(*args, **kwargs)
134 |             if cancel_scope.cancelled_caught:
135 |                 pytest.fail('Test runtime exceeded the maximum {} seconds'
136 |                     .format(self._seconds))
137 |         return wrapper
138 | 


--------------------------------------------------------------------------------
/tests/test_async_mock.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from . import AsyncMock
 4 | 
 5 | 
 6 | async def test_async_mock_no_return():
 7 |     foo = AsyncMock()
 8 |     assert await foo() is None
 9 | 
10 | 
11 | async def test_async_mock_single_return():
12 |     foo = AsyncMock(return_value=1)
13 |     assert await foo() == 1
14 |     assert await foo() == 1
15 | 
16 | 
17 | async def test_async_mock_multiple_returns():
18 |     foo = AsyncMock(return_values=(1,2))
19 |     assert await foo() == 1
20 |     assert await foo() == 2
21 | 
22 | 
23 | async def test_async_mock_raises():
24 |     foo = AsyncMock(raises=Exception)
25 |     with pytest.raises(Exception):
26 |         await foo()
27 | 


--------------------------------------------------------------------------------
/tests/test_backoff.py:
--------------------------------------------------------------------------------
 1 | from . import assert_elapsed
 2 | from starbelly.backoff import ExponentialBackoff
 3 | 
 4 | 
 5 | async def test_backoff_no_change(autojump_clock):
 6 |     ''' Backoff starts at 1, so 3 iterations takes ~2 seconds. '''
 7 |     with assert_elapsed(2):
 8 |         loop_count = 0
 9 |         async for _ in ExponentialBackoff(min_=1, max_=64):
10 |             loop_count += 1
11 |             if loop_count == 3:
12 |                 break
13 | 
14 | 
15 | async def test_backoff_increase(autojump_clock):
16 |     ''' Increase backoff on each loop. Backoffs should be equal to 1, 2, 4,
17 |     8, 16, 16, but the first value is skipped, so the total is ~46 seconds. '''
18 |     with assert_elapsed(seconds=46):
19 |         loop_count = 0
20 |         backoff = ExponentialBackoff(min_=1, max_=16)
21 |         async for n in backoff:
22 |             backoff.increase()
23 |             loop_count += 1
24 |             if loop_count == 6: break
25 | 
26 | 
27 | async def test_backoff_returns_value(autojump_clock):
28 |     ''' Backoff returns the current value. Increase up to max and then decrease
29 |     back to starting point. '''
30 |     backoff = ExponentialBackoff(min_=1, max_=8)
31 |     assert await backoff.__anext__() == 0
32 |     assert await backoff.__anext__() == 1
33 |     backoff.increase()
34 |     assert await backoff.__anext__() == 2
35 |     backoff.increase()
36 |     assert await backoff.__anext__() == 4
37 |     backoff.increase()
38 |     assert await backoff.__anext__() == 8
39 |     backoff.increase()
40 |     assert await backoff.__anext__() == 8
41 |     backoff.decrease()
42 |     assert await backoff.__anext__() == 4
43 |     backoff.decrease()
44 |     assert await backoff.__anext__() == 2
45 |     backoff.decrease()
46 |     assert await backoff.__anext__() == 1
47 |     backoff.decrease()
48 |     assert await backoff.__anext__() == 1
49 | 


--------------------------------------------------------------------------------
/tests/test_captcha.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timezone
 2 | from uuid import UUID
 3 | 
 4 | import pytest
 5 | 
 6 | from starbelly.captcha import (
 7 |     CaptchaSolver,
 8 |     captcha_doc_to_pb,
 9 |     captcha_pb_to_doc,
10 | )
11 | from starbelly.starbelly_pb2 import CaptchaSolverAntigateCharacters
12 | 
13 | 
14 | def test_captcha_command():
15 |     captcha_doc = {
16 |         'id': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa',
17 |         'name': 'Captcha #1',
18 |         'service_url': 'https://captcha.example/service.php',
19 |         'api_key': 'FAKE-API-KEY',
20 |         'require_phrase': False,
21 |         'case_sensitive': True,
22 |         'characters': 'ALPHANUMERIC',
23 |         'require_math': False,
24 |     }
25 |     solver = CaptchaSolver(captcha_doc)
26 |     img_data = b'\x01\x02\x03\x04'
27 |     command = solver.get_command(img_data)
28 |     assert command['clientKey'] == 'FAKE-API-KEY'
29 |     assert command['task']['type'] == 'ImageToTextTask'
30 |     assert command['task']['body'] == 'AQIDBA==' # Base64 of img_data
31 |     assert not command['task']['phrase']
32 |     assert command['task']['case']
33 |     assert command['task']['numeric'] == 0
34 |     assert not command['task']['math']
35 |     assert command['task']['minLength'] == 0
36 |     assert command['task']['maxLength'] == 0
37 | 
38 | 
39 | def test_captcha_doc_to_pb():
40 |     captcha_id = UUID('aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa')
41 |     captcha_doc = {
42 |         'id': str(captcha_id),
43 |         'type': 'antigate',
44 |         'created_at': datetime(2019, 1, 26, 15, 30, 0, tzinfo=timezone.utc),
45 |         'updated_at': datetime(2019, 1, 26, 15, 35, 0, tzinfo=timezone.utc),
46 |         'name': 'Captcha #1',
47 |         'service_url': 'https://captcha.example/service.php',
48 |         'api_key': 'FAKE-API-KEY',
49 |         'require_phrase': False,
50 |         'case_sensitive': True,
51 |         'characters': 'ALPHANUMERIC',
52 |         'require_math': False,
53 |     }
54 |     pb_captcha = captcha_doc_to_pb(captcha_doc)
55 |     assert pb_captcha.name == 'Captcha #1'
56 |     assert pb_captcha.solver_id == captcha_id.bytes
57 |     assert pb_captcha.created_at == '2019-01-26T15:30:00+00:00'
58 |     assert pb_captcha.updated_at == '2019-01-26T15:35:00+00:00'
59 |     assert pb_captcha.antigate.service_url == \
60 |         'https://captcha.example/service.php'
61 |     assert pb_captcha.antigate.api_key == 'FAKE-API-KEY'
62 |     assert not pb_captcha.antigate.require_phrase
63 |     assert pb_captcha.antigate.case_sensitive
64 |     assert pb_captcha.antigate.characters == \
65 |         CaptchaSolverAntigateCharacters.Value('ALPHANUMERIC')
66 |     assert not pb_captcha.antigate.require_math
67 | 
68 |     captcha_doc = captcha_pb_to_doc(pb_captcha)
69 |     assert captcha_doc['id'] == str(captcha_id)
70 |     assert captcha_doc['name'] == 'Captcha #1'
71 |     assert captcha_doc['type'] == 'antigate'
72 |     assert captcha_doc['service_url'] == 'https://captcha.example/service.php'
73 |     assert captcha_doc['api_key'] == 'FAKE-API-KEY'
74 |     assert captcha_doc['require_phrase'] == False
75 |     assert captcha_doc['case_sensitive'] == True
76 |     assert captcha_doc['characters'] == 'ALPHANUMERIC'
77 |     assert captcha_doc['require_math'] == False
78 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import starbelly.config
 4 | 
 5 | 
 6 | LOCAL_INI = '''[database]
 7 | host = starbelly-host
 8 | db = starbelly-db
 9 | user = starbelly-app
10 | password = normalpass
11 | super_user = starbelly-admin
12 | super_password = superpass'''
13 | 
14 | 
15 | SYSTEM_INI = '''[database]
16 | host =
17 | port = 28015
18 | db =
19 | user =
20 | password =
21 | super_user =
22 | super_password =
23 | 
24 | [rate_limiter]
25 | capacity = 10000'''
26 | 
27 | 
28 | def test_get_config(tmp_path):
29 |     # Hack: modify the module's private _root variable to point at our temp
30 |     # directory.
31 |     starbelly.config._root = tmp_path
32 | 
33 |     # Create temp configuration files.
34 |     config_dir = tmp_path / 'conf'
35 |     config_dir.mkdir()
36 | 
37 |     with (config_dir / 'local.ini').open('w') as f:
38 |         f.write(LOCAL_INI)
39 | 
40 |     with (config_dir / 'system.ini').open('w') as f:
41 |         f.write(SYSTEM_INI)
42 | 
43 |     # Read configuration.
44 |     config = starbelly.config.get_config()
45 |     import logging
46 |     logging.debug('secrions %r', config.sections())
47 |     db = config['database']
48 |     rl = config['rate_limiter']
49 | 
50 |     assert db['host'] == 'starbelly-host'
51 |     assert db['port'] == '28015'
52 |     assert db['db'] == 'starbelly-db'
53 |     assert rl['capacity'] == '10000'
54 | 


--------------------------------------------------------------------------------
/tests/test_crawl.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/tests/test_crawl.py


--------------------------------------------------------------------------------
/tests/test_frontier.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timezone
  2 | from unittest.mock import Mock
  3 | 
  4 | import pytest
  5 | import trio
  6 | 
  7 | from . import AsyncMock
  8 | from starbelly.frontier import (
  9 |     CrawlFrontier,
 10 |     FrontierItem,
 11 |     FrontierExhaustionError,
 12 | )
 13 | from starbelly.policy import Policy
 14 | 
 15 | 
 16 | def make_policy():
 17 |     created_at = datetime(2018,12,31,13,47,00)
 18 |     policy_doc = {
 19 |         'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb',
 20 |         'name': 'Test',
 21 |         'created_at': created_at,
 22 |         'updated_at': created_at,
 23 |         'authentication': {
 24 |             'enabled': True,
 25 |         },
 26 |         'limits': {
 27 |             'max_cost': 10,
 28 |             'max_duration': 3600,
 29 |             'max_items': 10_000,
 30 |         },
 31 |         'mime_type_rules': [
 32 |             {'match': 'MATCHES', 'pattern': '^text/', 'save': True},
 33 |             {'save': False},
 34 |         ],
 35 |         'proxy_rules': [],
 36 |         'robots_txt': {
 37 |             'usage': 'IGNORE',
 38 |         },
 39 |         'url_normalization': {
 40 |             'enabled': True,
 41 |             'strip_parameters': ['b'],
 42 |         },
 43 |         'url_rules': [
 44 |             {'action': 'ADD', 'amount': 1, 'match': 'MATCHES',
 45 |              'pattern': '^https?://({SEED_DOMAINS})/'},
 46 |             {'action': 'MULTIPLY', 'amount': 0},
 47 |         ],
 48 |         'user_agents': [
 49 |             {'name': 'Test User Agent'}
 50 |         ]
 51 |     }
 52 |     return Policy(policy_doc, '1.0.0', ['https://frontier.example'])
 53 | 
 54 | 
 55 | async def test_frontier_exhaustion(nursery):
 56 |     # Set up test fixtures
 57 |     job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
 58 |     db = Mock()
 59 |     db.any_in_flight = AsyncMock()
 60 |     db.get_frontier_batch = AsyncMock(return_value=list())
 61 |     db.get_frontier_size = AsyncMock(return_value=5)
 62 |     send_channel, recv_channel = trio.open_memory_channel(0)
 63 |     login_manager = Mock()
 64 |     login_manager.login = AsyncMock()
 65 |     policy = make_policy()
 66 |     stats = dict()
 67 |     frontier = CrawlFrontier(job_id, db, send_channel, login_manager, policy,
 68 |         stats)
 69 | 
 70 |     # This test has an empty frontier, so it should raise an exhaustion error
 71 |     # in its run() method.
 72 |     with pytest.raises(FrontierExhaustionError):
 73 |         await frontier.run()
 74 | 
 75 | 
 76 | async def test_frontier_batches(autojump_clock, nursery):
 77 |     # Set up test fixtures
 78 |     job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
 79 |     db = Mock()
 80 |     db.any_in_flight = AsyncMock()
 81 |     batch1 = [{
 82 |         'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb',
 83 |         'cost': 1.0,
 84 |         'job_id': job_id,
 85 |         'url': 'https://frontier.example/1',
 86 |         'in_flight': False,
 87 |     },{
 88 |         'id': 'cccccccc-cccc-cccc-cccc-cccccccccccc',
 89 |         'cost': 2.0,
 90 |         'job_id': job_id,
 91 |         'url': 'https://frontier.example/2',
 92 |         'in_flight': False,
 93 |     }]
 94 |     batch2 = [{
 95 |         'id': 'dddddddd-dddd-dddd-dddd-dddddddddddd',
 96 |         'cost': 3.0,
 97 |         'job_id': job_id,
 98 |         'url': 'https://frontier.example/3',
 99 |         'in_flight': False,
100 |     }]
101 |     db.get_frontier_batch = AsyncMock(return_values=(batch1, batch2, []))
102 |     db.get_frontier_size = AsyncMock(return_value=5)
103 |     send_channel, recv_channel = trio.open_memory_channel(0)
104 |     login_manager = Mock()
105 |     login_manager.login = AsyncMock()
106 |     policy = make_policy()
107 |     stats = dict()
108 |     frontier = CrawlFrontier(job_id, db, send_channel, login_manager, policy,
109 |         stats)
110 |     assert repr(frontier) == '<CrawlFrontier job_id=aaaaaaaa>'
111 |     nursery.start_soon(frontier.run)
112 | 
113 |     # Wait for the first item from the frontier. It should trigger the login
114 |     # manager to log in to this domain, and also check the robots.txt to see if
115 |     # the item is allowed.
116 |     item1 = await recv_channel.receive()
117 |     assert login_manager.login.call_count == 1
118 |     assert login_manager.login.call_args[0] == 'frontier.example'
119 |     assert str(item1.url) == 'https://frontier.example/1'
120 |     assert item1.cost == 1.0
121 |     assert item1.job_id == job_id
122 |     assert item1.frontier_id == 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb'
123 | 


--------------------------------------------------------------------------------
/tests/test_job.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta, timezone
  2 | import logging
  3 | import pickle
  4 | from unittest.mock import Mock
  5 | 
  6 | import pytest
  7 | import trio
  8 | 
  9 | from . import AsyncMock, asyncio_loop, fail_after
 10 | from starbelly.frontier import FrontierExhaustionError
 11 | from starbelly.job import (
 12 |     PipelineTerminator,
 13 |     RunState,
 14 |     StatsTracker,
 15 |     CrawlManager,
 16 | )
 17 | 
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | def make_policy_doc():
 23 |     created_at = datetime(2019, 1, 1, 12, 0, 0, tzinfo=timezone.utc)
 24 |     return {
 25 |         'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb',
 26 |         'name': 'Test Policy',
 27 |         'created_at': created_at,
 28 |         'updated_at': created_at,
 29 |         'authentication': {'enabled': True},
 30 |         'limits': {
 31 |             'max_cost': 10,
 32 |             'max_duration': 3600,
 33 |             'max_items': 10_000,
 34 |         },
 35 |         'mime_type_rules': [
 36 |             {'match': 'MATCHES', 'pattern': '^text/', 'save': True},
 37 |             {'save': False},
 38 |         ],
 39 |         'proxy_rules': [],
 40 |         'robots_txt': {
 41 |             'usage': 'IGNORE',
 42 |         },
 43 |         'url_normalization': {
 44 |             'enabled': True,
 45 |             'strip_parameters': ['PHPSESSID'],
 46 |         },
 47 |         'url_rules': [
 48 |             {'action': 'ADD', 'amount': 1, 'match': 'MATCHES',
 49 |              'pattern': '^https?://({SEED_DOMAINS})/'},
 50 |             {'action': 'MULTIPLY', 'amount': 0},
 51 |         ],
 52 |         'user_agents': [
 53 |             {'name': 'Test User Agent'}
 54 |         ]
 55 |     }
 56 | 
 57 | 
 58 | @fail_after(3)
 59 | async def test_start_job(asyncio_loop, nursery):
 60 |     # Set up fixtures
 61 |     job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
 62 |     policy_id = 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb'
 63 |     rate_limiter = Mock()
 64 |     rate_limiter.remove_job = AsyncMock()
 65 |     stats_tracker = StatsTracker(timedelta(seconds=60))
 66 |     robots_txt_manager = Mock()
 67 |     manager_db = Mock()
 68 |     manager_db.clear_frontier = AsyncMock()
 69 |     manager_db.create_job = AsyncMock(job_id)
 70 |     manager_db.finish_job = AsyncMock()
 71 |     manager_db.get_max_sequence = AsyncMock(100)
 72 |     manager_db.get_policy = AsyncMock(make_policy_doc())
 73 |     manager_db.run_job = AsyncMock()
 74 |     frontier_db = Mock()
 75 |     frontier_db.any_in_flight = AsyncMock(False)
 76 |     frontier_db.get_frontier_batch = AsyncMock({})
 77 |     frontier_db.get_frontier_size = AsyncMock(0)
 78 |     frontier_db.run = AsyncMock()
 79 |     extractor_db = Mock()
 80 |     storage_db = Mock()
 81 |     login_db = Mock()
 82 |     crawl_manager = CrawlManager(rate_limiter, stats_tracker,
 83 |         robots_txt_manager, manager_db, frontier_db, extractor_db, storage_db,
 84 |         login_db)
 85 | 
 86 |     # Run the crawl manager and start a new job
 87 |     await nursery.start(crawl_manager.run)
 88 |     await crawl_manager.start_job('Test Job', ['https://seed.example'],
 89 |         ['tag1'], policy_id)
 90 | 
 91 |     # Wait for the crawler to tell us that the job is running.
 92 |     recv_channel = crawl_manager.get_job_state_channel()
 93 |     state_event = await recv_channel.receive()
 94 |     assert state_event.run_state == RunState.RUNNING
 95 | 
 96 |     resources = crawl_manager.get_resource_usage()
 97 |     assert resources['maximum_downloads'] == 20
 98 |     assert resources['current_downloads'] == 0
 99 |     assert resources['jobs'][0]['id'] == job_id
100 |     assert resources['jobs'][0]['name'] == 'Test Job'
101 |     assert resources['jobs'][0]['current_downloads'] == 0
102 | 
103 |     # The job has an empty frontier, so it will quit immediately after starting.
104 |     # Wait for the completed job state.
105 |     state_event = await recv_channel.receive()
106 |     assert state_event.run_state == RunState.COMPLETED
107 | 
108 |     # Make sure the manager interacted with other objects correctly.
109 |     assert manager_db.clear_frontier.call_args[0] == job_id
110 |     assert manager_db.finish_job.call_args[0] == job_id
111 |     assert manager_db.finish_job.call_args[1] == RunState.COMPLETED
112 |     assert manager_db.get_policy.call_args[0] == policy_id
113 |     assert manager_db.run_job.call_args[0] == job_id
114 |     assert frontier_db.get_frontier_batch.call_args[0] == job_id
115 | 
116 |     stats = stats_tracker.snapshot()
117 |     assert stats[0]['id'] == job_id
118 |     assert stats[0]['name'] == 'Test Job'
119 |     assert stats[0]['run_state'] == RunState.COMPLETED
120 |     assert stats[0]['seeds'] == ['https://seed.example']
121 |     assert stats[0]['tags'] == ['tag1']
122 | 
123 | 
124 | @fail_after(3)
125 | async def test_pause_resume_cancel(asyncio_loop, nursery):
126 |     # Set up fixtures
127 |     job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
128 |     created_at = datetime(2019, 1, 1, 12, 0, 0, tzinfo=timezone.utc)
129 |     job_doc = {
130 |         'id': job_id,
131 |         'name': 'Test Job',
132 |         'seeds': ['https://seed1.example', 'https://seed2.example'],
133 |         'tags': [],
134 |         'run_state': RunState.PAUSED,
135 |         'old_urls': b'\x80\x03cbuiltins\nset\nq\x00]q\x01C\x10\xad\xb6\x93\x9b'
136 |                     b'\xac\x92\xd8\xfd\xc0\x8dJ\x94^\x8d\xe5~q\x02a\x85q\x03Rq'
137 |                     b'\x04.',
138 |         'started_at': created_at,
139 |         'completed_at': None,
140 |         'duration': None,
141 |         'item_count': 0,
142 |         'http_success_count': 0,
143 |         'http_error_count': 0,
144 |         'exception_count': 0,
145 |         'http_status_counts': {},
146 |         'schedule_id': 'cccccccc-cccc-cccc-cccc-cccccccccccc',
147 |         'policy': {
148 |             'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb',
149 |             'name': 'Test Policy',
150 |             'created_at': created_at,
151 |             'updated_at': created_at,
152 |             'authentication': {
153 |                 'enabled': False,
154 |             },
155 |             'captcha_solver_id': None,
156 |             'limits': {
157 |                 'max_cost': 10,
158 |                 'max_duration': 3600,
159 |                 'max_items': 10_000,
160 |             },
161 |             'mime_type_rules': [
162 |                 {'match': 'MATCHES', 'pattern': '^text/', 'save': True},
163 |                 {'save': False},
164 |             ],
165 |             'proxy_rules': [],
166 |             'robots_txt': {
167 |                 'usage': 'IGNORE',
168 |             },
169 |             'url_normalization': {
170 |                 'enabled': True,
171 |                 'strip_parameters': [],
172 |             },
173 |             'url_rules': [
174 |                 {'action': 'ADD', 'amount': 1, 'match': 'MATCHES',
175 |                  'pattern': '^https?://({SEED_DOMAINS})/'},
176 |                 {'action': 'MULTIPLY', 'amount': 0},
177 |             ],
178 |             'user_agents': [
179 |                 {'name': 'Test User Agent'}
180 |             ],
181 |         },
182 |     }
183 | 
184 |     rate_limiter = Mock()
185 |     rate_limiter.remove_job = AsyncMock()
186 |     stats_tracker = StatsTracker(timedelta(seconds=60))
187 |     robots_txt_manager = Mock()
188 |     manager_db = Mock()
189 |     manager_db.clear_frontier = AsyncMock()
190 |     manager_db.create_job = AsyncMock(job_id)
191 |     manager_db.finish_job = AsyncMock()
192 |     manager_db.get_max_sequence = AsyncMock(100)
193 |     manager_db.get_policy = AsyncMock(make_policy_doc())
194 |     manager_db.resume_job = AsyncMock(job_doc)
195 |     manager_db.pause_job = AsyncMock()
196 |     manager_db.run_job = AsyncMock()
197 |     frontier_db = Mock()
198 |     frontier_db.any_in_flight = AsyncMock(True)
199 |     frontier_db.get_frontier_batch = AsyncMock({})
200 |     frontier_db.get_frontier_size = AsyncMock(0)
201 |     frontier_db.run = AsyncMock()
202 |     extractor_db = Mock()
203 |     storage_db = Mock()
204 |     login_db = Mock()
205 |     crawl_manager = CrawlManager(rate_limiter, stats_tracker,
206 |         robots_txt_manager, manager_db, frontier_db, extractor_db, storage_db,
207 |         login_db)
208 | 
209 |     # Run the crawl manager and start a new job
210 |     await nursery.start(crawl_manager.run)
211 |     await crawl_manager.start_job(job_doc['name'], job_doc['seeds'],
212 |         job_doc['tags'], job_doc['policy']['id'])
213 | 
214 |     # Wait for the crawler to tell us that the job is running.
215 |     recv_channel = crawl_manager.get_job_state_channel()
216 |     state_event = await recv_channel.receive()
217 |     assert state_event.run_state == RunState.RUNNING
218 | 
219 |     # Now pause and wait for the paused event.
220 |     await crawl_manager.pause_job(job_id)
221 |     state_event = await recv_channel.receive()
222 |     assert state_event.run_state == RunState.PAUSED
223 |     assert manager_db.pause_job.call_args[0] == job_id
224 |     # There are two "old URLs": the seed URLs.
225 |     assert len(pickle.loads(manager_db.pause_job.call_args[1])) == 2
226 |     assert stats_tracker.snapshot()[0]['run_state'] == RunState.PAUSED
227 | 
228 |     # Now resume and wait for the running event.
229 |     await crawl_manager.resume_job(job_id)
230 |     state_event = await recv_channel.receive()
231 |     assert state_event.run_state == RunState.RUNNING
232 |     assert manager_db.resume_job.call_args[0] == job_id
233 | 
234 |     # Now cancel and wait for the cancelled event
235 |     await crawl_manager.cancel_job(job_id)
236 |     state_event = await recv_channel.receive()
237 |     assert state_event.run_state == RunState.CANCELLED
238 |     assert manager_db.finish_job.call_args[0] == job_id
239 |     assert manager_db.finish_job.call_args[1] == RunState.CANCELLED
240 | 


--------------------------------------------------------------------------------
/tests/test_login.py:
--------------------------------------------------------------------------------
  1 | from base64 import b64decode
  2 | from datetime import datetime, timezone
  3 | from functools import partial
  4 | from unittest.mock import Mock
  5 | 
  6 | from aiohttp import CookieJar
  7 | import trio
  8 | from yarl import URL
  9 | 
 10 | from . import asyncio_loop, AsyncMock
 11 | from starbelly.captcha import CaptchaSolver
 12 | from starbelly.downloader import Downloader, DownloadResponse
 13 | from starbelly.login import LoginManager
 14 | from starbelly.policy import Policy
 15 | 
 16 | 
 17 | def make_policy(captcha_port=80):
 18 |     policy_doc = {
 19 |         'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7',
 20 |         'name': 'Test',
 21 |         'created_at': datetime(2019, 1, 28, 14, 26, 0, tzinfo=timezone.utc),
 22 |         'updated_at': datetime(2019, 1, 28, 14, 26, 0, tzinfo=timezone.utc),
 23 |         'authentication': {
 24 |             'enabled': False,
 25 |         },
 26 |         'captcha_solver': {
 27 |             'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f8',
 28 |             'name': 'Example CAPTCHA',
 29 |             'service_url': 'http://127.0.0.1:{}'.format(captcha_port),
 30 |             'api_key': None,
 31 |             'require_phrase': False,
 32 |             'case_sensitive': True,
 33 |             'characters': 'ALPHANUMERIC',
 34 |             'require_math': False,
 35 |         },
 36 |         'limits': {
 37 |             'max_cost': 10,
 38 |             'max_duration': 3600,
 39 |             'max_items': 10_000,
 40 |         },
 41 |         'mime_type_rules': [
 42 |             {'match': 'MATCHES', 'pattern': '^text/', 'save': True},
 43 |             {'save': False},
 44 |         ],
 45 |         'proxy_rules': [],
 46 |         'robots_txt': {
 47 |             'usage': 'IGNORE',
 48 |         },
 49 |         'url_normalization': {
 50 |             'enabled': True,
 51 |             'strip_parameters': [],
 52 |         },
 53 |         'url_rules': [
 54 |             {'action': 'ADD', 'amount': 1, 'match': 'MATCHES',
 55 |              'pattern': '^https?://({SEED_DOMAINS})/'},
 56 |             {'action': 'MULTIPLY', 'amount': 0},
 57 |         ],
 58 |         'user_agents': [
 59 |             {'name': 'Test User Agent'}
 60 |         ]
 61 |     }
 62 |     return Policy(policy_doc, '1.0.0', ['https://login.example'])
 63 | 
 64 | 
 65 | async def test_login_form():
 66 |     job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
 67 |     db = Mock()
 68 |     login = {
 69 |         'domain': 'login.example',
 70 |         'login_url': 'https://login.example/index',
 71 |         'users': [{'username': 'john', 'password': 'fake'}]
 72 |     }
 73 |     db.get_login = AsyncMock(return_value=login)
 74 |     policy = make_policy()
 75 |     downloader = Mock()
 76 |     html1 = \
 77 |     b'''<html>
 78 |         <head><title>Login Test</title></head>
 79 |         <body>
 80 |             <form action="/login" method="POST">
 81 |             <input type="text" name="username">
 82 |             <input type="password" name="password">
 83 |             <input type="submit" value="Log In">
 84 |         </body>
 85 |         </html>'''
 86 |     response1 = DownloadResponse(
 87 |         frontier_id='bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb',
 88 |         cost=1.0,
 89 |         url='https://login.example',
 90 |         canonical_url='https://login.example',
 91 |         content_type='text/html',
 92 |         body=html1,
 93 |         started_at=datetime(2019, 2, 1, 10, 2, 0, tzinfo=timezone.utc),
 94 |         completed_at=datetime(2019, 2, 1, 10, 2, 0, tzinfo=timezone.utc),
 95 |         exception=None,
 96 |         status_code=200,
 97 |         headers=dict()
 98 |     )
 99 |     response2 = DownloadResponse(
100 |         frontier_id='cccccccc-cccc-cccc-cccc-cccccccccccc',
101 |         cost=1.0,
102 |         url='https://login.example',
103 |         canonical_url='https://login.example',
104 |         content_type='text/html',
105 |         body=None,
106 |         started_at=datetime(2019, 2, 1, 10, 2, 0, tzinfo=timezone.utc),
107 |         completed_at=datetime(2019, 2, 1, 10, 2, 0, tzinfo=timezone.utc),
108 |         exception=None,
109 |         status_code=200,
110 |         headers=dict()
111 |     )
112 |     downloader.download = AsyncMock(return_values=(response1, response2))
113 |     login_manager = LoginManager(job_id, db, policy, downloader)
114 |     await login_manager.login('login.example')
115 |     assert downloader.download.call_count == 2
116 |     request = downloader.download.call_args[0]
117 |     assert request.method == 'POST'
118 |     assert str(request.url) == 'https://login.example/login'
119 |     assert request.form_data['username'] == 'john'
120 |     assert request.form_data['password'] == 'fake'
121 | 
122 | 
123 | async def test_login_with_captcha(asyncio_loop, mocker, nursery):
124 |     # Create a CAPTCHA server
125 |     conn_count = 0
126 |     async def handler(stream):
127 |         nonlocal conn_count
128 |         if conn_count == 0:
129 |             request = await stream.receive_some(4096)
130 |             assert request.startswith(b'POST /createTask HTTP/1.1\r\n')
131 |             await stream.send_all(
132 |                 b'HTTP/1.1 200 OK\r\n'
133 |                 b'Content-type: application/json\r\n'
134 |                 b'\r\n'
135 |                 b'{"errorId": 0, "taskId": 28278116}\r\n'
136 |                 b'\r\n'
137 |             )
138 |         else:
139 |             request = await stream.receive_some(4096)
140 |             assert request.startswith(b'POST /getTaskResult HTTP/1.1\r\n')
141 |             await stream.send_all(
142 |                 b'HTTP/1.1 200 OK\r\n'
143 |                 b'Content-type: application/json\r\n'
144 |                 b'\r\n'
145 |                 b'{"errorId": 0, "taskId": 28278116, "status": "ready",\r\n'
146 |                 b' "solution": {"text": "ABCD1234"}}\r\n'
147 |                 b'\r\n'
148 |             )
149 |         conn_count += 1
150 |     serve_tcp = partial(trio.serve_tcp, handler, port=0, host='127.0.0.1')
151 |     http_server = await nursery.start(serve_tcp)
152 |     addr, port = http_server[0].socket.getsockname()
153 | 
154 |     # Patch out asyncio sleep so that the test completes quickly.
155 |     sleep_mock = mocker.patch('asyncio.sleep', new=AsyncMock())
156 | 
157 |     # Create test fixtures
158 |     job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
159 |     db = Mock()
160 |     login = {
161 |         'domain': 'login.example',
162 |         'login_url': 'https://login.example/index',
163 |         'users': [{'username': 'john', 'password': 'fake'}]
164 |     }
165 |     db.get_login = AsyncMock(return_value=login)
166 |     policy = make_policy(port)
167 |     downloader = Mock()
168 |     html1 = \
169 |     b'''<html>
170 |         <head><title>Login Test</title></head>
171 |         <body>
172 |             <form action="/login" method="POST">
173 |             <input type="text" name="username">
174 |             <input type="password" name="password">
175 |             <img src="/get-captcha" alt="CAPTCHA">
176 |             <input type="text" name="captcha">
177 |             <input type="submit" value="Log In">
178 |         </body>
179 |         </html>'''
180 |     response1 = DownloadResponse(
181 |         frontier_id='bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb',
182 |         cost=1.0,
183 |         url='https://login.example/index',
184 |         canonical_url='https://login.example/index',
185 |         content_type='text/html',
186 |         body=html1,
187 |         started_at=datetime(2019, 2, 1, 10, 2, 0, tzinfo=timezone.utc),
188 |         completed_at=datetime(2019, 2, 1, 10, 2, 1, tzinfo=timezone.utc),
189 |         exception=None,
190 |         status_code=200,
191 |         headers=dict()
192 |     )
193 |     img_data = b64decode('iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJ'
194 |         'AAAACklEQVR4nGMAAQAABQABDQottAAAAABJRU5ErkJggg==')
195 |     response2 = DownloadResponse(
196 |         frontier_id='cccccccc-cccc-cccc-cccc-cccccccccccc',
197 |         cost=1.0,
198 |         url='https://login.example/get-captcha',
199 |         canonical_url='https://login.example/get-captcha',
200 |         content_type='image/png',
201 |         body=img_data,
202 |         started_at=datetime(2019, 2, 1, 10, 2, 4, tzinfo=timezone.utc),
203 |         completed_at=datetime(2019, 2, 1, 10, 2, 5, tzinfo=timezone.utc),
204 |         exception=None,
205 |         status_code=200,
206 |         headers=dict()
207 |     )
208 |     response3 = DownloadResponse(
209 |         frontier_id='cccccccc-cccc-cccc-cccc-cccccccccccc',
210 |         cost=1.0,
211 |         url='https://login.example/login',
212 |         canonical_url='https://login.example/login',
213 |         content_type='text/plain',
214 |         body=b'200 OK',
215 |         started_at=datetime(2019, 2, 1, 10, 2, 2, tzinfo=timezone.utc),
216 |         completed_at=datetime(2019, 2, 1, 10, 2, 3, tzinfo=timezone.utc),
217 |         exception=None,
218 |         status_code=200,
219 |         headers=dict()
220 |     )
221 |     downloader.download = AsyncMock(return_values=(response1, response2,
222 |         response3))
223 | 
224 |     # Run the test
225 |     login_manager = LoginManager(job_id, db, policy, downloader)
226 |     await login_manager.login('login.example')
227 |     assert downloader.download.call_count == 3
228 |     request = downloader.download.call_args[0]
229 |     assert request.method == 'POST'
230 |     assert str(request.url) == 'https://login.example/login'
231 |     assert request.form_data['username'] == 'john'
232 |     assert request.form_data['password'] == 'fake'
233 |     assert request.form_data['captcha'] == 'ABCD1234'
234 | 


--------------------------------------------------------------------------------
/tests/test_rate_limiter.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import trio
  3 | 
  4 | from . import assert_min_elapsed, assert_max_elapsed
  5 | from starbelly.rate_limiter import (
  6 |     Expiry,
  7 |     get_domain_token,
  8 |     GLOBAL_RATE_LIMIT_TOKEN,
  9 |     RateLimiter,
 10 | )
 11 | from starbelly.downloader import DownloadRequest
 12 | 
 13 | 
 14 | def make_request(job_id, url):
 15 |     ''' Make a download request object. '''
 16 |     return DownloadRequest(
 17 |         frontier_id='aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa',
 18 |         job_id=job_id,
 19 |         method='GET',
 20 |         url=url,
 21 |         form_data=None,
 22 |         cost=1.0
 23 |     )
 24 | 
 25 | 
 26 | def test_expiry_repr():
 27 |     token = b'\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff'
 28 |     expiry = Expiry(time=1.5, token=token)
 29 |     assert repr(expiry) == \
 30 |         'Expiry(time=1.500, token=00112233445566778899aabbccddeeff)'
 31 | 
 32 | 
 33 | def test_compare_expiry_to_expiry():
 34 |     token = b'\x01' * 16
 35 |     expiry1 = Expiry(time=1, token=token)
 36 |     expiry2 = Expiry(time=1, token=token)
 37 |     expiry3 = Expiry(time=2, token=token)
 38 |     assert expiry1 == expiry2
 39 |     assert expiry2 < expiry3
 40 | 
 41 | 
 42 | def test_compare_expiry_to_float():
 43 |     token = b'\x01' * 16
 44 |     expiry1 = Expiry(time=2, token=token)
 45 |     assert expiry1 > 1.5
 46 |     assert 1.5 < expiry1
 47 |     assert expiry1 == 2.0
 48 | 
 49 | 
 50 | async def test_one_request(nursery):
 51 |     job_id = '123e4567-e89b-12d3-a456-426655440001'
 52 |     rl = RateLimiter(1)
 53 |     request_send = rl.get_request_channel()
 54 |     reset_send = rl.get_reset_channel()
 55 |     job_recv = rl.add_job(job_id)
 56 |     assert rl.job_count == 1
 57 |     assert rl.item_count == 0
 58 |     nursery.start_soon(rl.run)
 59 |     request = make_request(job_id, 'http://domain.example')
 60 |     await request_send.send(request)
 61 |     job_request = await job_recv.receive()
 62 |     assert job_request is request
 63 | 
 64 | 
 65 | async def test_two_requests_different_domains(nursery):
 66 |     '''
 67 |     The requests are for separate domains, so the rate limiter will emit both
 68 |     requests without delay.
 69 |     '''
 70 |     job_id = '123e4567-e89b-12d3-a456-426655440001'
 71 |     rl = RateLimiter(2)
 72 |     rl.set_rate_limit(GLOBAL_RATE_LIMIT_TOKEN, 10)
 73 |     request_send = rl.get_request_channel()
 74 |     reset_send = rl.get_reset_channel()
 75 |     job_recv = rl.add_job(job_id)
 76 |     nursery.start_soon(rl.run)
 77 |     request1 = make_request(job_id, 'http://domain1.example')
 78 |     request2 = make_request(job_id, 'http://domain2.example')
 79 |     await request_send.send(request1)
 80 |     await request_send.send(request2)
 81 |     with assert_max_elapsed(seconds=1):
 82 |         job_request1 = await job_recv.receive()
 83 |         job_request2 = await job_recv.receive()
 84 |         assert job_request1 is request1
 85 |         assert job_request2 is request2
 86 | 
 87 | 
 88 | async def test_two_requests_same_domain(autojump_clock, nursery):
 89 |     '''
 90 |     The requests are for the same domain, so the rate limiter will impose a
 91 |     10 second delay between the reset of the first request and issuing the
 92 |     second request.
 93 |     '''
 94 |     job_id = '123e4567-e89b-12d3-a456-426655440001'
 95 |     rl = RateLimiter(2)
 96 |     rl.set_rate_limit(GLOBAL_RATE_LIMIT_TOKEN, 10)
 97 |     request_send = rl.get_request_channel()
 98 |     reset_send = rl.get_reset_channel()
 99 |     job_recv = rl.add_job(job_id)
100 |     nursery.start_soon(rl.run)
101 |     request1 = make_request(job_id, 'http://domain.example/1')
102 |     request2 = make_request(job_id, 'http://domain.example/2')
103 |     await request_send.send(request1)
104 |     await request_send.send(request2)
105 |     job_request1 = await job_recv.receive()
106 |     with assert_min_elapsed(seconds=10):
107 |         await reset_send.send(job_request1.url)
108 |         job_request2 = await job_recv.receive()
109 |     assert job_request1 is request1
110 |     assert job_request2 is request2
111 | 
112 | 
113 | async def test_rate_limiter_over_capacity(autojump_clock, nursery):
114 |     '''
115 |     The rate limiter will be over capacity when the 3rd item is added and will
116 |     block for 5 seconds until ``remove_one_request()`` reads one item from the
117 |     rate limiter.
118 |     '''
119 |     job_id = '123e4567-e89b-12d3-a456-426655440001'
120 |     rl = RateLimiter(2)
121 |     rl.set_rate_limit(GLOBAL_RATE_LIMIT_TOKEN, 10)
122 |     request_send = rl.get_request_channel()
123 |     reset_send = rl.get_reset_channel()
124 |     job_recv = rl.add_job(job_id)
125 |     nursery.start_soon(rl.run)
126 |     request1 = make_request(job_id, 'http://domain1.example')
127 |     request2 = make_request(job_id, 'http://domain2.example')
128 |     request3 = make_request(job_id, 'http://domain3.example')
129 |     await request_send.send(request1)
130 |     await request_send.send(request2)
131 | 
132 |     async def read_one_request(when):
133 |         await trio.sleep(when)
134 |         await job_recv.receive()
135 | 
136 |     with assert_min_elapsed(seconds=5):
137 |         async with trio.open_nursery() as inner:
138 |             inner.start_soon(read_one_request, 5)
139 |             await request_send.send(request3)
140 | 
141 | 
142 | async def test_token_limit_supercedes_global_limit(autojump_clock, nursery):
143 |     '''
144 |     If a limit is set on a domain token, that rate limit is used, otherwise the
145 |     global rate limit is used.
146 |     '''
147 |     job_id = '123e4567-e89b-12d3-a456-426655440001'
148 |     rl = RateLimiter(2)
149 |     request_send = rl.get_request_channel()
150 |     reset_send = rl.get_reset_channel()
151 |     token = get_domain_token('domain.example')
152 |     rl.set_rate_limit(token, 2)
153 |     rl.set_rate_limit(GLOBAL_RATE_LIMIT_TOKEN, 10)
154 |     job_recv = rl.add_job(job_id)
155 |     nursery.start_soon(rl.run)
156 | 
157 |     # These two requests should take ~2 seconds due to the domain rate limit.
158 |     with assert_max_elapsed(seconds=2.5):
159 |         request1 = make_request(job_id, 'http://domain.example/1')
160 |         request2 = make_request(job_id, 'http://domain.example/2')
161 |         await request_send.send(request1)
162 |         await request_send.send(request2)
163 |         await job_recv.receive()
164 |         await reset_send.send(request1.url)
165 |         await job_recv.receive()
166 |         await reset_send.send(request2.url)
167 | 
168 |     # Now if we delete the domain rate limit, the next two requests should take
169 |     # 10 seconds due to the global rate limit.
170 |     await trio.sleep(2)
171 |     rl.delete_rate_limit(token)
172 |     with assert_min_elapsed(seconds=10):
173 |         request3 = make_request(job_id, 'http://domain.example/3')
174 |         request4 = make_request(job_id, 'http://domain.example/4')
175 |         await request_send.send(request3)
176 |         await request_send.send(request4)
177 |         await job_recv.receive()
178 |         await reset_send.send(request3.url)
179 |         await job_recv.receive()
180 |         await reset_send.send(request4.url)
181 | 
182 |     # Deleting a non-existent token has no effect:
183 |     rl.delete_rate_limit(token)
184 | 
185 | 
186 | async def test_skip_expired_limit_if_nothing_pending(autojump_clock, nursery):
187 |     ''' The rate limit for domain1 will expire before the rate limit for
188 |     domain2, but since domain1 has no pending requests, it will wait for domain2
189 |     to become available again. '''
190 |     job_id = '123e4567-e89b-12d3-a456-426655440001'
191 |     rl = RateLimiter(2)
192 |     request_send = rl.get_request_channel()
193 |     reset_send = rl.get_reset_channel()
194 |     token1 = get_domain_token('domain1.example')
195 |     token2 = get_domain_token('domain2.example')
196 |     rl.set_rate_limit(token1, 1)
197 |     rl.set_rate_limit(token2, 2)
198 |     job_recv = rl.add_job(job_id)
199 |     nursery.start_soon(rl.run)
200 |     request1 = make_request(job_id, 'http://domain1.example')
201 |     request2a = make_request(job_id, 'http://domain2.example/a')
202 |     request2b = make_request(job_id, 'http://domain2.example/b')
203 |     await request_send.send(request1)
204 |     await request_send.send(request2a)
205 |     await request_send.send(request2b)
206 |     with assert_min_elapsed(seconds=2):
207 |         job_request1 = await job_recv.receive()
208 |         await reset_send.send(request1.url)
209 |         job_request2a = await job_recv.receive()
210 |         await reset_send.send(request2a.url)
211 |         job_request2b = await job_recv.receive()
212 |         await reset_send.send(request2b.url)
213 |     assert job_request1 is request1
214 |     assert job_request2a is request2a
215 |     assert job_request2b is request2b
216 | 
217 | 
218 | async def test_push_after_get(autojump_clock, nursery):
219 |     ''' If a job is waiting for a request but nothing is pending, then the rate
220 |     limiter will wait until it receives a request. '''
221 |     job_id = '123e4567-e89b-12d3-a456-426655440001'
222 |     rl = RateLimiter(2)
223 |     rl.set_rate_limit(GLOBAL_RATE_LIMIT_TOKEN, 10)
224 |     request_send = rl.get_request_channel()
225 |     reset_send = rl.get_reset_channel()
226 |     job_recv = rl.add_job(job_id)
227 |     nursery.start_soon(rl.run)
228 |     request = make_request(job_id, 'http://domain.example')
229 | 
230 |     async def wait_to_send():
231 |         await trio.sleep(2)
232 |         await request_send.send(request)
233 | 
234 |     with assert_min_elapsed(seconds=2):
235 |         nursery.start_soon(wait_to_send)
236 |         job_request = await job_recv.receive()
237 |     assert job_request is request
238 | 
239 | 


--------------------------------------------------------------------------------
/tests/test_resource_monitor.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | from datetime import datetime, timezone
  3 | from unittest.mock import patch
  4 | 
  5 | import pytest
  6 | import trio
  7 | 
  8 | from . import assert_min_elapsed, assert_max_elapsed
  9 | from starbelly.rate_limiter import RateLimiter
 10 | from starbelly.resource_monitor import ResourceMonitor
 11 | 
 12 | 
 13 | @pytest.fixture
 14 | def rate_limiter():
 15 |     return RateLimiter(1)
 16 | 
 17 | 
 18 | async def test_history(autojump_clock, nursery, rate_limiter):
 19 |     '''
 20 |     Set interval to 2 seconds and run for 11 seconds. This should produce
 21 |     6 measurements.
 22 | 
 23 |     Note: this test doesn't mock out psutil, so it also ensures that we are
 24 |     consuming the psutil API correctly.
 25 |     '''
 26 |     job1_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
 27 |     crawl_resources_fn = lambda: {
 28 |         'current_downloads': 3,
 29 |         'maximum_downloads': 10,
 30 |         'jobs': [{
 31 |             'id': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa',
 32 |             'name': 'Test Job',
 33 |             'current_downloads': 3,
 34 |         }],
 35 |     }
 36 |     rm = ResourceMonitor(interval=2, buffer_size=300,
 37 |         crawl_resources_fn=crawl_resources_fn, rate_limiter=rate_limiter)
 38 |     nursery.start_soon(rm.run)
 39 |     await trio.sleep(11)
 40 |     history1 = list(rm.history())
 41 |     assert len(history1) == 6
 42 |     # We should also be able to get a subset of history
 43 |     history2 = list(rm.history(3))
 44 |     assert len(history2) == 3
 45 |     assert history1[0]['timestamp'] < history2[0]['timestamp']
 46 | 
 47 | 
 48 | async def test_measurement(autojump_clock, nursery, mocker,
 49 |     rate_limiter):
 50 |     ''' Mock out inputs and check that the resource monitor formats the data
 51 |     correctly. '''
 52 |     # Set up patches
 53 |     Consumed = namedtuple('Memory', 'used total')
 54 |     Mount = namedtuple('Disk', 'mountpoint')
 55 |     Nic = namedtuple('Nic', 'bytes_sent bytes_recv')
 56 |     psutil_cpu_percent = mocker.patch('psutil.cpu_percent')
 57 |     psutil_cpu_percent.return_value = [12.3, 45.6]
 58 |     ps_util_virtual_memory = mocker.patch('psutil.virtual_memory')
 59 |     ps_util_virtual_memory.return_value = Consumed(100_000, 200_000)
 60 |     psutil_disk_partitions = mocker.patch('psutil.disk_partitions')
 61 |     psutil_disk_partitions.return_value = [Mount('/'), Mount('/mnt/external')]
 62 |     psutil_disk_usage = mocker.patch('psutil.disk_usage')
 63 |     psutil_disk_usage.return_value = Consumed(300_000, 400_000)
 64 |     psutil_net_io_counters = mocker.patch('psutil.net_io_counters')
 65 |     psutil_net_io_counters.return_value = {
 66 |         'eth0': Nic(100, 200),
 67 |         'eth1': Nic(300, 400),
 68 |     }
 69 | 
 70 |     # The crawl resources can be instantiated right here; no mocking required.
 71 |     job1_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
 72 |     job2_id = 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb'
 73 |     crawl_resources_fn = lambda: {
 74 |         'current_downloads': 5,
 75 |         'maximum_downloads': 10,
 76 |         'jobs': [{
 77 |             'id': job1_id,
 78 |             'name': 'Test Job 1',
 79 |             'current_downloads': 3,
 80 |         },{
 81 |             'id': job2_id,
 82 |             'name': 'Test Job 2',
 83 |             'current_downloads': 2,
 84 |         }],
 85 |     }
 86 | 
 87 |     # Run the resource monitor
 88 |     rm = ResourceMonitor(interval=1, buffer_size=300,
 89 |         crawl_resources_fn=crawl_resources_fn, rate_limiter=rate_limiter)
 90 |     rm_recv = rm.get_channel(channel_size=5)
 91 |     nursery.start_soon(rm.run)
 92 | 
 93 |     # Read one measurement:
 94 |     measurement = await rm_recv.receive()
 95 |     assert measurement['cpus'] == [12.3, 45.6]
 96 |     assert measurement['memory_used'] == 100_000
 97 |     assert measurement['memory_total'] == 200_000
 98 |     assert len(measurement['disks']) == 2
 99 |     assert measurement['disks'][0]['mount'] == '/'
100 |     assert measurement['disks'][0]['used'] == 300_000
101 |     assert measurement['disks'][0]['total'] == 400_000
102 |     assert measurement['disks'][1]['mount'] == '/mnt/external'
103 |     assert measurement['disks'][1]['used'] == 300_000
104 |     assert measurement['disks'][1]['total'] == 400_000
105 |     assert len(measurement['networks']) == 2
106 |     assert measurement['networks'][0]['name'] == 'eth0'
107 |     assert measurement['networks'][0]['sent'] == 100
108 |     assert measurement['networks'][0]['received'] == 200
109 |     assert measurement['networks'][1]['name'] == 'eth1'
110 |     assert measurement['networks'][1]['sent'] == 300
111 |     assert measurement['networks'][1]['received'] == 400
112 |     assert measurement['jobs'][0]['id'] == job1_id
113 |     assert measurement['jobs'][0]['name'] == 'Test Job 1'
114 |     assert measurement['jobs'][0]['current_downloads'] == 3
115 |     assert measurement['jobs'][1]['id'] == job2_id
116 |     assert measurement['jobs'][1]['name'] == 'Test Job 2'
117 |     assert measurement['jobs'][1]['current_downloads'] == 2
118 |     assert measurement['current_downloads'] == 5
119 |     assert measurement['maximum_downloads'] == 10
120 |     assert measurement['rate_limiter'] == 0
121 | 
122 | 
123 | async def test_slow_channel(autojump_clock, nursery, rate_limiter):
124 |     ''' If there are two subscribers to the resource monitor and one is slow, it
125 |     will not prevent delivery to the other subscriber. '''
126 |     crawl_resources_fn = lambda: {
127 |         'current_downloads': 3,
128 |         'maximum_downloads': 10,
129 |         'jobs': [{
130 |             'id': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa',
131 |             'name': 'Test Job',
132 |             'current_downloads': 3,
133 |         }],
134 |     }
135 | 
136 |     rm = ResourceMonitor(interval=1, buffer_size=300,
137 |         crawl_resources_fn=crawl_resources_fn, rate_limiter=rate_limiter)
138 |     slow_recv = rm.get_channel(channel_size=1)
139 |     fast_recv = rm.get_channel(channel_size=1)
140 |     nursery.start_soon(rm.run)
141 |     # The fast reader gets one measurement per second even though the slow
142 |     # reader is blocked.
143 |     with assert_min_elapsed(3), assert_max_elapsed(4):
144 |         for _ in range(4):
145 |             await fast_recv.receive()
146 |     # Now we close the slow reader and make sure the fast reader still gets
147 |     # measurements.
148 |     await slow_recv.aclose()
149 |     with assert_min_elapsed(3), assert_max_elapsed(4):
150 |         for _ in range(3):
151 |             await fast_recv.receive()
152 | 
153 | 


--------------------------------------------------------------------------------
/tests/test_storage.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timezone
  2 | from itertools import count
  3 | from unittest.mock import Mock
  4 | 
  5 | from yarl import URL
  6 | import trio
  7 | 
  8 | from . import AsyncMock
  9 | from starbelly.downloader import DownloadResponse
 10 | from starbelly.policy import Policy
 11 | from starbelly.storage import CrawlStorage
 12 | 
 13 | 
 14 | def make_policy():
 15 |     ''' Make a sample policy. '''
 16 |     dt = datetime(2018,12,31,13,47,00)
 17 |     doc = {
 18 |         'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb',
 19 |         'name': 'Test',
 20 |         'created_at': dt,
 21 |         'updated_at': dt,
 22 |         'authentication': {
 23 |             'enabled': False,
 24 |         },
 25 |         'limits': {
 26 |             'max_cost': 10,
 27 |             'max_duration': 3600,
 28 |             'max_items': 10_000,
 29 |         },
 30 |         'mime_type_rules': [
 31 |             {'match': 'MATCHES', 'pattern': '^text/', 'save': True},
 32 |             {'save': False},
 33 |         ],
 34 |         'proxy_rules': [],
 35 |         'robots_txt': {
 36 |             'usage': 'IGNORE',
 37 |         },
 38 |         'url_normalization': {
 39 |             'enabled': True,
 40 |             'strip_parameters': [],
 41 |         },
 42 |         'url_rules': [
 43 |             {'action': 'ADD', 'amount': 1, 'match': 'MATCHES',
 44 |              'pattern': '^https?://({SEED_DOMAINS})/'},
 45 |             {'action': 'MULTIPLY', 'amount': 0},
 46 |         ],
 47 |         'user_agents': [
 48 |             {'name': 'Test User Agent'}
 49 |         ]
 50 |     }
 51 |     return Policy(doc, '1.0.0', ['https://seeds.example'])
 52 | 
 53 | 
 54 | async def test_storage(nursery):
 55 |     job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaa'
 56 |     db = Mock()
 57 |     db.save_response = AsyncMock()
 58 |     db.update_job_stats = AsyncMock()
 59 |     test_send, storage_receive = trio.open_memory_channel(0)
 60 |     storage_send, test_receive = trio.open_memory_channel(0)
 61 |     policy = make_policy()
 62 |     sequence = count(start=100)
 63 |     storage = CrawlStorage(job_id, db, storage_send, storage_receive, policy,
 64 |         sequence)
 65 |     assert repr(storage) == '<CrawlStorage job_id=aaaaaaaa>'
 66 |     nursery.start_soon(storage.run)
 67 |     started_at = datetime(2019, 1, 1, 12, 0, 0, tzinfo=timezone.utc)
 68 |     completed_at = datetime(2019, 1, 1, 12, 0, 3, tzinfo=timezone.utc)
 69 |     response = DownloadResponse(
 70 |         frontier_id='cccccccc-cccc-cccc-cccc-cccccccccccc',
 71 |         cost=1.0,
 72 |         url='https://storage.example/',
 73 |         canonical_url='https://storage.example/',
 74 |         content_type='text/plain',
 75 |         body=b'Hello, world!',
 76 |         started_at=started_at,
 77 |         completed_at=completed_at,
 78 |         exception=None,
 79 |         status_code=200,
 80 |         headers={'Server': 'FooServe 1.0'}
 81 |     )
 82 |     response.duration = 3.0
 83 |     await test_send.send(response)
 84 |     response2 = await test_receive.receive()
 85 |     assert response is response2
 86 |     assert db.save_response.call_count == 1
 87 |     save_response_args = db.save_response.call_args
 88 |     assert save_response_args[0] == {
 89 |         'sequence': 100,
 90 |         'job_id': job_id,
 91 |         'body_id': b'\x9b\xbb\xb7A\x0f\xa6FJ\x1aj!i\x19\x17\x94U',
 92 |         'url': 'https://storage.example/',
 93 |         'canonical_url': 'https://storage.example/',
 94 |         'content_type': 'text/plain',
 95 |         'cost': 1.0,
 96 |         'duration': 3.0,
 97 |         'headers': ['SERVER', 'FooServe 1.0'],
 98 |         'is_success': True,
 99 |         'status_code': 200,
100 |         'started_at': started_at,
101 |         'completed_at': completed_at,
102 |     }
103 |     # Note that the gzip'ed body is non deterministic, so we only check body ID
104 |     # and is_compressed fields.
105 |     assert save_response_args[1]['id'] == \
106 |         b'\x9b\xbb\xb7A\x0f\xa6FJ\x1aj!i\x19\x17\x94U'
107 |     assert save_response_args[1]['is_compressed']
108 |     assert db.update_job_stats.call_count == 1
109 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/tools/__init__.py


--------------------------------------------------------------------------------
/tools/clear.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Clears data from the database. Only intended for developers who want to clear
 3 | their environment and start from scratch.
 4 | '''
 5 | import logging
 6 | import os
 7 | import sys
 8 | 
 9 | from rethinkdb import RethinkDB
10 | import trio
11 | 
12 | sys.path.append(os.path.dirname(os.path.dirname(__file__)))
13 | 
14 | from starbelly.config import get_config
15 | 
16 | 
17 | r = RethinkDB()
18 | r.set_loop_type('trio')
19 | logging.basicConfig(level=logging.INFO)
20 | logger = logging.getLogger('clear.py')
21 | 
22 | 
23 | async def clear(conn, table):
24 |     logger.info('Clearing table %s', table)
25 |     query = r.table(table)
26 | 
27 |     if table == 'policy':
28 |         # Exclude built-in policies.
29 |         query = query.filter(
30 |             (r.row['name'] != 'Broad Crawl') &
31 |             (r.row['name'] != 'Deep Crawl')
32 |         )
33 |     elif table == 'rate_limit':
34 |         # Exclude global rate limit.
35 |         query = query.filter(r.row['type'] != 'global')
36 | 
37 |     await query.delete().run(conn)
38 | 
39 | 
40 | async def main():
41 |     db_config = get_config()['database']
42 |     async with trio.open_nursery() as nursery:
43 |         conn = await r.connect(
44 |             host=db_config['host'],
45 |             port=db_config['port'],
46 |             db=db_config['db'],
47 |             user=db_config['user'],
48 |             password=db_config['password'],
49 |             nursery=nursery
50 |         )
51 |         await clear(conn, 'captcha_solver')
52 |         await clear(conn, 'domain_login')
53 |         await clear(conn, 'frontier')
54 |         await clear(conn, 'job')
55 |         await clear(conn, 'job_schedule')
56 |         await clear(conn, 'policy')
57 |         await clear(conn, 'rate_limit')
58 |         await clear(conn, 'response')
59 |         await clear(conn, 'response_body')
60 |         await clear(conn, 'robots_txt')
61 |         await conn.close()
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     trio.run(main)
66 | 


--------------------------------------------------------------------------------
/tools/delete_dangling_bodies.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Response bodies are deduplicated and stored separate from response headers. When
 3 | crawls are deleted, the response bodies remain behind. This script finds
 4 | "dangling" response bodies—i.e. bodies that don't match to any existing
 5 | response—and deletes them.
 6 | 
 7 | This is a slow process, since it involves looking at every single response body.
 8 | '''
 9 | import logging
10 | import os
11 | import sys
12 | 
13 | from rethinkdb import RethinkDB
14 | 
15 | import trio
16 | 
17 | sys.path.append(os.path.dirname(os.path.dirname(__file__)))
18 | 
19 | from starbelly.config import get_config
20 | 
21 | 
22 | r = RethinkDB()
23 | r.set_loop_type('trio')
24 | logging.basicConfig(level=logging.INFO)
25 | logger = logging.getLogger('clear.py')
26 | 
27 | 
28 | async def clear_dangling_response_bodies(conn):
29 |     '''
30 |     Response bodies are deduplicated and stored in a separate table from
31 |     response metadata. When a job is deleted, only the response metadata is
32 |     removed. This method finds response bodies that are dangling (not
33 |     referred to by any existing response) and removes them.
34 | 
35 |     Note: this is a very slow operation because it has to iterate over all
36 |     existing response bodies. This should only be run periodically.
37 | 
38 |     :param db_pool: A RethinkDB connection pool.
39 |     '''
40 |     def dangling(body):
41 |         responses = r.table('response').get_all(body['id'], index='body_id')
42 |         return  responses.count().eq(0)
43 | 
44 |     await (
45 |         r.table('response_body')
46 |          .order_by('id')
47 |          .filter(dangling)
48 |          .delete()
49 |          .run(conn)
50 |     )
51 | 
52 | 
53 | async def main():
54 |     db_config = get_config()['database']
55 |     async with trio.open_nursery() as nursery:
56 |         conn = await r.connect(
57 |             host=db_config['host'],
58 |             port=db_config['port'],
59 |             db=db_config['db'],
60 |             user=db_config['user'],
61 |             password=db_config['password'],
62 |             nursery=nursery
63 |         )
64 |         await clear_dangling_response_bodies(conn)
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     trio.run(main)
69 | 


--------------------------------------------------------------------------------
/tools/shell.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | A Python REPL for Starbelly.
  3 | 
  4 | This "shell" imports useful modules and sets up the application
  5 | configuration, database pool, and other useful features. This shell
  6 | is intended for use with Python's interactive flag, i.e.:
  7 | 
  8 |     $ python3 -im tools.shell
  9 |     >>> config['database']['user']
 10 |     'starbelly-app'
 11 | 
 12 | You can also load this in Jupyter Notebook by running this in the first cell:
 13 | 
 14 |     from tools.shell import *
 15 | 
 16 | The shell is handy for development and debugging in order to execute
 17 | sections of Starbelly without running the entire server.
 18 | '''
 19 | 
 20 | import functools
 21 | import logging
 22 | import os
 23 | import sys
 24 | 
 25 | from IPython.terminal.embed import InteractiveShellEmbed
 26 | from rethinkdb import RethinkDB
 27 | import trio
 28 | 
 29 | sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 30 | 
 31 | import starbelly.config
 32 | from starbelly.version import __version__
 33 | 
 34 | 
 35 | # Globals exposed in the shell:
 36 | r = RethinkDB()
 37 | r.set_loop_type('trio')
 38 | logger = None
 39 | config = None
 40 | 
 41 | 
 42 | def run_query(query, super_user=False):
 43 |     ''' Run ``query`` on RethinkDB and return result. '''
 44 |     async def async_query():
 45 |         db_config = config['database']
 46 |         kwargs = {
 47 |             'host': db_config['host'],
 48 |             'port': db_config['port'],
 49 |             'db': db_config['db'],
 50 |             'user': db_config['user'],
 51 |             'password': db_config['password'],
 52 |         }
 53 |         if super_user:
 54 |             kwargs['user'] = db_config['super_user']
 55 |             kwargs['password'] = db_config['super_password']
 56 |         async with trio.open_nursery() as nursery:
 57 |             kwargs['nursery'] = nursery
 58 |             connect_db = functools.partial(r.connect, **kwargs)
 59 |             conn = await connect_db()
 60 |             try:
 61 |                 result = await query.run(conn)
 62 |             finally:
 63 |                 await conn.close()
 64 |             return result
 65 | 
 66 |     return trio.run(async_query)
 67 | 
 68 | 
 69 | def list_results(results):
 70 |     '''
 71 |     Convert query results to list, even if the query resulted in a cursor. '''
 72 |     async def async_list_results():
 73 |         if isinstance(results, list):
 74 |             return results.copy()
 75 |         elif isinstance(results, r.Cursor):
 76 |             l = list()
 77 |             async with results:
 78 |                 async for item in results:
 79 |                     l.append(item)
 80 |             return l
 81 |         else:
 82 |             type_ = type(results)
 83 |             logger.error(f'RethinkDB UNKNOWN TYPE: {type_}')
 84 |             return None
 85 | 
 86 |     return trio.run(async_list_results)
 87 | 
 88 | 
 89 | def print_results(results):
 90 |     '''
 91 |     Pretty print RethinkDB query results.
 92 | 
 93 |     This method correctly handles different types of results, such as a
 94 |     cursor, list, etc.
 95 |     '''
 96 |     async def async_print_results():
 97 |         MAX_ITEMS = 100
 98 |         INDENT = '    '
 99 |         if isinstance(results, list):
100 |             len_ = len(results)
101 |             print(f'RethinkDB List (len={len_}): [')
102 |             for item in results[:MAX_ITEMS]:
103 |                 print(f'{INDENT}{item},')
104 |             if len_ > MAX_ITEMS:
105 |                 print(f'{INDENT}...')
106 |             print(']')
107 |         elif isinstance(results, r.Cursor):
108 |             print('RethinkDB Cursor: [')
109 |             item_count = 0
110 |             try:
111 |                 async for item in results:
112 |                     if item_count > MAX_ITEMS:
113 |                         print(f'{INDENT}...')
114 |                     print(f'{INDENT}{item},')
115 |                     item_count += 1
116 |             finally:
117 |                 await results.close()
118 |             print(']')
119 |         else:
120 |             type_ = type(results)
121 |             logger.error(f'RethinkDB UNKNOWN TYPE: {type_}')
122 | 
123 |     trio.run(async_print_results)
124 | 
125 | 
126 | def setup():
127 |     ''' Set up configuration and logging. '''
128 |     global config, logger
129 |     log_format = '%(asctime)s [%(name)s] %(levelname)s: %(message)s'
130 |     log_date_format = '%H:%M:%S'
131 |     log_formatter = logging.Formatter(log_format, log_date_format)
132 |     log_handler = logging.StreamHandler(sys.stdout)
133 |     log_handler.setFormatter(log_formatter)
134 |     logger = logging.getLogger('tools.shell')
135 |     logger.addHandler(log_handler)
136 |     logger.setLevel(logging.INFO)
137 |     config = starbelly.config.get_config()
138 | 
139 | 
140 | def main():
141 |     ''' Run IPython shell. '''
142 |     ipy_shell = InteractiveShellEmbed(
143 |         banner1=f'IPython Shell: Starbelly v{__version__}')
144 |     ipy_shell.magic('autoawait trio')
145 |     ipy_shell()
146 | 
147 | 
148 | setup()
149 | if __name__ == '__main__':
150 |     main()
151 | else:
152 |     print(f'Starbelly v{__version__} Shell')
153 | 
154 | 


--------------------------------------------------------------------------------