├── .gitignore
├── .pylintrc
├── .readthedocs.yml
├── .travis.yml
├── LICENSE
├── Makefile
├── README.md
├── conf
├── .gitignore
├── local.ini.template
└── system.ini
├── dev
├── .gitignore
├── docker-compose.yml
├── gencert.py
└── nginx.conf
├── docs
├── .gitignore
├── Makefile
├── README.txt
├── administration.rst
├── changelog.rst
├── client-side-logging.png
├── conf.py
├── configuration.rst
├── dashboard.png
├── dashboard_first_crawl.png
├── development.rst
├── first_crawl.rst
├── first_crawl_results.png
├── gui.png
├── index.rst
├── installation.rst
├── internals.rst
├── jupyter_new_notebook.png
├── jupyter_notebook.png
├── logo.png
├── policy.rst
├── protobuf.html
├── protobuf.rst
├── requirements.txt
├── rethinkdb_gui.png
├── start_crawl.png
├── start_crawl_filled_in.png
├── terminal.png
└── websocket_api.rst
├── integration
├── __init__.py
├── test_db.py
└── test_subscription.py
├── notebooks
├── .gitignore
├── Example.ipynb
└── Frontier Reloading.ipynb
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── starbelly
├── __init__.py
├── __main__.py
├── backoff.py
├── bootstrap.py
├── captcha.py
├── config.py
├── db.py
├── downloader.py
├── extractor.py
├── frontier.py
├── job.py
├── login.py
├── policy.py
├── rate_limiter.py
├── resource_monitor.py
├── robots.py
├── schedule.py
├── server
│ ├── __init__.py
│ ├── captcha.py
│ ├── job.py
│ ├── login.py
│ ├── policy.py
│ ├── rate_limit.py
│ ├── schedule.py
│ ├── subscription.py
│ └── system.py
├── starbelly_pb2.py
├── storage.py
├── subscription.py
└── version.py
├── tests
├── __init__.py
├── test_async_mock.py
├── test_backoff.py
├── test_captcha.py
├── test_config.py
├── test_crawl.py
├── test_downloader.py
├── test_extractor.py
├── test_frontier.py
├── test_job.py
├── test_login.py
├── test_policy.py
├── test_rate_limiter.py
├── test_resource_monitor.py
├── test_robots.py
├── test_schedule.py
├── test_server.py
├── test_storage.py
└── test_subscription.py
└── tools
├── __init__.py
├── clear.py
├── container_init.py
├── delete_dangling_bodies.py
└── shell.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info
2 | .coverage
3 | .coverage.*
4 | .ipynb_checkpoints
5 | .pytest_cache
6 | .vscode
7 | __pycache__
8 | README.html
9 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
2 | version: 2
3 |
4 | sphinx:
5 | configuration: docs/conf.py
6 |
7 | python:
8 | version: 3.7
9 | install:
10 | - requirements: docs/requirements.txt
11 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 |
3 | git:
4 | depth: 1
5 |
6 | matrix:
7 | include:
8 | - python: 3.7
9 | dist: xenial
10 | sudo: yes
11 |
12 | before_install:
13 | - pip install poetry
14 |
15 | install:
16 | - poetry install
17 |
18 | script:
19 | - poetry run make test
20 |
21 | after_success:
22 | - poetry run make coverage
23 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016 Hyperion Gray LLC
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # The targets in this makefile should be executed inside Poetry, i.e. `poetry run make
2 | # docs`.
3 |
4 | .PHONY: docs
5 |
6 | docs:
7 | $(MAKE) -C docs html
8 |
9 | test:
10 | pytest tests/ --cov=starbelly --cov-report=term-missing
11 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Overview
2 |
3 | Starbelly is a user-friendly web crawler that is easy to deploy and configure.
4 | Learn more at
5 | [starbelly.readthedocs.io](http://starbelly.readthedocs.io/en/latest/).
6 |
7 | [](https://travis-ci.org/HyperionGray/starbelly)
8 | [](https://coveralls.io/github/HyperionGray/starbelly)
9 | [](https://starbelly.readthedocs.io)
10 |
11 |
12 | # LICENSE
13 |
14 | Starbelly is under a proprietary license. Please contact Hyperion Gray at acaceres@hyperiongray.com
15 |
16 | ---
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/conf/.gitignore:
--------------------------------------------------------------------------------
1 | local.ini
2 |
--------------------------------------------------------------------------------
/conf/local.ini.template:
--------------------------------------------------------------------------------
1 | [database]
2 |
3 | host =
4 | db =
5 | user =
6 | password =
7 | super_user =
8 | super_password =
9 |
--------------------------------------------------------------------------------
/conf/system.ini:
--------------------------------------------------------------------------------
1 | [database]
2 |
3 | host =
4 | port = 28015
5 | db =
6 | user =
7 | password =
8 | super_user =
9 | super_password =
10 |
11 | [rate_limiter]
12 |
13 | capacity = 10000
14 |
--------------------------------------------------------------------------------
/dev/.gitignore:
--------------------------------------------------------------------------------
1 | ca.crt
2 | server.crt
3 |
--------------------------------------------------------------------------------
/dev/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 |
3 | volumes:
4 | db_data: {}
5 |
6 | services:
7 | db:
8 | network_mode: host
9 | image: rethinkdb:2.4
10 | container_name: starbelly-dev-db
11 | volumes:
12 | - db_data:/data
13 |
14 | web:
15 | network_mode: host
16 | image: nginx:1.15
17 | container_name: starbelly-dev-web
18 | volumes:
19 | - "${PWD}/nginx.conf:/etc/nginx/conf.d/default.conf"
20 | - "${PWD}/server.crt:/etc/nginx/server.crt"
21 |
--------------------------------------------------------------------------------
/dev/gencert.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pathlib
3 | import sys
4 |
5 | import trustme
6 |
7 |
8 | logging.basicConfig(level=logging.INFO)
9 |
10 |
11 | def main():
12 | if len(sys.argv) != 2:
13 | logging.error('Usage: %s ', sys.argv[0])
14 | sys.exit(1)
15 | hostname = sys.argv[1]
16 | here = pathlib.Path(__file__).parent
17 | ca_path = here / 'ca.crt'
18 | server_path = here / 'server.crt'
19 | logging.info('Creating self-signed certificate for "%s"', hostname)
20 | ca_cert = trustme.CA()
21 | ca_cert.cert_pem.write_to_path(ca_path)
22 | logging.info(' * CA certificate: {}'.format(ca_path))
23 | server_cert = ca_cert.issue_server_cert(hostname)
24 | server_cert.private_key_and_cert_chain_pem.write_to_path(server_path)
25 | logging.info(' * Server certificate: {}'.format(server_path))
26 | logging.info('Done')
27 |
28 |
29 | if __name__ == '__main__':
30 | main()
31 |
--------------------------------------------------------------------------------
/dev/nginx.conf:
--------------------------------------------------------------------------------
1 | server {
2 | listen 80 default_server;
3 | return 301 https://$host$request_uri;
4 | }
5 |
6 | server {
7 | listen 443 ssl default_server;
8 | ssl_certificate /etc/nginx/server.crt;
9 | ssl_certificate_key /etc/nginx/server.crt;
10 | ssl_protocols TLSv1.2;
11 | ssl_ciphers HIGH:!aNULL:!MD5;
12 | ssl_session_cache shared:TLSCACHE:1m;
13 | add_header Strict-Transport-Security "max-age=31536000";
14 |
15 | location /ws/ {
16 | proxy_pass http://localhost:8000;
17 | proxy_http_version 1.1;
18 | proxy_set_header Upgrade $http_upgrade;
19 | proxy_set_header Connection "upgrade";
20 | }
21 |
22 | # Reverse proxy to `pub serve`
23 | location / {
24 | proxy_pass http://localhost:8081;
25 | proxy_http_version 1.1;
26 | proxy_intercept_errors on;
27 | error_page 404 =200 /index.html;
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 |
3 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = Starbelly
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/docs/README.txt:
--------------------------------------------------------------------------------
1 | To build documentation in this directory, install `sphinx` and
2 | `sphinx_rtd_theme` packages from pip, then run `make html`.
3 |
--------------------------------------------------------------------------------
/docs/administration.rst:
--------------------------------------------------------------------------------
1 | *******************
2 | Administrator Guide
3 | *******************
4 |
5 | .. contents::
6 | :depth: 2
7 |
8 |
9 | Overview
10 | ========
11 |
12 | This section goes over some common tasks that you may need to perform as a
13 | Starbelly administrator. In the examples below, if a command prompt is prefixed
14 | with a container name, then that indicates that the command must be run inside
15 | a specific Docker container. For example, if you see this:
16 |
17 | .. code::
18 |
19 | starbelly-dev-app:/starbelly# ls /usr/local/etc
20 | jupyter
21 |
22 | Then that command should be run inside of the ``starbelly-dev-app`` container.
23 | To obtain a shell inside that container, run:
24 |
25 | .. code::
26 |
27 | $ docker exec -it starbelly-dev-app /bin/bash
28 | starbelly-dev-app#
29 |
30 | You can use the same technique to get a shell inside the ``starbelly-dev-db`` or
31 | ``starbelly-dev-web`` containers.
32 |
33 | Clear Database
34 | ==============
35 |
36 | To clear all data from the database, including crawl data, job data, and other
37 | state:
38 |
39 | .. code::
40 |
41 | starbelly-dev-app:/starbelly# python tools/clear.py
42 |
43 | Change Password
44 | ===============
45 |
46 | Adding or changing passwords is covered in the :doc:`installation` under the
47 | "Security" section.
48 |
--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
1 | *********
2 | Changelog
3 | *********
4 |
5 |
6 | v2.0.0 (2019-XX-XX)
7 | ===================
8 |
9 | - Massive rewrite of Starbelly's I/O to use Trio instead of asyncio.
10 | - Upgrade web client to Dart 2 and Angular 5.
11 |
12 |
13 | v1.0.0 (2017-11-03)
14 | ===================
15 |
16 | - Initial release.
17 |
--------------------------------------------------------------------------------
/docs/client-side-logging.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/client-side-logging.png
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Starbelly documentation build configuration file, created by
5 | # sphinx-quickstart on Thu Nov 2 15:00:59 2017.
6 | #
7 | # This file is execfile()d with the current directory set to its
8 | # containing dir.
9 | #
10 | # Note that not all possible configuration values are present in this
11 | # autogenerated file.
12 | #
13 | # All configuration values have a default; values that are commented out
14 | # serve to show the default.
15 |
16 | # If extensions (or modules to document with autodoc) are in another directory,
17 | # add these directories to sys.path here. If the directory is relative to the
18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
19 | #
20 | # import os
21 | # import sys
22 | # sys.path.insert(0, os.path.abspath('.'))
23 |
24 | import pathlib
25 | import sys
26 |
27 | # -- General configuration ------------------------------------------------
28 |
29 | # If your documentation needs a minimal Sphinx version, state it here.
30 | #
31 | # needs_sphinx = '1.0'
32 |
33 | # Add any Sphinx extension module names here, as strings. They can be
34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
35 | # ones.
36 | extensions = [
37 | 'sphinx.ext.autodoc',
38 | 'sphinx.ext.intersphinx',
39 | 'sphinx.ext.graphviz',
40 | 'sphinxcontrib_trio',
41 | ]
42 |
43 | # Add any paths that contain templates here, relative to this directory.
44 | templates_path = ['_templates']
45 |
46 | # The suffix(es) of source filenames.
47 | # You can specify multiple suffix as a list of string:
48 | #
49 | # source_suffix = ['.rst', '.md']
50 | source_suffix = '.rst'
51 |
52 | # The master toctree document.
53 | master_doc = 'index'
54 |
55 | # General information about the project.
56 | project = 'Starbelly'
57 | copyright = 'Hyperion Gray 2017'
58 | author = 'Mark E. Haase'
59 |
60 | # The version info for the project you're documenting, acts as replacement for
61 | # |version| and |release|, also used in various other places throughout the
62 | # built documents.
63 | #
64 | # The short X.Y version.
65 | import starbelly.version
66 | version = starbelly.version.__version__
67 | # The full version, including alpha/beta/rc tags.
68 | release = version
69 |
70 | # The language for content autogenerated by Sphinx. Refer to documentation
71 | # for a list of supported languages.
72 | #
73 | # This is also used if you do content translation via gettext catalogs.
74 | # Usually you set "language" from the command line for these cases.
75 | language = None
76 |
77 | # List of patterns, relative to source directory, that match files and
78 | # directories to ignore when looking for source files.
79 | # This patterns also effect to html_static_path and html_extra_path
80 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
81 |
82 | # The name of the Pygments (syntax highlighting) style to use.
83 | pygments_style = 'sphinx'
84 |
85 | # If true, `todo` and `todoList` produce output, else they produce nothing.
86 | todo_include_todos = False
87 |
88 |
89 | # -- Options for HTML output ----------------------------------------------
90 |
91 | # The theme to use for HTML and HTML Help pages. See the documentation for
92 | # a list of builtin themes.
93 | #
94 | html_theme = 'sphinx_rtd_theme'
95 |
96 | # Theme options are theme-specific and customize the look and feel of a theme
97 | # further. For a list of options available for each theme, see the
98 | # documentation.
99 | #
100 | # html_theme_options = {}
101 |
102 | # Add any paths that contain custom static files (such as style sheets) here,
103 | # relative to this directory. They are copied after the builtin static files,
104 | # so a file named "default.css" will overwrite the builtin "default.css".
105 | html_static_path = ['_static']
106 |
107 | # Custom sidebar templates, must be a dictionary that maps document names
108 | # to template names.
109 | #
110 | # This is required for the alabaster theme
111 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
112 | html_sidebars = {
113 | '**': [
114 | 'relations.html', # needs 'show_related': True theme option to display
115 | 'searchbox.html',
116 | ]
117 | }
118 |
119 |
120 | # -- Options for HTMLHelp output ------------------------------------------
121 |
122 | # Output file base name for HTML help builder.
123 | htmlhelp_basename = 'Starbellydoc'
124 |
125 |
126 | # -- Options for LaTeX output ---------------------------------------------
127 |
128 | latex_elements = {
129 | # The paper size ('letterpaper' or 'a4paper').
130 | #
131 | # 'papersize': 'letterpaper',
132 |
133 | # The font size ('10pt', '11pt' or '12pt').
134 | #
135 | # 'pointsize': '10pt',
136 |
137 | # Additional stuff for the LaTeX preamble.
138 | #
139 | # 'preamble': '',
140 |
141 | # Latex figure (float) alignment
142 | #
143 | # 'figure_align': 'htbp',
144 | }
145 |
146 | # Grouping the document tree into LaTeX files. List of tuples
147 | # (source start file, target name, title,
148 | # author, documentclass [howto, manual, or own class]).
149 | latex_documents = [
150 | (master_doc, 'Starbelly.tex', 'Starbelly Documentation',
151 | 'Mark E. Haase', 'manual'),
152 | ]
153 |
154 |
155 | # -- Options for manual page output ---------------------------------------
156 |
157 | # One entry per manual page. List of tuples
158 | # (source start file, name, description, authors, manual section).
159 | man_pages = [
160 | (master_doc, 'starbelly', 'Starbelly Documentation',
161 | [author], 1)
162 | ]
163 |
164 |
165 | # -- Options for Texinfo output -------------------------------------------
166 |
167 | # Grouping the document tree into Texinfo files. List of tuples
168 | # (source start file, target name, title, author,
169 | # dir menu entry, description, category)
170 | texinfo_documents = [
171 | (master_doc, 'Starbelly', 'Starbelly Documentation',
172 | author, 'Starbelly', 'One line description of project.',
173 | 'Miscellaneous'),
174 | ]
175 |
176 | # -- Options for extensions -----------------------------------------------
177 |
178 | intersphinx_mapping = {
179 | 'trio': ('https://trio.readthedocs.io/en/stable/', None),
180 | 'yarl': ('https://yarl.readthedocs.io/en/stable/', None),
181 | }
182 |
--------------------------------------------------------------------------------
/docs/configuration.rst:
--------------------------------------------------------------------------------
1 | *******************
2 | Configuration Guide
3 | *******************
4 |
5 | .. contents::
6 | :depth: 2
7 |
8 | Overview
9 | ========
10 |
11 | Starbelly can be configured entirely through its graphical interface. In fact,
12 | this is one of the advantages to using Starbelly: no more arcane configuration
13 | files or custom code! The tradeoff, of course, is that Starbelly has fewer
14 | configuration options than other crawlers and may not be flexible enough to
15 | solve all crawling problems.
16 |
17 | The configuration items are all contained in the *Configuration* submenu on the
18 | left side of the interface.
19 |
20 | CAPTCHA Solvers
21 | ===============
22 |
23 | Starbelly has the ability to automatically log into a website if it has the
24 | appropriate credentials (see `Credentials`_ below). Some login forms may
25 | require a CAPTCHA. In those cases, you may configure a CAPTCHA solving service.
26 | Starbelly supports any CAPTCHA service that is compatible with the Antigate API.
27 | You may create multiple configurations in order to use multiple backend solvers
28 | or just to send different configurations to the same service.
29 |
30 | Once you have created a CAPTCHA solver, specify that CAPTCHA solver in a crawl
31 | policy in order to send login CAPTCHAs to the solving service during crawls.
32 |
33 | Credentials
34 | ===========
35 |
36 | Starbelly has the ability to automatically log into a website if it has the
37 | appropriate credentials. To configure credentials for a site, you only need to
38 | specify a login URL. (If the login URL enables single sign-on for multiple
39 | subdomains, then you should also specify the domain name that you wish to
40 | authenticate on.)
41 |
42 | For each domain, you may set up multiple username & password credentials. When
43 | the crawler encounters that domain during a crawl, it will randomly pick one of
44 | the credentials and attempt to login with it. (The crawler uses machine learning
45 | to identify and parse the login form.)
46 |
47 | Rate Limits
48 | ===========
49 |
50 | The crawler observes rate limits between subsequent requests to a single domain.
51 | For example, with the default delay of 5 seconds, the crawler will wait 5
52 | seconds after a request completes until it initiates another request to that
53 | same domain. Therfore, the crawler will download at most 12 pages per minute
54 | from a single domain using the default rate limit. In practice, it will download
55 | fewer than 12 pages per minute, since each request itself also takes some
56 | non-negligible amount of time.
57 |
58 | Furthermore, rate limits apply across all jobs. For example, if you have two
59 | different jobs crawling one domain, each job will effectively be limited to 6
60 | pages per minute instead of 12.
61 |
62 | On the *Rate Limits* configuration screen, you may change the global limit as
63 | well as customize rate limits for specific domains. This allows you to specify
64 | lower rate limits for domains that can handle higher traffic. For example, you
65 | might crawl web servers on your corporate intranet faster than you crawl a
66 | public internet server.
67 |
--------------------------------------------------------------------------------
/docs/dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/dashboard.png
--------------------------------------------------------------------------------
/docs/dashboard_first_crawl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/dashboard_first_crawl.png
--------------------------------------------------------------------------------
/docs/first_crawl.rst:
--------------------------------------------------------------------------------
1 | ****************
2 | Your First Crawl
3 | ****************
4 |
5 | Starbelly offers a lot of ways to fine tune crawling, but for your first crawl,
6 | we will just use all of its default settings. The goal of this section is to
7 | perform a deep crawl of a news site. Here are a few example sites you may want
8 | to try:
9 |
10 | - `CNN `__
11 | - `Fox News `__
12 | - `New York Times `__
13 | - `Washington Post `__
14 |
15 | Begin by accessing the Starbelly GUI in your web browser. You should be able to
16 | see the dashboard:
17 |
18 | .. image:: dashboard.png
19 | :alt: screenshot of dashboard
20 |
21 | The dashboard is currently empty because you have not started any crawls yet,
22 | but let us take a moment to get familiar with the interface. On the left side,
23 | under the "Starbelly" logo, is the menu. Click any item in the menu to view it.
24 | There is a gray bar across the top of the screen. The left side of this bar
25 | displays breadcrumbs, e.g. the *home* icon and the *dashboard* icon. The
26 | breadcrumbs help you keep track of where you are, and also allow you to quickly
27 | return to earlier screens. The right side of the bar shows the status
28 | "Connected", meaning that it is connected to the Starbelly server.
29 |
30 | Next, click on *Start Crawl* in the menu.
31 |
32 | .. image:: start_crawl.png
33 | :alt: screenshot of blank crawl form
34 |
35 | This screen allows you to start a crawl. First, enter a seed URL. Second, select
36 | the *Deep Crawl* policy (more on that in a moment). Optionally, you may assign a
37 | name to the crawl. If you do not assign a name, then the crawler will choose a
38 | name for you. Finally, you may assign tags. Tags may be used by consumers of
39 | crawl data, but they do not have any effect on the crawl itself, so leave it
40 | blank for now. (You can edit the tags later if you wish.)
41 |
42 | .. image:: start_crawl_filled_in.png
43 | :alt: screenshot of filled-in crawl form
44 |
45 | When you are ready, click the *Start Crawl* button. You should see a
46 | notification that the crawl is starting. Go back to the Dashboard and you should
47 | now be able to see that your crawl is running. The dashboard updates in realtime
48 | as the crawler downloads documents.
49 |
50 | .. image:: dashboard_first_crawl.png
51 | :alt: dashboard shows the first crawl is running
52 |
53 | You can click on the name of the crawl to view details about that crawl job. The
54 | details screen also updates in real time.
55 |
56 | .. image:: first_crawl_results.png
57 | :alt: detailed look at results of first crawl
58 |
59 | The crawl will continue to run, downloading various documents that it finds,
60 | until it reaches its end. But how does it decide what documents to download, and
61 | how does it decide when the crawl should end? These questions are answered by
62 | consulting the *crawl policy*, which guides the crawler's decision making. In
63 | this example, we used the default *Deep Crawl* policy, which stays inside the
64 | same domain as the seed URL and crawls to depth 10. You may customize this
65 | policy or create any number of your own policies to carefully refine crawler
66 | behavior.
67 |
68 | Where does the crawl data go and what can you do with it? You can view crawl
69 | results inside Starbelly, but this feature is intended to help with debugging,
70 | not as a practical way to use crawl data. Crawl data is stored inside
71 | Starbelly's database until you explicitly delete it (see the *Crawl Results*
72 | screen). Starbelly is just a crawling *frontend*, which means that it is
73 | designed to be plugged into another application that can read the crawl data and
74 | do something useful with it, such as a search engine or a scraping program.
75 |
76 | Now that you have a crawl under your belt, you might want to do a deeper dive
77 | into :doc:`configuration` and :doc:`policy`.
78 |
--------------------------------------------------------------------------------
/docs/first_crawl_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/first_crawl_results.png
--------------------------------------------------------------------------------
/docs/gui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/gui.png
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. image:: logo.png
2 | :height: 150px
3 | :width: 150px
4 | :align: center
5 |
6 | Starbelly
7 | =========
8 |
9 | Starbelly is a user-friendly and highly configurable web crawler front end.
10 | Compared to other crawling systems, such as Nutch or Scrapy, Starbelly trades
11 | off lower scalability for improved usability. Starbelly eschews the arcane
12 | configuration files and custom code required for other crawling systems,
13 | favoring a GUI for configuration and managment. Starbelly exposes all of its
14 | features and data through an efficient API, allowing you to build crawling-based
15 | systems on top of it. For example, you might plug in an Elastic Search backend
16 | to build a custom search engine, or plug in a scraper to create a data
17 | collection pipeline.
18 |
19 | .. toctree::
20 | :maxdepth: 1
21 |
22 | installation
23 | first_crawl
24 | configuration
25 | policy
26 | administration
27 | websocket_api
28 | protobuf
29 | development
30 | internals
31 | changelog
32 |
33 | .. image:: https://hyperiongray.s3.amazonaws.com/define-hg.svg
34 | :target: https://www.hyperiongray.com/?pk_campaign=github&pk_kwd=agnostic
35 | :alt: define hyperiongray
36 | :width: 500px
37 |
--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
1 | Installation Guide
2 | ==================
3 |
4 | Prerequisites
5 | -------------
6 |
7 | Starbelly is offered as a collection of Docker images and Docker-compose
8 | configurations. If you're not familiar with Docker, it is a system for deploying
9 | multiple software components into individual containers and orchestrating the
10 | entire system.
11 |
12 | First, install `Docker `__ using
13 | the instructions for your platform.
14 |
15 | Next, install `Docker Compose `__
16 | using the instructions for your platform.
17 |
18 | Docker Compose
19 | --------------
20 |
21 | Docker Compose is used to set up and run multiple Docker containers together.
22 | You should have installed Docker Compose in the previous step. Now you need a
23 | Docker Compose configuration file (usually called ``docker-compose.yml``) that
24 | specifies what containers need to be created and how they should be configured.
25 | A sample docker-compose.yml configuration file is available for Starbelly, but
26 | you may need to tailor this file to your unique environment.
27 |
28 | Download this `zip file
29 | `__
30 | and extract it. (If you have Git installed, you can run ``git clone
31 | git@github.com:HyperionGray/starbelly-docker.git`` instead.) From the
32 | ``starbelly-docker/starbelly`` directory, run the following command:
33 |
34 | .. code::
35 |
36 | $ docker-compose up -d
37 |
38 | This will download the required Docker images, create the corresponding
39 | containers, and then start the entire application on ports 80 and 443. Once
40 | the application has started, open up a browser and try navigating to the host
41 | where you are running Starbelly. The default username and password is "admin".
42 |
43 | You should see the Dashboard:
44 |
45 | .. image:: dashboard.png
46 | :alt: screenshot of dashboard
47 |
48 | If you experience any problems, try using the command ``docker-compose logs``
49 | to view logging output from the Docker containers.
50 |
51 | Security
52 | --------
53 |
54 | If your Starbelly instance is exposed to the internet, then you should
55 | immediately do two things to secure it:
56 |
57 | 1. Change the admin password.
58 | 2. Create TLS certificates
59 |
60 | The **admin password** is stored in a file called ``htpasswd`` and it can be
61 | created or edited using the ``htpasswd`` command from the Apache2 utilities
62 | package (called ``apache2-utils`` on Ubuntu distributions). Install that package
63 | and then run this command:
64 |
65 | .. code::
66 |
67 | $ htpasswd -c passwd admin
68 | New password:
69 | Re-type new password:
70 | Adding password for user admin
71 |
72 | Type in the new password when prompted. You can change passwords or add
73 | additional passwords to an existing file by running ``htpasswd passwd USER``,
74 | where ``USER`` is the username to change. When you are done, copy the ``passwd``
75 | file into the Docker container and remove the original.
76 |
77 | .. code::
78 |
79 | $ docker cp passwd starbelly-web:/etc/nginx/tls/
80 | $ docker exec starbelly-web nginx -s reload
81 | 2017/11/02 14:29:37 [notice] 1437#1437: signal process started
82 | $ rm passwd
83 |
84 | The default **TLS certificate** is automatically generated and self-signed when
85 | the container is created. If you have a valid domain name for your Starbelly
86 | server, then you should obtain a real certificate for it.
87 |
88 | If you have obtained your own certificates, you can install them as follows,
89 | where ``certificate.pem`` is the full certificate chain in PEM format and
90 | ``privatekey.pem`` is the private key in PEM format.
91 |
92 | .. code::
93 |
94 | $ docker cp certificate.pem starbelly-web:/etc/nginx/tls/server.crt
95 | $ docker cp privatekey.pem starbelly-web:/etc/nginx/tls/server.key
96 | $ docker exec starbelly-web nginx -s reload
97 | 2017/11/02 14:29:37 [notice] 1437#1437: signal process started
98 |
99 | If you do not already have TLS certificates, you may obtain free certificates
100 | from `Let's Encrypt `__. First, install the certbot
101 | application using the `instructions for your platform
102 | `__. Now run certbot to create initial certificates,
103 | replacing ``YOUR_DOMAIN`` with the fully qualified domain name of the server and
104 | replacing ``/path/to/starbelly-docker/starbelly`` with the path where you placed
105 | the Starbelly docker configuration files from an earlier step.
106 |
107 | .. code::
108 |
109 | $ cd /path/to/starbelly-docker/starbelly
110 | $ certbot certonly \
111 | --webroot -w certbot-webroot \
112 | -d YOUR_DOMAIN \
113 | --deploy-hook ./deploy-certbot.py
114 | Plugins selected: Authenticator webroot, Installer None
115 | Enter email address (used for urgent renewal and security notices) (Enter 'c' to
116 | cancel): YOUR EMAIL HERE
117 |
118 | -------------------------------------------------------------------------------
119 | Please read the Terms of Service at
120 | https://letsencrypt.org/documents/LE-SA-v1.1.1-August-1-2016.pdf. You must agree
121 | in order to register with the ACME server at
122 | https://acme-staging.api.letsencrypt.org/directory
123 | -------------------------------------------------------------------------------
124 | (A)gree/(C)ancel: a
125 |
126 | ...snip...
127 |
128 | This command will attempt to get TLS certificates from the Let's Encrypt server.
129 | If you've never run certbot on this server before, it will prompt you to enter
130 | a contact e-mail address and agree to the terms of service.
131 |
132 | If certificate generation succeeds, it will install those certificates into the
133 | Docker container. This certificate is valid for 90 days and will need to be
134 | renewed before it expires. Create a daily cron job containing the following
135 | command to ensure that the certificate will be renewed appropriately.
136 |
137 | .. code::
138 |
139 | certbot renew --deploy-hook /path/to/starbelly-docker/starbelly/deploy-certbot.py
140 |
141 | Next Steps
142 | ----------
143 |
144 | Now that you have Starbelly up and running, take a look at :doc:`first_crawl`.
145 |
--------------------------------------------------------------------------------
/docs/jupyter_new_notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/jupyter_new_notebook.png
--------------------------------------------------------------------------------
/docs/jupyter_notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/jupyter_notebook.png
--------------------------------------------------------------------------------
/docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/logo.png
--------------------------------------------------------------------------------
/docs/policy.rst:
--------------------------------------------------------------------------------
1 | ******
2 | Policy
3 | ******
4 |
5 | .. contents::
6 | :depth: 2
7 |
8 | Overview
9 | ========
10 |
11 | The *crawl policy* is one of the most important and powerful concepts in
12 | Starbelly. A policy controls the crawler's behavior and decision making, guiding
13 | which links the crawler follows, what kinds of resources it downloads, and how
14 | long or how far it runs. When you start a crawl job, you must specify which
15 | policy that job should use.
16 |
17 | In this part of the documentation, we take a look at the features of the crawl
18 | policy. To begin, click *Policy* in the Starbelly menu, then click on an
19 | existing policy to view it, or click *New Policy* to create a new policy.
20 |
21 | Authentication
22 | ==============
23 |
24 | The authentication policy determines how a crawler can authenticate itself to a
25 | web site. When the crawler sees a domain in a crawl for the first time, it
26 | checks to see if it has any credentials for that domain. (See the configuration
27 | of Credentials for more information.) If it does, it picks one
28 | of the appropriate credentials at random and tries to login with it. Some login
29 | forms may require a CAPTCHA. In those cases, you may configure a CAPTCHA solver
30 | and specify that solver in the policy.
31 |
32 | Robots.txt
33 | ==========
34 |
35 | `Robots.txt `__ is a standard for specifying how
36 | crawlers should interact with websites. By default, Starbelly will attempt to
37 | download a ``robots.txt`` from each domain that it visits, and it will obey the
38 | directives of any such files that it finds. In some circumstances, however,
39 | such as crawling some old sites, it may be useful to ignore or even invert the
40 | directives in a site's robots.txt, which you can configure using the policy.
41 |
42 | URL Normalization
43 | =================
44 |
45 | The crawler attempts to avoid crawling the same URL multiple times. If two links
46 | contain exactly identical URLs, then the crawler will only download that
47 | resource once. On some sites, especially dynamically generated sites, multiple
48 | URLs may refer to the same resource and differ only in the order of URL query
49 | parameters or the values of semantically meaningless query parameters like
50 | session IDs.
51 |
52 | The URL normalization policy allows you to control this behavior. When enabled,
53 | the crawler normalizes URLS using a number of techniques, including:
54 |
55 | - sorting query parameters alphabetically
56 | - upper case percent encodings
57 | - remove query fragments
58 | - etc.
59 |
60 | You may specify URL query parameters that should be discarded during
61 | normalization. By default, the crawler discards several common session ID
62 | parameters. Alternatively, you can disable URL normalization completely,
63 | although this may result in lots of duplicated downloads.
64 |
65 | URL Rules
66 | =========
67 |
68 | The URL rules policy controls how a crawler selects links to follow. For each
69 | page that is downloaded, the crawler extracts candidate links. For each candidate
70 | link, the crawler checks the rules one-by-one until a rule matches, then the crawler
71 | applies the matching rule.
72 |
73 | For example, the default *Deep Crawl* policy contains two URL rules:
74 |
75 | 1. If the URL *matches* the regex ``^https?://({SEED_DOMAINS})/`` then *add* ``1.0``.
76 | 2. Else *multiply by* ``0.0``.
77 |
78 | Let's say the URL is seeded with ``http://foo.com/bar``. It downloads this
79 | document and assigns it a cost of 1.0. Cost is roughly similar to the concept of
80 | *crawl depth* in other crawlers, but it is a bit more sophisticated. Each link
81 | is assigned a cost based on the cost of the document where it was found and the
82 | URL rule that it matches. If a link cost evaluates to zero, then the link is
83 | thrown away. If the link is greater than zero but less than the "Max Cost"
84 | specified in the crawl policy, then the crawler schedules the link to be
85 | fetched. Links are fetched roughly in order of cost, so lower-cost items are
86 | typically fetched before higher-cost items.
87 |
88 | After the crawler downloads the document at ``http://foo.com/bar``, it checks
89 | each link in that document against the URL rules in the policy. For example, if
90 | the link matches the regex in rule #1, then the link will be given a score of
91 | 2.0: the rule says to add 1.0 to the cost of its parent (which was 1.0).
92 |
93 | If the link matches rule #2, then that rule says to multiply the parent's cost
94 | by zero. This results in the new cost being set to zero, and the crawler
95 | discards links where the cost is zero, so the link will not be followed.
96 |
97 | Although the URL rules are a bit complicated at first, they turn out to be a
98 | very powerful way to guide the crawler. For example, if we step back a bit and
99 | consider the effect of the two rules above, we see that it follows links inside
100 | the seed domain and does not follow links outside the seed domain. In other
101 | words, this is a *deep crawl*!
102 |
103 | If we replace the two rules here with just a single rule that says "Always add
104 | 1.0" , then that would result in a *broad crawl* policy! In fact, you can go
105 | look at the default *Broad Crawl* policy included in Starbelly to confirm that
106 | this is how it works.
107 |
108 | User Agents
109 | ===========
110 |
111 | When the crawler downloads a resource, it sends a *User Agent* string in the
112 | headers. By default, Starbelly sends a user agent that identifies itself with a
113 | version number and includes a URL to its source code repository. You may
114 | customize what user agent is sent using the policy. If you include multiple user
115 | agent strings, one will be chosen at random for each request.
116 |
117 | Proxy Rules
118 | ===========
119 |
120 | By default, the crawler downloads resources directly from their hosts. In some
121 | cases, you may want to proxy requests through an intermediary. The *Proxy Rules*
122 | specify which proxy server should be used for which request, similar to the *URL
123 | Rules* above.
124 |
125 | MIME Type Rules
126 | ===============
127 |
128 | While *URL Rules* determine which links to follow, *MIME Type Rules* determine
129 | what types of resources to download. By default, the crawler only downloads
130 | resources that have a MIME type matching the regex ``^text/``, which matches
131 | plain text and HTML resources. If you want the crawler to download images, for
132 | example, then you would add a new rule like ``^image/*`` that would match GIF,
133 | JPEG, and PNG resources.
134 |
135 | The MIME type of a resource is determined by inspecting the ``Content-Type``
136 | header, which means that *MIME Type Rules* are not applied until *after the
137 | crawler downloads headers* for a resource. If the crawler determines that a
138 | resource should not be downloaded, then the crawler closes the connection and
139 | discards any data that has already been downloaded.
140 |
141 | Limits
142 | ======
143 |
144 | The *Limits* policy specifies limits on how far and how long the crawl should
145 | run. If a limit is left blank, then that limit will not be applied to the crawl.
146 |
147 | - Max cost: the crawler will not follow links that have a cost greater than the
148 | one specified here.
149 | - Max duration: the maximum amount of time the crawler should run, in seconds.
150 | - Max items: the maximum number of items that the crawler should download. This
151 | number includes successes, errors, and exceptions.
152 |
--------------------------------------------------------------------------------
/docs/protobuf.rst:
--------------------------------------------------------------------------------
1 | .. _protobuf_messages:
2 |
3 | Protobuf Messages
4 | =================
5 |
6 | The structure of the API is described in :doc:`websocket_api`. The details of
7 | all the individual messages are documened here. The client always sends a
8 | `Request <#.Request>`__ message. The server always sends a `ServerMessage
9 | <#.ServerMessage>`__ message, which contains either a `Response <#.Response>`__
10 | to a request or an `Event <#.Event>`__ belonging to a subscription.
11 |
12 | .. raw:: html
13 | :file: protobuf.html
14 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | # RTD does not support Poetry natively, so this is a temporary workaround. See:
2 | # https://github.com/readthedocs/readthedocs.org/issues/4912
3 | sphinx
4 | sphinx-rtd-theme
5 | sphinxcontrib_trio
6 | .
7 |
--------------------------------------------------------------------------------
/docs/rethinkdb_gui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/rethinkdb_gui.png
--------------------------------------------------------------------------------
/docs/start_crawl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/start_crawl.png
--------------------------------------------------------------------------------
/docs/start_crawl_filled_in.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/start_crawl_filled_in.png
--------------------------------------------------------------------------------
/docs/terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/terminal.png
--------------------------------------------------------------------------------
/docs/websocket_api.rst:
--------------------------------------------------------------------------------
1 | .. _api-documentation:
2 |
3 | WebSocket API
4 | =============
5 |
6 | Overview
7 | --------
8 |
9 | The crawler is controlled completely by an API. Clients connect to the crawler
10 | using `websockets
11 | `__ and
12 | exchange messages with the crawler using `protobuf messages
13 | `__. The built-in GUI relies
14 | solely on this API, so everything that can be done in the GUI can also be done
15 | with the API – and more!
16 |
17 | One of the central goals for the API is to enable clients to synchronize crawl
18 | results in real time. Most crawling systems are batch-oriented: you run the
19 | crawler for a period of time and then collect the results when the crawl is
20 | finished. Starbelly is streaming-oriented: it can send crawl results to a client
21 | as soon as it downloads them.
22 |
23 | Let's imagine that a crawl has started running and already has 1,000 results. A
24 | client can connect to Starbelly and quickly fetch the first 1,000 results.
25 | Because the crawler is still running, new results will continue to stream in as
26 | the crawler downloads them. If either the server or the client needs to
27 | disconnect for some reason, the client is able to reconnect later and pick up
28 | the stream exactly where it left off.
29 |
30 | Connecting to API
31 | -----------------
32 |
33 | The API is exposed as a websocket service on port 443 at the path ``/ws/``. For
34 | example, if starbelly is running on the host ``starbelly.example.com``, then you
35 | should connect to the web socket using the URL
36 | ``wss://starbelly.example.com/ws/``. By default, Starbelly uses HTTP basic
37 | authentication, so you need to include those credentials when you connect to the
38 | API.
39 |
40 | Messages
41 | --------
42 |
43 | Starbelly uses ``protobuf`` to encode messages sent between the client and the
44 | server. There are three types of message used in the API:
45 |
46 | 1. Request
47 | 2. Response
48 | 3. Event
49 |
50 | The *request* and *response* messages are created in pairs: the client sends a
51 | *request* to the server and the server sends back exactly one *response* per
52 | request. The response indicates whether the request was successful and may
53 | include other data related to the request.
54 |
55 | Although each request generates a response, the responses are not necessarily
56 | sent back in the same order that the requests are received. If the client sends
57 | two commands very quickly (call them A and B), it may get the responses back in
58 | either order, e.g. A→B or B→A. For this reason, the client should include a
59 | unique ``request_id`` with each request; the server will include the same
60 | ``request_id`` in its response so that the client can track which response goes
61 | with which request. The client can assign request IDs in any manner that it
62 | chooses, but one sensible approach would be to assign an incrementing sequence
63 | of integers.
64 |
65 | The third type of message is an *event*, which is pushed from the server to the
66 | client. For example, the client can send a request to subscribe to job status.
67 | The server will send a response containing a subscription ID. Now, whenever a
68 | job has a status event, such as downloading a new resource, the server will send
69 | an event to the client containing the job status data and the corresponding
70 | subscription ID. The client can close the subscription by sending another
71 | request. The server will stop sending event messages and will send a response
72 | indicating that the subscription has been cancelled.
73 |
74 | Protobuf is a binary serialization format that supports common data types like
75 | integers, strings, lists, and maps. It is similar in purpose to JSON, but
76 | protobuf is more efficient in terms of encoding overhead and serialization
77 | speed.
78 |
79 | For complete specification of the available protobuf messages, see
80 | :doc:`protobuf`.
81 |
82 | Example Session
83 | ---------------
84 |
85 | This section shows a complete interaction where a client starts a crawl and
86 | synchronizes crawl results. To begin, the client sends a ``RequestSetJob``
87 | request to the server that includes the seed URL, a policy identifier, and a
88 | crawl name.
89 |
90 | .. code::
91 |
92 | Request {
93 | request_id: 1
94 | Command: RequestSetJob {
95 | run_state: RUNNING
96 | policy_id: d28b379ff3668322bfd5d56e11d4e34e
97 | seeds: "https://cnn.com"
98 | name: "My Crawl"
99 | }
100 | }
101 |
102 | The server will kick off a crawling job and will send a response telling the
103 | client that the job has started successfully and including an identifier for the
104 | new job.
105 |
106 | .. code::
107 |
108 | Response {
109 | request_id: 1
110 | is_success: true
111 | Body: ResponseNewJob {
112 | job_id: 0514478baffd401546b755bf460b5997
113 | }
114 | }
115 |
116 | Notice that the response includes the request ID sent by the client, so
117 | we know that this is a response to the above request.
118 |
119 | This response tells us that the crawl is starting, but we would like to keep
120 | track of the crawl's progress and know when it finishes. The next step is to
121 | send a subscription request for job status events.
122 |
123 | .. code::
124 |
125 | Request {
126 | request_id: 2
127 | Command: RequestSubscribeJobStatus {
128 | min_interval: 3.0
129 | }
130 | }
131 |
132 | This subscription provides high-level job status for *all* crawl jobs, including
133 | data like how many items have been downloaded, how many pages had errors, how
134 | many pages results in exceptions, etc. Job status can change rapidly when the
135 | crawler is busy, because each item downloaded counts as a change in job status.
136 | The ``min_interval`` parameter specifies the minimum amount of time in between
137 | job status events sent by the server. In this example, if there are multiple job
138 | status events, the server will batch them together and send at most 1 event
139 | every 3 seconds for this subscription. On the other hand, if the crawl is very
140 | slow, then it may send events even less frequently than that.
141 |
142 | The server will create the subscription and respond with a subscription
143 | identifier.
144 |
145 | .. code::
146 |
147 | Response {
148 | request_id: 1
149 | is_success: true
150 | Body: ResponseNewSubscription {
151 | subscription_id: 300
152 | }
153 | }
154 |
155 | When the client first subscribes to job status, the crawler will send the
156 | complete status of each currently running job. For example, if the crawler has
157 | already downloaded one item, the job status may look like this:
158 |
159 | .. code::
160 |
161 | Event {
162 | subscription_id: 300
163 | Body: JobList {
164 | jobs: {
165 | job_id: 0514478baffd401546b755bf460b5997
166 | seeds: "https://cnn.com"
167 | policy: d28b379ff3668322bfd5d56e11d4e34e
168 | name: "My Crawl"
169 | run_state: RUNNING
170 | started_at: "2017-11-03T10:14:42.194744"
171 | item_count: 1
172 | http_success_count: 1
173 | http_error_count: 0
174 | exception_count: 0
175 | http_status_counts: {
176 | 200: 1
177 | }
178 | }
179 | }
180 | }
181 |
182 | After sending complete job status, the crawler will send small updates as the
183 | job status changes. For example, after the crawler downloads a second item, it
184 | will send an event like this:
185 |
186 | .. code::
187 |
188 | Event {
189 | subscription_id: 300
190 | Body: JobList {
191 | jobs: {
192 | job_id: 0514478baffd401546b755bf460b5997
193 | item_count: 2
194 | http_success_count: 2
195 | http_status_counts: {
196 | 200: 2
197 | }
198 | }
199 | }
200 | }
201 |
202 | Notice how the second message is much smaller: it only contains the fields that
203 | have changed since the previous event. This is how the job status subscription
204 | allows clients to efficiently keep track of the status of all jobs. This API is
205 | used in the GUI to power the Dashboard and Results screens.
206 |
207 | For a complete list of API messages, see :ref:`protobuf_messages`.
208 |
209 | Web Client
210 | ----------
211 |
212 | The crawler GUI is implemented as a stand-alone application written in Dart, and
213 | it interacts with the Starbelly server solely through the public API. Therefore,
214 | anything that you can do in the GUI can also be done through the API.
215 |
216 | https://github.com/hyperiongray/starbelly-web-client
217 |
218 | Python Client
219 | -------------
220 |
221 | A very basic and incomplete Python client library implementation is available:
222 |
223 | https://github.com/hyperiongray/starbelly-python-client
224 |
225 | This client library will be improved over time and made more stable, but for
226 | now it may be used as a reference implementation.
227 |
--------------------------------------------------------------------------------
/integration/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | This module contains integration tests.
3 |
4 | These tests rely on a RethinkDB server running on localhost 28015.
5 | '''
6 | from functools import wraps
7 |
8 | import pytest
9 | from rethinkdb import RethinkDB
10 | import trio
11 |
12 | from starbelly.config import get_config
13 |
14 | # Add this project to the Python path:
15 | from os.path import dirname
16 | from sys import path
17 | path.append(dirname(dirname(__file__)))
18 |
19 |
20 | class fail_after:
21 | ''' This decorator fails if the runtime of the decorated function (as
22 | measured by the Trio clock) exceeds the specified value. '''
23 | def __init__(self, seconds):
24 | self._seconds = seconds
25 |
26 | def __call__(self, fn):
27 | @wraps(fn)
28 | async def wrapper(*args, **kwargs):
29 | with trio.move_on_after(self._seconds) as cancel_scope:
30 | await fn(*args, **kwargs)
31 | if cancel_scope.cancelled_caught:
32 | pytest.fail('Test runtime exceeded the maximum {} seconds'
33 | .format(self._seconds))
34 | return wrapper
35 |
36 |
37 | @pytest.fixture
38 | async def db_pool(nursery):
39 | r = RethinkDB()
40 | r.set_loop_type('trio')
41 | db_config = get_config()['database']
42 | db_pool = r.ConnectionPool(
43 | host=db_config['host'],
44 | port=db_config['port'],
45 | db='integration_testing',
46 | user=db_config['super_user'],
47 | password=db_config['super_password'],
48 | nursery=nursery
49 | )
50 | async with db_pool.connection() as conn:
51 | await r.db_create('integration_testing').run(conn)
52 | yield db_pool
53 | async with db_pool.connection() as conn:
54 | await r.db_drop('integration_testing').run(conn)
55 | await db_pool.close()
56 |
--------------------------------------------------------------------------------
/notebooks/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 |
--------------------------------------------------------------------------------
/notebooks/Example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | " # Example Notebook\n",
8 | " \n",
9 | " This notebook shows how to use the Starbelly shell from inside a Jupyter notebook."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "name": "stdout",
19 | "output_type": "stream",
20 | "text": [
21 | "Starbelly v1.2.0-dev Shell\n"
22 | ]
23 | }
24 | ],
25 | "source": [
26 | "%autoawait trio\n",
27 | "from pathlib import Path\n",
28 | "from sys import path\n",
29 | "path.append(str(Path().resolve().parent))\n",
30 | "from tools.shell import *"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 2,
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "data": {
40 | "text/plain": [
41 | "'starbelly-app'"
42 | ]
43 | },
44 | "execution_count": 2,
45 | "metadata": {},
46 | "output_type": "execute_result"
47 | }
48 | ],
49 | "source": [
50 | "config['database']['user']"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 3,
56 | "metadata": {},
57 | "outputs": [
58 | {
59 | "name": "stdout",
60 | "output_type": "stream",
61 | "text": [
62 | "15:24:09 [tools.shell] INFO: Hello, world!\n"
63 | ]
64 | }
65 | ],
66 | "source": [
67 | "logger.info('Hello, world!')"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 4,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "async def foo():\n",
77 | " await trio.sleep(1)"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 5,
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "await foo()"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 6,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "trio.run(foo)"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 7,
101 | "metadata": {},
102 | "outputs": [
103 | {
104 | "name": "stdout",
105 | "output_type": "stream",
106 | "text": [
107 | "RethinkDB Cursor: [\n",
108 | " {'name': 'Deep Crawl'},\n",
109 | " {'name': 'Broad Crawl'},\n",
110 | "]\n"
111 | ]
112 | }
113 | ],
114 | "source": [
115 | "policy_query = r.table('policy').pluck('name')\n",
116 | "policy_results = run_query(policy_query)\n",
117 | "print_results(policy_results)"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": []
126 | }
127 | ],
128 | "metadata": {
129 | "kernelspec": {
130 | "display_name": "Python 3.7 (starbelly)",
131 | "language": "python",
132 | "name": "starbelly"
133 | },
134 | "language_info": {
135 | "codemirror_mode": {
136 | "name": "ipython",
137 | "version": 3
138 | },
139 | "file_extension": ".py",
140 | "mimetype": "text/x-python",
141 | "name": "python",
142 | "nbconvert_exporter": "python",
143 | "pygments_lexer": "ipython3",
144 | "version": "3.7.0"
145 | }
146 | },
147 | "nbformat": 4,
148 | "nbformat_minor": 2
149 | }
150 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "starbelly"
3 | version = "2.0.0-dev"
4 | description = "Streaming crawler with a graphical user interface"
5 | authors = ["Mark E. Haase "]
6 | license = "MIT"
7 |
8 | [tool.poetry.dependencies]
9 | python = "^3.7"
10 | aiohttp = "^3.6.2"
11 | aiohttp-socks = "^0.3.9"
12 | beautifulsoup4 = "^4.9.1"
13 | cchardet = "^2.1.6"
14 | feedparser = "^5.2.1"
15 | formasaurus = {url = "https://hyperiongray.s3-us-west-2.amazonaws.com/formasaurus-0.9.0.tar.gz"}
16 | lxml = "^4.5.1"
17 | protobuf = "^3.12.2"
18 | psutil = "^5.7.0"
19 | python-dateutil = "^2.8.1"
20 | python-mimeparse = "^1.6.0"
21 | pytz = "==2020.1"
22 | rethinkdb = {url = "https://hyperiongray.s3-us-west-2.amazonaws.com/rethinkdb-2.4.7%2Bstarbelly.tar.gz"}
23 | robotexclusionrulesparser = "^1.7.1"
24 | trio = "^0.15.1"
25 | trio-asyncio = "^0.11.0"
26 | trio-websocket = "^0.8.0"
27 | w3lib = "1.22.0"
28 | watchdog = "^0.10.2"
29 | yarl = "^1.4.2"
30 | scikit-learn = ">=0.18" # required for formasaurus
31 | scipy = "^1.4.1" # required for formasaurus
32 | sklearn-crfsuite = ">=0.3.1" # required for formasaurus
33 | joblib = "^0.15.1" # required for formasaurus
34 | tldextract = "^2.2.2" # required for formasaurus
35 |
36 | [tool.poetry.dev-dependencies]
37 | coverage = "^5.1"
38 | coveralls = "^2.0.0"
39 | pytest-cov = "^2.9.0"
40 | pytest-mock = "^3.1.0"
41 | sphinxcontrib-trio = "^1.1.2"
42 | sphinx-rtd-theme = "^0.4.3"
43 | pytest-trio = "^0.6.0"
44 | trustme = "^0.6.0"
45 |
46 | [build-system]
47 | requires = ["poetry>=0.12"]
48 | build-backend = "poetry.masonry.api"
49 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | log_level = debug
3 | trio_mode = true
4 |
--------------------------------------------------------------------------------
/starbelly/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/starbelly/__init__.py
--------------------------------------------------------------------------------
/starbelly/__main__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import os
4 | import signal
5 | import subprocess
6 | import sys
7 | import time
8 |
9 | from watchdog.events import FileSystemEventHandler
10 | from watchdog.observers import Observer
11 |
12 | from .bootstrap import Bootstrap
13 | from .config import get_config, get_path
14 |
15 |
16 | class ProcessWatchdog(FileSystemEventHandler):
17 | ''' Handle watchdog events by restarting a subprocess. '''
18 |
19 | def __init__(self):
20 | ''' Constructor. '''
21 |
22 | self._logger = logging.getLogger('watchdog')
23 | self._process = None
24 |
25 | def dispatch(self, event):
26 | ''' Restart the subprocess if a source/config file changed. '''
27 |
28 | path = event.src_path
29 | file = os.path.basename(path)
30 | descr = '{} was {}'.format(event.src_path, event.event_type)
31 |
32 | if (file.endswith('.py') and not file.startswith('test_')) or \
33 | file.endswith('.ini'):
34 | self._logger.info('%s (Reloading)', descr)
35 | self.terminate_process()
36 | self.start_process()
37 |
38 | def join(self):
39 | ''' Wait for subprocess to exit. '''
40 | if self._process is not None:
41 | self._process.wait()
42 |
43 | def start_process(self):
44 | ''' Start the subprocess. '''
45 |
46 | if self._process is not None:
47 | msg = 'Cannot start subprocess if it is already running.'
48 | raise RuntimeError(msg)
49 |
50 | time.sleep(1)
51 | args = [sys.executable, '-m', __package__] + sys.argv[1:]
52 | new_env = dict(os.environ)
53 | new_env['WATCHDOG_RUNNING'] = '1'
54 | self._process = subprocess.Popen(args, env=new_env)
55 |
56 | def terminate_process(self):
57 | ''' Terminate the subprocess. '''
58 | if self._process is not None:
59 | try:
60 | self._process.send_signal(signal.SIGTERM)
61 | self._process.wait()
62 | self._process = None
63 | except ProcessLookupError:
64 | pass # The process already died.
65 |
66 |
67 | class Reloader:
68 | ''' Reloads the subprocess when a source file is modified. '''
69 | def __init__(self):
70 | ''' Constructor. '''
71 | self._logger = logging.getLogger('reloader')
72 | self._observer = None
73 | self._running = False
74 | self._watchdog = None
75 |
76 | def run(self):
77 | ''' Run the reloader. '''
78 |
79 | self._logger.info('Running with reloader...')
80 | self._watchdog = ProcessWatchdog()
81 | self._watchdog.start_process()
82 |
83 | self._observer = Observer()
84 | self._observer.schedule(
85 | self._watchdog, str(get_path('starbelly')), recursive=True)
86 | self._observer.start()
87 |
88 | while True:
89 | time.sleep(1)
90 |
91 | def shutdown(self, signum, _):
92 | ''' Exit the reloader. '''
93 | signame = signal.Signals(signum).name
94 | self._logger.info('Caught %s (shutting down)', signame)
95 | self._watchdog.terminate_process()
96 | self._observer.stop()
97 | self._observer.join()
98 | sys.exit(0)
99 |
100 |
101 | def configure_logging(log_level, error_log):
102 | ''' Set default format and output stream for logging. '''
103 | log_format = '%(asctime)s [%(name)s] %(levelname)s: %(message)s'
104 | log_date_format = '%Y-%m-%d %H:%M:%S'
105 | log_formatter = logging.Formatter(log_format, log_date_format)
106 | log_level = getattr(logging, log_level.upper())
107 | log_handler = logging.StreamHandler(sys.stderr)
108 | log_handler.setFormatter(log_formatter)
109 | log_handler.setLevel(log_level)
110 | logger = logging.getLogger()
111 | logger.addHandler(log_handler)
112 | logger.setLevel(log_level)
113 | if log_level < logging.INFO:
114 | logging.getLogger('watchdog').setLevel(logging.INFO)
115 | logging.getLogger('trio-websocket').setLevel(logging.INFO)
116 |
117 | if error_log is not None:
118 | exc_handler = logging.FileHandler(error_log)
119 | exc_handler.setFormatter(log_formatter)
120 | exc_handler.setLevel(logging.ERROR)
121 | logger.addHandler(exc_handler)
122 |
123 |
124 | def get_args():
125 | ''' Parse command line arguments. '''
126 | arg_parser = argparse.ArgumentParser(description='Starbelly')
127 | arg_parser.add_argument(
128 | '--log-level',
129 | default='warning',
130 | metavar='LEVEL',
131 | choices=['debug', 'info', 'warning', 'error', 'critical'],
132 | help='Set logging verbosity (default: warning)'
133 | )
134 | arg_parser.add_argument(
135 | '--ip',
136 | default='127.0.0.1',
137 | help='The IP address to bind to (default: 127.0.0.1)'
138 | )
139 | arg_parser.add_argument(
140 | '--port',
141 | type=int,
142 | default=8000,
143 | help='The TCP port to bind to (default: 8000)'
144 | )
145 | arg_parser.add_argument(
146 | '--reload',
147 | action='store_true',
148 | help='Auto-reload when code or static assets are modified.'
149 | )
150 | arg_parser.add_argument(
151 | '--error-log',
152 | help='Copy error logs to the specified file.'
153 | )
154 | return arg_parser.parse_args()
155 |
156 |
157 | def main():
158 | ''' Set up watchdog or run starbelly. '''
159 | args = get_args()
160 | configure_logging(args.log_level, args.error_log)
161 | config = get_config()
162 |
163 | if args.reload and os.getenv('WATCHDOG_RUNNING') is None:
164 | reloader = Reloader()
165 | signal.signal(signal.SIGINT, reloader.shutdown)
166 | signal.signal(signal.SIGTERM, reloader.shutdown)
167 | reloader.run()
168 | else:
169 | bootstrap = Bootstrap(config, args)
170 | bootstrap.run()
171 |
172 |
173 | if __name__ == '__main__':
174 | main()
175 |
--------------------------------------------------------------------------------
/starbelly/backoff.py:
--------------------------------------------------------------------------------
1 | import trio
2 |
3 |
4 | class ExponentialBackoff:
5 | ''' An experimental class: this makes it simple to write loops that poll
6 | a resource and backoff when the resource is not ready.
7 |
8 | For example, if you are polling the database for some new records, you might
9 | wait 1 second and then try again. If there are still no records, then you
10 | wait 2 seconds before trying again, then 4 seconds, then 8, etc.
11 |
12 | This is written as an async iterator, so you can just loop over it and it
13 | will automatically delay in between loop iterations.
14 | '''
15 | def __init__(self, min_=0.25, max_=64):
16 | '''
17 | Constructor.
18 |
19 | :param int start: The initial delay between loop iterations.
20 | :param int max_: The maximum delay.
21 | '''
22 | self._backoff = min_
23 | self._initial = True
24 | self._min = min_
25 | self._max = max_
26 |
27 | def __repr__(self):
28 | return ''.format(self._backoff)
29 |
30 | def __aiter__(self):
31 | ''' This instance is an async iterator. '''
32 | return self
33 |
34 | async def __anext__(self):
35 | ''' Add a delay in between loop iterations. (No delay for the first
36 | iteration. '''
37 | if self._initial:
38 | backoff = 0
39 | self._initial = False
40 | else:
41 | backoff = self._backoff
42 | await trio.sleep(backoff)
43 | return backoff
44 |
45 | def increase(self):
46 | ''' Double the current backoff, but not if it would exceed this
47 | instance's max value. '''
48 | if self._backoff <= self._max // 2:
49 | self._backoff *= 2
50 |
51 | def decrease(self):
52 | ''' Halve the current backoff, not if would be less than 1. '''
53 | if self._backoff >= 2 * self._min:
54 | self._backoff //= 2
55 |
--------------------------------------------------------------------------------
/starbelly/bootstrap.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta, timezone
2 | import logging
3 |
4 | from rethinkdb import RethinkDB
5 | from rethinkdb.trio_net.net_trio import Connection as RethinkDBTrioConnection
6 | import trio
7 |
8 | from .db import (
9 | BootstrapDb,
10 | CrawlFrontierDb,
11 | CrawlManagerDb,
12 | CrawlExtractorDb,
13 | CrawlStorageDb,
14 | LoginDb,
15 | ScheduleDb,
16 | ServerDb,
17 | SubscriptionDb,
18 | )
19 | from .job import CrawlManager, StatsTracker
20 | from .rate_limiter import RateLimiter
21 | from .resource_monitor import ResourceMonitor
22 | from .robots import RobotsTxtManager
23 | from .schedule import Scheduler
24 | from .server import Server
25 |
26 |
27 | logger = logging.getLogger(__name__)
28 |
29 |
30 | class Bootstrap:
31 | ''' Main class for bootstrapping the crawler. '''
32 | def __init__(self, config, args):
33 | '''
34 | Constructor.
35 |
36 | :param config: Output of config parser.
37 | :param args: Output of argparse.
38 | :param
39 | '''
40 | self._args = args
41 | self._config = config
42 |
43 | def run(self):
44 | ''' Run the main task on the event loop. '''
45 | logger.info('Starbelly is starting...')
46 | try:
47 | trio.run(self._main,
48 | restrict_keyboard_interrupt_to_checkpoints=True)
49 | except KeyboardInterrupt:
50 | logger.warning('Quitting due to KeyboardInterrupt')
51 | logger.info('Starbelly has stopped.')
52 |
53 | def _db_pool(self, nursery):
54 | '''
55 | Create a database connectoin pool.
56 |
57 | :param nursery: A Trio nursery to spawn database connections in.
58 | :returns: A RethinkDB connection pool.
59 | '''
60 | r = RethinkDB()
61 | r.set_loop_type('trio')
62 | db_config = self._config['database']
63 | return r.ConnectionPool(
64 | connection_type=RethinkDBTrioConnection,
65 | host=db_config['host'],
66 | port=db_config['port'],
67 | db=db_config['db'],
68 | user=db_config['user'],
69 | password=db_config['password'],
70 | nursery=nursery
71 | )
72 |
73 | async def _main(self):
74 | '''
75 | The main task.
76 |
77 | :returns: This function runs until cancelled.
78 | '''
79 | # Create db pool & objects
80 | async with trio.open_nursery() as nursery:
81 | db_pool = self._db_pool(nursery)
82 | bootstrap_db = BootstrapDb(db_pool)
83 | crawl_db = CrawlManagerDb(db_pool)
84 | extractor_db = CrawlExtractorDb(db_pool)
85 | frontier_db = CrawlFrontierDb(db_pool)
86 | login_db = LoginDb(db_pool)
87 | schedule_db = ScheduleDb(db_pool)
88 | storage_db = CrawlStorageDb(db_pool)
89 | logging.info('Doing startup check...')
90 | await bootstrap_db.startup_check()
91 |
92 | # Create a rate limiter
93 | rate_limiter = RateLimiter(capacity=1_000)
94 | logger.info('Initializing rate limiter...')
95 | rate_limits = await bootstrap_db.get_rate_limits()
96 | for rate_limit in rate_limits:
97 | rate_limiter.set_rate_limit(rate_limit['token'],
98 | rate_limit['delay'])
99 | logger.info('Rate limiter is initialized.')
100 |
101 | # Create a robots.txt manager
102 | robots_txt_manager = RobotsTxtManager(db_pool)
103 |
104 | # Create a tracker for job stats and initialize with jobs that are
105 | # unfinished or recently finishe.d
106 | recent_period = timedelta(hours=24)
107 | recent_dt = datetime.now(tz=timezone.utc) - recent_period
108 | stats_tracker = StatsTracker(recent_period)
109 | dashboard_jobs = await crawl_db.get_dashboard_jobs(recent_dt)
110 | for job_doc in dashboard_jobs:
111 | stats_dict = {
112 | 'id': job_doc['id'],
113 | 'run_state': job_doc['run_state'],
114 | 'name': job_doc['name'],
115 | 'seeds': job_doc['seeds'],
116 | 'tags': job_doc['tags'],
117 | 'started_at': job_doc['started_at'],
118 | 'completed_at': job_doc['completed_at'],
119 | 'item_count': job_doc['item_count'],
120 | 'http_success_count': job_doc['http_success_count'],
121 | 'http_error_count': job_doc['http_error_count'],
122 | 'exception_count': job_doc['exception_count'],
123 | 'http_status_counts': job_doc['http_status_counts'],
124 | }
125 | stats_tracker.add_job(stats_dict)
126 |
127 | # Create a crawl manager
128 | crawl_manager = CrawlManager(rate_limiter, stats_tracker,
129 | robots_txt_manager, crawl_db, frontier_db, extractor_db,
130 | storage_db, login_db)
131 |
132 | # Create a resource monitor: one sample per second and keep 1 minute of
133 | # history.
134 | resource_monitor = ResourceMonitor(interval=1.0, buffer_size=60,
135 | crawl_resources_fn=crawl_manager.get_resource_usage,
136 | rate_limiter=rate_limiter)
137 |
138 | # Create a scheduler
139 | scheduler = Scheduler(schedule_db, crawl_manager)
140 |
141 | # Create a server
142 | server_db = ServerDb(db_pool)
143 | subscription_db = SubscriptionDb(db_pool)
144 | server = Server(self._args.ip, self._args.port, server_db,
145 | subscription_db, crawl_manager, rate_limiter, resource_monitor,
146 | stats_tracker, scheduler)
147 |
148 | # Run all the components
149 | await nursery.start(crawl_manager.run, name='Crawl Manager')
150 | nursery.start_soon(rate_limiter.run, name='Rate Limiter')
151 | nursery.start_soon(resource_monitor.run, name='Resource Monitor')
152 | nursery.start_soon(scheduler.run, name='Scheduler')
153 | await nursery.start(server.run, name='Server')
154 |
--------------------------------------------------------------------------------
/starbelly/captcha.py:
--------------------------------------------------------------------------------
1 | import base64
2 | from uuid import UUID
3 |
4 | import starbelly.starbelly_pb2
5 |
6 |
7 | class CaptchaSolver:
8 | ''' An interface for a CAPTCHA solving service. '''
9 | def __init__(self, doc):
10 | '''
11 | Constructor.
12 |
13 | :param dict doc: A database document.
14 | '''
15 | self.id = doc['id']
16 | self.name = doc['name']
17 | self.service_url = doc['service_url']
18 | self.api_key = doc['api_key']
19 | self.require_phrase = doc['require_phrase']
20 | self.case_sensitive = doc['case_sensitive']
21 | self.characters = doc['characters']
22 | self.require_math = doc['require_math']
23 | self.min_length = doc.get('min_length', 0)
24 | self.max_length = doc.get('max_length', 0)
25 |
26 | def get_command(self, img_data):
27 | '''
28 | Return a JSON API command.
29 |
30 | :param bytes img_data: The image data for the CAPTCHA.
31 | :returns: A command that can be serialized to JSON.
32 | :rtype: dict
33 | '''
34 | img_b64 = base64.b64encode(img_data).decode('ascii')
35 |
36 | if self.characters == 'ALPHANUMERIC':
37 | numeric = 0
38 | elif self.characters == 'NUMERIC_ONLY':
39 | numeric = 1
40 | elif self.characters == 'ALPHA_ONLY':
41 | numeric = 2
42 | else:
43 | raise Exception('Invalid characters setting: {}'.format(
44 | self.characters))
45 |
46 | return {
47 | 'clientKey': self.api_key,
48 | 'task': {
49 | 'type': 'ImageToTextTask',
50 | 'body': img_b64,
51 | 'phrase': self.require_phrase,
52 | 'case': self.case_sensitive,
53 | 'numeric': numeric,
54 | 'math': self.require_math,
55 | 'minLength': self.min_length,
56 | 'maxLength': self.max_length,
57 | }
58 | }
59 |
60 |
61 | def captcha_doc_to_pb(doc):
62 | '''
63 | Convert CAPTCHA solver from database document to protobuf.
64 |
65 | :param dict doc: A database document.
66 | :returns: A protobuf message.
67 | '''
68 | pb = starbelly.starbelly_pb2.CaptchaSolver()
69 | pb.name = doc['name']
70 | pb.solver_id = UUID(doc['id']).bytes
71 | pb.created_at = doc['created_at'].isoformat()
72 | pb.updated_at = doc['updated_at'].isoformat()
73 | type_ = doc['type']
74 | if type_ == 'antigate':
75 | pb.antigate.CopyFrom(_antigate_doc_to_pb(doc))
76 | else:
77 | raise Exception('Unknown CAPTCHA solver type ({})'.format(type_))
78 | return pb
79 |
80 |
81 | def _antigate_doc_to_pb(doc):
82 | '''
83 | Convert Antigate CAPTCHA solver from database doc to protobuf.
84 |
85 | :param dict doc: A database document.
86 | :returns: A protobuf message.
87 | '''
88 | pb = starbelly.starbelly_pb2.CaptchaSolverAntigate()
89 | pb.service_url = doc['service_url']
90 | pb.api_key = doc['api_key']
91 | pb.require_phrase = doc['require_phrase']
92 | pb.case_sensitive = doc['case_sensitive']
93 | pb.characters = starbelly.starbelly_pb2.CaptchaSolverAntigateCharacters \
94 | .Value(doc['characters'])
95 | pb.require_math = doc['require_math']
96 | if 'min_length' in doc:
97 | pb.min_length = doc['min_length']
98 | if 'max_length' in doc:
99 | pb.max_length = doc['max_length']
100 | return pb
101 |
102 |
103 | def captcha_pb_to_doc(pb):
104 | '''
105 | Convert CAPTCHA solver from protobuf to database document.
106 |
107 | :param pb: A protobuf message.
108 | :returns: A database document.
109 | :rtype: dict
110 | '''
111 | if pb.name.strip() == '':
112 | raise Exception('Name is required.')
113 | doc = {'name': pb.name}
114 | if pb.HasField('solver_id'):
115 | doc['id'] = str(UUID(bytes=pb.solver_id))
116 | type_ = pb.WhichOneof('SolverType')
117 | if type_ == 'antigate':
118 | doc.update(_antigate_pb_to_doc(pb))
119 | else:
120 | raise Exception('Unknown CAPTCHA solver type ({})'.format(type_))
121 | return doc
122 |
123 |
124 | def _antigate_pb_to_doc(pb):
125 | '''
126 | Convert Antigate CAPTCHA solver from database doc to protobuf.
127 |
128 | :param pb: A protobuf message.
129 | :returns: A database document.
130 | :rtype: dict
131 | '''
132 | antigate = pb.antigate
133 | doc = {
134 | 'service_url': antigate.service_url,
135 | 'api_key': antigate.api_key,
136 | 'require_phrase': antigate.require_phrase,
137 | 'case_sensitive': antigate.case_sensitive,
138 | 'characters': starbelly.starbelly_pb2.CaptchaSolverAntigateCharacters \
139 | .Name(antigate.characters),
140 | 'require_math': antigate.require_math,
141 | 'type': 'antigate',
142 | }
143 | if antigate.HasField('min_length'):
144 | doc['min_length'] = antigate.min_length
145 | if antigate.HasField('max_length'):
146 | doc['max_length'] = antigate.max_length
147 | return doc
148 |
--------------------------------------------------------------------------------
/starbelly/config.py:
--------------------------------------------------------------------------------
1 | import configparser
2 | import pathlib
3 |
4 |
5 | _root = pathlib.Path(__file__).resolve().parent.parent
6 |
7 |
8 | def get_path(relpath):
9 | ''' Get absolute path to a project-relative path. '''
10 | return _root / relpath
11 |
12 |
13 | def get_config():
14 | '''
15 | Read the application configuration from the standard configuration files.
16 |
17 | :rtype: ConfigParser
18 | '''
19 | config_dir = get_path("conf")
20 | config_files = [
21 | config_dir / "system.ini",
22 | config_dir / "local.ini",
23 | ]
24 | config = configparser.ConfigParser()
25 | config.optionxform = str
26 | config.read(config_files)
27 | return config
28 |
--------------------------------------------------------------------------------
/starbelly/extractor.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import logging
3 |
4 | from bs4 import BeautifulSoup
5 | import cchardet
6 | import feedparser
7 | import mimeparse
8 | import trio
9 | import w3lib.encoding
10 | import yarl
11 |
12 |
13 | logger = logging.getLogger(__name__)
14 | chardet = lambda s: cchardet.detect(s).get('encoding')
15 |
16 |
17 | class CrawlExtractor:
18 | ''' Extract URLs from crawled items and add them to the frontier table. '''
19 | def __init__(self, job_id, db, send_channel, receive_channel, policy,
20 | downloader, robots_txt_manager, old_urls, stats, batch_size=100):
21 | '''
22 | Constructor.
23 |
24 | :param str job_id: The ID of the job to extract response for.
25 | :param starbelly.db.CrawlExtractorDb db: A database layer.
26 | :param trio.SendChannel send_channel: A channel that sends
27 | DownloadResponse instances.
28 | :param trio.ReceiveChannel receive_channel: A channel that receives
29 | DownloadResponse instances.
30 | :param starbelly.policy.Policy: A policy for computing costs.
31 | :param starbelly.downloader.Downloader: A downloader used for this job.
32 | :param starbelly.robots.RobotsTxtManager: A robots.txt manager.
33 | :param set old_urls: A set of hashed URLs that this crawl has seen before.
34 | These URLs will not be added to the crawl frontier a second time.
35 | :param dict stats: A dictionary of crawl statistics.
36 | :param int batch_size: The maximum size of inserts to do in a single
37 | database query. If more items than this are extracted from a
38 | document, then multiple queries will be issued.
39 | '''
40 | self._job_id = job_id
41 | self._db = db
42 | self._send_channel = send_channel
43 | self._receive_channel = receive_channel
44 | self._policy = policy
45 | self._downloader = downloader
46 | self._robots_txt_manager = robots_txt_manager
47 | self._old_urls = old_urls
48 | self._stats = stats
49 | self._batch_size = batch_size
50 |
51 | def __repr__(self):
52 | ''' Report crawl job ID. '''
53 | return ''.format(self._job_id[:8])
54 |
55 | @property
56 | def old_urls(self):
57 | return self._old_urls
58 |
59 | async def run(self):
60 | '''
61 | Read responses from extraction channel and add them to the frontier.
62 |
63 | :returns: This function runs until cancelled.
64 | '''
65 | async for response in self._receive_channel:
66 | try:
67 | if response.is_success:
68 | await self._extract(response)
69 | except Exception:
70 | logger.exception('%r Extractor exception on %r', self, response)
71 | finally:
72 | await self._db.delete_frontier_item(response.frontier_id)
73 | await self._send_channel.send(response)
74 |
75 | async def _extract(self, response):
76 | '''
77 | Find links in a response body and put them in the frontier.
78 |
79 | :param starbelly.downloader.DownloadReponse:
80 | '''
81 | logger.debug('%r Extracting links from %s', self, response.url)
82 | extracted_urls = await trio.run_sync_in_worker_thread(
83 | extract_urls, response)
84 | insert_items = list()
85 |
86 | for counter, url in enumerate(extracted_urls):
87 | # Check if the policy allows us to follow this URL.
88 | new_cost = self._policy.url_rules.get_cost(response.cost, url)
89 | exceeds_max_cost = self._policy.limits.exceeds_max_cost(new_cost)
90 | if new_cost <= 0 or exceeds_max_cost:
91 | continue
92 | robots_ok = await self._robots_txt_manager.is_allowed(url,
93 | self._policy, self._downloader)
94 | if not robots_ok:
95 | continue
96 |
97 | # Normalize and hash URL.
98 | url_can = self._policy.url_normalization.normalize(url)
99 | hash_ = hashlib.blake2b(url_can.encode('ascii'), digest_size=16)
100 | url_hash = hash_.digest()
101 |
102 | # If we haven't seen this URL before, it should be added to the
103 | # frontier.
104 | if url_hash not in self._old_urls:
105 | logger.debug('%r Adding URL %s (cost=%0.2f)', self, url,
106 | new_cost)
107 | insert_items.append({
108 | 'cost': new_cost,
109 | 'job_id': self._job_id,
110 | 'url': url,
111 | 'in_flight': False,
112 | })
113 | self._old_urls.add(url_hash)
114 |
115 | # Don't monopolize the event loop:
116 | if counter % self._batch_size == self._batch_size - 1:
117 | await trio.sleep(0)
118 |
119 | # Insert items in batches
120 | start = 0
121 | while start < len(insert_items):
122 | end = min(start + self._batch_size, len(insert_items))
123 | self._stats['frontier_size'] += end - start
124 | await self._db.insert_frontier_items(insert_items[start:end])
125 | start = end
126 |
127 |
128 | def extract_urls(response):
129 | '''
130 | Extract URLs from a response body.
131 |
132 | Any relative URLs found in the response body are converted to absolute URLs
133 | using the original request URL.
134 |
135 | :param starbelly.downloader.DownloadResponse response: A response to
136 | extract URLs from.
137 | :returns: A list of URLs.
138 | :rtype: list[str]
139 | '''
140 | extracted_urls = list()
141 |
142 | if response.is_success:
143 | base_url = response.url
144 | type_, subtype, _ = mimeparse.parse_mime_type(response.content_type)
145 |
146 | if type_ == 'text' and subtype == 'html' or \
147 | type_ == 'application' and subtype == 'xhtml+xml':
148 | extracted_urls = _extract_html(response)
149 | elif type_ == 'application' and subtype == 'atom+xml' or \
150 | type_ == 'application' and subtype == 'rss+xml':
151 | extracted_urls = _extract_feed(response)
152 | else:
153 | raise ValueError('Unsupported MIME in extract_urls(): {} (url={})'
154 | .format(response.content_type, base_url))
155 |
156 | return extracted_urls
157 |
158 |
159 | def _extract_feed(response):
160 | '''
161 | Extract links from Atom or RSS feeds.
162 |
163 | :param starbelly.downloader.DownloadResponse response: An Atom/RSS response
164 | to extract URLs from.
165 | :returns: A list of URLs.
166 | :rtype: list[str]
167 | '''
168 | doc = feedparser.parse(response.body)
169 | return [entry.link for entry in doc.entries]
170 |
171 |
172 | def _extract_html(response):
173 | '''
174 | Extract links from HTML document tags.
175 |
176 | :param starbelly.downloader.DownloadResponse response: An HTML response to
177 | extract URLs from.
178 | :returns: A list of URLs.
179 | :rtype: list[str]
180 | '''
181 | _, html = w3lib.encoding.html_to_unicode(
182 | response.content_type,
183 | response.body,
184 | auto_detect_fun=chardet
185 | )
186 |
187 | doc = BeautifulSoup(html, 'lxml')
188 | base_tag = doc.head.base
189 | base_url = None
190 |
191 | if base_tag is not None:
192 | base_href = base_tag.get('href')
193 | if base_href is not None:
194 | base_url = yarl.URL(base_href)
195 |
196 | if base_url is None:
197 | base_url = yarl.URL(response.url)
198 |
199 | extracted_urls = list()
200 |
201 | for anchor in doc.find_all('a', href=True):
202 | href = anchor.get('href')
203 |
204 | try:
205 | parsed_href = yarl.URL(href)
206 | except:
207 | logger.exception('Rejecting malformed URL base=%s url=%s',
208 | str(response.url), href)
209 | continue
210 |
211 | absolute_href = base_url.join(parsed_href)
212 |
213 | if absolute_href.scheme in ('http', 'https'):
214 | extracted_urls.append(str(absolute_href))
215 |
216 | return extracted_urls
217 |
--------------------------------------------------------------------------------
/starbelly/frontier.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | import logging
3 |
4 | from rethinkdb import RethinkDB
5 | from yarl import URL
6 |
7 | from .backoff import ExponentialBackoff
8 | from .downloader import DownloadRequest
9 |
10 |
11 | r = RethinkDB()
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | class FrontierExhaustionError(Exception):
16 | ''' Indicates that the frontier has no items remaining (in-flight or
17 | otherwise. '''
18 |
19 |
20 | @dataclass
21 | class FrontierItem:
22 | ''' Represents a resource that should be crawled. '''
23 | frontier_id: bytes
24 | job_id: bytes
25 | url: str
26 | cost: float
27 |
28 | @classmethod
29 | def from_doc(cls, doc):
30 | '''
31 | Create a frontier item from a database document.
32 |
33 | :param dict doc: A database document.
34 | '''
35 | return cls(doc['id'], doc['job_id'], doc['url'], doc['cost'])
36 |
37 |
38 | class CrawlFrontier:
39 | ''' Contains the logic for managing a crawl frontier, i.e. the URLs that
40 | have already been crawled and the URLs that are remaining to be crawled. '''
41 | def __init__(self, job_id, db, send_channel, login_manager, policy, stats):
42 | '''
43 | Constructor
44 |
45 | :param str job_id: The ID of the job corresponding to this frontier.
46 | :param starbelly.db.CrawlFrontierDb db: A database layer.
47 | :param trio.SendChannel send_channel: This channel is used to send
48 | ``FrontierItem`` that need to be downloaded, e.g. to send to the
49 | rate limiter.
50 | :param starbelly.login.LoginManager login_manager: Used when the
51 | frontier sees an unauthenticated domain and needs to log in.
52 | :param starbelly.policy.Policy: The policy to use.
53 | :param dict stats: A dictionary of crawl statistics.
54 | '''
55 | self._job_id = job_id
56 | self._db = db
57 | self._send_channel = send_channel
58 | self._login_manager = login_manager
59 | self._policy = policy
60 | self._authenticated_domains = set()
61 | self._stats = stats
62 |
63 | def __repr__(self):
64 | ''' Include job ID in the repr. '''
65 | return ''.format(self._job_id[:8])
66 |
67 | async def run(self):
68 | '''
69 | This task takes items off the frontier and sends them to the rate
70 | limiter.
71 |
72 | :returns: This function runs until cancelled.
73 | '''
74 | await self._initialize()
75 |
76 | while True:
77 | frontier_items = await self._get_batch()
78 | for item in frontier_items:
79 | if self._policy.authentication.is_enabled():
80 | domain = URL(item.url).host
81 | if domain not in self._authenticated_domains:
82 | await self._login_manager.login(domain)
83 | self._authenticated_domains.add(domain)
84 | logger.debug('%r Sending: %r', self, item)
85 | request = DownloadRequest.from_frontier_item(item)
86 | await self._send_channel.send(request)
87 |
88 | async def _initialize(self):
89 | ''' Initialize frontier database documents. '''
90 |
91 | size = await self._db.get_frontier_size(self._job_id)
92 | logger.info('%r Initialization complete (size=%d)', self, size)
93 | self._stats['frontier_size'] = size
94 |
95 | async def _get_batch(self, size=10):
96 | '''
97 | Get a batch of items from the frontier table, ordered by ascending cost.
98 | If no items available, poll the database until items become available.
99 |
100 | :param int size:
101 | :returns: A batch of frontier items.
102 | :rtype: list[FrontierItem]
103 | '''
104 | backoff = ExponentialBackoff(min_=1, max_=16)
105 | async for _ in backoff:
106 | docs = await self._db.get_frontier_batch(self._job_id, size)
107 | if docs:
108 | self._stats['frontier_size'] -= len(docs)
109 | break
110 | else:
111 | if await self._db.any_in_flight(self._job_id):
112 | backoff.increase()
113 | else:
114 | raise FrontierExhaustionError()
115 |
116 | return [FrontierItem.from_doc(doc) for doc in docs]
117 |
--------------------------------------------------------------------------------
/starbelly/login.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from functools import partial
3 | import logging
4 | import random
5 |
6 | import aiohttp
7 | import cchardet
8 | import formasaurus
9 | import trio
10 | import trio_asyncio
11 | import w3lib.encoding
12 | from yarl import URL
13 |
14 | from .downloader import DownloadRequest
15 |
16 |
17 | logger = logging.getLogger(__name__)
18 | chardet = lambda s: cchardet.detect(s).get("encoding")
19 |
20 |
21 | def get_captcha_image_element(form):
22 | """
23 | Return the
element in an lxml form that contains the CAPTCHA.
24 |
25 | NOTE: This assumes the first image in the form is the CAPTCHA image. If
26 | a form has multiple images, maybe use the etree .sourceline attribute to
27 | figure out which image is closer to the CAPTCHA input? Or crawl through
28 | the element tree to find the image?
29 |
30 | :param form: An lxml form element.
31 | :returns: An lxml image element.
32 | """
33 | img_el = form.find(".//img")
34 | if img_el is None:
35 | raise Exception("Cannot locate CAPTCHA image")
36 | return img_el
37 |
38 |
39 | def select_login_fields(fields):
40 | """
41 | Select field having highest probability for class ``field``.
42 |
43 | :param dict fields: Nested dictionary containing label probabilities
44 | for each form element.
45 | :returns: (username field, password field, captcha field)
46 | :rtype: tuple
47 | """
48 | username_field = None
49 | username_prob = 0
50 | password_field = None
51 | password_prob = 0
52 | captcha_field = None
53 | captcha_prob = 0
54 |
55 | for field_name, labels in fields.items():
56 | for label, prob in labels.items():
57 | if label in ("username", "username or email") and prob > username_prob:
58 | username_field = field_name
59 | username_prob = prob
60 | elif label == "password" and prob > password_prob:
61 | password_field = field_name
62 | password_prob = prob
63 | elif label == "captcha" and prob > captcha_prob:
64 | captcha_field = field_name
65 | captcha_prob = prob
66 |
67 | return username_field, password_field, captcha_field
68 |
69 |
70 | def select_login_form(forms):
71 | """
72 | Select form having highest probability for login class.
73 |
74 | :param dict forms: Nested dict containing label probabilities for each
75 | form.
76 | :returns: (login form, login meta)
77 | :rtype: tuple
78 | """
79 | login_form = None
80 | login_meta = None
81 | login_prob = 0
82 |
83 | for form, meta in forms:
84 | for type_, prob in meta["form"].items():
85 | if type_ == "login" and prob > login_prob:
86 | login_form = form
87 | login_meta = meta
88 | login_prob = prob
89 |
90 | return login_form, login_meta
91 |
92 |
93 | class LoginManager:
94 | def __init__(self, job_id, db, policy, downloader):
95 | """
96 | Constructor
97 |
98 | :param starbelly.db.LoginDb: A database layer.
99 | """
100 | self._job_id = job_id
101 | self._db = db
102 | self._policy = policy
103 | self._downloader = downloader
104 |
105 | async def login(self, domain):
106 | """
107 | Attempt a login for the given domain.
108 |
109 | :param str domain: The domain to log into.
110 | """
111 | domain_login = await self._db.get_login(domain)
112 | if domain_login is None:
113 | return
114 |
115 | # Ensure login has users
116 | if not domain_login.get("users", []):
117 | logger.warning("No users for login: %s", domain_login)
118 | return
119 |
120 | # Select random user from domain_login
121 | user = random.choice(domain_login["users"])
122 | masked_pass = user["password"][:2] + "******"
123 | logger.info(
124 | "Attempting login: domain=%s with user=%s password=%s",
125 | domain,
126 | user["username"],
127 | masked_pass,
128 | )
129 | request = DownloadRequest(
130 | frontier_id=None,
131 | job_id=self._job_id,
132 | method="GET",
133 | url=domain_login["login_url"],
134 | form_data=None,
135 | cost=1.0,
136 | )
137 | response = await self._downloader.download(request)
138 | if not response.is_success:
139 | logger.error("Login aborted: cannot fetch %s", response.url)
140 | return
141 | try:
142 | action, method, data = await self._get_login_form(
143 | response, user["username"], user["password"]
144 | )
145 | except Exception as e:
146 | logger.exception("Cannot parse login form: %s", e)
147 | return
148 | logger.info("Login action=%s method=%s data=%r", action, method, data)
149 | request = DownloadRequest(
150 | frontier_id=None,
151 | job_id=self._job_id,
152 | method=method,
153 | url=action,
154 | form_data=data,
155 | cost=1.0,
156 | )
157 | response = await self._downloader.download(request)
158 | if not response.is_success:
159 | logger.error(
160 | "Login failed action=%s (see downloader log for" " details)", action
161 | )
162 |
163 | async def _download_captcha_image(self, img_src):
164 | """
165 | Download and return a CAPTCHA image.
166 |
167 | :param str img_src: The URL to download the image from.
168 | :rtype bytes:
169 | """
170 | logger.info("Downloading CAPTCHA image src=%s", img_src)
171 | request = DownloadRequest(
172 | frontier_id=None,
173 | job_id=None,
174 | method="GET",
175 | url=img_src,
176 | form_data=None,
177 | cost=0,
178 | )
179 | response = await self._downloader.download(request)
180 |
181 | if response.status_code == 200 and response.body is not None:
182 | img_data = response.body
183 | else:
184 | raise Exception("Failed to download CAPTCHA image src={}".format(img_src))
185 |
186 | return img_data
187 |
188 | async def _get_login_form(self, response, username, password):
189 | """
190 | Attempt to extract login form action and form data from a response,
191 | substituting the provided ``username`` and ``password`` into the
192 | corresponding fields. Returns the data needed to POST a login request.
193 |
194 | :param starbelly.downloader.DownloadResponse response:
195 | :param str username: The username to log in with.
196 | :param str password: The password to log in with.
197 | :returns: (action, method, fields)
198 | :rtype: tuple
199 | """
200 | _, html = w3lib.encoding.html_to_unicode(
201 | response.content_type, response.body, auto_detect_fun=chardet
202 | )
203 |
204 | forms = await trio.run_sync_in_worker_thread(
205 | partial(formasaurus.extract_forms, html, proba=True)
206 | )
207 | form, meta = select_login_form(forms)
208 |
209 | if form is None:
210 | raise Exception("Can't find login form")
211 |
212 | login_field, password_field, captcha_field = select_login_fields(meta["fields"])
213 | if login_field is None or password_field is None:
214 | raise Exception("Can't find username/password fields")
215 |
216 | form.fields[login_field] = username
217 | form.fields[password_field] = password
218 |
219 | if captcha_field is not None:
220 | if self._policy.captcha_solver is None:
221 | raise Exception(
222 | "CAPTCHA required for login url={} but there is"
223 | " no CAPTCHA solver available".format(response.url)
224 | )
225 |
226 | img_el = get_captcha_image_element(form)
227 | img_src = str(URL(response.url).join(URL(img_el.get("src"))))
228 | img_data = await self._download_captcha_image(img_src)
229 | captcha_text = await self._solve_captcha_asyncio(img_data)
230 | form.fields[captcha_field] = captcha_text
231 |
232 | form_action = URL(response.url).join(URL(form.action))
233 | return form_action, form.method, dict(form.fields)
234 |
235 | @trio_asyncio.aio_as_trio
236 | async def _solve_captcha_asyncio(self, img_data):
237 | """
238 | Send an image CAPTCHA to an external solver and return the solution.
239 | This function uses aiohttp and therefore must run on the asyncio loop.
240 |
241 | :param bytes img_data: The CAPTCHA image.
242 | :rtype: str
243 | """
244 | solver = self._policy.captcha_solver
245 | solution = None
246 | task_url = str(URL(solver.service_url).join(URL("createTask")))
247 | poll_url = str(URL(solver.service_url).join(URL("getTaskResult")))
248 |
249 | # This doesn't use the downloader object because this is a third party
250 | # and is not the subject of our crawl.
251 | async with aiohttp.ClientSession() as session:
252 | # Send CAPTCHA task to service
253 | command = solver.get_command(img_data)
254 | async with session.post(task_url, json=command) as response:
255 | result = await response.json()
256 | if result["errorId"] != 0:
257 | raise Exception("CAPTCHA API error {}".format(result["errorId"]))
258 | task_id = result["taskId"]
259 | logger.info("Sent image to CAPTCHA API task_id=%d", task_id)
260 |
261 | # Poll for task completion. (Try 6 times.)
262 | solution = None
263 | for attempt in range(6):
264 | await asyncio.sleep(5)
265 | command = {
266 | "clientKey": solver.api_key,
267 | "taskId": task_id,
268 | }
269 | logger.info(
270 | "Polling for CAPTCHA solution task_id=%d," " attempt=%d",
271 | task_id,
272 | attempt + 1,
273 | )
274 | async with session.post(poll_url, json=command) as response:
275 | result = await response.json()
276 | if result["errorId"] != 0:
277 | raise Exception(
278 | "CAPTCHA API error {}".format(result["errorId"])
279 | )
280 | solution = result["solution"]["text"]
281 | break
282 |
283 | if solution is None:
284 | raise Exception("CAPTCHA API never completed task")
285 |
286 | return solution
287 |
--------------------------------------------------------------------------------
/starbelly/resource_monitor.py:
--------------------------------------------------------------------------------
1 | from collections import deque
2 | from datetime import datetime, timezone
3 | import logging
4 |
5 | import psutil
6 | import trio
7 |
8 |
9 | logger = logging.getLogger(__name__)
10 |
11 |
12 | class ResourceMonitor:
13 | '''
14 | Keep track of consumption and usage statistics for various resources.
15 | '''
16 | def __init__(self, interval, buffer_size, crawl_resources_fn, rate_limiter):
17 | '''
18 | Constructor.
19 |
20 | :param float interval: The number of seconds to wait between
21 | measurements.
22 | :param int buffer_size: The number of measurements to store in the
23 | internal buffer.
24 | :param callable crawl_resource_fn: A function that will return a dict
25 | of crawl resources.
26 | :param starbelly.rate_limiter.RateLimiter rate_limiter:
27 | '''
28 | self._interval = interval
29 | self._crawl_resources_fn = crawl_resources_fn
30 | self._rate_limiter = rate_limiter
31 | self._measurements = deque(maxlen=buffer_size)
32 | self._channels = list()
33 |
34 | def get_channel(self, channel_size):
35 | '''
36 | Get a statistics channel. The resource monitor will send measurements to
37 | this channel until the receive end is closed. Note that if the channel
38 | is full, the resource monitor does not block! It will drop messages
39 | instead.
40 |
41 | :param int channel_size: The size of the channel to create.
42 | :returns: A channel that will receive resource statistics at regular
43 | intervals.
44 | :rtype: trio.ReceiveChannel
45 | '''
46 | logger.debug('Creating new channel with size=%d', channel_size)
47 | send_channel, recv_channel = trio.open_memory_channel(channel_size)
48 | self._channels.append(send_channel)
49 | return recv_channel
50 |
51 | def history(self, n=None):
52 | '''
53 | Return the most recent ``n`` measurements.
54 |
55 | :param int n: The number of measurements to retrieve. If ``n`` is None
56 | or there are fewer than ``n`` measurements, return all measurements.
57 | :rtype: list
58 | '''
59 | # A deque can't be sliced, so we have to do some extra work to return
60 | # the most recent measurements from the end.
61 | history_iter = iter(self._measurements)
62 | if n is not None:
63 | for _ in range(len(self._measurements) - n):
64 | next(history_iter)
65 | return list(history_iter)
66 |
67 | async def run(self):
68 | '''
69 | Run the resource monitor.
70 |
71 | :returns: Runs until cancelled.
72 | '''
73 | next_run = trio.current_time() + self._interval
74 | while True:
75 | measurement = self._measure()
76 | self._measurements.append(measurement)
77 | to_remove = set()
78 | for channel in self._channels:
79 | try:
80 | channel.send_nowait(measurement)
81 | except trio.WouldBlock:
82 | continue
83 | except trio.BrokenResourceError:
84 | to_remove.add(channel)
85 | for channel in to_remove:
86 | logger.debug('Removing closed channel')
87 | self._channels.remove(channel)
88 | sleep_time = next_run - trio.current_time()
89 | while sleep_time < 0:
90 | sleep_time += self._interval
91 | await trio.sleep(sleep_time)
92 | next_run += self._interval
93 |
94 | def _measure(self):
95 | '''
96 | Record one set of measurements.
97 |
98 | :rtype: dict
99 | '''
100 | measurement = dict()
101 | measurement['timestamp'] = datetime.now(timezone.utc)
102 |
103 | # CPUs
104 | measurement['cpus'] = psutil.cpu_percent(percpu=True)
105 |
106 | # Memory
107 | vm = psutil.virtual_memory()
108 | measurement['memory_used'] = vm.used
109 | measurement['memory_total'] = vm.total
110 |
111 | # Disks
112 | measurement['disks'] = list()
113 | for partition in psutil.disk_partitions():
114 | disk = dict()
115 | disk['mount'] = partition.mountpoint
116 | usage = psutil.disk_usage(disk['mount'])
117 | disk['used'] = usage.used
118 | disk['total'] = usage.total
119 | measurement['disks'].append(disk)
120 |
121 | # Networks
122 | measurement['networks'] = list()
123 | for name, nic in psutil.net_io_counters(pernic=True).items():
124 | net = dict()
125 | net['name'] = name
126 | net['sent'] = nic.bytes_sent
127 | net['received'] = nic.bytes_recv
128 | measurement['networks'].append(net)
129 |
130 | # Crawl Job Resources
131 | measurement['jobs'] = list()
132 | crawl_resources = self._crawl_resources_fn()
133 | for job in crawl_resources['jobs']:
134 | measurement['jobs'].append(job.copy())
135 |
136 | # Crawl Global Resources
137 | measurement['current_downloads'] = crawl_resources['current_downloads']
138 | measurement['maximum_downloads'] = crawl_resources['maximum_downloads']
139 | measurement['rate_limiter'] = self._rate_limiter.item_count
140 |
141 | return measurement
142 |
--------------------------------------------------------------------------------
/starbelly/robots.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | from datetime import datetime, timezone
3 | import logging
4 |
5 | from robotexclusionrulesparser import RobotExclusionRulesParser
6 | from rethinkdb import RethinkDB
7 | from yarl import URL
8 | import trio
9 |
10 | from .downloader import DownloadRequest
11 |
12 |
13 | r = RethinkDB()
14 | logger = logging.getLogger(__name__)
15 |
16 |
17 | class RobotsTxtManager:
18 | ''' Store and manage robots.txt files. '''
19 | def __init__(self, db_pool, max_age=24*60*60, max_cache=1e3):
20 | '''
21 | Constructor.
22 |
23 | :param db_pool: A DB connection pool.
24 | :param int max_age: The maximum age before a robots.txt is downloaded
25 | again.
26 | :param int max_cache: The maximum number of robots.txt files to cache
27 | in memory.
28 | '''
29 | self._db_pool = db_pool
30 | self._events = dict()
31 | self._cache = OrderedDict()
32 | self._max_age = max_age
33 | self._max_cache = max_cache
34 |
35 | async def is_allowed(self, url, policy, downloader):
36 | '''
37 | Return True if ``url`` is allowed by the applicable robots.txt file.
38 |
39 | This fetches the applicable robots.txt if we don't have a recent copy
40 | of it cached in memory or in the database. The ``policy`` is used if a
41 | robots.txt file needs to be fetched from the network.
42 |
43 | :param str url: Check this URL to see if the robots.txt and accompanying
44 | policy permit access to it.
45 | :param Policy policy:
46 | :param Downloader downloader:
47 | :rtype: bool
48 | '''
49 | if policy.robots_txt.usage == 'IGNORE':
50 | # No need to fetch robots.txt.
51 | return True
52 |
53 | robots_url = str(URL(url).with_path('robots.txt')
54 | .with_query(None)
55 | .with_fragment(None))
56 |
57 | # Check if cache has a current copy of robots.txt.
58 | try:
59 | robots = self._cache[robots_url]
60 | if robots.is_older_than(self._max_age):
61 | del self._cache[robots_url]
62 | robots = None
63 | else:
64 | self._cache.move_to_end(robots_url)
65 | except KeyError:
66 | robots = None
67 |
68 | # Do we need to fetch robots into cache?
69 | if robots is None:
70 | try:
71 | # If another task is fetching it, then just wait for that task.
72 | await self._events[robots_url].wait()
73 | robots = self._cache[robots_url]
74 | except KeyError:
75 | # Create a new task to fetch it.
76 | self._events[robots_url] = trio.Event()
77 | robots = await self._get_robots(robots_url, downloader)
78 | event = self._events.pop(robots_url)
79 | event.set()
80 |
81 | # Note: we only check the first user agent.
82 | user_agent = policy.user_agents.get_first_user_agent()
83 | robots_decision = robots.is_allowed(user_agent, url)
84 | if policy.robots_txt.usage == 'OBEY':
85 | return robots_decision
86 | return not robots_decision
87 |
88 | async def _get_robots(self, robots_url, downloader):
89 | '''
90 | Locate and return a robots.txt file.
91 |
92 | Looks for non-expired robots.txt file first in database then request
93 | from network. Wherever the robots file is found, it is placed into the
94 | cache and then returned.
95 |
96 | If we get a copy from the network, then we also store a copy in the
97 | database. If we cannot get a copy from the network (e.g. 404 error) and
98 | we have a database copy, then we update the database copy's expiration.
99 | If we cannot get a copy from database or network, then we create a
100 | permissive robots.txt and use that instead.
101 |
102 | :param str url: Fetch the file at this URL.
103 | :param Downloader downloader:
104 | :rtype: RobotsTxt
105 | '''
106 | # Check DB. If not there (or expired), check network.
107 | now = datetime.now(timezone.utc)
108 | robots_doc = await self._get_robots_from_db(robots_url)
109 |
110 | if robots_doc is None or \
111 | (now - robots_doc['updated_at']).seconds > self._max_age:
112 | robots_file = await self._get_robots_from_net(robots_url,
113 | downloader)
114 | else:
115 | robots_file = None
116 |
117 | if robots_doc is None:
118 | # No local copy: create a new local copy. If robots_file is None, it
119 | # will be treated as a permissive RobotsTxt.
120 | logger.info('Saving new robots.txt file: %s', robots_url)
121 | robots_doc = {
122 | 'file': robots_file,
123 | 'updated_at': now,
124 | 'url': robots_url,
125 | }
126 | robots = RobotsTxt(robots_doc)
127 | else:
128 | # If we have a network copy, use that to update local copy.
129 | # Otherwise, just update the local copy's timestamp.
130 | robots = RobotsTxt(robots_doc)
131 | logger.info('Updating robots.txt file: %s', robots_url)
132 | if robots_file is not None:
133 | robots_doc['file'] = robots_file
134 | else:
135 | del robots_doc['file']
136 |
137 | robots_doc['updated_at'] = now
138 | del robots_doc['url']
139 |
140 | # Upsert robots_docs.
141 | await self._save_robots_to_db(robots_doc)
142 |
143 | # Add to cache before completing the future to avoid race condition.
144 | self._cache[robots_url] = robots
145 | self._cache.move_to_end(robots_url)
146 | if len(self._cache) > self._max_cache:
147 | self._cache.popitem(last=False)
148 | return robots
149 |
150 | async def _get_robots_from_db(self, robots_url):
151 | '''
152 | Get robots document from the database.
153 |
154 | Returns None if it doesn't exist in the database.
155 |
156 | :param str robots_url: The URL of the robots.txt file.
157 | :returns: A database document.
158 | :rtype: dict
159 | '''
160 | query = r.table('robots_txt').get_all(robots_url, index='url').nth(0)
161 |
162 | async with self._db_pool.connection() as conn:
163 | try:
164 | db_robots = await query.run(conn)
165 | except r.ReqlNonExistenceError:
166 | db_robots = None
167 |
168 | return db_robots
169 |
170 | async def _get_robots_from_net(self, robots_url, downloader):
171 | '''
172 | Get robots.txt file from the network.
173 |
174 | Returns None if the file cannot be fetched (e.g. 404 error).
175 |
176 | :param str robots_url: Fetch the robots.txt file at this URL.
177 | :param Downloader downloader:
178 | :returns: Contents of robots.txt file or None if it couldn't be
179 | downloaded.
180 | :rtype: str
181 | '''
182 |
183 | logger.info('Fetching robots.txt: %s', robots_url)
184 | request = DownloadRequest(frontier_id=None, job_id=None, method='GET',
185 | url=robots_url, form_data=None, cost=0)
186 | response = await downloader.download(request, skip_mime=True)
187 |
188 | if response.status_code == 200 and response.body is not None:
189 | # There are no invalid byte sequences in latin1 encoding, so this
190 | # should always succeed.
191 | robots_file = response.body.decode('latin1')
192 | else:
193 | robots_file = None
194 |
195 | return robots_file
196 |
197 | async def _save_robots_to_db(self, robots_doc):
198 | async with self._db_pool.connection() as conn:
199 | await (
200 | r.table('robots_txt')
201 | .insert(robots_doc, conflict='update')
202 | .run(conn)
203 | )
204 |
205 |
206 | class RobotsTxt:
207 | '''
208 | Wrapper around robots.txt parser that adds the date the file was fetched.
209 |
210 | If the ``robots_file`` is None or cannot be parsed, then it's treated as a
211 | highly permissive robots.txt.
212 | '''
213 | def __init__(self, robots_doc):
214 | ''' Initialize from database document representation. '''
215 | self._updated_at = robots_doc['updated_at']
216 | self._robots = RobotExclusionRulesParser()
217 |
218 | if robots_doc['file'] is not None:
219 | # The parser never throws an exception, it just ignores things that
220 | # it doesn't understand.
221 | self._robots.parse(robots_doc['file'])
222 |
223 | def is_allowed(self, user_agent, url):
224 | '''
225 | Return True if ``url`` is allowed by this robots.txt file.
226 |
227 | :param str user_agent: The user agent that want to access the URL.
228 | :param str url: The URL that the user agent wants to access.
229 | :rtype: bool
230 | '''
231 | return self._robots.is_allowed(user_agent, url)
232 |
233 | def is_older_than(self, age):
234 | '''
235 | Return True if this robots file is older than ``age``.
236 |
237 | :param datetime age: A timezone-aware datetime.
238 | :rtype: bool
239 | '''
240 | return (datetime.now(timezone.utc) - self._updated_at).seconds >= age
241 |
--------------------------------------------------------------------------------
/starbelly/server/__init__.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | from datetime import datetime, timedelta
3 | from functools import partial
4 | import gzip
5 | import inspect
6 | import logging
7 | import operator
8 | from time import time
9 | from urllib.parse import urlparse
10 |
11 | import dateutil.parser
12 | from dateutil.tz import tzlocal
13 | from google.protobuf.message import DecodeError
14 | import rethinkdb as r
15 | from rethinkdb.errors import ReqlNonExistenceError
16 | from trio_websocket import ConnectionClosed, serve_websocket
17 | import trio
18 |
19 | from starbelly.subscription import SubscriptionManager
20 | from starbelly.starbelly_pb2 import Request, Response, ServerMessage
21 |
22 | # Define API handler decorator before importing API implementations, because
23 | # those implementations use this decorator to register themselves with the
24 | # server.
25 | _handlers = dict()
26 | def api_handler(handler):
27 | '''
28 | This decorator registers a function as a callable command through the
29 | API server.
30 | '''
31 | _handlers[handler.__name__] = handler
32 | return handler
33 |
34 |
35 | class InvalidRequestException(Exception):
36 | ''' Indicates a request is invalid. '''
37 |
38 |
39 | # pylint: disable=cyclic-import, wrong-import-position
40 | from .captcha import *
41 | from .job import *
42 | from .login import *
43 | from .policy import *
44 | from .rate_limit import *
45 | from .schedule import *
46 | from .subscription import *
47 | from .system import *
48 |
49 |
50 | logger = logging.getLogger(__name__)
51 |
52 |
53 | class Server:
54 | ''' Handles websocket connections from clients and command dispatching. '''
55 |
56 | def __init__(self, host, port, server_db, subscription_db, crawl_manager,
57 | rate_limiter, resource_monitor, stats_tracker, scheduler):
58 | '''
59 | Constructor
60 |
61 | :param str host: The hostname to serve on.
62 | :param int port: The port to serve on, or zero to automatically pick a
63 | port.
64 | :param starbelly.db.ServerDb server_db:
65 | :param starbelly.db.SubscriptionDb subscription_db:
66 | :param starbelly.job.CrawlManager crawl_manager:
67 | :param starbelly.rate_limiter.RateLimiter:
68 | :param starbelly.resource_monitor.ResourceMonitor resource_monitor:
69 | :param starbelly.job.StatsTracker stats_tracker:
70 | :param starbelly.schedule.Scheduler scheduler:
71 | '''
72 | self._host = host
73 | self._port = port
74 | self._server_db = server_db
75 | self._subscription_db = subscription_db
76 | self._crawl_manager = crawl_manager
77 | self._rate_limiter = rate_limiter
78 | self._resource_monitor = resource_monitor
79 | self._stats_tracker = stats_tracker
80 | self._scheduler = scheduler
81 |
82 | @property
83 | def port(self):
84 | return self._port
85 |
86 | async def run(self, *, task_status=trio.TASK_STATUS_IGNORED):
87 | '''
88 | Run the websocket server.
89 |
90 | To ensure that the server is ready, call ``await
91 | nursery.start(server.run)``.
92 |
93 | :returns: Runs until cancelled.
94 | '''
95 | logger.info('Starting server on %s:%d', self._host, self._port)
96 | async with trio.open_nursery() as nursery:
97 | serve_fn = partial(serve_websocket, self._handle_connection,
98 | self._host, self._port, ssl_context=None,
99 | handler_nursery=nursery)
100 | server = await nursery.start(serve_fn, name='Connection Listener')
101 | self._port = server.port
102 | task_status.started()
103 | logger.info('Server stopped')
104 |
105 | async def _handle_connection(self, request):
106 | '''
107 | Handle an incoming connection.
108 |
109 | :param request: A WebSocket connection request.
110 | '''
111 | headers = dict(request.headers)
112 | ws = await request.accept()
113 | client = '{}:{}'.format(ws.remote.address, ws.remote.port)
114 | logger.info('Connection opened: client=%s path=%s', client, ws.path)
115 | connection = Connection(client, ws, self._server_db,
116 | self._subscription_db, self._crawl_manager, self._rate_limiter,
117 | self._resource_monitor, self._stats_tracker, self._scheduler)
118 | await connection.run()
119 |
120 |
121 | class Connection:
122 | def __init__(self, client, ws, server_db, subscription_db, crawl_manager,
123 | rate_limiter, resource_monitor, stats_tracker, scheduler):
124 | '''
125 | Constructor.
126 |
127 | :param str client: Description the client (IP address and port) that
128 | opened this connection.
129 | :param trio_websocket.WebSocketConnection ws: A websocket connection.
130 | :param starbelly.db.ServerDb: A database layer.
131 | :param starbelly.db.SubscriptionDb: A database layer.
132 | :param starbelly.job.CrawlManager crawl_manager: A crawl manager.
133 | :param starbelly.rate_limiter.RateLimiter: A rate limiter.
134 | :param starbelly.resource_monitor.ResourceMonitor resource_monitor: A
135 | resource monitor.
136 | :param starbelly.schedule.Scheduler scheduler: A scheduler.
137 | :param starbelly.job.StatsTracker stats_tracker:
138 | :param starbelly.subscription.SubscriptionManager: A subscription
139 | manager.
140 | '''
141 | self._client = client
142 | self._ws = ws
143 | self._server_db = server_db
144 | self._subscription_db = subscription_db
145 | self._crawl_manager = crawl_manager
146 | self._rate_limiter = rate_limiter
147 | self._resource_monitor = resource_monitor
148 | self._scheduler = scheduler
149 | self._subscription_db = subscription_db
150 | self._nursery = None
151 | self._stats_tracker = stats_tracker
152 | self._subscription_manager = None
153 |
154 | async def run(self):
155 | '''
156 | Run the connection: read requests and send responses.
157 |
158 | This opens an internal nursery in case background tasks, like
159 | subscriptions, need to be started.
160 |
161 | :returns: This runs until the connection is closed.
162 | '''
163 | try:
164 | async with trio.open_nursery() as nursery:
165 | self._nursery = nursery
166 | self._subscription_manager = SubscriptionManager(
167 | self._subscription_db, nursery, self._ws)
168 | while True:
169 | request_data = await self._ws.get_message()
170 | nursery.start_soon(self._handle_request, request_data,
171 | name='Request Handler')
172 | except ConnectionClosed:
173 | logger.info('Connection closed for %s', self._client)
174 | except:
175 | logger.exception('Connection exception')
176 | finally:
177 | await self._ws.aclose()
178 |
179 | async def _handle_request(self, request_data):
180 | '''
181 | Handle a single API request.
182 |
183 | :param request: A protobuf request object.
184 | '''
185 | start = trio.current_time()
186 | message = ServerMessage()
187 | message.response.is_success = False
188 | request = None
189 |
190 | try:
191 | # Prepare response.
192 | request = Request.FromString(request_data)
193 | message.response.request_id = request.request_id
194 |
195 | # Find an appropriate handler.
196 | command_name = request.WhichOneof('Command')
197 | if command_name is None:
198 | raise InvalidRequestException('No command specified')
199 | command = getattr(request, command_name)
200 | try:
201 | handler = _handlers[command_name]
202 | except KeyError:
203 | raise InvalidRequestException('Invalid command name: {}'
204 | .format(command_name)) from None
205 |
206 | # Inject dependencies into argument list, then call the handler.
207 | argspec = inspect.getfullargspec(handler)
208 | args = list()
209 | for var in argspec[0]:
210 | if var == 'command':
211 | args.append(command)
212 | elif var == 'crawl_manager':
213 | args.append(self._crawl_manager)
214 | elif var == 'nursery':
215 | args.append(self._nursery)
216 | elif var == 'rate_limiter':
217 | args.append(self._rate_limiter)
218 | elif var == 'resource_monitor':
219 | args.append(self._resource_monitor)
220 | elif var == 'response':
221 | args.append(message.response)
222 | elif var == 'scheduler':
223 | args.append(self._scheduler)
224 | elif var == 'server_db':
225 | args.append(self._server_db)
226 | elif var == 'subscription_manager':
227 | args.append(self._subscription_manager)
228 | elif var == 'stats_tracker':
229 | args.append(self._stats_tracker)
230 | elif var == 'websocket':
231 | args.append(self._ws)
232 | else:
233 | raise Exception('Unknown dependency "{}" in handler {}()'
234 | .format(var, command_name))
235 |
236 | await handler(*args)
237 | message.response.is_success = True
238 | elapsed = trio.current_time() - start
239 | logger.info('Request OK %s %s %0.3fs', self._client,
240 | command_name, elapsed)
241 | except DecodeError:
242 | # Failure to decode a protobuf message means that the connection
243 | # is severely damaged; raise to the nursery so we can close the
244 | # entire connection.
245 | raise
246 | except InvalidRequestException as ire:
247 | error_message = str(ire)
248 | logger.error('Request ERROR %s %s (%s)', command_name,
249 | self._client, error_message)
250 | message.response.error_message = error_message
251 | except:
252 | logger.exception('Exception while handling request:\n%r',
253 | request)
254 | message.response.error_message = 'A server exception occurred'
255 |
256 | message_data = message.SerializeToString()
257 | await self._ws.send_message(message_data)
258 |
--------------------------------------------------------------------------------
/starbelly/server/captcha.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timezone
2 | import logging
3 | from uuid import UUID
4 |
5 | from . import api_handler, InvalidRequestException
6 | from ..captcha import captcha_doc_to_pb, captcha_pb_to_doc
7 |
8 |
9 | logger = logging.getLogger(__name__)
10 |
11 |
12 | @api_handler
13 | async def delete_captcha_solver(command, server_db):
14 | ''' Delete a a CAPTCHA solver. '''
15 | solver_id = str(UUID(bytes=command.solver_id))
16 | try:
17 | await server_db.delete_captcha_solver(solver_id)
18 | except ValueError as ve:
19 | raise InvalidRequestException(str(ve)) from None
20 |
21 |
22 | @api_handler
23 | async def get_captcha_solver(command, response, server_db):
24 | ''' Get a CAPTCHA solver. '''
25 | solver_id = str(UUID(bytes=command.solver_id))
26 | doc = await server_db.get_captcha_solver(solver_id)
27 |
28 | if doc is None:
29 | raise InvalidRequestException('No CAPTCHA solver found for that ID')
30 |
31 | response.solver.CopyFrom(captcha_doc_to_pb(doc))
32 |
33 |
34 | @api_handler
35 | async def list_captcha_solvers(command, response, server_db):
36 | ''' Return a list of CAPTCHA solvers. '''
37 | limit = command.page.limit
38 | offset = command.page.offset
39 | count, docs = await server_db.list_captcha_solvers(limit, offset)
40 |
41 | for doc in docs:
42 | solver = response.list_captcha_solvers.solvers.add()
43 | solver.CopyFrom(captcha_doc_to_pb(doc))
44 |
45 | response.list_captcha_solvers.total = count
46 |
47 |
48 | @api_handler
49 | async def set_captcha_solver(command, response, server_db):
50 | ''' Create or update CAPTCHA solver. '''
51 | now = datetime.now(timezone.utc)
52 | doc = captcha_pb_to_doc(command.solver)
53 | new_id = await server_db.set_captcha_solver(doc, now)
54 | if new_id:
55 | response.new_solver.solver_id = UUID(new_id).bytes
56 |
--------------------------------------------------------------------------------
/starbelly/server/job.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import logging
3 | from uuid import UUID
4 | from yarl import URL
5 |
6 | import dateutil.parser
7 |
8 | from . import api_handler, InvalidRequestException
9 | from ..policy import Policy
10 | from ..starbelly_pb2 import JobRunState as PbRunState
11 |
12 |
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | @api_handler
17 | async def delete_job(command, server_db, stats_tracker):
18 | """ Delete a job. """
19 | job_id = str(UUID(bytes=command.job_id))
20 | await server_db.delete_job(job_id)
21 | stats_tracker.delete_job(job_id)
22 |
23 |
24 | @api_handler
25 | async def get_job(command, response, server_db):
26 | """ Get status for a single job. """
27 | job_id = str(UUID(bytes=command.job_id))
28 | job_doc = await server_db.get_job(job_id)
29 | if not job_doc:
30 | raise InvalidRequestException(f"No job exists with ID={job_id}")
31 |
32 | job = response.job
33 | job.job_id = UUID(job_doc["id"]).bytes
34 | for seed in job_doc["seeds"]:
35 | job.seeds.append(seed)
36 | for tag in job_doc["tags"]:
37 | job.tags.append(tag)
38 | Policy.convert_doc_to_pb(job_doc["policy"], job.policy)
39 | job.name = job_doc["name"]
40 | job.item_count = job_doc["item_count"]
41 | job.http_success_count = job_doc["http_success_count"]
42 | job.http_error_count = job_doc["http_error_count"]
43 | job.exception_count = job_doc["exception_count"]
44 | job.started_at = job_doc["started_at"].isoformat()
45 | if job_doc["completed_at"] is not None:
46 | job.completed_at = job_doc["completed_at"].isoformat()
47 | run_state = job_doc["run_state"].upper()
48 | job.run_state = PbRunState.Value(run_state)
49 | http_status_counts = job_doc["http_status_counts"]
50 | for status_code, count in http_status_counts.items():
51 | job.http_status_counts[int(status_code)] = count
52 |
53 |
54 | @api_handler
55 | async def get_job_items(command, response, server_db):
56 | """ Get a page of items (crawl responses) from a job. """
57 | job_id = str(UUID(bytes=command.job_id))
58 | limit = command.page.limit
59 | offset = command.page.offset
60 | count, items = await server_db.get_job_items(
61 | job_id,
62 | limit,
63 | offset,
64 | command.include_success,
65 | command.include_error,
66 | command.include_exception,
67 | )
68 | response.list_items.total = count
69 | compression_ok = command.compression_ok
70 | for item_doc in items:
71 | item = response.list_items.items.add()
72 |
73 | if item_doc["join"] is None:
74 | item.is_compressed = False
75 | elif item_doc["join"]["is_compressed"] and not compression_ok:
76 | item.body = gzip.decompress(item_doc["join"]["body"])
77 | item.is_compressed = False
78 | else:
79 | item.body = item_doc["join"]["body"]
80 | item.is_compressed = item_doc["join"]["is_compressed"]
81 | if "content_type" in item_doc:
82 | item.content_type = item_doc["content_type"]
83 | if "exception" in item_doc:
84 | item.exception = item_doc["exception"]
85 | if "status_code" in item_doc:
86 | item.status_code = item_doc["status_code"]
87 | header_iter = iter(item_doc.get("headers", []))
88 | for key in header_iter:
89 | value = next(header_iter)
90 | header = item.headers.add()
91 | header.key = key
92 | header.value = value
93 | item.cost = item_doc["cost"]
94 | item.job_id = UUID(item_doc["job_id"]).bytes
95 | item.completed_at = item_doc["completed_at"].isoformat()
96 | item.started_at = item_doc["started_at"].isoformat()
97 | item.duration = item_doc["duration"]
98 | item.url = item_doc["url"]
99 | item.url_can = item_doc["canonical_url"]
100 | item.is_success = item_doc["is_success"]
101 |
102 |
103 | @api_handler
104 | async def list_jobs(command, response, server_db):
105 | """ Return a list of jobs. """
106 | limit = command.page.limit
107 | offset = command.page.offset
108 | if command.HasField("started_after"):
109 | started_after = dateutil.parser.parse(command.started_after)
110 | else:
111 | started_after = None
112 | tag = command.tag if command.HasField("tag") else None
113 | schedule_id = (
114 | str(UUID(bytes=command.schedule_id))
115 | if command.HasField("schedule_id")
116 | else None
117 | )
118 | count, jobs = await server_db.list_jobs(
119 | limit, offset, started_after, tag, schedule_id
120 | )
121 | response.list_jobs.total = count
122 |
123 | for job_doc in jobs:
124 | job = response.list_jobs.jobs.add()
125 | job.job_id = UUID(job_doc["id"]).bytes
126 | job.name = job_doc["name"]
127 | for seed in job_doc["seeds"]:
128 | job.seeds.append(seed)
129 | for tag in job_doc["tags"]:
130 | job.tags.append(tag)
131 | job.item_count = job_doc["item_count"]
132 | job.http_success_count = job_doc["http_success_count"]
133 | job.http_error_count = job_doc["http_error_count"]
134 | job.exception_count = job_doc["exception_count"]
135 | job.started_at = job_doc["started_at"].isoformat()
136 | if job_doc["completed_at"] is not None:
137 | job.completed_at = job_doc["completed_at"].isoformat()
138 | run_state = job_doc["run_state"].upper()
139 | job.run_state = PbRunState.Value(run_state)
140 | http_status_counts = job_doc["http_status_counts"]
141 | for status_code, count in http_status_counts.items():
142 | job.http_status_counts[int(status_code)] = count
143 |
144 |
145 | @api_handler
146 | async def set_job(command, crawl_manager, response):
147 | """ Create or update job metadata. """
148 | if command.HasField("job_id"):
149 | # Update run state of existing job.
150 | job_id = str(UUID(bytes=command.job_id))
151 | if command.HasField("run_state"):
152 | run_state = command.run_state
153 | if run_state == PbRunState.Value("CANCELLED"):
154 | await crawl_manager.cancel_job(job_id)
155 | elif run_state == PbRunState.Value("PAUSED"):
156 | await crawl_manager.pause_job(job_id)
157 | elif run_state == PbRunState.Value("RUNNING"):
158 | await crawl_manager.resume_job(job_id)
159 | else:
160 | raise InvalidRequestException(
161 | f"Not allowed to set job run state: {run_state}"
162 | )
163 | else:
164 | # Create new job.
165 | if not command.policy_id:
166 | raise InvalidRequestException('"policy_id" is required')
167 | if not command.seeds:
168 | raise InvalidRequestException('"seeds" is required')
169 | name = command.name
170 | policy_id = str(UUID(bytes=command.policy_id))
171 | seeds = [s.strip() for s in command.seeds]
172 | tags = [t.strip() for t in command.tags]
173 |
174 | if name.strip() == "":
175 | url = URL(seeds[0])
176 | name = url.host
177 | if len(seeds) > 1:
178 | name += "& {} more".format(len(seeds) - 1)
179 |
180 | job_id = await crawl_manager.start_job(name, seeds, tags, policy_id)
181 | response.new_job.job_id = UUID(job_id).bytes
182 |
--------------------------------------------------------------------------------
/starbelly/server/login.py:
--------------------------------------------------------------------------------
1 | from . import api_handler, InvalidRequestException
2 |
3 |
4 | @api_handler
5 | async def delete_domain_login(command, server_db):
6 | ''' Delete a domain login and all of its users. '''
7 | await server_db.delete_domain_login(command.domain)
8 |
9 |
10 | @api_handler
11 | async def get_domain_login(command, response, server_db):
12 | ''' Get a domain login. '''
13 | domain = command.domain
14 | domain_login = await server_db.get_domain_login(domain)
15 | if domain_login is None:
16 | raise InvalidRequestException('No domain credentials found for'
17 | ' domain={}'.format(domain))
18 | response.domain_login.domain = domain_login['domain']
19 | response.domain_login.login_url = domain_login['login_url']
20 | if domain_login['login_test'] is not None:
21 | response.domain_login.login_test = domain_login['login_test']
22 |
23 | for user in domain_login['users']:
24 | dl_user = response.domain_login.users.add()
25 | dl_user.username = user['username']
26 | dl_user.password = user['password']
27 | dl_user.working = user['working']
28 |
29 |
30 | @api_handler
31 | async def list_domain_logins(command, response, server_db):
32 | ''' Return a list of domain logins. '''
33 | limit = command.page.limit
34 | offset = command.page.offset
35 | count, docs = await server_db.list_domain_logins(limit, offset)
36 | response.list_domain_logins.total = count
37 | for doc in docs:
38 | dl = response.list_domain_logins.logins.add()
39 | dl.domain = doc['domain']
40 | dl.login_url = doc['login_url']
41 | if doc['login_test'] is not None:
42 | dl.login_test = doc['login_test']
43 | for user_doc in doc['users']:
44 | user = dl.users.add()
45 | user.username = user_doc['username']
46 | user.password = user_doc['password']
47 | user.working = user_doc['working']
48 |
49 |
50 | @api_handler
51 | async def set_domain_login(command, server_db):
52 | ''' Create or update a domain login. '''
53 | domain_login = command.login
54 |
55 | if not domain_login.HasField('domain'):
56 | raise InvalidRequestException('domain is required.')
57 |
58 | domain = domain_login.domain
59 | doc = await server_db.get_domain_login(domain)
60 | if doc is None:
61 | if not domain_login.HasField('login_url'):
62 | raise InvalidRequestException('login_url is required to'
63 | ' create a domain login.')
64 | doc = {
65 | 'domain': domain,
66 | 'login_url': domain_login.login_url,
67 | 'login_test': None,
68 | }
69 |
70 | if domain_login.HasField('login_url'):
71 | doc['login_url'] = domain_login.login_url
72 |
73 | if domain_login.HasField('login_test'):
74 | doc['login_test'] = domain_login.login_test
75 |
76 | doc['users'] = list()
77 |
78 | for user in domain_login.users:
79 | doc['users'].append({
80 | 'username': user.username,
81 | 'password': user.password,
82 | 'working': user.working,
83 | })
84 |
85 | await server_db.set_domain_login(doc)
86 |
--------------------------------------------------------------------------------
/starbelly/server/policy.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timezone
2 | from uuid import UUID
3 |
4 | from . import api_handler
5 | from ..policy import Policy
6 | from ..version import __version__
7 |
8 |
9 | @api_handler
10 | async def delete_policy(command, server_db):
11 | ''' Delete a policy. '''
12 | policy_id = str(UUID(bytes=command.policy_id))
13 | await server_db.delete_policy(policy_id)
14 |
15 |
16 | @api_handler
17 | async def get_policy(command, response, server_db):
18 | ''' Get a single policy. '''
19 | policy_id = str(UUID(bytes=command.policy_id))
20 | policy_doc = await server_db.get_policy(policy_id)
21 | Policy.convert_doc_to_pb(policy_doc, response.policy)
22 |
23 |
24 | @api_handler
25 | async def list_policies(command, response, server_db):
26 | ''' Get a list of policies. '''
27 | limit = command.page.limit
28 | offset = command.page.offset
29 | count, docs = await server_db.list_policies(limit, offset)
30 | response.list_policies.total = count
31 |
32 | for policy_doc in docs:
33 | policy = response.list_policies.policies.add()
34 | policy.policy_id = UUID(policy_doc['id']).bytes
35 | policy.name = policy_doc['name']
36 | policy.created_at = policy_doc['created_at'].isoformat()
37 | policy.updated_at = policy_doc['updated_at'].isoformat()
38 |
39 | return response
40 |
41 |
42 | @api_handler
43 | async def set_policy(command, response, server_db):
44 | '''
45 | Create or update a single policy.
46 |
47 | If the policy ID is set, then update the corresponding policy.
48 | Otherwise, create a new policy.
49 | '''
50 | policy_doc = Policy.convert_pb_to_doc(command.policy)
51 | # Validate policy by trying to instantiate a Policy object, which will
52 | # raise an exception if the policy is invalid.
53 | Policy(policy_doc, version=__version__,
54 | seeds=['http://test1.com', 'http://test2.org'])
55 | now = datetime.now(timezone.utc)
56 | new_id = await server_db.set_policy(policy_doc, now)
57 | if new_id is not None:
58 | response.new_policy.policy_id = UUID(new_id).bytes
59 |
--------------------------------------------------------------------------------
/starbelly/server/rate_limit.py:
--------------------------------------------------------------------------------
1 | from . import api_handler, InvalidRequestException
2 | from ..rate_limiter import get_domain_token, GLOBAL_RATE_LIMIT_TOKEN
3 |
4 |
5 | @api_handler
6 | async def list_rate_limits(command, response, server_db):
7 | ''' Get a page of rate limits. '''
8 | limit = command.page.limit
9 | offset = command.page.offset
10 | count, rate_limits = await server_db.list_rate_limits(limit, offset)
11 | response.list_rate_limits.total = count
12 |
13 | for rate_limit in rate_limits:
14 | rl = response.list_rate_limits.rate_limits.add()
15 | rl.name = rate_limit['name']
16 | rl.token = rate_limit['token']
17 | rl.delay = rate_limit['delay']
18 | if rl.name.startswith('domain:'):
19 | rl.domain = rl.name.split(':')[1]
20 |
21 |
22 | @api_handler
23 | async def set_rate_limit(command, rate_limiter, server_db):
24 | ''' Set a rate limit. '''
25 | delay = command.delay if command.HasField('delay') else None
26 |
27 | if command.HasField('domain'):
28 | # Set a specific rate limit.
29 | domain = command.domain
30 | token = get_domain_token(domain)
31 | name = 'domain:{}'.format(domain)
32 | else:
33 | # Set global rate limit.
34 | if delay is None:
35 | raise InvalidRequestException(
36 | 'Cannot delete the global rate limit.')
37 | token = GLOBAL_RATE_LIMIT_TOKEN
38 | name = 'Global Rate Limit'
39 |
40 | await server_db.set_rate_limit(name, token, delay)
41 | if delay is None:
42 | rate_limiter.delete_rate_limit(token)
43 | else:
44 | rate_limiter.set_rate_limit(token, delay)
45 |
--------------------------------------------------------------------------------
/starbelly/server/schedule.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timezone
2 | from uuid import UUID
3 |
4 | from . import api_handler
5 | from ..schedule import Schedule
6 | from ..starbelly_pb2 import JobRunState as PbRunState
7 |
8 |
9 | @api_handler
10 | async def delete_schedule(command, scheduler, server_db):
11 | ''' Delete a job schedule. '''
12 | schedule_id = str(UUID(bytes=command.schedule_id))
13 | await server_db.delete_schedule(schedule_id)
14 | scheduler.remove_schedule(schedule_id)
15 |
16 |
17 | @api_handler
18 | async def get_schedule(command, response, server_db):
19 | ''' Get metadata for a job schedule. '''
20 | schedule_id = str(UUID(bytes=command.schedule_id))
21 | doc = await server_db.get_schedule(schedule_id)
22 | if doc is None:
23 | response.is_success = False
24 | response.error_message = f'No schedule exists with ID={schedule_id}'
25 | else:
26 | pb = response.schedule
27 | Schedule.from_doc(doc).to_pb(pb)
28 |
29 |
30 | @api_handler
31 | async def list_schedules(command, response, server_db):
32 | ''' Return a list of job schedules. '''
33 | limit = command.page.limit
34 | offset = command.page.offset
35 | count, schedules = await server_db.list_schedules(limit, offset)
36 | response.list_schedules.total = count
37 | for doc in schedules:
38 | pb = response.list_schedules.schedules.add()
39 | Schedule.from_doc(doc).to_pb(pb)
40 |
41 |
42 | @api_handler
43 | async def list_schedule_jobs(command, response, server_db):
44 | ''' Return a list of job schedules. '''
45 | schedule_id = str(UUID(bytes=command.schedule_id))
46 | limit = command.page.limit
47 | offset = command.page.offset
48 | count, jobs = await server_db.list_schedule_jobs(schedule_id, limit, offset)
49 | response.list_schedule_jobs.total = count
50 | for job_doc in jobs:
51 | job = response.list_schedule_jobs.jobs.add()
52 | job.job_id = UUID(job_doc['id']).bytes
53 | job.name = job_doc['name']
54 | for seed in job_doc['seeds']:
55 | job.seeds.append(seed)
56 | for tag in job_doc['tags']:
57 | job.tags.append(tag)
58 | job.item_count = job_doc['item_count']
59 | job.http_success_count = job_doc['http_success_count']
60 | job.http_error_count = job_doc['http_error_count']
61 | job.exception_count = job_doc['exception_count']
62 | job.started_at = job_doc['started_at'].isoformat()
63 | if job_doc['completed_at'] is not None:
64 | job.completed_at = job_doc['completed_at'].isoformat()
65 | run_state = job_doc['run_state'].upper()
66 | job.run_state = PbRunState.Value(run_state)
67 | http_status_counts = job_doc['http_status_counts']
68 | for status_code, count in http_status_counts.items():
69 | job.http_status_counts[int(status_code)] = count
70 |
71 |
72 | @api_handler
73 | async def set_schedule(command, response, scheduler, server_db):
74 | ''' Create or update job schedule metadata. '''
75 | doc = Schedule.from_pb(command.schedule).to_doc()
76 | now = datetime.now(timezone.utc)
77 | schedule_id = await server_db.set_schedule(doc, now)
78 | if schedule_id:
79 | response.new_schedule.schedule_id = UUID(schedule_id).bytes
80 | else:
81 | schedule_id = str(UUID(bytes=command.schedule.schedule_id))
82 | scheduler.remove_schedule(schedule_id)
83 | if command.schedule.enabled:
84 | schedule_doc = await server_db.get_schedule(schedule_id)
85 | job_docs = await server_db.list_schedule_jobs(schedule_id, limit=1,
86 | offset=0)
87 | try:
88 | latest_job_doc = job_docs[0]
89 | except IndexError:
90 | latest_job_doc = None
91 | scheduler.add_schedule(schedule_doc, latest_job_doc)
92 |
--------------------------------------------------------------------------------
/starbelly/server/subscription.py:
--------------------------------------------------------------------------------
1 | from uuid import UUID
2 |
3 | import trio.hazmat
4 |
5 | from . import api_handler
6 |
7 |
8 | @api_handler
9 | async def subscribe_job_sync(command, crawl_manager, response,
10 | subscription_manager):
11 | ''' Handle the subscribe crawl items command. '''
12 | job_id = str(UUID(bytes=command.job_id))
13 | compression_ok = command.compression_ok
14 | job_state_recv = crawl_manager.get_job_state_channel()
15 | sync_token = command.sync_token if command.HasField('sync_token') else None
16 | sub_id = subscription_manager.subscribe_job_sync(job_id, compression_ok,
17 | job_state_recv, sync_token)
18 | response.new_subscription.subscription_id = sub_id
19 |
20 |
21 | @api_handler
22 | async def subscribe_job_status(command, response, subscription_manager,
23 | stats_tracker):
24 | ''' Handle the subscribe crawl status command. '''
25 | sub_id = subscription_manager.subscribe_job_status(stats_tracker,
26 | command.min_interval)
27 | response.new_subscription.subscription_id = sub_id
28 |
29 |
30 | @api_handler
31 | async def subscribe_resource_monitor(command, response, resource_monitor,
32 | subscription_manager):
33 | ''' Handle the subscribe resource monitor command. '''
34 | sub_id = subscription_manager.subscribe_resource_monitor(resource_monitor,
35 | command.history)
36 | response.new_subscription.subscription_id = sub_id
37 |
38 |
39 | @api_handler
40 | async def subscribe_task_monitor(command, response, subscription_manager):
41 | ''' Handle the subscribe task monitor command. '''
42 | root_task = trio.hazmat.current_root_task()
43 | sub_id = subscription_manager.subscribe_task_monitor(command.period,
44 | root_task)
45 | response.new_subscription.subscription_id = sub_id
46 |
47 |
48 | @api_handler
49 | async def unsubscribe(command, subscription_manager):
50 | ''' Handle an unsubscribe command. '''
51 | sub_id = command.subscription_id
52 | subscription_manager.cancel_subscription(sub_id)
53 |
--------------------------------------------------------------------------------
/starbelly/server/system.py:
--------------------------------------------------------------------------------
1 | import cProfile
2 | import operator
3 | import pstats
4 |
5 | import trio
6 |
7 | from . import api_handler, InvalidRequestException
8 |
9 |
10 | @api_handler
11 | async def performance_profile(command, response):
12 | ''' Run CPU profiler. '''
13 | profile = cProfile.Profile()
14 | profile.enable()
15 | await trio.sleep(command.duration)
16 | profile.disable()
17 |
18 | # pstats sorting only works when you use pstats printing... so we have
19 | # to build our own data structure in order to sort it.
20 | pr_stats = pstats.Stats(profile)
21 | stats = list()
22 | for key, value in pr_stats.stats.items():
23 | stats.append({
24 | 'file': key[0],
25 | 'line_number': key[1],
26 | 'function': key[2],
27 | 'calls': value[0],
28 | 'non_recursive_calls': value[1],
29 | 'total_time': value[2],
30 | 'cumulative_time': value[3],
31 | })
32 |
33 | try:
34 | stats.sort(key=operator.itemgetter(command.sort_by), reverse=True)
35 | except KeyError:
36 | raise InvalidRequestException('Invalid sort key: {}'
37 | .format(command.sort_by))
38 |
39 | response.performance_profile.total_calls = pr_stats.total_calls
40 | response.performance_profile.total_time = pr_stats.total_tt
41 |
42 | for stat in stats[:command.top_n]:
43 | function = response.performance_profile.functions.add()
44 | function.file = stat['file']
45 | function.line_number = stat['line_number']
46 | function.function = stat['function']
47 | function.calls = stat['calls']
48 | function.non_recursive_calls = stat['non_recursive_calls']
49 | function.total_time = stat['total_time']
50 | function.cumulative_time = stat['cumulative_time']
51 |
--------------------------------------------------------------------------------
/starbelly/storage.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import hashlib
3 | import logging
4 | import functools
5 |
6 | import mimeparse
7 | import trio
8 |
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | def should_compress_body(response):
14 | '''
15 | Returns true if the response body should be compressed.
16 |
17 | This logic can be amended over time to add additional MIME types that
18 | should be compressed.
19 |
20 | :param starbelly.downloader.DownloadResponse response:
21 | '''
22 | should_compress = False
23 | type_, subtype, _ = mimeparse.parse_mime_type(response.content_type)
24 | if type_ == 'text':
25 | should_compress = True
26 | elif type_ == 'application' and subtype in ('json', 'pdf'):
27 | should_compress = True
28 | return should_compress
29 |
30 |
31 | class CrawlStorage:
32 | ''' This class stores crawl items in the database. '''
33 | def __init__(self, job_id, db, send_channel, receive_channel, policy,
34 | sequence):
35 | '''
36 | Constructor
37 |
38 | :param str job_id: The job to store items for.
39 | :param starbelly.db.CrawlStorageDb db: Database layer.
40 | :param starbelly.policy.Policy: A policy to use for determining which
41 | responses to save.
42 | :param sequence: An iterator that returns a sequence number for each
43 | item to be saved.
44 | '''
45 | self._job_id = job_id
46 | self._db = db
47 | self._send_channel = send_channel
48 | self._receive_channel = receive_channel
49 | self._policy = policy
50 | self._sequence = sequence
51 |
52 | def __repr__(self):
53 | ''' Put job ID in repr. '''
54 | return ''.format(self._job_id[:8])
55 |
56 | async def run(self):
57 | '''
58 | Read items from channel and saves them into the database.
59 |
60 | :returns: This function runs until cancelled.
61 | '''
62 | async for response in self._receive_channel:
63 | await self._save_response(response)
64 | await self._db.update_job_stats(self._job_id, response)
65 | await self._send_channel.send(response)
66 |
67 | async def _save_response(self, response):
68 | '''
69 | Save a response to the database.
70 |
71 | :param starbelly.downloader.DownloadResponse response:
72 | '''
73 | response_doc = {
74 | 'completed_at': response.completed_at,
75 | 'cost': response.cost,
76 | 'duration': response.duration,
77 | 'job_id': self._job_id,
78 | 'started_at': response.started_at,
79 | 'url': response.url.human_repr(),
80 | 'canonical_url': response.canonical_url,
81 | }
82 |
83 | if response.exception is None:
84 | response_doc['completed_at'] = response.completed_at
85 | response_doc['content_type'] = response.content_type
86 | response_doc['is_success'] = response.status_code // 100 == 2
87 | response_doc['status_code'] = response.status_code
88 | compress_body = should_compress_body(response)
89 |
90 | headers = list()
91 | for key, value in response.headers.items():
92 | headers.append(key.upper())
93 | headers.append(value)
94 | response_doc['headers'] = headers
95 |
96 | body_hash = hashlib.blake2b(response.body, digest_size=16).digest()
97 | if compress_body:
98 | body = await trio.run_sync_in_worker_thread(functools.partial(
99 | gzip.compress, response.body, compresslevel=6))
100 | else:
101 | body = response.body
102 |
103 | response_doc['body_id'] = body_hash
104 | response_body_doc = {
105 | 'id': body_hash,
106 | 'body': body,
107 | 'is_compressed': compress_body,
108 | }
109 | else:
110 | response_doc['exception'] = response.exception
111 | response_doc['is_success'] = False
112 | response_body_doc = None
113 |
114 | response_doc['sequence'] = next(self._sequence)
115 | await self._db.save_response(response_doc, response_body_doc)
116 |
--------------------------------------------------------------------------------
/starbelly/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '2.0.0-dev'
2 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | from contextlib import contextmanager
2 | from functools import wraps
3 | import pathlib
4 | from os.path import dirname
5 | from sys import path
6 | from unittest.mock import Mock
7 |
8 | import pytest
9 | import trio
10 | import trio_asyncio
11 |
12 |
13 | # Add this project to the Python path.
14 | path.append(dirname(dirname(__file__)))
15 |
16 |
17 | @contextmanager
18 | def assert_min_elapsed(seconds):
19 | '''
20 | Fail the test if the execution of a block takes less than ``seconds``.
21 | '''
22 | start = trio.current_time()
23 | yield
24 | elapsed = trio.current_time() - start
25 | assert elapsed >= seconds, 'Completed in under {} seconds'.format(seconds)
26 |
27 |
28 | @contextmanager
29 | def assert_max_elapsed(seconds):
30 | '''
31 | Fail the test if the execution of a block takes longer than ``seconds``.
32 | '''
33 | try:
34 | with trio.fail_after(seconds):
35 | yield
36 | except trio.TooSlowError:
37 | pytest.fail('Failed to complete within {} seconds'.format(seconds))
38 |
39 |
40 | @contextmanager
41 | def assert_elapsed(seconds, delta=0.1):
42 | '''
43 | Fail the test if the execution of a block takes more than seconds+delta time
44 | or less than seconds-delta time.
45 | '''
46 | with assert_min_elapsed(seconds-delta), assert_max_elapsed(seconds+delta):
47 | yield
48 |
49 |
50 | class AsyncMock:
51 | ''' A mock that acts like an async def function. '''
52 | def __init__(self, return_value=None, return_values=None, raises=None,
53 | side_effect=None):
54 | self._raises = None
55 | self._side_effect = None
56 | self._return_value = None
57 | self._index = None
58 | self._call_count = 0
59 | self._call_args = None
60 | self._call_kwargs = None
61 |
62 | if raises:
63 | self._raises = raises
64 | elif return_values:
65 | self._return_value = return_values
66 | self._index = 0
67 | elif side_effect:
68 | self._side_effect=side_effect
69 | else:
70 | self._return_value = return_value
71 |
72 | @property
73 | def call_args(self):
74 | return self._call_args
75 |
76 | @property
77 | def call_kwargs(self):
78 | return self._call_kwargs
79 |
80 | @property
81 | def called(self):
82 | return self._call_count > 0
83 |
84 | @property
85 | def call_count(self):
86 | return self._call_count
87 |
88 | async def __call__(self, *args, **kwargs):
89 | self._call_args = args
90 | self._call_kwargs = kwargs
91 | self._call_count += 1
92 | if self._raises:
93 | raise(self._raises)
94 | elif self._side_effect:
95 | return await self._side_effect(*args, **kwargs)
96 | elif self._index is not None:
97 | return_index = self._index
98 | self._index += 1
99 | return self._return_value[return_index]
100 | else:
101 | return self._return_value
102 |
103 |
104 | async def async_iter(iter):
105 | '''
106 | Convert a synchronous iterable into an async iterator.
107 |
108 | :param iterable iter:
109 | '''
110 | for item in iter:
111 | await trio.sleep(0)
112 | yield item
113 |
114 |
115 | @pytest.fixture
116 | async def asyncio_loop():
117 | ''' Open an asyncio loop. Useful for things like aiohttp.CookieJar that
118 | require a global loop. '''
119 | async with trio_asyncio.open_loop() as loop:
120 | yield loop
121 |
122 |
123 | class fail_after:
124 | ''' This decorator fails if the runtime of the decorated function (as
125 | measured by the Trio clock) exceeds the specified value. '''
126 | def __init__(self, seconds):
127 | self._seconds = seconds
128 |
129 | def __call__(self, fn):
130 | @wraps(fn)
131 | async def wrapper(*args, **kwargs):
132 | with trio.move_on_after(self._seconds) as cancel_scope:
133 | await fn(*args, **kwargs)
134 | if cancel_scope.cancelled_caught:
135 | pytest.fail('Test runtime exceeded the maximum {} seconds'
136 | .format(self._seconds))
137 | return wrapper
138 |
--------------------------------------------------------------------------------
/tests/test_async_mock.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from . import AsyncMock
4 |
5 |
6 | async def test_async_mock_no_return():
7 | foo = AsyncMock()
8 | assert await foo() is None
9 |
10 |
11 | async def test_async_mock_single_return():
12 | foo = AsyncMock(return_value=1)
13 | assert await foo() == 1
14 | assert await foo() == 1
15 |
16 |
17 | async def test_async_mock_multiple_returns():
18 | foo = AsyncMock(return_values=(1,2))
19 | assert await foo() == 1
20 | assert await foo() == 2
21 |
22 |
23 | async def test_async_mock_raises():
24 | foo = AsyncMock(raises=Exception)
25 | with pytest.raises(Exception):
26 | await foo()
27 |
--------------------------------------------------------------------------------
/tests/test_backoff.py:
--------------------------------------------------------------------------------
1 | from . import assert_elapsed
2 | from starbelly.backoff import ExponentialBackoff
3 |
4 |
5 | async def test_backoff_no_change(autojump_clock):
6 | ''' Backoff starts at 1, so 3 iterations takes ~2 seconds. '''
7 | with assert_elapsed(2):
8 | loop_count = 0
9 | async for _ in ExponentialBackoff(min_=1, max_=64):
10 | loop_count += 1
11 | if loop_count == 3:
12 | break
13 |
14 |
15 | async def test_backoff_increase(autojump_clock):
16 | ''' Increase backoff on each loop. Backoffs should be equal to 1, 2, 4,
17 | 8, 16, 16, but the first value is skipped, so the total is ~46 seconds. '''
18 | with assert_elapsed(seconds=46):
19 | loop_count = 0
20 | backoff = ExponentialBackoff(min_=1, max_=16)
21 | async for n in backoff:
22 | backoff.increase()
23 | loop_count += 1
24 | if loop_count == 6: break
25 |
26 |
27 | async def test_backoff_returns_value(autojump_clock):
28 | ''' Backoff returns the current value. Increase up to max and then decrease
29 | back to starting point. '''
30 | backoff = ExponentialBackoff(min_=1, max_=8)
31 | assert await backoff.__anext__() == 0
32 | assert await backoff.__anext__() == 1
33 | backoff.increase()
34 | assert await backoff.__anext__() == 2
35 | backoff.increase()
36 | assert await backoff.__anext__() == 4
37 | backoff.increase()
38 | assert await backoff.__anext__() == 8
39 | backoff.increase()
40 | assert await backoff.__anext__() == 8
41 | backoff.decrease()
42 | assert await backoff.__anext__() == 4
43 | backoff.decrease()
44 | assert await backoff.__anext__() == 2
45 | backoff.decrease()
46 | assert await backoff.__anext__() == 1
47 | backoff.decrease()
48 | assert await backoff.__anext__() == 1
49 |
--------------------------------------------------------------------------------
/tests/test_captcha.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timezone
2 | from uuid import UUID
3 |
4 | import pytest
5 |
6 | from starbelly.captcha import (
7 | CaptchaSolver,
8 | captcha_doc_to_pb,
9 | captcha_pb_to_doc,
10 | )
11 | from starbelly.starbelly_pb2 import CaptchaSolverAntigateCharacters
12 |
13 |
14 | def test_captcha_command():
15 | captcha_doc = {
16 | 'id': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa',
17 | 'name': 'Captcha #1',
18 | 'service_url': 'https://captcha.example/service.php',
19 | 'api_key': 'FAKE-API-KEY',
20 | 'require_phrase': False,
21 | 'case_sensitive': True,
22 | 'characters': 'ALPHANUMERIC',
23 | 'require_math': False,
24 | }
25 | solver = CaptchaSolver(captcha_doc)
26 | img_data = b'\x01\x02\x03\x04'
27 | command = solver.get_command(img_data)
28 | assert command['clientKey'] == 'FAKE-API-KEY'
29 | assert command['task']['type'] == 'ImageToTextTask'
30 | assert command['task']['body'] == 'AQIDBA==' # Base64 of img_data
31 | assert not command['task']['phrase']
32 | assert command['task']['case']
33 | assert command['task']['numeric'] == 0
34 | assert not command['task']['math']
35 | assert command['task']['minLength'] == 0
36 | assert command['task']['maxLength'] == 0
37 |
38 |
39 | def test_captcha_doc_to_pb():
40 | captcha_id = UUID('aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa')
41 | captcha_doc = {
42 | 'id': str(captcha_id),
43 | 'type': 'antigate',
44 | 'created_at': datetime(2019, 1, 26, 15, 30, 0, tzinfo=timezone.utc),
45 | 'updated_at': datetime(2019, 1, 26, 15, 35, 0, tzinfo=timezone.utc),
46 | 'name': 'Captcha #1',
47 | 'service_url': 'https://captcha.example/service.php',
48 | 'api_key': 'FAKE-API-KEY',
49 | 'require_phrase': False,
50 | 'case_sensitive': True,
51 | 'characters': 'ALPHANUMERIC',
52 | 'require_math': False,
53 | }
54 | pb_captcha = captcha_doc_to_pb(captcha_doc)
55 | assert pb_captcha.name == 'Captcha #1'
56 | assert pb_captcha.solver_id == captcha_id.bytes
57 | assert pb_captcha.created_at == '2019-01-26T15:30:00+00:00'
58 | assert pb_captcha.updated_at == '2019-01-26T15:35:00+00:00'
59 | assert pb_captcha.antigate.service_url == \
60 | 'https://captcha.example/service.php'
61 | assert pb_captcha.antigate.api_key == 'FAKE-API-KEY'
62 | assert not pb_captcha.antigate.require_phrase
63 | assert pb_captcha.antigate.case_sensitive
64 | assert pb_captcha.antigate.characters == \
65 | CaptchaSolverAntigateCharacters.Value('ALPHANUMERIC')
66 | assert not pb_captcha.antigate.require_math
67 |
68 | captcha_doc = captcha_pb_to_doc(pb_captcha)
69 | assert captcha_doc['id'] == str(captcha_id)
70 | assert captcha_doc['name'] == 'Captcha #1'
71 | assert captcha_doc['type'] == 'antigate'
72 | assert captcha_doc['service_url'] == 'https://captcha.example/service.php'
73 | assert captcha_doc['api_key'] == 'FAKE-API-KEY'
74 | assert captcha_doc['require_phrase'] == False
75 | assert captcha_doc['case_sensitive'] == True
76 | assert captcha_doc['characters'] == 'ALPHANUMERIC'
77 | assert captcha_doc['require_math'] == False
78 |
--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | import starbelly.config
4 |
5 |
6 | LOCAL_INI = '''[database]
7 | host = starbelly-host
8 | db = starbelly-db
9 | user = starbelly-app
10 | password = normalpass
11 | super_user = starbelly-admin
12 | super_password = superpass'''
13 |
14 |
15 | SYSTEM_INI = '''[database]
16 | host =
17 | port = 28015
18 | db =
19 | user =
20 | password =
21 | super_user =
22 | super_password =
23 |
24 | [rate_limiter]
25 | capacity = 10000'''
26 |
27 |
28 | def test_get_config(tmp_path):
29 | # Hack: modify the module's private _root variable to point at our temp
30 | # directory.
31 | starbelly.config._root = tmp_path
32 |
33 | # Create temp configuration files.
34 | config_dir = tmp_path / 'conf'
35 | config_dir.mkdir()
36 |
37 | with (config_dir / 'local.ini').open('w') as f:
38 | f.write(LOCAL_INI)
39 |
40 | with (config_dir / 'system.ini').open('w') as f:
41 | f.write(SYSTEM_INI)
42 |
43 | # Read configuration.
44 | config = starbelly.config.get_config()
45 | import logging
46 | logging.debug('secrions %r', config.sections())
47 | db = config['database']
48 | rl = config['rate_limiter']
49 |
50 | assert db['host'] == 'starbelly-host'
51 | assert db['port'] == '28015'
52 | assert db['db'] == 'starbelly-db'
53 | assert rl['capacity'] == '10000'
54 |
--------------------------------------------------------------------------------
/tests/test_crawl.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/tests/test_crawl.py
--------------------------------------------------------------------------------
/tests/test_frontier.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timezone
2 | from unittest.mock import Mock
3 |
4 | import pytest
5 | import trio
6 |
7 | from . import AsyncMock
8 | from starbelly.frontier import (
9 | CrawlFrontier,
10 | FrontierItem,
11 | FrontierExhaustionError,
12 | )
13 | from starbelly.policy import Policy
14 |
15 |
16 | def make_policy():
17 | created_at = datetime(2018,12,31,13,47,00)
18 | policy_doc = {
19 | 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb',
20 | 'name': 'Test',
21 | 'created_at': created_at,
22 | 'updated_at': created_at,
23 | 'authentication': {
24 | 'enabled': True,
25 | },
26 | 'limits': {
27 | 'max_cost': 10,
28 | 'max_duration': 3600,
29 | 'max_items': 10_000,
30 | },
31 | 'mime_type_rules': [
32 | {'match': 'MATCHES', 'pattern': '^text/', 'save': True},
33 | {'save': False},
34 | ],
35 | 'proxy_rules': [],
36 | 'robots_txt': {
37 | 'usage': 'IGNORE',
38 | },
39 | 'url_normalization': {
40 | 'enabled': True,
41 | 'strip_parameters': ['b'],
42 | },
43 | 'url_rules': [
44 | {'action': 'ADD', 'amount': 1, 'match': 'MATCHES',
45 | 'pattern': '^https?://({SEED_DOMAINS})/'},
46 | {'action': 'MULTIPLY', 'amount': 0},
47 | ],
48 | 'user_agents': [
49 | {'name': 'Test User Agent'}
50 | ]
51 | }
52 | return Policy(policy_doc, '1.0.0', ['https://frontier.example'])
53 |
54 |
55 | async def test_frontier_exhaustion(nursery):
56 | # Set up test fixtures
57 | job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
58 | db = Mock()
59 | db.any_in_flight = AsyncMock()
60 | db.get_frontier_batch = AsyncMock(return_value=list())
61 | db.get_frontier_size = AsyncMock(return_value=5)
62 | send_channel, recv_channel = trio.open_memory_channel(0)
63 | login_manager = Mock()
64 | login_manager.login = AsyncMock()
65 | policy = make_policy()
66 | stats = dict()
67 | frontier = CrawlFrontier(job_id, db, send_channel, login_manager, policy,
68 | stats)
69 |
70 | # This test has an empty frontier, so it should raise an exhaustion error
71 | # in its run() method.
72 | with pytest.raises(FrontierExhaustionError):
73 | await frontier.run()
74 |
75 |
76 | async def test_frontier_batches(autojump_clock, nursery):
77 | # Set up test fixtures
78 | job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
79 | db = Mock()
80 | db.any_in_flight = AsyncMock()
81 | batch1 = [{
82 | 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb',
83 | 'cost': 1.0,
84 | 'job_id': job_id,
85 | 'url': 'https://frontier.example/1',
86 | 'in_flight': False,
87 | },{
88 | 'id': 'cccccccc-cccc-cccc-cccc-cccccccccccc',
89 | 'cost': 2.0,
90 | 'job_id': job_id,
91 | 'url': 'https://frontier.example/2',
92 | 'in_flight': False,
93 | }]
94 | batch2 = [{
95 | 'id': 'dddddddd-dddd-dddd-dddd-dddddddddddd',
96 | 'cost': 3.0,
97 | 'job_id': job_id,
98 | 'url': 'https://frontier.example/3',
99 | 'in_flight': False,
100 | }]
101 | db.get_frontier_batch = AsyncMock(return_values=(batch1, batch2, []))
102 | db.get_frontier_size = AsyncMock(return_value=5)
103 | send_channel, recv_channel = trio.open_memory_channel(0)
104 | login_manager = Mock()
105 | login_manager.login = AsyncMock()
106 | policy = make_policy()
107 | stats = dict()
108 | frontier = CrawlFrontier(job_id, db, send_channel, login_manager, policy,
109 | stats)
110 | assert repr(frontier) == ''
111 | nursery.start_soon(frontier.run)
112 |
113 | # Wait for the first item from the frontier. It should trigger the login
114 | # manager to log in to this domain, and also check the robots.txt to see if
115 | # the item is allowed.
116 | item1 = await recv_channel.receive()
117 | assert login_manager.login.call_count == 1
118 | assert login_manager.login.call_args[0] == 'frontier.example'
119 | assert str(item1.url) == 'https://frontier.example/1'
120 | assert item1.cost == 1.0
121 | assert item1.job_id == job_id
122 | assert item1.frontier_id == 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb'
123 |
--------------------------------------------------------------------------------
/tests/test_job.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta, timezone
2 | import logging
3 | import pickle
4 | from unittest.mock import Mock
5 |
6 | import pytest
7 | import trio
8 |
9 | from . import AsyncMock, asyncio_loop, fail_after
10 | from starbelly.frontier import FrontierExhaustionError
11 | from starbelly.job import (
12 | PipelineTerminator,
13 | RunState,
14 | StatsTracker,
15 | CrawlManager,
16 | )
17 |
18 |
19 | logger = logging.getLogger(__name__)
20 |
21 |
22 | def make_policy_doc():
23 | created_at = datetime(2019, 1, 1, 12, 0, 0, tzinfo=timezone.utc)
24 | return {
25 | 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb',
26 | 'name': 'Test Policy',
27 | 'created_at': created_at,
28 | 'updated_at': created_at,
29 | 'authentication': {'enabled': True},
30 | 'limits': {
31 | 'max_cost': 10,
32 | 'max_duration': 3600,
33 | 'max_items': 10_000,
34 | },
35 | 'mime_type_rules': [
36 | {'match': 'MATCHES', 'pattern': '^text/', 'save': True},
37 | {'save': False},
38 | ],
39 | 'proxy_rules': [],
40 | 'robots_txt': {
41 | 'usage': 'IGNORE',
42 | },
43 | 'url_normalization': {
44 | 'enabled': True,
45 | 'strip_parameters': ['PHPSESSID'],
46 | },
47 | 'url_rules': [
48 | {'action': 'ADD', 'amount': 1, 'match': 'MATCHES',
49 | 'pattern': '^https?://({SEED_DOMAINS})/'},
50 | {'action': 'MULTIPLY', 'amount': 0},
51 | ],
52 | 'user_agents': [
53 | {'name': 'Test User Agent'}
54 | ]
55 | }
56 |
57 |
58 | @fail_after(3)
59 | async def test_start_job(asyncio_loop, nursery):
60 | # Set up fixtures
61 | job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
62 | policy_id = 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb'
63 | rate_limiter = Mock()
64 | rate_limiter.remove_job = AsyncMock()
65 | stats_tracker = StatsTracker(timedelta(seconds=60))
66 | robots_txt_manager = Mock()
67 | manager_db = Mock()
68 | manager_db.clear_frontier = AsyncMock()
69 | manager_db.create_job = AsyncMock(job_id)
70 | manager_db.finish_job = AsyncMock()
71 | manager_db.get_max_sequence = AsyncMock(100)
72 | manager_db.get_policy = AsyncMock(make_policy_doc())
73 | manager_db.run_job = AsyncMock()
74 | frontier_db = Mock()
75 | frontier_db.any_in_flight = AsyncMock(False)
76 | frontier_db.get_frontier_batch = AsyncMock({})
77 | frontier_db.get_frontier_size = AsyncMock(0)
78 | frontier_db.run = AsyncMock()
79 | extractor_db = Mock()
80 | storage_db = Mock()
81 | login_db = Mock()
82 | crawl_manager = CrawlManager(rate_limiter, stats_tracker,
83 | robots_txt_manager, manager_db, frontier_db, extractor_db, storage_db,
84 | login_db)
85 |
86 | # Run the crawl manager and start a new job
87 | await nursery.start(crawl_manager.run)
88 | await crawl_manager.start_job('Test Job', ['https://seed.example'],
89 | ['tag1'], policy_id)
90 |
91 | # Wait for the crawler to tell us that the job is running.
92 | recv_channel = crawl_manager.get_job_state_channel()
93 | state_event = await recv_channel.receive()
94 | assert state_event.run_state == RunState.RUNNING
95 |
96 | resources = crawl_manager.get_resource_usage()
97 | assert resources['maximum_downloads'] == 20
98 | assert resources['current_downloads'] == 0
99 | assert resources['jobs'][0]['id'] == job_id
100 | assert resources['jobs'][0]['name'] == 'Test Job'
101 | assert resources['jobs'][0]['current_downloads'] == 0
102 |
103 | # The job has an empty frontier, so it will quit immediately after starting.
104 | # Wait for the completed job state.
105 | state_event = await recv_channel.receive()
106 | assert state_event.run_state == RunState.COMPLETED
107 |
108 | # Make sure the manager interacted with other objects correctly.
109 | assert manager_db.clear_frontier.call_args[0] == job_id
110 | assert manager_db.finish_job.call_args[0] == job_id
111 | assert manager_db.finish_job.call_args[1] == RunState.COMPLETED
112 | assert manager_db.get_policy.call_args[0] == policy_id
113 | assert manager_db.run_job.call_args[0] == job_id
114 | assert frontier_db.get_frontier_batch.call_args[0] == job_id
115 |
116 | stats = stats_tracker.snapshot()
117 | assert stats[0]['id'] == job_id
118 | assert stats[0]['name'] == 'Test Job'
119 | assert stats[0]['run_state'] == RunState.COMPLETED
120 | assert stats[0]['seeds'] == ['https://seed.example']
121 | assert stats[0]['tags'] == ['tag1']
122 |
123 |
124 | @fail_after(3)
125 | async def test_pause_resume_cancel(asyncio_loop, nursery):
126 | # Set up fixtures
127 | job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
128 | created_at = datetime(2019, 1, 1, 12, 0, 0, tzinfo=timezone.utc)
129 | job_doc = {
130 | 'id': job_id,
131 | 'name': 'Test Job',
132 | 'seeds': ['https://seed1.example', 'https://seed2.example'],
133 | 'tags': [],
134 | 'run_state': RunState.PAUSED,
135 | 'old_urls': b'\x80\x03cbuiltins\nset\nq\x00]q\x01C\x10\xad\xb6\x93\x9b'
136 | b'\xac\x92\xd8\xfd\xc0\x8dJ\x94^\x8d\xe5~q\x02a\x85q\x03Rq'
137 | b'\x04.',
138 | 'started_at': created_at,
139 | 'completed_at': None,
140 | 'duration': None,
141 | 'item_count': 0,
142 | 'http_success_count': 0,
143 | 'http_error_count': 0,
144 | 'exception_count': 0,
145 | 'http_status_counts': {},
146 | 'schedule_id': 'cccccccc-cccc-cccc-cccc-cccccccccccc',
147 | 'policy': {
148 | 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb',
149 | 'name': 'Test Policy',
150 | 'created_at': created_at,
151 | 'updated_at': created_at,
152 | 'authentication': {
153 | 'enabled': False,
154 | },
155 | 'captcha_solver_id': None,
156 | 'limits': {
157 | 'max_cost': 10,
158 | 'max_duration': 3600,
159 | 'max_items': 10_000,
160 | },
161 | 'mime_type_rules': [
162 | {'match': 'MATCHES', 'pattern': '^text/', 'save': True},
163 | {'save': False},
164 | ],
165 | 'proxy_rules': [],
166 | 'robots_txt': {
167 | 'usage': 'IGNORE',
168 | },
169 | 'url_normalization': {
170 | 'enabled': True,
171 | 'strip_parameters': [],
172 | },
173 | 'url_rules': [
174 | {'action': 'ADD', 'amount': 1, 'match': 'MATCHES',
175 | 'pattern': '^https?://({SEED_DOMAINS})/'},
176 | {'action': 'MULTIPLY', 'amount': 0},
177 | ],
178 | 'user_agents': [
179 | {'name': 'Test User Agent'}
180 | ],
181 | },
182 | }
183 |
184 | rate_limiter = Mock()
185 | rate_limiter.remove_job = AsyncMock()
186 | stats_tracker = StatsTracker(timedelta(seconds=60))
187 | robots_txt_manager = Mock()
188 | manager_db = Mock()
189 | manager_db.clear_frontier = AsyncMock()
190 | manager_db.create_job = AsyncMock(job_id)
191 | manager_db.finish_job = AsyncMock()
192 | manager_db.get_max_sequence = AsyncMock(100)
193 | manager_db.get_policy = AsyncMock(make_policy_doc())
194 | manager_db.resume_job = AsyncMock(job_doc)
195 | manager_db.pause_job = AsyncMock()
196 | manager_db.run_job = AsyncMock()
197 | frontier_db = Mock()
198 | frontier_db.any_in_flight = AsyncMock(True)
199 | frontier_db.get_frontier_batch = AsyncMock({})
200 | frontier_db.get_frontier_size = AsyncMock(0)
201 | frontier_db.run = AsyncMock()
202 | extractor_db = Mock()
203 | storage_db = Mock()
204 | login_db = Mock()
205 | crawl_manager = CrawlManager(rate_limiter, stats_tracker,
206 | robots_txt_manager, manager_db, frontier_db, extractor_db, storage_db,
207 | login_db)
208 |
209 | # Run the crawl manager and start a new job
210 | await nursery.start(crawl_manager.run)
211 | await crawl_manager.start_job(job_doc['name'], job_doc['seeds'],
212 | job_doc['tags'], job_doc['policy']['id'])
213 |
214 | # Wait for the crawler to tell us that the job is running.
215 | recv_channel = crawl_manager.get_job_state_channel()
216 | state_event = await recv_channel.receive()
217 | assert state_event.run_state == RunState.RUNNING
218 |
219 | # Now pause and wait for the paused event.
220 | await crawl_manager.pause_job(job_id)
221 | state_event = await recv_channel.receive()
222 | assert state_event.run_state == RunState.PAUSED
223 | assert manager_db.pause_job.call_args[0] == job_id
224 | # There are two "old URLs": the seed URLs.
225 | assert len(pickle.loads(manager_db.pause_job.call_args[1])) == 2
226 | assert stats_tracker.snapshot()[0]['run_state'] == RunState.PAUSED
227 |
228 | # Now resume and wait for the running event.
229 | await crawl_manager.resume_job(job_id)
230 | state_event = await recv_channel.receive()
231 | assert state_event.run_state == RunState.RUNNING
232 | assert manager_db.resume_job.call_args[0] == job_id
233 |
234 | # Now cancel and wait for the cancelled event
235 | await crawl_manager.cancel_job(job_id)
236 | state_event = await recv_channel.receive()
237 | assert state_event.run_state == RunState.CANCELLED
238 | assert manager_db.finish_job.call_args[0] == job_id
239 | assert manager_db.finish_job.call_args[1] == RunState.CANCELLED
240 |
--------------------------------------------------------------------------------
/tests/test_login.py:
--------------------------------------------------------------------------------
1 | from base64 import b64decode
2 | from datetime import datetime, timezone
3 | from functools import partial
4 | from unittest.mock import Mock
5 |
6 | from aiohttp import CookieJar
7 | import trio
8 | from yarl import URL
9 |
10 | from . import asyncio_loop, AsyncMock
11 | from starbelly.captcha import CaptchaSolver
12 | from starbelly.downloader import Downloader, DownloadResponse
13 | from starbelly.login import LoginManager
14 | from starbelly.policy import Policy
15 |
16 |
17 | def make_policy(captcha_port=80):
18 | policy_doc = {
19 | 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7',
20 | 'name': 'Test',
21 | 'created_at': datetime(2019, 1, 28, 14, 26, 0, tzinfo=timezone.utc),
22 | 'updated_at': datetime(2019, 1, 28, 14, 26, 0, tzinfo=timezone.utc),
23 | 'authentication': {
24 | 'enabled': False,
25 | },
26 | 'captcha_solver': {
27 | 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f8',
28 | 'name': 'Example CAPTCHA',
29 | 'service_url': 'http://127.0.0.1:{}'.format(captcha_port),
30 | 'api_key': None,
31 | 'require_phrase': False,
32 | 'case_sensitive': True,
33 | 'characters': 'ALPHANUMERIC',
34 | 'require_math': False,
35 | },
36 | 'limits': {
37 | 'max_cost': 10,
38 | 'max_duration': 3600,
39 | 'max_items': 10_000,
40 | },
41 | 'mime_type_rules': [
42 | {'match': 'MATCHES', 'pattern': '^text/', 'save': True},
43 | {'save': False},
44 | ],
45 | 'proxy_rules': [],
46 | 'robots_txt': {
47 | 'usage': 'IGNORE',
48 | },
49 | 'url_normalization': {
50 | 'enabled': True,
51 | 'strip_parameters': [],
52 | },
53 | 'url_rules': [
54 | {'action': 'ADD', 'amount': 1, 'match': 'MATCHES',
55 | 'pattern': '^https?://({SEED_DOMAINS})/'},
56 | {'action': 'MULTIPLY', 'amount': 0},
57 | ],
58 | 'user_agents': [
59 | {'name': 'Test User Agent'}
60 | ]
61 | }
62 | return Policy(policy_doc, '1.0.0', ['https://login.example'])
63 |
64 |
65 | async def test_login_form():
66 | job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
67 | db = Mock()
68 | login = {
69 | 'domain': 'login.example',
70 | 'login_url': 'https://login.example/index',
71 | 'users': [{'username': 'john', 'password': 'fake'}]
72 | }
73 | db.get_login = AsyncMock(return_value=login)
74 | policy = make_policy()
75 | downloader = Mock()
76 | html1 = \
77 | b'''
78 | Login Test
79 |
80 |