├── .gitignore ├── .pylintrc ├── .readthedocs.yml ├── .travis.yml ├── LICENSE ├── Makefile ├── README.md ├── conf ├── .gitignore ├── local.ini.template └── system.ini ├── dev ├── .gitignore ├── docker-compose.yml ├── gencert.py └── nginx.conf ├── docs ├── .gitignore ├── Makefile ├── README.txt ├── administration.rst ├── changelog.rst ├── client-side-logging.png ├── conf.py ├── configuration.rst ├── dashboard.png ├── dashboard_first_crawl.png ├── development.rst ├── first_crawl.rst ├── first_crawl_results.png ├── gui.png ├── index.rst ├── installation.rst ├── internals.rst ├── jupyter_new_notebook.png ├── jupyter_notebook.png ├── logo.png ├── policy.rst ├── protobuf.html ├── protobuf.rst ├── requirements.txt ├── rethinkdb_gui.png ├── start_crawl.png ├── start_crawl_filled_in.png ├── terminal.png └── websocket_api.rst ├── integration ├── __init__.py ├── test_db.py └── test_subscription.py ├── notebooks ├── .gitignore ├── Example.ipynb └── Frontier Reloading.ipynb ├── poetry.lock ├── pyproject.toml ├── pytest.ini ├── starbelly ├── __init__.py ├── __main__.py ├── backoff.py ├── bootstrap.py ├── captcha.py ├── config.py ├── db.py ├── downloader.py ├── extractor.py ├── frontier.py ├── job.py ├── login.py ├── policy.py ├── rate_limiter.py ├── resource_monitor.py ├── robots.py ├── schedule.py ├── server │ ├── __init__.py │ ├── captcha.py │ ├── job.py │ ├── login.py │ ├── policy.py │ ├── rate_limit.py │ ├── schedule.py │ ├── subscription.py │ └── system.py ├── starbelly_pb2.py ├── storage.py ├── subscription.py └── version.py ├── tests ├── __init__.py ├── test_async_mock.py ├── test_backoff.py ├── test_captcha.py ├── test_config.py ├── test_crawl.py ├── test_downloader.py ├── test_extractor.py ├── test_frontier.py ├── test_job.py ├── test_login.py ├── test_policy.py ├── test_rate_limiter.py ├── test_resource_monitor.py ├── test_robots.py ├── test_schedule.py ├── test_server.py ├── test_storage.py └── test_subscription.py └── tools ├── __init__.py ├── clear.py ├── container_init.py ├── delete_dangling_bodies.py └── shell.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | .coverage 3 | .coverage.* 4 | .ipynb_checkpoints 5 | .pytest_cache 6 | .vscode 7 | __pycache__ 8 | README.html 9 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 2 | version: 2 3 | 4 | sphinx: 5 | configuration: docs/conf.py 6 | 7 | python: 8 | version: 3.7 9 | install: 10 | - requirements: docs/requirements.txt 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | git: 4 | depth: 1 5 | 6 | matrix: 7 | include: 8 | - python: 3.7 9 | dist: xenial 10 | sudo: yes 11 | 12 | before_install: 13 | - pip install poetry 14 | 15 | install: 16 | - poetry install 17 | 18 | script: 19 | - poetry run make test 20 | 21 | after_success: 22 | - poetry run make coverage 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Hyperion Gray LLC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # The targets in this makefile should be executed inside Poetry, i.e. `poetry run make 2 | # docs`. 3 | 4 | .PHONY: docs 5 | 6 | docs: 7 | $(MAKE) -C docs html 8 | 9 | test: 10 | pytest tests/ --cov=starbelly --cov-report=term-missing 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Starbelly is a user-friendly web crawler that is easy to deploy and configure. 4 | Learn more at 5 | [starbelly.readthedocs.io](http://starbelly.readthedocs.io/en/latest/). 6 | 7 | [![Build Status](https://img.shields.io/travis/com/HyperionGray/starbelly.svg?style=flat-square)](https://travis-ci.org/HyperionGray/starbelly) 8 | [![Coverage](https://img.shields.io/coveralls/github/HyperionGray/starbelly.svg?style=flat-square)](https://coveralls.io/github/HyperionGray/starbelly) 9 | [![Read the Docs](https://img.shields.io/readthedocs/starbelly.svg)](https://starbelly.readthedocs.io) 10 | 11 | 12 | # LICENSE 13 | 14 | Starbelly is under a proprietary license. Please contact Hyperion Gray at acaceres@hyperiongray.com 15 | 16 | --- 17 | 18 | define hyperion gray 19 | 20 | -------------------------------------------------------------------------------- /conf/.gitignore: -------------------------------------------------------------------------------- 1 | local.ini 2 | -------------------------------------------------------------------------------- /conf/local.ini.template: -------------------------------------------------------------------------------- 1 | [database] 2 | 3 | host = 4 | db = 5 | user = 6 | password = 7 | super_user = 8 | super_password = 9 | -------------------------------------------------------------------------------- /conf/system.ini: -------------------------------------------------------------------------------- 1 | [database] 2 | 3 | host = 4 | port = 28015 5 | db = 6 | user = 7 | password = 8 | super_user = 9 | super_password = 10 | 11 | [rate_limiter] 12 | 13 | capacity = 10000 14 | -------------------------------------------------------------------------------- /dev/.gitignore: -------------------------------------------------------------------------------- 1 | ca.crt 2 | server.crt 3 | -------------------------------------------------------------------------------- /dev/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | volumes: 4 | db_data: {} 5 | 6 | services: 7 | db: 8 | network_mode: host 9 | image: rethinkdb:2.4 10 | container_name: starbelly-dev-db 11 | volumes: 12 | - db_data:/data 13 | 14 | web: 15 | network_mode: host 16 | image: nginx:1.15 17 | container_name: starbelly-dev-web 18 | volumes: 19 | - "${PWD}/nginx.conf:/etc/nginx/conf.d/default.conf" 20 | - "${PWD}/server.crt:/etc/nginx/server.crt" 21 | -------------------------------------------------------------------------------- /dev/gencert.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pathlib 3 | import sys 4 | 5 | import trustme 6 | 7 | 8 | logging.basicConfig(level=logging.INFO) 9 | 10 | 11 | def main(): 12 | if len(sys.argv) != 2: 13 | logging.error('Usage: %s ', sys.argv[0]) 14 | sys.exit(1) 15 | hostname = sys.argv[1] 16 | here = pathlib.Path(__file__).parent 17 | ca_path = here / 'ca.crt' 18 | server_path = here / 'server.crt' 19 | logging.info('Creating self-signed certificate for "%s"', hostname) 20 | ca_cert = trustme.CA() 21 | ca_cert.cert_pem.write_to_path(ca_path) 22 | logging.info(' * CA certificate: {}'.format(ca_path)) 23 | server_cert = ca_cert.issue_server_cert(hostname) 24 | server_cert.private_key_and_cert_chain_pem.write_to_path(server_path) 25 | logging.info(' * Server certificate: {}'.format(server_path)) 26 | logging.info('Done') 27 | 28 | 29 | if __name__ == '__main__': 30 | main() 31 | -------------------------------------------------------------------------------- /dev/nginx.conf: -------------------------------------------------------------------------------- 1 | server { 2 | listen 80 default_server; 3 | return 301 https://$host$request_uri; 4 | } 5 | 6 | server { 7 | listen 443 ssl default_server; 8 | ssl_certificate /etc/nginx/server.crt; 9 | ssl_certificate_key /etc/nginx/server.crt; 10 | ssl_protocols TLSv1.2; 11 | ssl_ciphers HIGH:!aNULL:!MD5; 12 | ssl_session_cache shared:TLSCACHE:1m; 13 | add_header Strict-Transport-Security "max-age=31536000"; 14 | 15 | location /ws/ { 16 | proxy_pass http://localhost:8000; 17 | proxy_http_version 1.1; 18 | proxy_set_header Upgrade $http_upgrade; 19 | proxy_set_header Connection "upgrade"; 20 | } 21 | 22 | # Reverse proxy to `pub serve` 23 | location / { 24 | proxy_pass http://localhost:8081; 25 | proxy_http_version 1.1; 26 | proxy_intercept_errors on; 27 | error_page 404 =200 /index.html; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | 3 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = Starbelly 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/README.txt: -------------------------------------------------------------------------------- 1 | To build documentation in this directory, install `sphinx` and 2 | `sphinx_rtd_theme` packages from pip, then run `make html`. 3 | -------------------------------------------------------------------------------- /docs/administration.rst: -------------------------------------------------------------------------------- 1 | ******************* 2 | Administrator Guide 3 | ******************* 4 | 5 | .. contents:: 6 | :depth: 2 7 | 8 | 9 | Overview 10 | ======== 11 | 12 | This section goes over some common tasks that you may need to perform as a 13 | Starbelly administrator. In the examples below, if a command prompt is prefixed 14 | with a container name, then that indicates that the command must be run inside 15 | a specific Docker container. For example, if you see this: 16 | 17 | .. code:: 18 | 19 | starbelly-dev-app:/starbelly# ls /usr/local/etc 20 | jupyter 21 | 22 | Then that command should be run inside of the ``starbelly-dev-app`` container. 23 | To obtain a shell inside that container, run: 24 | 25 | .. code:: 26 | 27 | $ docker exec -it starbelly-dev-app /bin/bash 28 | starbelly-dev-app# 29 | 30 | You can use the same technique to get a shell inside the ``starbelly-dev-db`` or 31 | ``starbelly-dev-web`` containers. 32 | 33 | Clear Database 34 | ============== 35 | 36 | To clear all data from the database, including crawl data, job data, and other 37 | state: 38 | 39 | .. code:: 40 | 41 | starbelly-dev-app:/starbelly# python tools/clear.py 42 | 43 | Change Password 44 | =============== 45 | 46 | Adding or changing passwords is covered in the :doc:`installation` under the 47 | "Security" section. 48 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | ********* 2 | Changelog 3 | ********* 4 | 5 | 6 | v2.0.0 (2019-XX-XX) 7 | =================== 8 | 9 | - Massive rewrite of Starbelly's I/O to use Trio instead of asyncio. 10 | - Upgrade web client to Dart 2 and Angular 5. 11 | 12 | 13 | v1.0.0 (2017-11-03) 14 | =================== 15 | 16 | - Initial release. 17 | -------------------------------------------------------------------------------- /docs/client-side-logging.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/client-side-logging.png -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Starbelly documentation build configuration file, created by 5 | # sphinx-quickstart on Thu Nov 2 15:00:59 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | # import os 21 | # import sys 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | 24 | import pathlib 25 | import sys 26 | 27 | # -- General configuration ------------------------------------------------ 28 | 29 | # If your documentation needs a minimal Sphinx version, state it here. 30 | # 31 | # needs_sphinx = '1.0' 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = [ 37 | 'sphinx.ext.autodoc', 38 | 'sphinx.ext.intersphinx', 39 | 'sphinx.ext.graphviz', 40 | 'sphinxcontrib_trio', 41 | ] 42 | 43 | # Add any paths that contain templates here, relative to this directory. 44 | templates_path = ['_templates'] 45 | 46 | # The suffix(es) of source filenames. 47 | # You can specify multiple suffix as a list of string: 48 | # 49 | # source_suffix = ['.rst', '.md'] 50 | source_suffix = '.rst' 51 | 52 | # The master toctree document. 53 | master_doc = 'index' 54 | 55 | # General information about the project. 56 | project = 'Starbelly' 57 | copyright = 'Hyperion Gray 2017' 58 | author = 'Mark E. Haase' 59 | 60 | # The version info for the project you're documenting, acts as replacement for 61 | # |version| and |release|, also used in various other places throughout the 62 | # built documents. 63 | # 64 | # The short X.Y version. 65 | import starbelly.version 66 | version = starbelly.version.__version__ 67 | # The full version, including alpha/beta/rc tags. 68 | release = version 69 | 70 | # The language for content autogenerated by Sphinx. Refer to documentation 71 | # for a list of supported languages. 72 | # 73 | # This is also used if you do content translation via gettext catalogs. 74 | # Usually you set "language" from the command line for these cases. 75 | language = None 76 | 77 | # List of patterns, relative to source directory, that match files and 78 | # directories to ignore when looking for source files. 79 | # This patterns also effect to html_static_path and html_extra_path 80 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 81 | 82 | # The name of the Pygments (syntax highlighting) style to use. 83 | pygments_style = 'sphinx' 84 | 85 | # If true, `todo` and `todoList` produce output, else they produce nothing. 86 | todo_include_todos = False 87 | 88 | 89 | # -- Options for HTML output ---------------------------------------------- 90 | 91 | # The theme to use for HTML and HTML Help pages. See the documentation for 92 | # a list of builtin themes. 93 | # 94 | html_theme = 'sphinx_rtd_theme' 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | # 100 | # html_theme_options = {} 101 | 102 | # Add any paths that contain custom static files (such as style sheets) here, 103 | # relative to this directory. They are copied after the builtin static files, 104 | # so a file named "default.css" will overwrite the builtin "default.css". 105 | html_static_path = ['_static'] 106 | 107 | # Custom sidebar templates, must be a dictionary that maps document names 108 | # to template names. 109 | # 110 | # This is required for the alabaster theme 111 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 112 | html_sidebars = { 113 | '**': [ 114 | 'relations.html', # needs 'show_related': True theme option to display 115 | 'searchbox.html', 116 | ] 117 | } 118 | 119 | 120 | # -- Options for HTMLHelp output ------------------------------------------ 121 | 122 | # Output file base name for HTML help builder. 123 | htmlhelp_basename = 'Starbellydoc' 124 | 125 | 126 | # -- Options for LaTeX output --------------------------------------------- 127 | 128 | latex_elements = { 129 | # The paper size ('letterpaper' or 'a4paper'). 130 | # 131 | # 'papersize': 'letterpaper', 132 | 133 | # The font size ('10pt', '11pt' or '12pt'). 134 | # 135 | # 'pointsize': '10pt', 136 | 137 | # Additional stuff for the LaTeX preamble. 138 | # 139 | # 'preamble': '', 140 | 141 | # Latex figure (float) alignment 142 | # 143 | # 'figure_align': 'htbp', 144 | } 145 | 146 | # Grouping the document tree into LaTeX files. List of tuples 147 | # (source start file, target name, title, 148 | # author, documentclass [howto, manual, or own class]). 149 | latex_documents = [ 150 | (master_doc, 'Starbelly.tex', 'Starbelly Documentation', 151 | 'Mark E. Haase', 'manual'), 152 | ] 153 | 154 | 155 | # -- Options for manual page output --------------------------------------- 156 | 157 | # One entry per manual page. List of tuples 158 | # (source start file, name, description, authors, manual section). 159 | man_pages = [ 160 | (master_doc, 'starbelly', 'Starbelly Documentation', 161 | [author], 1) 162 | ] 163 | 164 | 165 | # -- Options for Texinfo output ------------------------------------------- 166 | 167 | # Grouping the document tree into Texinfo files. List of tuples 168 | # (source start file, target name, title, author, 169 | # dir menu entry, description, category) 170 | texinfo_documents = [ 171 | (master_doc, 'Starbelly', 'Starbelly Documentation', 172 | author, 'Starbelly', 'One line description of project.', 173 | 'Miscellaneous'), 174 | ] 175 | 176 | # -- Options for extensions ----------------------------------------------- 177 | 178 | intersphinx_mapping = { 179 | 'trio': ('https://trio.readthedocs.io/en/stable/', None), 180 | 'yarl': ('https://yarl.readthedocs.io/en/stable/', None), 181 | } 182 | -------------------------------------------------------------------------------- /docs/configuration.rst: -------------------------------------------------------------------------------- 1 | ******************* 2 | Configuration Guide 3 | ******************* 4 | 5 | .. contents:: 6 | :depth: 2 7 | 8 | Overview 9 | ======== 10 | 11 | Starbelly can be configured entirely through its graphical interface. In fact, 12 | this is one of the advantages to using Starbelly: no more arcane configuration 13 | files or custom code! The tradeoff, of course, is that Starbelly has fewer 14 | configuration options than other crawlers and may not be flexible enough to 15 | solve all crawling problems. 16 | 17 | The configuration items are all contained in the *Configuration* submenu on the 18 | left side of the interface. 19 | 20 | CAPTCHA Solvers 21 | =============== 22 | 23 | Starbelly has the ability to automatically log into a website if it has the 24 | appropriate credentials (see `Credentials`_ below). Some login forms may 25 | require a CAPTCHA. In those cases, you may configure a CAPTCHA solving service. 26 | Starbelly supports any CAPTCHA service that is compatible with the Antigate API. 27 | You may create multiple configurations in order to use multiple backend solvers 28 | or just to send different configurations to the same service. 29 | 30 | Once you have created a CAPTCHA solver, specify that CAPTCHA solver in a crawl 31 | policy in order to send login CAPTCHAs to the solving service during crawls. 32 | 33 | Credentials 34 | =========== 35 | 36 | Starbelly has the ability to automatically log into a website if it has the 37 | appropriate credentials. To configure credentials for a site, you only need to 38 | specify a login URL. (If the login URL enables single sign-on for multiple 39 | subdomains, then you should also specify the domain name that you wish to 40 | authenticate on.) 41 | 42 | For each domain, you may set up multiple username & password credentials. When 43 | the crawler encounters that domain during a crawl, it will randomly pick one of 44 | the credentials and attempt to login with it. (The crawler uses machine learning 45 | to identify and parse the login form.) 46 | 47 | Rate Limits 48 | =========== 49 | 50 | The crawler observes rate limits between subsequent requests to a single domain. 51 | For example, with the default delay of 5 seconds, the crawler will wait 5 52 | seconds after a request completes until it initiates another request to that 53 | same domain. Therfore, the crawler will download at most 12 pages per minute 54 | from a single domain using the default rate limit. In practice, it will download 55 | fewer than 12 pages per minute, since each request itself also takes some 56 | non-negligible amount of time. 57 | 58 | Furthermore, rate limits apply across all jobs. For example, if you have two 59 | different jobs crawling one domain, each job will effectively be limited to 6 60 | pages per minute instead of 12. 61 | 62 | On the *Rate Limits* configuration screen, you may change the global limit as 63 | well as customize rate limits for specific domains. This allows you to specify 64 | lower rate limits for domains that can handle higher traffic. For example, you 65 | might crawl web servers on your corporate intranet faster than you crawl a 66 | public internet server. 67 | -------------------------------------------------------------------------------- /docs/dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/dashboard.png -------------------------------------------------------------------------------- /docs/dashboard_first_crawl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/dashboard_first_crawl.png -------------------------------------------------------------------------------- /docs/first_crawl.rst: -------------------------------------------------------------------------------- 1 | **************** 2 | Your First Crawl 3 | **************** 4 | 5 | Starbelly offers a lot of ways to fine tune crawling, but for your first crawl, 6 | we will just use all of its default settings. The goal of this section is to 7 | perform a deep crawl of a news site. Here are a few example sites you may want 8 | to try: 9 | 10 | - `CNN `__ 11 | - `Fox News `__ 12 | - `New York Times `__ 13 | - `Washington Post `__ 14 | 15 | Begin by accessing the Starbelly GUI in your web browser. You should be able to 16 | see the dashboard: 17 | 18 | .. image:: dashboard.png 19 | :alt: screenshot of dashboard 20 | 21 | The dashboard is currently empty because you have not started any crawls yet, 22 | but let us take a moment to get familiar with the interface. On the left side, 23 | under the "Starbelly" logo, is the menu. Click any item in the menu to view it. 24 | There is a gray bar across the top of the screen. The left side of this bar 25 | displays breadcrumbs, e.g. the *home* icon and the *dashboard* icon. The 26 | breadcrumbs help you keep track of where you are, and also allow you to quickly 27 | return to earlier screens. The right side of the bar shows the status 28 | "Connected", meaning that it is connected to the Starbelly server. 29 | 30 | Next, click on *Start Crawl* in the menu. 31 | 32 | .. image:: start_crawl.png 33 | :alt: screenshot of blank crawl form 34 | 35 | This screen allows you to start a crawl. First, enter a seed URL. Second, select 36 | the *Deep Crawl* policy (more on that in a moment). Optionally, you may assign a 37 | name to the crawl. If you do not assign a name, then the crawler will choose a 38 | name for you. Finally, you may assign tags. Tags may be used by consumers of 39 | crawl data, but they do not have any effect on the crawl itself, so leave it 40 | blank for now. (You can edit the tags later if you wish.) 41 | 42 | .. image:: start_crawl_filled_in.png 43 | :alt: screenshot of filled-in crawl form 44 | 45 | When you are ready, click the *Start Crawl* button. You should see a 46 | notification that the crawl is starting. Go back to the Dashboard and you should 47 | now be able to see that your crawl is running. The dashboard updates in realtime 48 | as the crawler downloads documents. 49 | 50 | .. image:: dashboard_first_crawl.png 51 | :alt: dashboard shows the first crawl is running 52 | 53 | You can click on the name of the crawl to view details about that crawl job. The 54 | details screen also updates in real time. 55 | 56 | .. image:: first_crawl_results.png 57 | :alt: detailed look at results of first crawl 58 | 59 | The crawl will continue to run, downloading various documents that it finds, 60 | until it reaches its end. But how does it decide what documents to download, and 61 | how does it decide when the crawl should end? These questions are answered by 62 | consulting the *crawl policy*, which guides the crawler's decision making. In 63 | this example, we used the default *Deep Crawl* policy, which stays inside the 64 | same domain as the seed URL and crawls to depth 10. You may customize this 65 | policy or create any number of your own policies to carefully refine crawler 66 | behavior. 67 | 68 | Where does the crawl data go and what can you do with it? You can view crawl 69 | results inside Starbelly, but this feature is intended to help with debugging, 70 | not as a practical way to use crawl data. Crawl data is stored inside 71 | Starbelly's database until you explicitly delete it (see the *Crawl Results* 72 | screen). Starbelly is just a crawling *frontend*, which means that it is 73 | designed to be plugged into another application that can read the crawl data and 74 | do something useful with it, such as a search engine or a scraping program. 75 | 76 | Now that you have a crawl under your belt, you might want to do a deeper dive 77 | into :doc:`configuration` and :doc:`policy`. 78 | -------------------------------------------------------------------------------- /docs/first_crawl_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/first_crawl_results.png -------------------------------------------------------------------------------- /docs/gui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/gui.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. image:: logo.png 2 | :height: 150px 3 | :width: 150px 4 | :align: center 5 | 6 | Starbelly 7 | ========= 8 | 9 | Starbelly is a user-friendly and highly configurable web crawler front end. 10 | Compared to other crawling systems, such as Nutch or Scrapy, Starbelly trades 11 | off lower scalability for improved usability. Starbelly eschews the arcane 12 | configuration files and custom code required for other crawling systems, 13 | favoring a GUI for configuration and managment. Starbelly exposes all of its 14 | features and data through an efficient API, allowing you to build crawling-based 15 | systems on top of it. For example, you might plug in an Elastic Search backend 16 | to build a custom search engine, or plug in a scraper to create a data 17 | collection pipeline. 18 | 19 | .. toctree:: 20 | :maxdepth: 1 21 | 22 | installation 23 | first_crawl 24 | configuration 25 | policy 26 | administration 27 | websocket_api 28 | protobuf 29 | development 30 | internals 31 | changelog 32 | 33 | .. image:: https://hyperiongray.s3.amazonaws.com/define-hg.svg 34 | :target: https://www.hyperiongray.com/?pk_campaign=github&pk_kwd=agnostic 35 | :alt: define hyperiongray 36 | :width: 500px 37 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | Installation Guide 2 | ================== 3 | 4 | Prerequisites 5 | ------------- 6 | 7 | Starbelly is offered as a collection of Docker images and Docker-compose 8 | configurations. If you're not familiar with Docker, it is a system for deploying 9 | multiple software components into individual containers and orchestrating the 10 | entire system. 11 | 12 | First, install `Docker `__ using 13 | the instructions for your platform. 14 | 15 | Next, install `Docker Compose `__ 16 | using the instructions for your platform. 17 | 18 | Docker Compose 19 | -------------- 20 | 21 | Docker Compose is used to set up and run multiple Docker containers together. 22 | You should have installed Docker Compose in the previous step. Now you need a 23 | Docker Compose configuration file (usually called ``docker-compose.yml``) that 24 | specifies what containers need to be created and how they should be configured. 25 | A sample docker-compose.yml configuration file is available for Starbelly, but 26 | you may need to tailor this file to your unique environment. 27 | 28 | Download this `zip file 29 | `__ 30 | and extract it. (If you have Git installed, you can run ``git clone 31 | git@github.com:HyperionGray/starbelly-docker.git`` instead.) From the 32 | ``starbelly-docker/starbelly`` directory, run the following command: 33 | 34 | .. code:: 35 | 36 | $ docker-compose up -d 37 | 38 | This will download the required Docker images, create the corresponding 39 | containers, and then start the entire application on ports 80 and 443. Once 40 | the application has started, open up a browser and try navigating to the host 41 | where you are running Starbelly. The default username and password is "admin". 42 | 43 | You should see the Dashboard: 44 | 45 | .. image:: dashboard.png 46 | :alt: screenshot of dashboard 47 | 48 | If you experience any problems, try using the command ``docker-compose logs`` 49 | to view logging output from the Docker containers. 50 | 51 | Security 52 | -------- 53 | 54 | If your Starbelly instance is exposed to the internet, then you should 55 | immediately do two things to secure it: 56 | 57 | 1. Change the admin password. 58 | 2. Create TLS certificates 59 | 60 | The **admin password** is stored in a file called ``htpasswd`` and it can be 61 | created or edited using the ``htpasswd`` command from the Apache2 utilities 62 | package (called ``apache2-utils`` on Ubuntu distributions). Install that package 63 | and then run this command: 64 | 65 | .. code:: 66 | 67 | $ htpasswd -c passwd admin 68 | New password: 69 | Re-type new password: 70 | Adding password for user admin 71 | 72 | Type in the new password when prompted. You can change passwords or add 73 | additional passwords to an existing file by running ``htpasswd passwd USER``, 74 | where ``USER`` is the username to change. When you are done, copy the ``passwd`` 75 | file into the Docker container and remove the original. 76 | 77 | .. code:: 78 | 79 | $ docker cp passwd starbelly-web:/etc/nginx/tls/ 80 | $ docker exec starbelly-web nginx -s reload 81 | 2017/11/02 14:29:37 [notice] 1437#1437: signal process started 82 | $ rm passwd 83 | 84 | The default **TLS certificate** is automatically generated and self-signed when 85 | the container is created. If you have a valid domain name for your Starbelly 86 | server, then you should obtain a real certificate for it. 87 | 88 | If you have obtained your own certificates, you can install them as follows, 89 | where ``certificate.pem`` is the full certificate chain in PEM format and 90 | ``privatekey.pem`` is the private key in PEM format. 91 | 92 | .. code:: 93 | 94 | $ docker cp certificate.pem starbelly-web:/etc/nginx/tls/server.crt 95 | $ docker cp privatekey.pem starbelly-web:/etc/nginx/tls/server.key 96 | $ docker exec starbelly-web nginx -s reload 97 | 2017/11/02 14:29:37 [notice] 1437#1437: signal process started 98 | 99 | If you do not already have TLS certificates, you may obtain free certificates 100 | from `Let's Encrypt `__. First, install the certbot 101 | application using the `instructions for your platform 102 | `__. Now run certbot to create initial certificates, 103 | replacing ``YOUR_DOMAIN`` with the fully qualified domain name of the server and 104 | replacing ``/path/to/starbelly-docker/starbelly`` with the path where you placed 105 | the Starbelly docker configuration files from an earlier step. 106 | 107 | .. code:: 108 | 109 | $ cd /path/to/starbelly-docker/starbelly 110 | $ certbot certonly \ 111 | --webroot -w certbot-webroot \ 112 | -d YOUR_DOMAIN \ 113 | --deploy-hook ./deploy-certbot.py 114 | Plugins selected: Authenticator webroot, Installer None 115 | Enter email address (used for urgent renewal and security notices) (Enter 'c' to 116 | cancel): YOUR EMAIL HERE 117 | 118 | ------------------------------------------------------------------------------- 119 | Please read the Terms of Service at 120 | https://letsencrypt.org/documents/LE-SA-v1.1.1-August-1-2016.pdf. You must agree 121 | in order to register with the ACME server at 122 | https://acme-staging.api.letsencrypt.org/directory 123 | ------------------------------------------------------------------------------- 124 | (A)gree/(C)ancel: a 125 | 126 | ...snip... 127 | 128 | This command will attempt to get TLS certificates from the Let's Encrypt server. 129 | If you've never run certbot on this server before, it will prompt you to enter 130 | a contact e-mail address and agree to the terms of service. 131 | 132 | If certificate generation succeeds, it will install those certificates into the 133 | Docker container. This certificate is valid for 90 days and will need to be 134 | renewed before it expires. Create a daily cron job containing the following 135 | command to ensure that the certificate will be renewed appropriately. 136 | 137 | .. code:: 138 | 139 | certbot renew --deploy-hook /path/to/starbelly-docker/starbelly/deploy-certbot.py 140 | 141 | Next Steps 142 | ---------- 143 | 144 | Now that you have Starbelly up and running, take a look at :doc:`first_crawl`. 145 | -------------------------------------------------------------------------------- /docs/jupyter_new_notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/jupyter_new_notebook.png -------------------------------------------------------------------------------- /docs/jupyter_notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/jupyter_notebook.png -------------------------------------------------------------------------------- /docs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/logo.png -------------------------------------------------------------------------------- /docs/policy.rst: -------------------------------------------------------------------------------- 1 | ****** 2 | Policy 3 | ****** 4 | 5 | .. contents:: 6 | :depth: 2 7 | 8 | Overview 9 | ======== 10 | 11 | The *crawl policy* is one of the most important and powerful concepts in 12 | Starbelly. A policy controls the crawler's behavior and decision making, guiding 13 | which links the crawler follows, what kinds of resources it downloads, and how 14 | long or how far it runs. When you start a crawl job, you must specify which 15 | policy that job should use. 16 | 17 | In this part of the documentation, we take a look at the features of the crawl 18 | policy. To begin, click *Policy* in the Starbelly menu, then click on an 19 | existing policy to view it, or click *New Policy* to create a new policy. 20 | 21 | Authentication 22 | ============== 23 | 24 | The authentication policy determines how a crawler can authenticate itself to a 25 | web site. When the crawler sees a domain in a crawl for the first time, it 26 | checks to see if it has any credentials for that domain. (See the configuration 27 | of Credentials for more information.) If it does, it picks one 28 | of the appropriate credentials at random and tries to login with it. Some login 29 | forms may require a CAPTCHA. In those cases, you may configure a CAPTCHA solver 30 | and specify that solver in the policy. 31 | 32 | Robots.txt 33 | ========== 34 | 35 | `Robots.txt `__ is a standard for specifying how 36 | crawlers should interact with websites. By default, Starbelly will attempt to 37 | download a ``robots.txt`` from each domain that it visits, and it will obey the 38 | directives of any such files that it finds. In some circumstances, however, 39 | such as crawling some old sites, it may be useful to ignore or even invert the 40 | directives in a site's robots.txt, which you can configure using the policy. 41 | 42 | URL Normalization 43 | ================= 44 | 45 | The crawler attempts to avoid crawling the same URL multiple times. If two links 46 | contain exactly identical URLs, then the crawler will only download that 47 | resource once. On some sites, especially dynamically generated sites, multiple 48 | URLs may refer to the same resource and differ only in the order of URL query 49 | parameters or the values of semantically meaningless query parameters like 50 | session IDs. 51 | 52 | The URL normalization policy allows you to control this behavior. When enabled, 53 | the crawler normalizes URLS using a number of techniques, including: 54 | 55 | - sorting query parameters alphabetically 56 | - upper case percent encodings 57 | - remove query fragments 58 | - etc. 59 | 60 | You may specify URL query parameters that should be discarded during 61 | normalization. By default, the crawler discards several common session ID 62 | parameters. Alternatively, you can disable URL normalization completely, 63 | although this may result in lots of duplicated downloads. 64 | 65 | URL Rules 66 | ========= 67 | 68 | The URL rules policy controls how a crawler selects links to follow. For each 69 | page that is downloaded, the crawler extracts candidate links. For each candidate 70 | link, the crawler checks the rules one-by-one until a rule matches, then the crawler 71 | applies the matching rule. 72 | 73 | For example, the default *Deep Crawl* policy contains two URL rules: 74 | 75 | 1. If the URL *matches* the regex ``^https?://({SEED_DOMAINS})/`` then *add* ``1.0``. 76 | 2. Else *multiply by* ``0.0``. 77 | 78 | Let's say the URL is seeded with ``http://foo.com/bar``. It downloads this 79 | document and assigns it a cost of 1.0. Cost is roughly similar to the concept of 80 | *crawl depth* in other crawlers, but it is a bit more sophisticated. Each link 81 | is assigned a cost based on the cost of the document where it was found and the 82 | URL rule that it matches. If a link cost evaluates to zero, then the link is 83 | thrown away. If the link is greater than zero but less than the "Max Cost" 84 | specified in the crawl policy, then the crawler schedules the link to be 85 | fetched. Links are fetched roughly in order of cost, so lower-cost items are 86 | typically fetched before higher-cost items. 87 | 88 | After the crawler downloads the document at ``http://foo.com/bar``, it checks 89 | each link in that document against the URL rules in the policy. For example, if 90 | the link matches the regex in rule #1, then the link will be given a score of 91 | 2.0: the rule says to add 1.0 to the cost of its parent (which was 1.0). 92 | 93 | If the link matches rule #2, then that rule says to multiply the parent's cost 94 | by zero. This results in the new cost being set to zero, and the crawler 95 | discards links where the cost is zero, so the link will not be followed. 96 | 97 | Although the URL rules are a bit complicated at first, they turn out to be a 98 | very powerful way to guide the crawler. For example, if we step back a bit and 99 | consider the effect of the two rules above, we see that it follows links inside 100 | the seed domain and does not follow links outside the seed domain. In other 101 | words, this is a *deep crawl*! 102 | 103 | If we replace the two rules here with just a single rule that says "Always add 104 | 1.0" , then that would result in a *broad crawl* policy! In fact, you can go 105 | look at the default *Broad Crawl* policy included in Starbelly to confirm that 106 | this is how it works. 107 | 108 | User Agents 109 | =========== 110 | 111 | When the crawler downloads a resource, it sends a *User Agent* string in the 112 | headers. By default, Starbelly sends a user agent that identifies itself with a 113 | version number and includes a URL to its source code repository. You may 114 | customize what user agent is sent using the policy. If you include multiple user 115 | agent strings, one will be chosen at random for each request. 116 | 117 | Proxy Rules 118 | =========== 119 | 120 | By default, the crawler downloads resources directly from their hosts. In some 121 | cases, you may want to proxy requests through an intermediary. The *Proxy Rules* 122 | specify which proxy server should be used for which request, similar to the *URL 123 | Rules* above. 124 | 125 | MIME Type Rules 126 | =============== 127 | 128 | While *URL Rules* determine which links to follow, *MIME Type Rules* determine 129 | what types of resources to download. By default, the crawler only downloads 130 | resources that have a MIME type matching the regex ``^text/``, which matches 131 | plain text and HTML resources. If you want the crawler to download images, for 132 | example, then you would add a new rule like ``^image/*`` that would match GIF, 133 | JPEG, and PNG resources. 134 | 135 | The MIME type of a resource is determined by inspecting the ``Content-Type`` 136 | header, which means that *MIME Type Rules* are not applied until *after the 137 | crawler downloads headers* for a resource. If the crawler determines that a 138 | resource should not be downloaded, then the crawler closes the connection and 139 | discards any data that has already been downloaded. 140 | 141 | Limits 142 | ====== 143 | 144 | The *Limits* policy specifies limits on how far and how long the crawl should 145 | run. If a limit is left blank, then that limit will not be applied to the crawl. 146 | 147 | - Max cost: the crawler will not follow links that have a cost greater than the 148 | one specified here. 149 | - Max duration: the maximum amount of time the crawler should run, in seconds. 150 | - Max items: the maximum number of items that the crawler should download. This 151 | number includes successes, errors, and exceptions. 152 | -------------------------------------------------------------------------------- /docs/protobuf.rst: -------------------------------------------------------------------------------- 1 | .. _protobuf_messages: 2 | 3 | Protobuf Messages 4 | ================= 5 | 6 | The structure of the API is described in :doc:`websocket_api`. The details of 7 | all the individual messages are documened here. The client always sends a 8 | `Request <#.Request>`__ message. The server always sends a `ServerMessage 9 | <#.ServerMessage>`__ message, which contains either a `Response <#.Response>`__ 10 | to a request or an `Event <#.Event>`__ belonging to a subscription. 11 | 12 | .. raw:: html 13 | :file: protobuf.html 14 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # RTD does not support Poetry natively, so this is a temporary workaround. See: 2 | # https://github.com/readthedocs/readthedocs.org/issues/4912 3 | sphinx 4 | sphinx-rtd-theme 5 | sphinxcontrib_trio 6 | . 7 | -------------------------------------------------------------------------------- /docs/rethinkdb_gui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/rethinkdb_gui.png -------------------------------------------------------------------------------- /docs/start_crawl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/start_crawl.png -------------------------------------------------------------------------------- /docs/start_crawl_filled_in.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/start_crawl_filled_in.png -------------------------------------------------------------------------------- /docs/terminal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/docs/terminal.png -------------------------------------------------------------------------------- /docs/websocket_api.rst: -------------------------------------------------------------------------------- 1 | .. _api-documentation: 2 | 3 | WebSocket API 4 | ============= 5 | 6 | Overview 7 | -------- 8 | 9 | The crawler is controlled completely by an API. Clients connect to the crawler 10 | using `websockets 11 | `__ and 12 | exchange messages with the crawler using `protobuf messages 13 | `__. The built-in GUI relies 14 | solely on this API, so everything that can be done in the GUI can also be done 15 | with the API – and more! 16 | 17 | One of the central goals for the API is to enable clients to synchronize crawl 18 | results in real time. Most crawling systems are batch-oriented: you run the 19 | crawler for a period of time and then collect the results when the crawl is 20 | finished. Starbelly is streaming-oriented: it can send crawl results to a client 21 | as soon as it downloads them. 22 | 23 | Let's imagine that a crawl has started running and already has 1,000 results. A 24 | client can connect to Starbelly and quickly fetch the first 1,000 results. 25 | Because the crawler is still running, new results will continue to stream in as 26 | the crawler downloads them. If either the server or the client needs to 27 | disconnect for some reason, the client is able to reconnect later and pick up 28 | the stream exactly where it left off. 29 | 30 | Connecting to API 31 | ----------------- 32 | 33 | The API is exposed as a websocket service on port 443 at the path ``/ws/``. For 34 | example, if starbelly is running on the host ``starbelly.example.com``, then you 35 | should connect to the web socket using the URL 36 | ``wss://starbelly.example.com/ws/``. By default, Starbelly uses HTTP basic 37 | authentication, so you need to include those credentials when you connect to the 38 | API. 39 | 40 | Messages 41 | -------- 42 | 43 | Starbelly uses ``protobuf`` to encode messages sent between the client and the 44 | server. There are three types of message used in the API: 45 | 46 | 1. Request 47 | 2. Response 48 | 3. Event 49 | 50 | The *request* and *response* messages are created in pairs: the client sends a 51 | *request* to the server and the server sends back exactly one *response* per 52 | request. The response indicates whether the request was successful and may 53 | include other data related to the request. 54 | 55 | Although each request generates a response, the responses are not necessarily 56 | sent back in the same order that the requests are received. If the client sends 57 | two commands very quickly (call them A and B), it may get the responses back in 58 | either order, e.g. A→B or B→A. For this reason, the client should include a 59 | unique ``request_id`` with each request; the server will include the same 60 | ``request_id`` in its response so that the client can track which response goes 61 | with which request. The client can assign request IDs in any manner that it 62 | chooses, but one sensible approach would be to assign an incrementing sequence 63 | of integers. 64 | 65 | The third type of message is an *event*, which is pushed from the server to the 66 | client. For example, the client can send a request to subscribe to job status. 67 | The server will send a response containing a subscription ID. Now, whenever a 68 | job has a status event, such as downloading a new resource, the server will send 69 | an event to the client containing the job status data and the corresponding 70 | subscription ID. The client can close the subscription by sending another 71 | request. The server will stop sending event messages and will send a response 72 | indicating that the subscription has been cancelled. 73 | 74 | Protobuf is a binary serialization format that supports common data types like 75 | integers, strings, lists, and maps. It is similar in purpose to JSON, but 76 | protobuf is more efficient in terms of encoding overhead and serialization 77 | speed. 78 | 79 | For complete specification of the available protobuf messages, see 80 | :doc:`protobuf`. 81 | 82 | Example Session 83 | --------------- 84 | 85 | This section shows a complete interaction where a client starts a crawl and 86 | synchronizes crawl results. To begin, the client sends a ``RequestSetJob`` 87 | request to the server that includes the seed URL, a policy identifier, and a 88 | crawl name. 89 | 90 | .. code:: 91 | 92 | Request { 93 | request_id: 1 94 | Command: RequestSetJob { 95 | run_state: RUNNING 96 | policy_id: d28b379ff3668322bfd5d56e11d4e34e 97 | seeds: "https://cnn.com" 98 | name: "My Crawl" 99 | } 100 | } 101 | 102 | The server will kick off a crawling job and will send a response telling the 103 | client that the job has started successfully and including an identifier for the 104 | new job. 105 | 106 | .. code:: 107 | 108 | Response { 109 | request_id: 1 110 | is_success: true 111 | Body: ResponseNewJob { 112 | job_id: 0514478baffd401546b755bf460b5997 113 | } 114 | } 115 | 116 | Notice that the response includes the request ID sent by the client, so 117 | we know that this is a response to the above request. 118 | 119 | This response tells us that the crawl is starting, but we would like to keep 120 | track of the crawl's progress and know when it finishes. The next step is to 121 | send a subscription request for job status events. 122 | 123 | .. code:: 124 | 125 | Request { 126 | request_id: 2 127 | Command: RequestSubscribeJobStatus { 128 | min_interval: 3.0 129 | } 130 | } 131 | 132 | This subscription provides high-level job status for *all* crawl jobs, including 133 | data like how many items have been downloaded, how many pages had errors, how 134 | many pages results in exceptions, etc. Job status can change rapidly when the 135 | crawler is busy, because each item downloaded counts as a change in job status. 136 | The ``min_interval`` parameter specifies the minimum amount of time in between 137 | job status events sent by the server. In this example, if there are multiple job 138 | status events, the server will batch them together and send at most 1 event 139 | every 3 seconds for this subscription. On the other hand, if the crawl is very 140 | slow, then it may send events even less frequently than that. 141 | 142 | The server will create the subscription and respond with a subscription 143 | identifier. 144 | 145 | .. code:: 146 | 147 | Response { 148 | request_id: 1 149 | is_success: true 150 | Body: ResponseNewSubscription { 151 | subscription_id: 300 152 | } 153 | } 154 | 155 | When the client first subscribes to job status, the crawler will send the 156 | complete status of each currently running job. For example, if the crawler has 157 | already downloaded one item, the job status may look like this: 158 | 159 | .. code:: 160 | 161 | Event { 162 | subscription_id: 300 163 | Body: JobList { 164 | jobs: { 165 | job_id: 0514478baffd401546b755bf460b5997 166 | seeds: "https://cnn.com" 167 | policy: d28b379ff3668322bfd5d56e11d4e34e 168 | name: "My Crawl" 169 | run_state: RUNNING 170 | started_at: "2017-11-03T10:14:42.194744" 171 | item_count: 1 172 | http_success_count: 1 173 | http_error_count: 0 174 | exception_count: 0 175 | http_status_counts: { 176 | 200: 1 177 | } 178 | } 179 | } 180 | } 181 | 182 | After sending complete job status, the crawler will send small updates as the 183 | job status changes. For example, after the crawler downloads a second item, it 184 | will send an event like this: 185 | 186 | .. code:: 187 | 188 | Event { 189 | subscription_id: 300 190 | Body: JobList { 191 | jobs: { 192 | job_id: 0514478baffd401546b755bf460b5997 193 | item_count: 2 194 | http_success_count: 2 195 | http_status_counts: { 196 | 200: 2 197 | } 198 | } 199 | } 200 | } 201 | 202 | Notice how the second message is much smaller: it only contains the fields that 203 | have changed since the previous event. This is how the job status subscription 204 | allows clients to efficiently keep track of the status of all jobs. This API is 205 | used in the GUI to power the Dashboard and Results screens. 206 | 207 | For a complete list of API messages, see :ref:`protobuf_messages`. 208 | 209 | Web Client 210 | ---------- 211 | 212 | The crawler GUI is implemented as a stand-alone application written in Dart, and 213 | it interacts with the Starbelly server solely through the public API. Therefore, 214 | anything that you can do in the GUI can also be done through the API. 215 | 216 | https://github.com/hyperiongray/starbelly-web-client 217 | 218 | Python Client 219 | ------------- 220 | 221 | A very basic and incomplete Python client library implementation is available: 222 | 223 | https://github.com/hyperiongray/starbelly-python-client 224 | 225 | This client library will be improved over time and made more stable, but for 226 | now it may be used as a reference implementation. 227 | -------------------------------------------------------------------------------- /integration/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This module contains integration tests. 3 | 4 | These tests rely on a RethinkDB server running on localhost 28015. 5 | ''' 6 | from functools import wraps 7 | 8 | import pytest 9 | from rethinkdb import RethinkDB 10 | import trio 11 | 12 | from starbelly.config import get_config 13 | 14 | # Add this project to the Python path: 15 | from os.path import dirname 16 | from sys import path 17 | path.append(dirname(dirname(__file__))) 18 | 19 | 20 | class fail_after: 21 | ''' This decorator fails if the runtime of the decorated function (as 22 | measured by the Trio clock) exceeds the specified value. ''' 23 | def __init__(self, seconds): 24 | self._seconds = seconds 25 | 26 | def __call__(self, fn): 27 | @wraps(fn) 28 | async def wrapper(*args, **kwargs): 29 | with trio.move_on_after(self._seconds) as cancel_scope: 30 | await fn(*args, **kwargs) 31 | if cancel_scope.cancelled_caught: 32 | pytest.fail('Test runtime exceeded the maximum {} seconds' 33 | .format(self._seconds)) 34 | return wrapper 35 | 36 | 37 | @pytest.fixture 38 | async def db_pool(nursery): 39 | r = RethinkDB() 40 | r.set_loop_type('trio') 41 | db_config = get_config()['database'] 42 | db_pool = r.ConnectionPool( 43 | host=db_config['host'], 44 | port=db_config['port'], 45 | db='integration_testing', 46 | user=db_config['super_user'], 47 | password=db_config['super_password'], 48 | nursery=nursery 49 | ) 50 | async with db_pool.connection() as conn: 51 | await r.db_create('integration_testing').run(conn) 52 | yield db_pool 53 | async with db_pool.connection() as conn: 54 | await r.db_drop('integration_testing').run(conn) 55 | await db_pool.close() 56 | -------------------------------------------------------------------------------- /notebooks/.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | -------------------------------------------------------------------------------- /notebooks/Example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | " # Example Notebook\n", 8 | " \n", 9 | " This notebook shows how to use the Starbelly shell from inside a Jupyter notebook." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "Starbelly v1.2.0-dev Shell\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "%autoawait trio\n", 27 | "from pathlib import Path\n", 28 | "from sys import path\n", 29 | "path.append(str(Path().resolve().parent))\n", 30 | "from tools.shell import *" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/plain": [ 41 | "'starbelly-app'" 42 | ] 43 | }, 44 | "execution_count": 2, 45 | "metadata": {}, 46 | "output_type": "execute_result" 47 | } 48 | ], 49 | "source": [ 50 | "config['database']['user']" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "15:24:09 [tools.shell] INFO: Hello, world!\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "logger.info('Hello, world!')" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "async def foo():\n", 77 | " await trio.sleep(1)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 5, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "await foo()" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 6, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "trio.run(foo)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 7, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "RethinkDB Cursor: [\n", 108 | " {'name': 'Deep Crawl'},\n", 109 | " {'name': 'Broad Crawl'},\n", 110 | "]\n" 111 | ] 112 | } 113 | ], 114 | "source": [ 115 | "policy_query = r.table('policy').pluck('name')\n", 116 | "policy_results = run_query(policy_query)\n", 117 | "print_results(policy_results)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [] 126 | } 127 | ], 128 | "metadata": { 129 | "kernelspec": { 130 | "display_name": "Python 3.7 (starbelly)", 131 | "language": "python", 132 | "name": "starbelly" 133 | }, 134 | "language_info": { 135 | "codemirror_mode": { 136 | "name": "ipython", 137 | "version": 3 138 | }, 139 | "file_extension": ".py", 140 | "mimetype": "text/x-python", 141 | "name": "python", 142 | "nbconvert_exporter": "python", 143 | "pygments_lexer": "ipython3", 144 | "version": "3.7.0" 145 | } 146 | }, 147 | "nbformat": 4, 148 | "nbformat_minor": 2 149 | } 150 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "starbelly" 3 | version = "2.0.0-dev" 4 | description = "Streaming crawler with a graphical user interface" 5 | authors = ["Mark E. Haase "] 6 | license = "MIT" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.7" 10 | aiohttp = "^3.6.2" 11 | aiohttp-socks = "^0.3.9" 12 | beautifulsoup4 = "^4.9.1" 13 | cchardet = "^2.1.6" 14 | feedparser = "^5.2.1" 15 | formasaurus = {url = "https://hyperiongray.s3-us-west-2.amazonaws.com/formasaurus-0.9.0.tar.gz"} 16 | lxml = "^4.5.1" 17 | protobuf = "^3.12.2" 18 | psutil = "^5.7.0" 19 | python-dateutil = "^2.8.1" 20 | python-mimeparse = "^1.6.0" 21 | pytz = "==2020.1" 22 | rethinkdb = {url = "https://hyperiongray.s3-us-west-2.amazonaws.com/rethinkdb-2.4.7%2Bstarbelly.tar.gz"} 23 | robotexclusionrulesparser = "^1.7.1" 24 | trio = "^0.15.1" 25 | trio-asyncio = "^0.11.0" 26 | trio-websocket = "^0.8.0" 27 | w3lib = "1.22.0" 28 | watchdog = "^0.10.2" 29 | yarl = "^1.4.2" 30 | scikit-learn = ">=0.18" # required for formasaurus 31 | scipy = "^1.4.1" # required for formasaurus 32 | sklearn-crfsuite = ">=0.3.1" # required for formasaurus 33 | joblib = "^0.15.1" # required for formasaurus 34 | tldextract = "^2.2.2" # required for formasaurus 35 | 36 | [tool.poetry.dev-dependencies] 37 | coverage = "^5.1" 38 | coveralls = "^2.0.0" 39 | pytest-cov = "^2.9.0" 40 | pytest-mock = "^3.1.0" 41 | sphinxcontrib-trio = "^1.1.2" 42 | sphinx-rtd-theme = "^0.4.3" 43 | pytest-trio = "^0.6.0" 44 | trustme = "^0.6.0" 45 | 46 | [build-system] 47 | requires = ["poetry>=0.12"] 48 | build-backend = "poetry.masonry.api" 49 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | log_level = debug 3 | trio_mode = true 4 | -------------------------------------------------------------------------------- /starbelly/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/starbelly/__init__.py -------------------------------------------------------------------------------- /starbelly/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import signal 5 | import subprocess 6 | import sys 7 | import time 8 | 9 | from watchdog.events import FileSystemEventHandler 10 | from watchdog.observers import Observer 11 | 12 | from .bootstrap import Bootstrap 13 | from .config import get_config, get_path 14 | 15 | 16 | class ProcessWatchdog(FileSystemEventHandler): 17 | ''' Handle watchdog events by restarting a subprocess. ''' 18 | 19 | def __init__(self): 20 | ''' Constructor. ''' 21 | 22 | self._logger = logging.getLogger('watchdog') 23 | self._process = None 24 | 25 | def dispatch(self, event): 26 | ''' Restart the subprocess if a source/config file changed. ''' 27 | 28 | path = event.src_path 29 | file = os.path.basename(path) 30 | descr = '{} was {}'.format(event.src_path, event.event_type) 31 | 32 | if (file.endswith('.py') and not file.startswith('test_')) or \ 33 | file.endswith('.ini'): 34 | self._logger.info('%s (Reloading)', descr) 35 | self.terminate_process() 36 | self.start_process() 37 | 38 | def join(self): 39 | ''' Wait for subprocess to exit. ''' 40 | if self._process is not None: 41 | self._process.wait() 42 | 43 | def start_process(self): 44 | ''' Start the subprocess. ''' 45 | 46 | if self._process is not None: 47 | msg = 'Cannot start subprocess if it is already running.' 48 | raise RuntimeError(msg) 49 | 50 | time.sleep(1) 51 | args = [sys.executable, '-m', __package__] + sys.argv[1:] 52 | new_env = dict(os.environ) 53 | new_env['WATCHDOG_RUNNING'] = '1' 54 | self._process = subprocess.Popen(args, env=new_env) 55 | 56 | def terminate_process(self): 57 | ''' Terminate the subprocess. ''' 58 | if self._process is not None: 59 | try: 60 | self._process.send_signal(signal.SIGTERM) 61 | self._process.wait() 62 | self._process = None 63 | except ProcessLookupError: 64 | pass # The process already died. 65 | 66 | 67 | class Reloader: 68 | ''' Reloads the subprocess when a source file is modified. ''' 69 | def __init__(self): 70 | ''' Constructor. ''' 71 | self._logger = logging.getLogger('reloader') 72 | self._observer = None 73 | self._running = False 74 | self._watchdog = None 75 | 76 | def run(self): 77 | ''' Run the reloader. ''' 78 | 79 | self._logger.info('Running with reloader...') 80 | self._watchdog = ProcessWatchdog() 81 | self._watchdog.start_process() 82 | 83 | self._observer = Observer() 84 | self._observer.schedule( 85 | self._watchdog, str(get_path('starbelly')), recursive=True) 86 | self._observer.start() 87 | 88 | while True: 89 | time.sleep(1) 90 | 91 | def shutdown(self, signum, _): 92 | ''' Exit the reloader. ''' 93 | signame = signal.Signals(signum).name 94 | self._logger.info('Caught %s (shutting down)', signame) 95 | self._watchdog.terminate_process() 96 | self._observer.stop() 97 | self._observer.join() 98 | sys.exit(0) 99 | 100 | 101 | def configure_logging(log_level, error_log): 102 | ''' Set default format and output stream for logging. ''' 103 | log_format = '%(asctime)s [%(name)s] %(levelname)s: %(message)s' 104 | log_date_format = '%Y-%m-%d %H:%M:%S' 105 | log_formatter = logging.Formatter(log_format, log_date_format) 106 | log_level = getattr(logging, log_level.upper()) 107 | log_handler = logging.StreamHandler(sys.stderr) 108 | log_handler.setFormatter(log_formatter) 109 | log_handler.setLevel(log_level) 110 | logger = logging.getLogger() 111 | logger.addHandler(log_handler) 112 | logger.setLevel(log_level) 113 | if log_level < logging.INFO: 114 | logging.getLogger('watchdog').setLevel(logging.INFO) 115 | logging.getLogger('trio-websocket').setLevel(logging.INFO) 116 | 117 | if error_log is not None: 118 | exc_handler = logging.FileHandler(error_log) 119 | exc_handler.setFormatter(log_formatter) 120 | exc_handler.setLevel(logging.ERROR) 121 | logger.addHandler(exc_handler) 122 | 123 | 124 | def get_args(): 125 | ''' Parse command line arguments. ''' 126 | arg_parser = argparse.ArgumentParser(description='Starbelly') 127 | arg_parser.add_argument( 128 | '--log-level', 129 | default='warning', 130 | metavar='LEVEL', 131 | choices=['debug', 'info', 'warning', 'error', 'critical'], 132 | help='Set logging verbosity (default: warning)' 133 | ) 134 | arg_parser.add_argument( 135 | '--ip', 136 | default='127.0.0.1', 137 | help='The IP address to bind to (default: 127.0.0.1)' 138 | ) 139 | arg_parser.add_argument( 140 | '--port', 141 | type=int, 142 | default=8000, 143 | help='The TCP port to bind to (default: 8000)' 144 | ) 145 | arg_parser.add_argument( 146 | '--reload', 147 | action='store_true', 148 | help='Auto-reload when code or static assets are modified.' 149 | ) 150 | arg_parser.add_argument( 151 | '--error-log', 152 | help='Copy error logs to the specified file.' 153 | ) 154 | return arg_parser.parse_args() 155 | 156 | 157 | def main(): 158 | ''' Set up watchdog or run starbelly. ''' 159 | args = get_args() 160 | configure_logging(args.log_level, args.error_log) 161 | config = get_config() 162 | 163 | if args.reload and os.getenv('WATCHDOG_RUNNING') is None: 164 | reloader = Reloader() 165 | signal.signal(signal.SIGINT, reloader.shutdown) 166 | signal.signal(signal.SIGTERM, reloader.shutdown) 167 | reloader.run() 168 | else: 169 | bootstrap = Bootstrap(config, args) 170 | bootstrap.run() 171 | 172 | 173 | if __name__ == '__main__': 174 | main() 175 | -------------------------------------------------------------------------------- /starbelly/backoff.py: -------------------------------------------------------------------------------- 1 | import trio 2 | 3 | 4 | class ExponentialBackoff: 5 | ''' An experimental class: this makes it simple to write loops that poll 6 | a resource and backoff when the resource is not ready. 7 | 8 | For example, if you are polling the database for some new records, you might 9 | wait 1 second and then try again. If there are still no records, then you 10 | wait 2 seconds before trying again, then 4 seconds, then 8, etc. 11 | 12 | This is written as an async iterator, so you can just loop over it and it 13 | will automatically delay in between loop iterations. 14 | ''' 15 | def __init__(self, min_=0.25, max_=64): 16 | ''' 17 | Constructor. 18 | 19 | :param int start: The initial delay between loop iterations. 20 | :param int max_: The maximum delay. 21 | ''' 22 | self._backoff = min_ 23 | self._initial = True 24 | self._min = min_ 25 | self._max = max_ 26 | 27 | def __repr__(self): 28 | return ''.format(self._backoff) 29 | 30 | def __aiter__(self): 31 | ''' This instance is an async iterator. ''' 32 | return self 33 | 34 | async def __anext__(self): 35 | ''' Add a delay in between loop iterations. (No delay for the first 36 | iteration. ''' 37 | if self._initial: 38 | backoff = 0 39 | self._initial = False 40 | else: 41 | backoff = self._backoff 42 | await trio.sleep(backoff) 43 | return backoff 44 | 45 | def increase(self): 46 | ''' Double the current backoff, but not if it would exceed this 47 | instance's max value. ''' 48 | if self._backoff <= self._max // 2: 49 | self._backoff *= 2 50 | 51 | def decrease(self): 52 | ''' Halve the current backoff, not if would be less than 1. ''' 53 | if self._backoff >= 2 * self._min: 54 | self._backoff //= 2 55 | -------------------------------------------------------------------------------- /starbelly/bootstrap.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta, timezone 2 | import logging 3 | 4 | from rethinkdb import RethinkDB 5 | from rethinkdb.trio_net.net_trio import Connection as RethinkDBTrioConnection 6 | import trio 7 | 8 | from .db import ( 9 | BootstrapDb, 10 | CrawlFrontierDb, 11 | CrawlManagerDb, 12 | CrawlExtractorDb, 13 | CrawlStorageDb, 14 | LoginDb, 15 | ScheduleDb, 16 | ServerDb, 17 | SubscriptionDb, 18 | ) 19 | from .job import CrawlManager, StatsTracker 20 | from .rate_limiter import RateLimiter 21 | from .resource_monitor import ResourceMonitor 22 | from .robots import RobotsTxtManager 23 | from .schedule import Scheduler 24 | from .server import Server 25 | 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | class Bootstrap: 31 | ''' Main class for bootstrapping the crawler. ''' 32 | def __init__(self, config, args): 33 | ''' 34 | Constructor. 35 | 36 | :param config: Output of config parser. 37 | :param args: Output of argparse. 38 | :param 39 | ''' 40 | self._args = args 41 | self._config = config 42 | 43 | def run(self): 44 | ''' Run the main task on the event loop. ''' 45 | logger.info('Starbelly is starting...') 46 | try: 47 | trio.run(self._main, 48 | restrict_keyboard_interrupt_to_checkpoints=True) 49 | except KeyboardInterrupt: 50 | logger.warning('Quitting due to KeyboardInterrupt') 51 | logger.info('Starbelly has stopped.') 52 | 53 | def _db_pool(self, nursery): 54 | ''' 55 | Create a database connectoin pool. 56 | 57 | :param nursery: A Trio nursery to spawn database connections in. 58 | :returns: A RethinkDB connection pool. 59 | ''' 60 | r = RethinkDB() 61 | r.set_loop_type('trio') 62 | db_config = self._config['database'] 63 | return r.ConnectionPool( 64 | connection_type=RethinkDBTrioConnection, 65 | host=db_config['host'], 66 | port=db_config['port'], 67 | db=db_config['db'], 68 | user=db_config['user'], 69 | password=db_config['password'], 70 | nursery=nursery 71 | ) 72 | 73 | async def _main(self): 74 | ''' 75 | The main task. 76 | 77 | :returns: This function runs until cancelled. 78 | ''' 79 | # Create db pool & objects 80 | async with trio.open_nursery() as nursery: 81 | db_pool = self._db_pool(nursery) 82 | bootstrap_db = BootstrapDb(db_pool) 83 | crawl_db = CrawlManagerDb(db_pool) 84 | extractor_db = CrawlExtractorDb(db_pool) 85 | frontier_db = CrawlFrontierDb(db_pool) 86 | login_db = LoginDb(db_pool) 87 | schedule_db = ScheduleDb(db_pool) 88 | storage_db = CrawlStorageDb(db_pool) 89 | logging.info('Doing startup check...') 90 | await bootstrap_db.startup_check() 91 | 92 | # Create a rate limiter 93 | rate_limiter = RateLimiter(capacity=1_000) 94 | logger.info('Initializing rate limiter...') 95 | rate_limits = await bootstrap_db.get_rate_limits() 96 | for rate_limit in rate_limits: 97 | rate_limiter.set_rate_limit(rate_limit['token'], 98 | rate_limit['delay']) 99 | logger.info('Rate limiter is initialized.') 100 | 101 | # Create a robots.txt manager 102 | robots_txt_manager = RobotsTxtManager(db_pool) 103 | 104 | # Create a tracker for job stats and initialize with jobs that are 105 | # unfinished or recently finishe.d 106 | recent_period = timedelta(hours=24) 107 | recent_dt = datetime.now(tz=timezone.utc) - recent_period 108 | stats_tracker = StatsTracker(recent_period) 109 | dashboard_jobs = await crawl_db.get_dashboard_jobs(recent_dt) 110 | for job_doc in dashboard_jobs: 111 | stats_dict = { 112 | 'id': job_doc['id'], 113 | 'run_state': job_doc['run_state'], 114 | 'name': job_doc['name'], 115 | 'seeds': job_doc['seeds'], 116 | 'tags': job_doc['tags'], 117 | 'started_at': job_doc['started_at'], 118 | 'completed_at': job_doc['completed_at'], 119 | 'item_count': job_doc['item_count'], 120 | 'http_success_count': job_doc['http_success_count'], 121 | 'http_error_count': job_doc['http_error_count'], 122 | 'exception_count': job_doc['exception_count'], 123 | 'http_status_counts': job_doc['http_status_counts'], 124 | } 125 | stats_tracker.add_job(stats_dict) 126 | 127 | # Create a crawl manager 128 | crawl_manager = CrawlManager(rate_limiter, stats_tracker, 129 | robots_txt_manager, crawl_db, frontier_db, extractor_db, 130 | storage_db, login_db) 131 | 132 | # Create a resource monitor: one sample per second and keep 1 minute of 133 | # history. 134 | resource_monitor = ResourceMonitor(interval=1.0, buffer_size=60, 135 | crawl_resources_fn=crawl_manager.get_resource_usage, 136 | rate_limiter=rate_limiter) 137 | 138 | # Create a scheduler 139 | scheduler = Scheduler(schedule_db, crawl_manager) 140 | 141 | # Create a server 142 | server_db = ServerDb(db_pool) 143 | subscription_db = SubscriptionDb(db_pool) 144 | server = Server(self._args.ip, self._args.port, server_db, 145 | subscription_db, crawl_manager, rate_limiter, resource_monitor, 146 | stats_tracker, scheduler) 147 | 148 | # Run all the components 149 | await nursery.start(crawl_manager.run, name='Crawl Manager') 150 | nursery.start_soon(rate_limiter.run, name='Rate Limiter') 151 | nursery.start_soon(resource_monitor.run, name='Resource Monitor') 152 | nursery.start_soon(scheduler.run, name='Scheduler') 153 | await nursery.start(server.run, name='Server') 154 | -------------------------------------------------------------------------------- /starbelly/captcha.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from uuid import UUID 3 | 4 | import starbelly.starbelly_pb2 5 | 6 | 7 | class CaptchaSolver: 8 | ''' An interface for a CAPTCHA solving service. ''' 9 | def __init__(self, doc): 10 | ''' 11 | Constructor. 12 | 13 | :param dict doc: A database document. 14 | ''' 15 | self.id = doc['id'] 16 | self.name = doc['name'] 17 | self.service_url = doc['service_url'] 18 | self.api_key = doc['api_key'] 19 | self.require_phrase = doc['require_phrase'] 20 | self.case_sensitive = doc['case_sensitive'] 21 | self.characters = doc['characters'] 22 | self.require_math = doc['require_math'] 23 | self.min_length = doc.get('min_length', 0) 24 | self.max_length = doc.get('max_length', 0) 25 | 26 | def get_command(self, img_data): 27 | ''' 28 | Return a JSON API command. 29 | 30 | :param bytes img_data: The image data for the CAPTCHA. 31 | :returns: A command that can be serialized to JSON. 32 | :rtype: dict 33 | ''' 34 | img_b64 = base64.b64encode(img_data).decode('ascii') 35 | 36 | if self.characters == 'ALPHANUMERIC': 37 | numeric = 0 38 | elif self.characters == 'NUMERIC_ONLY': 39 | numeric = 1 40 | elif self.characters == 'ALPHA_ONLY': 41 | numeric = 2 42 | else: 43 | raise Exception('Invalid characters setting: {}'.format( 44 | self.characters)) 45 | 46 | return { 47 | 'clientKey': self.api_key, 48 | 'task': { 49 | 'type': 'ImageToTextTask', 50 | 'body': img_b64, 51 | 'phrase': self.require_phrase, 52 | 'case': self.case_sensitive, 53 | 'numeric': numeric, 54 | 'math': self.require_math, 55 | 'minLength': self.min_length, 56 | 'maxLength': self.max_length, 57 | } 58 | } 59 | 60 | 61 | def captcha_doc_to_pb(doc): 62 | ''' 63 | Convert CAPTCHA solver from database document to protobuf. 64 | 65 | :param dict doc: A database document. 66 | :returns: A protobuf message. 67 | ''' 68 | pb = starbelly.starbelly_pb2.CaptchaSolver() 69 | pb.name = doc['name'] 70 | pb.solver_id = UUID(doc['id']).bytes 71 | pb.created_at = doc['created_at'].isoformat() 72 | pb.updated_at = doc['updated_at'].isoformat() 73 | type_ = doc['type'] 74 | if type_ == 'antigate': 75 | pb.antigate.CopyFrom(_antigate_doc_to_pb(doc)) 76 | else: 77 | raise Exception('Unknown CAPTCHA solver type ({})'.format(type_)) 78 | return pb 79 | 80 | 81 | def _antigate_doc_to_pb(doc): 82 | ''' 83 | Convert Antigate CAPTCHA solver from database doc to protobuf. 84 | 85 | :param dict doc: A database document. 86 | :returns: A protobuf message. 87 | ''' 88 | pb = starbelly.starbelly_pb2.CaptchaSolverAntigate() 89 | pb.service_url = doc['service_url'] 90 | pb.api_key = doc['api_key'] 91 | pb.require_phrase = doc['require_phrase'] 92 | pb.case_sensitive = doc['case_sensitive'] 93 | pb.characters = starbelly.starbelly_pb2.CaptchaSolverAntigateCharacters \ 94 | .Value(doc['characters']) 95 | pb.require_math = doc['require_math'] 96 | if 'min_length' in doc: 97 | pb.min_length = doc['min_length'] 98 | if 'max_length' in doc: 99 | pb.max_length = doc['max_length'] 100 | return pb 101 | 102 | 103 | def captcha_pb_to_doc(pb): 104 | ''' 105 | Convert CAPTCHA solver from protobuf to database document. 106 | 107 | :param pb: A protobuf message. 108 | :returns: A database document. 109 | :rtype: dict 110 | ''' 111 | if pb.name.strip() == '': 112 | raise Exception('Name is required.') 113 | doc = {'name': pb.name} 114 | if pb.HasField('solver_id'): 115 | doc['id'] = str(UUID(bytes=pb.solver_id)) 116 | type_ = pb.WhichOneof('SolverType') 117 | if type_ == 'antigate': 118 | doc.update(_antigate_pb_to_doc(pb)) 119 | else: 120 | raise Exception('Unknown CAPTCHA solver type ({})'.format(type_)) 121 | return doc 122 | 123 | 124 | def _antigate_pb_to_doc(pb): 125 | ''' 126 | Convert Antigate CAPTCHA solver from database doc to protobuf. 127 | 128 | :param pb: A protobuf message. 129 | :returns: A database document. 130 | :rtype: dict 131 | ''' 132 | antigate = pb.antigate 133 | doc = { 134 | 'service_url': antigate.service_url, 135 | 'api_key': antigate.api_key, 136 | 'require_phrase': antigate.require_phrase, 137 | 'case_sensitive': antigate.case_sensitive, 138 | 'characters': starbelly.starbelly_pb2.CaptchaSolverAntigateCharacters \ 139 | .Name(antigate.characters), 140 | 'require_math': antigate.require_math, 141 | 'type': 'antigate', 142 | } 143 | if antigate.HasField('min_length'): 144 | doc['min_length'] = antigate.min_length 145 | if antigate.HasField('max_length'): 146 | doc['max_length'] = antigate.max_length 147 | return doc 148 | -------------------------------------------------------------------------------- /starbelly/config.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import pathlib 3 | 4 | 5 | _root = pathlib.Path(__file__).resolve().parent.parent 6 | 7 | 8 | def get_path(relpath): 9 | ''' Get absolute path to a project-relative path. ''' 10 | return _root / relpath 11 | 12 | 13 | def get_config(): 14 | ''' 15 | Read the application configuration from the standard configuration files. 16 | 17 | :rtype: ConfigParser 18 | ''' 19 | config_dir = get_path("conf") 20 | config_files = [ 21 | config_dir / "system.ini", 22 | config_dir / "local.ini", 23 | ] 24 | config = configparser.ConfigParser() 25 | config.optionxform = str 26 | config.read(config_files) 27 | return config 28 | -------------------------------------------------------------------------------- /starbelly/extractor.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import logging 3 | 4 | from bs4 import BeautifulSoup 5 | import cchardet 6 | import feedparser 7 | import mimeparse 8 | import trio 9 | import w3lib.encoding 10 | import yarl 11 | 12 | 13 | logger = logging.getLogger(__name__) 14 | chardet = lambda s: cchardet.detect(s).get('encoding') 15 | 16 | 17 | class CrawlExtractor: 18 | ''' Extract URLs from crawled items and add them to the frontier table. ''' 19 | def __init__(self, job_id, db, send_channel, receive_channel, policy, 20 | downloader, robots_txt_manager, old_urls, stats, batch_size=100): 21 | ''' 22 | Constructor. 23 | 24 | :param str job_id: The ID of the job to extract response for. 25 | :param starbelly.db.CrawlExtractorDb db: A database layer. 26 | :param trio.SendChannel send_channel: A channel that sends 27 | DownloadResponse instances. 28 | :param trio.ReceiveChannel receive_channel: A channel that receives 29 | DownloadResponse instances. 30 | :param starbelly.policy.Policy: A policy for computing costs. 31 | :param starbelly.downloader.Downloader: A downloader used for this job. 32 | :param starbelly.robots.RobotsTxtManager: A robots.txt manager. 33 | :param set old_urls: A set of hashed URLs that this crawl has seen before. 34 | These URLs will not be added to the crawl frontier a second time. 35 | :param dict stats: A dictionary of crawl statistics. 36 | :param int batch_size: The maximum size of inserts to do in a single 37 | database query. If more items than this are extracted from a 38 | document, then multiple queries will be issued. 39 | ''' 40 | self._job_id = job_id 41 | self._db = db 42 | self._send_channel = send_channel 43 | self._receive_channel = receive_channel 44 | self._policy = policy 45 | self._downloader = downloader 46 | self._robots_txt_manager = robots_txt_manager 47 | self._old_urls = old_urls 48 | self._stats = stats 49 | self._batch_size = batch_size 50 | 51 | def __repr__(self): 52 | ''' Report crawl job ID. ''' 53 | return ''.format(self._job_id[:8]) 54 | 55 | @property 56 | def old_urls(self): 57 | return self._old_urls 58 | 59 | async def run(self): 60 | ''' 61 | Read responses from extraction channel and add them to the frontier. 62 | 63 | :returns: This function runs until cancelled. 64 | ''' 65 | async for response in self._receive_channel: 66 | try: 67 | if response.is_success: 68 | await self._extract(response) 69 | except Exception: 70 | logger.exception('%r Extractor exception on %r', self, response) 71 | finally: 72 | await self._db.delete_frontier_item(response.frontier_id) 73 | await self._send_channel.send(response) 74 | 75 | async def _extract(self, response): 76 | ''' 77 | Find links in a response body and put them in the frontier. 78 | 79 | :param starbelly.downloader.DownloadReponse: 80 | ''' 81 | logger.debug('%r Extracting links from %s', self, response.url) 82 | extracted_urls = await trio.run_sync_in_worker_thread( 83 | extract_urls, response) 84 | insert_items = list() 85 | 86 | for counter, url in enumerate(extracted_urls): 87 | # Check if the policy allows us to follow this URL. 88 | new_cost = self._policy.url_rules.get_cost(response.cost, url) 89 | exceeds_max_cost = self._policy.limits.exceeds_max_cost(new_cost) 90 | if new_cost <= 0 or exceeds_max_cost: 91 | continue 92 | robots_ok = await self._robots_txt_manager.is_allowed(url, 93 | self._policy, self._downloader) 94 | if not robots_ok: 95 | continue 96 | 97 | # Normalize and hash URL. 98 | url_can = self._policy.url_normalization.normalize(url) 99 | hash_ = hashlib.blake2b(url_can.encode('ascii'), digest_size=16) 100 | url_hash = hash_.digest() 101 | 102 | # If we haven't seen this URL before, it should be added to the 103 | # frontier. 104 | if url_hash not in self._old_urls: 105 | logger.debug('%r Adding URL %s (cost=%0.2f)', self, url, 106 | new_cost) 107 | insert_items.append({ 108 | 'cost': new_cost, 109 | 'job_id': self._job_id, 110 | 'url': url, 111 | 'in_flight': False, 112 | }) 113 | self._old_urls.add(url_hash) 114 | 115 | # Don't monopolize the event loop: 116 | if counter % self._batch_size == self._batch_size - 1: 117 | await trio.sleep(0) 118 | 119 | # Insert items in batches 120 | start = 0 121 | while start < len(insert_items): 122 | end = min(start + self._batch_size, len(insert_items)) 123 | self._stats['frontier_size'] += end - start 124 | await self._db.insert_frontier_items(insert_items[start:end]) 125 | start = end 126 | 127 | 128 | def extract_urls(response): 129 | ''' 130 | Extract URLs from a response body. 131 | 132 | Any relative URLs found in the response body are converted to absolute URLs 133 | using the original request URL. 134 | 135 | :param starbelly.downloader.DownloadResponse response: A response to 136 | extract URLs from. 137 | :returns: A list of URLs. 138 | :rtype: list[str] 139 | ''' 140 | extracted_urls = list() 141 | 142 | if response.is_success: 143 | base_url = response.url 144 | type_, subtype, _ = mimeparse.parse_mime_type(response.content_type) 145 | 146 | if type_ == 'text' and subtype == 'html' or \ 147 | type_ == 'application' and subtype == 'xhtml+xml': 148 | extracted_urls = _extract_html(response) 149 | elif type_ == 'application' and subtype == 'atom+xml' or \ 150 | type_ == 'application' and subtype == 'rss+xml': 151 | extracted_urls = _extract_feed(response) 152 | else: 153 | raise ValueError('Unsupported MIME in extract_urls(): {} (url={})' 154 | .format(response.content_type, base_url)) 155 | 156 | return extracted_urls 157 | 158 | 159 | def _extract_feed(response): 160 | ''' 161 | Extract links from Atom or RSS feeds. 162 | 163 | :param starbelly.downloader.DownloadResponse response: An Atom/RSS response 164 | to extract URLs from. 165 | :returns: A list of URLs. 166 | :rtype: list[str] 167 | ''' 168 | doc = feedparser.parse(response.body) 169 | return [entry.link for entry in doc.entries] 170 | 171 | 172 | def _extract_html(response): 173 | ''' 174 | Extract links from HTML document tags. 175 | 176 | :param starbelly.downloader.DownloadResponse response: An HTML response to 177 | extract URLs from. 178 | :returns: A list of URLs. 179 | :rtype: list[str] 180 | ''' 181 | _, html = w3lib.encoding.html_to_unicode( 182 | response.content_type, 183 | response.body, 184 | auto_detect_fun=chardet 185 | ) 186 | 187 | doc = BeautifulSoup(html, 'lxml') 188 | base_tag = doc.head.base 189 | base_url = None 190 | 191 | if base_tag is not None: 192 | base_href = base_tag.get('href') 193 | if base_href is not None: 194 | base_url = yarl.URL(base_href) 195 | 196 | if base_url is None: 197 | base_url = yarl.URL(response.url) 198 | 199 | extracted_urls = list() 200 | 201 | for anchor in doc.find_all('a', href=True): 202 | href = anchor.get('href') 203 | 204 | try: 205 | parsed_href = yarl.URL(href) 206 | except: 207 | logger.exception('Rejecting malformed URL base=%s url=%s', 208 | str(response.url), href) 209 | continue 210 | 211 | absolute_href = base_url.join(parsed_href) 212 | 213 | if absolute_href.scheme in ('http', 'https'): 214 | extracted_urls.append(str(absolute_href)) 215 | 216 | return extracted_urls 217 | -------------------------------------------------------------------------------- /starbelly/frontier.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import logging 3 | 4 | from rethinkdb import RethinkDB 5 | from yarl import URL 6 | 7 | from .backoff import ExponentialBackoff 8 | from .downloader import DownloadRequest 9 | 10 | 11 | r = RethinkDB() 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class FrontierExhaustionError(Exception): 16 | ''' Indicates that the frontier has no items remaining (in-flight or 17 | otherwise. ''' 18 | 19 | 20 | @dataclass 21 | class FrontierItem: 22 | ''' Represents a resource that should be crawled. ''' 23 | frontier_id: bytes 24 | job_id: bytes 25 | url: str 26 | cost: float 27 | 28 | @classmethod 29 | def from_doc(cls, doc): 30 | ''' 31 | Create a frontier item from a database document. 32 | 33 | :param dict doc: A database document. 34 | ''' 35 | return cls(doc['id'], doc['job_id'], doc['url'], doc['cost']) 36 | 37 | 38 | class CrawlFrontier: 39 | ''' Contains the logic for managing a crawl frontier, i.e. the URLs that 40 | have already been crawled and the URLs that are remaining to be crawled. ''' 41 | def __init__(self, job_id, db, send_channel, login_manager, policy, stats): 42 | ''' 43 | Constructor 44 | 45 | :param str job_id: The ID of the job corresponding to this frontier. 46 | :param starbelly.db.CrawlFrontierDb db: A database layer. 47 | :param trio.SendChannel send_channel: This channel is used to send 48 | ``FrontierItem`` that need to be downloaded, e.g. to send to the 49 | rate limiter. 50 | :param starbelly.login.LoginManager login_manager: Used when the 51 | frontier sees an unauthenticated domain and needs to log in. 52 | :param starbelly.policy.Policy: The policy to use. 53 | :param dict stats: A dictionary of crawl statistics. 54 | ''' 55 | self._job_id = job_id 56 | self._db = db 57 | self._send_channel = send_channel 58 | self._login_manager = login_manager 59 | self._policy = policy 60 | self._authenticated_domains = set() 61 | self._stats = stats 62 | 63 | def __repr__(self): 64 | ''' Include job ID in the repr. ''' 65 | return ''.format(self._job_id[:8]) 66 | 67 | async def run(self): 68 | ''' 69 | This task takes items off the frontier and sends them to the rate 70 | limiter. 71 | 72 | :returns: This function runs until cancelled. 73 | ''' 74 | await self._initialize() 75 | 76 | while True: 77 | frontier_items = await self._get_batch() 78 | for item in frontier_items: 79 | if self._policy.authentication.is_enabled(): 80 | domain = URL(item.url).host 81 | if domain not in self._authenticated_domains: 82 | await self._login_manager.login(domain) 83 | self._authenticated_domains.add(domain) 84 | logger.debug('%r Sending: %r', self, item) 85 | request = DownloadRequest.from_frontier_item(item) 86 | await self._send_channel.send(request) 87 | 88 | async def _initialize(self): 89 | ''' Initialize frontier database documents. ''' 90 | 91 | size = await self._db.get_frontier_size(self._job_id) 92 | logger.info('%r Initialization complete (size=%d)', self, size) 93 | self._stats['frontier_size'] = size 94 | 95 | async def _get_batch(self, size=10): 96 | ''' 97 | Get a batch of items from the frontier table, ordered by ascending cost. 98 | If no items available, poll the database until items become available. 99 | 100 | :param int size: 101 | :returns: A batch of frontier items. 102 | :rtype: list[FrontierItem] 103 | ''' 104 | backoff = ExponentialBackoff(min_=1, max_=16) 105 | async for _ in backoff: 106 | docs = await self._db.get_frontier_batch(self._job_id, size) 107 | if docs: 108 | self._stats['frontier_size'] -= len(docs) 109 | break 110 | else: 111 | if await self._db.any_in_flight(self._job_id): 112 | backoff.increase() 113 | else: 114 | raise FrontierExhaustionError() 115 | 116 | return [FrontierItem.from_doc(doc) for doc in docs] 117 | -------------------------------------------------------------------------------- /starbelly/login.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from functools import partial 3 | import logging 4 | import random 5 | 6 | import aiohttp 7 | import cchardet 8 | import formasaurus 9 | import trio 10 | import trio_asyncio 11 | import w3lib.encoding 12 | from yarl import URL 13 | 14 | from .downloader import DownloadRequest 15 | 16 | 17 | logger = logging.getLogger(__name__) 18 | chardet = lambda s: cchardet.detect(s).get("encoding") 19 | 20 | 21 | def get_captcha_image_element(form): 22 | """ 23 | Return the element in an lxml form that contains the CAPTCHA. 24 | 25 | NOTE: This assumes the first image in the form is the CAPTCHA image. If 26 | a form has multiple images, maybe use the etree .sourceline attribute to 27 | figure out which image is closer to the CAPTCHA input? Or crawl through 28 | the element tree to find the image? 29 | 30 | :param form: An lxml form element. 31 | :returns: An lxml image element. 32 | """ 33 | img_el = form.find(".//img") 34 | if img_el is None: 35 | raise Exception("Cannot locate CAPTCHA image") 36 | return img_el 37 | 38 | 39 | def select_login_fields(fields): 40 | """ 41 | Select field having highest probability for class ``field``. 42 | 43 | :param dict fields: Nested dictionary containing label probabilities 44 | for each form element. 45 | :returns: (username field, password field, captcha field) 46 | :rtype: tuple 47 | """ 48 | username_field = None 49 | username_prob = 0 50 | password_field = None 51 | password_prob = 0 52 | captcha_field = None 53 | captcha_prob = 0 54 | 55 | for field_name, labels in fields.items(): 56 | for label, prob in labels.items(): 57 | if label in ("username", "username or email") and prob > username_prob: 58 | username_field = field_name 59 | username_prob = prob 60 | elif label == "password" and prob > password_prob: 61 | password_field = field_name 62 | password_prob = prob 63 | elif label == "captcha" and prob > captcha_prob: 64 | captcha_field = field_name 65 | captcha_prob = prob 66 | 67 | return username_field, password_field, captcha_field 68 | 69 | 70 | def select_login_form(forms): 71 | """ 72 | Select form having highest probability for login class. 73 | 74 | :param dict forms: Nested dict containing label probabilities for each 75 | form. 76 | :returns: (login form, login meta) 77 | :rtype: tuple 78 | """ 79 | login_form = None 80 | login_meta = None 81 | login_prob = 0 82 | 83 | for form, meta in forms: 84 | for type_, prob in meta["form"].items(): 85 | if type_ == "login" and prob > login_prob: 86 | login_form = form 87 | login_meta = meta 88 | login_prob = prob 89 | 90 | return login_form, login_meta 91 | 92 | 93 | class LoginManager: 94 | def __init__(self, job_id, db, policy, downloader): 95 | """ 96 | Constructor 97 | 98 | :param starbelly.db.LoginDb: A database layer. 99 | """ 100 | self._job_id = job_id 101 | self._db = db 102 | self._policy = policy 103 | self._downloader = downloader 104 | 105 | async def login(self, domain): 106 | """ 107 | Attempt a login for the given domain. 108 | 109 | :param str domain: The domain to log into. 110 | """ 111 | domain_login = await self._db.get_login(domain) 112 | if domain_login is None: 113 | return 114 | 115 | # Ensure login has users 116 | if not domain_login.get("users", []): 117 | logger.warning("No users for login: %s", domain_login) 118 | return 119 | 120 | # Select random user from domain_login 121 | user = random.choice(domain_login["users"]) 122 | masked_pass = user["password"][:2] + "******" 123 | logger.info( 124 | "Attempting login: domain=%s with user=%s password=%s", 125 | domain, 126 | user["username"], 127 | masked_pass, 128 | ) 129 | request = DownloadRequest( 130 | frontier_id=None, 131 | job_id=self._job_id, 132 | method="GET", 133 | url=domain_login["login_url"], 134 | form_data=None, 135 | cost=1.0, 136 | ) 137 | response = await self._downloader.download(request) 138 | if not response.is_success: 139 | logger.error("Login aborted: cannot fetch %s", response.url) 140 | return 141 | try: 142 | action, method, data = await self._get_login_form( 143 | response, user["username"], user["password"] 144 | ) 145 | except Exception as e: 146 | logger.exception("Cannot parse login form: %s", e) 147 | return 148 | logger.info("Login action=%s method=%s data=%r", action, method, data) 149 | request = DownloadRequest( 150 | frontier_id=None, 151 | job_id=self._job_id, 152 | method=method, 153 | url=action, 154 | form_data=data, 155 | cost=1.0, 156 | ) 157 | response = await self._downloader.download(request) 158 | if not response.is_success: 159 | logger.error( 160 | "Login failed action=%s (see downloader log for" " details)", action 161 | ) 162 | 163 | async def _download_captcha_image(self, img_src): 164 | """ 165 | Download and return a CAPTCHA image. 166 | 167 | :param str img_src: The URL to download the image from. 168 | :rtype bytes: 169 | """ 170 | logger.info("Downloading CAPTCHA image src=%s", img_src) 171 | request = DownloadRequest( 172 | frontier_id=None, 173 | job_id=None, 174 | method="GET", 175 | url=img_src, 176 | form_data=None, 177 | cost=0, 178 | ) 179 | response = await self._downloader.download(request) 180 | 181 | if response.status_code == 200 and response.body is not None: 182 | img_data = response.body 183 | else: 184 | raise Exception("Failed to download CAPTCHA image src={}".format(img_src)) 185 | 186 | return img_data 187 | 188 | async def _get_login_form(self, response, username, password): 189 | """ 190 | Attempt to extract login form action and form data from a response, 191 | substituting the provided ``username`` and ``password`` into the 192 | corresponding fields. Returns the data needed to POST a login request. 193 | 194 | :param starbelly.downloader.DownloadResponse response: 195 | :param str username: The username to log in with. 196 | :param str password: The password to log in with. 197 | :returns: (action, method, fields) 198 | :rtype: tuple 199 | """ 200 | _, html = w3lib.encoding.html_to_unicode( 201 | response.content_type, response.body, auto_detect_fun=chardet 202 | ) 203 | 204 | forms = await trio.run_sync_in_worker_thread( 205 | partial(formasaurus.extract_forms, html, proba=True) 206 | ) 207 | form, meta = select_login_form(forms) 208 | 209 | if form is None: 210 | raise Exception("Can't find login form") 211 | 212 | login_field, password_field, captcha_field = select_login_fields(meta["fields"]) 213 | if login_field is None or password_field is None: 214 | raise Exception("Can't find username/password fields") 215 | 216 | form.fields[login_field] = username 217 | form.fields[password_field] = password 218 | 219 | if captcha_field is not None: 220 | if self._policy.captcha_solver is None: 221 | raise Exception( 222 | "CAPTCHA required for login url={} but there is" 223 | " no CAPTCHA solver available".format(response.url) 224 | ) 225 | 226 | img_el = get_captcha_image_element(form) 227 | img_src = str(URL(response.url).join(URL(img_el.get("src")))) 228 | img_data = await self._download_captcha_image(img_src) 229 | captcha_text = await self._solve_captcha_asyncio(img_data) 230 | form.fields[captcha_field] = captcha_text 231 | 232 | form_action = URL(response.url).join(URL(form.action)) 233 | return form_action, form.method, dict(form.fields) 234 | 235 | @trio_asyncio.aio_as_trio 236 | async def _solve_captcha_asyncio(self, img_data): 237 | """ 238 | Send an image CAPTCHA to an external solver and return the solution. 239 | This function uses aiohttp and therefore must run on the asyncio loop. 240 | 241 | :param bytes img_data: The CAPTCHA image. 242 | :rtype: str 243 | """ 244 | solver = self._policy.captcha_solver 245 | solution = None 246 | task_url = str(URL(solver.service_url).join(URL("createTask"))) 247 | poll_url = str(URL(solver.service_url).join(URL("getTaskResult"))) 248 | 249 | # This doesn't use the downloader object because this is a third party 250 | # and is not the subject of our crawl. 251 | async with aiohttp.ClientSession() as session: 252 | # Send CAPTCHA task to service 253 | command = solver.get_command(img_data) 254 | async with session.post(task_url, json=command) as response: 255 | result = await response.json() 256 | if result["errorId"] != 0: 257 | raise Exception("CAPTCHA API error {}".format(result["errorId"])) 258 | task_id = result["taskId"] 259 | logger.info("Sent image to CAPTCHA API task_id=%d", task_id) 260 | 261 | # Poll for task completion. (Try 6 times.) 262 | solution = None 263 | for attempt in range(6): 264 | await asyncio.sleep(5) 265 | command = { 266 | "clientKey": solver.api_key, 267 | "taskId": task_id, 268 | } 269 | logger.info( 270 | "Polling for CAPTCHA solution task_id=%d," " attempt=%d", 271 | task_id, 272 | attempt + 1, 273 | ) 274 | async with session.post(poll_url, json=command) as response: 275 | result = await response.json() 276 | if result["errorId"] != 0: 277 | raise Exception( 278 | "CAPTCHA API error {}".format(result["errorId"]) 279 | ) 280 | solution = result["solution"]["text"] 281 | break 282 | 283 | if solution is None: 284 | raise Exception("CAPTCHA API never completed task") 285 | 286 | return solution 287 | -------------------------------------------------------------------------------- /starbelly/resource_monitor.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | from datetime import datetime, timezone 3 | import logging 4 | 5 | import psutil 6 | import trio 7 | 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class ResourceMonitor: 13 | ''' 14 | Keep track of consumption and usage statistics for various resources. 15 | ''' 16 | def __init__(self, interval, buffer_size, crawl_resources_fn, rate_limiter): 17 | ''' 18 | Constructor. 19 | 20 | :param float interval: The number of seconds to wait between 21 | measurements. 22 | :param int buffer_size: The number of measurements to store in the 23 | internal buffer. 24 | :param callable crawl_resource_fn: A function that will return a dict 25 | of crawl resources. 26 | :param starbelly.rate_limiter.RateLimiter rate_limiter: 27 | ''' 28 | self._interval = interval 29 | self._crawl_resources_fn = crawl_resources_fn 30 | self._rate_limiter = rate_limiter 31 | self._measurements = deque(maxlen=buffer_size) 32 | self._channels = list() 33 | 34 | def get_channel(self, channel_size): 35 | ''' 36 | Get a statistics channel. The resource monitor will send measurements to 37 | this channel until the receive end is closed. Note that if the channel 38 | is full, the resource monitor does not block! It will drop messages 39 | instead. 40 | 41 | :param int channel_size: The size of the channel to create. 42 | :returns: A channel that will receive resource statistics at regular 43 | intervals. 44 | :rtype: trio.ReceiveChannel 45 | ''' 46 | logger.debug('Creating new channel with size=%d', channel_size) 47 | send_channel, recv_channel = trio.open_memory_channel(channel_size) 48 | self._channels.append(send_channel) 49 | return recv_channel 50 | 51 | def history(self, n=None): 52 | ''' 53 | Return the most recent ``n`` measurements. 54 | 55 | :param int n: The number of measurements to retrieve. If ``n`` is None 56 | or there are fewer than ``n`` measurements, return all measurements. 57 | :rtype: list 58 | ''' 59 | # A deque can't be sliced, so we have to do some extra work to return 60 | # the most recent measurements from the end. 61 | history_iter = iter(self._measurements) 62 | if n is not None: 63 | for _ in range(len(self._measurements) - n): 64 | next(history_iter) 65 | return list(history_iter) 66 | 67 | async def run(self): 68 | ''' 69 | Run the resource monitor. 70 | 71 | :returns: Runs until cancelled. 72 | ''' 73 | next_run = trio.current_time() + self._interval 74 | while True: 75 | measurement = self._measure() 76 | self._measurements.append(measurement) 77 | to_remove = set() 78 | for channel in self._channels: 79 | try: 80 | channel.send_nowait(measurement) 81 | except trio.WouldBlock: 82 | continue 83 | except trio.BrokenResourceError: 84 | to_remove.add(channel) 85 | for channel in to_remove: 86 | logger.debug('Removing closed channel') 87 | self._channels.remove(channel) 88 | sleep_time = next_run - trio.current_time() 89 | while sleep_time < 0: 90 | sleep_time += self._interval 91 | await trio.sleep(sleep_time) 92 | next_run += self._interval 93 | 94 | def _measure(self): 95 | ''' 96 | Record one set of measurements. 97 | 98 | :rtype: dict 99 | ''' 100 | measurement = dict() 101 | measurement['timestamp'] = datetime.now(timezone.utc) 102 | 103 | # CPUs 104 | measurement['cpus'] = psutil.cpu_percent(percpu=True) 105 | 106 | # Memory 107 | vm = psutil.virtual_memory() 108 | measurement['memory_used'] = vm.used 109 | measurement['memory_total'] = vm.total 110 | 111 | # Disks 112 | measurement['disks'] = list() 113 | for partition in psutil.disk_partitions(): 114 | disk = dict() 115 | disk['mount'] = partition.mountpoint 116 | usage = psutil.disk_usage(disk['mount']) 117 | disk['used'] = usage.used 118 | disk['total'] = usage.total 119 | measurement['disks'].append(disk) 120 | 121 | # Networks 122 | measurement['networks'] = list() 123 | for name, nic in psutil.net_io_counters(pernic=True).items(): 124 | net = dict() 125 | net['name'] = name 126 | net['sent'] = nic.bytes_sent 127 | net['received'] = nic.bytes_recv 128 | measurement['networks'].append(net) 129 | 130 | # Crawl Job Resources 131 | measurement['jobs'] = list() 132 | crawl_resources = self._crawl_resources_fn() 133 | for job in crawl_resources['jobs']: 134 | measurement['jobs'].append(job.copy()) 135 | 136 | # Crawl Global Resources 137 | measurement['current_downloads'] = crawl_resources['current_downloads'] 138 | measurement['maximum_downloads'] = crawl_resources['maximum_downloads'] 139 | measurement['rate_limiter'] = self._rate_limiter.item_count 140 | 141 | return measurement 142 | -------------------------------------------------------------------------------- /starbelly/robots.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from datetime import datetime, timezone 3 | import logging 4 | 5 | from robotexclusionrulesparser import RobotExclusionRulesParser 6 | from rethinkdb import RethinkDB 7 | from yarl import URL 8 | import trio 9 | 10 | from .downloader import DownloadRequest 11 | 12 | 13 | r = RethinkDB() 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class RobotsTxtManager: 18 | ''' Store and manage robots.txt files. ''' 19 | def __init__(self, db_pool, max_age=24*60*60, max_cache=1e3): 20 | ''' 21 | Constructor. 22 | 23 | :param db_pool: A DB connection pool. 24 | :param int max_age: The maximum age before a robots.txt is downloaded 25 | again. 26 | :param int max_cache: The maximum number of robots.txt files to cache 27 | in memory. 28 | ''' 29 | self._db_pool = db_pool 30 | self._events = dict() 31 | self._cache = OrderedDict() 32 | self._max_age = max_age 33 | self._max_cache = max_cache 34 | 35 | async def is_allowed(self, url, policy, downloader): 36 | ''' 37 | Return True if ``url`` is allowed by the applicable robots.txt file. 38 | 39 | This fetches the applicable robots.txt if we don't have a recent copy 40 | of it cached in memory or in the database. The ``policy`` is used if a 41 | robots.txt file needs to be fetched from the network. 42 | 43 | :param str url: Check this URL to see if the robots.txt and accompanying 44 | policy permit access to it. 45 | :param Policy policy: 46 | :param Downloader downloader: 47 | :rtype: bool 48 | ''' 49 | if policy.robots_txt.usage == 'IGNORE': 50 | # No need to fetch robots.txt. 51 | return True 52 | 53 | robots_url = str(URL(url).with_path('robots.txt') 54 | .with_query(None) 55 | .with_fragment(None)) 56 | 57 | # Check if cache has a current copy of robots.txt. 58 | try: 59 | robots = self._cache[robots_url] 60 | if robots.is_older_than(self._max_age): 61 | del self._cache[robots_url] 62 | robots = None 63 | else: 64 | self._cache.move_to_end(robots_url) 65 | except KeyError: 66 | robots = None 67 | 68 | # Do we need to fetch robots into cache? 69 | if robots is None: 70 | try: 71 | # If another task is fetching it, then just wait for that task. 72 | await self._events[robots_url].wait() 73 | robots = self._cache[robots_url] 74 | except KeyError: 75 | # Create a new task to fetch it. 76 | self._events[robots_url] = trio.Event() 77 | robots = await self._get_robots(robots_url, downloader) 78 | event = self._events.pop(robots_url) 79 | event.set() 80 | 81 | # Note: we only check the first user agent. 82 | user_agent = policy.user_agents.get_first_user_agent() 83 | robots_decision = robots.is_allowed(user_agent, url) 84 | if policy.robots_txt.usage == 'OBEY': 85 | return robots_decision 86 | return not robots_decision 87 | 88 | async def _get_robots(self, robots_url, downloader): 89 | ''' 90 | Locate and return a robots.txt file. 91 | 92 | Looks for non-expired robots.txt file first in database then request 93 | from network. Wherever the robots file is found, it is placed into the 94 | cache and then returned. 95 | 96 | If we get a copy from the network, then we also store a copy in the 97 | database. If we cannot get a copy from the network (e.g. 404 error) and 98 | we have a database copy, then we update the database copy's expiration. 99 | If we cannot get a copy from database or network, then we create a 100 | permissive robots.txt and use that instead. 101 | 102 | :param str url: Fetch the file at this URL. 103 | :param Downloader downloader: 104 | :rtype: RobotsTxt 105 | ''' 106 | # Check DB. If not there (or expired), check network. 107 | now = datetime.now(timezone.utc) 108 | robots_doc = await self._get_robots_from_db(robots_url) 109 | 110 | if robots_doc is None or \ 111 | (now - robots_doc['updated_at']).seconds > self._max_age: 112 | robots_file = await self._get_robots_from_net(robots_url, 113 | downloader) 114 | else: 115 | robots_file = None 116 | 117 | if robots_doc is None: 118 | # No local copy: create a new local copy. If robots_file is None, it 119 | # will be treated as a permissive RobotsTxt. 120 | logger.info('Saving new robots.txt file: %s', robots_url) 121 | robots_doc = { 122 | 'file': robots_file, 123 | 'updated_at': now, 124 | 'url': robots_url, 125 | } 126 | robots = RobotsTxt(robots_doc) 127 | else: 128 | # If we have a network copy, use that to update local copy. 129 | # Otherwise, just update the local copy's timestamp. 130 | robots = RobotsTxt(robots_doc) 131 | logger.info('Updating robots.txt file: %s', robots_url) 132 | if robots_file is not None: 133 | robots_doc['file'] = robots_file 134 | else: 135 | del robots_doc['file'] 136 | 137 | robots_doc['updated_at'] = now 138 | del robots_doc['url'] 139 | 140 | # Upsert robots_docs. 141 | await self._save_robots_to_db(robots_doc) 142 | 143 | # Add to cache before completing the future to avoid race condition. 144 | self._cache[robots_url] = robots 145 | self._cache.move_to_end(robots_url) 146 | if len(self._cache) > self._max_cache: 147 | self._cache.popitem(last=False) 148 | return robots 149 | 150 | async def _get_robots_from_db(self, robots_url): 151 | ''' 152 | Get robots document from the database. 153 | 154 | Returns None if it doesn't exist in the database. 155 | 156 | :param str robots_url: The URL of the robots.txt file. 157 | :returns: A database document. 158 | :rtype: dict 159 | ''' 160 | query = r.table('robots_txt').get_all(robots_url, index='url').nth(0) 161 | 162 | async with self._db_pool.connection() as conn: 163 | try: 164 | db_robots = await query.run(conn) 165 | except r.ReqlNonExistenceError: 166 | db_robots = None 167 | 168 | return db_robots 169 | 170 | async def _get_robots_from_net(self, robots_url, downloader): 171 | ''' 172 | Get robots.txt file from the network. 173 | 174 | Returns None if the file cannot be fetched (e.g. 404 error). 175 | 176 | :param str robots_url: Fetch the robots.txt file at this URL. 177 | :param Downloader downloader: 178 | :returns: Contents of robots.txt file or None if it couldn't be 179 | downloaded. 180 | :rtype: str 181 | ''' 182 | 183 | logger.info('Fetching robots.txt: %s', robots_url) 184 | request = DownloadRequest(frontier_id=None, job_id=None, method='GET', 185 | url=robots_url, form_data=None, cost=0) 186 | response = await downloader.download(request, skip_mime=True) 187 | 188 | if response.status_code == 200 and response.body is not None: 189 | # There are no invalid byte sequences in latin1 encoding, so this 190 | # should always succeed. 191 | robots_file = response.body.decode('latin1') 192 | else: 193 | robots_file = None 194 | 195 | return robots_file 196 | 197 | async def _save_robots_to_db(self, robots_doc): 198 | async with self._db_pool.connection() as conn: 199 | await ( 200 | r.table('robots_txt') 201 | .insert(robots_doc, conflict='update') 202 | .run(conn) 203 | ) 204 | 205 | 206 | class RobotsTxt: 207 | ''' 208 | Wrapper around robots.txt parser that adds the date the file was fetched. 209 | 210 | If the ``robots_file`` is None or cannot be parsed, then it's treated as a 211 | highly permissive robots.txt. 212 | ''' 213 | def __init__(self, robots_doc): 214 | ''' Initialize from database document representation. ''' 215 | self._updated_at = robots_doc['updated_at'] 216 | self._robots = RobotExclusionRulesParser() 217 | 218 | if robots_doc['file'] is not None: 219 | # The parser never throws an exception, it just ignores things that 220 | # it doesn't understand. 221 | self._robots.parse(robots_doc['file']) 222 | 223 | def is_allowed(self, user_agent, url): 224 | ''' 225 | Return True if ``url`` is allowed by this robots.txt file. 226 | 227 | :param str user_agent: The user agent that want to access the URL. 228 | :param str url: The URL that the user agent wants to access. 229 | :rtype: bool 230 | ''' 231 | return self._robots.is_allowed(user_agent, url) 232 | 233 | def is_older_than(self, age): 234 | ''' 235 | Return True if this robots file is older than ``age``. 236 | 237 | :param datetime age: A timezone-aware datetime. 238 | :rtype: bool 239 | ''' 240 | return (datetime.now(timezone.utc) - self._updated_at).seconds >= age 241 | -------------------------------------------------------------------------------- /starbelly/server/__init__.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from datetime import datetime, timedelta 3 | from functools import partial 4 | import gzip 5 | import inspect 6 | import logging 7 | import operator 8 | from time import time 9 | from urllib.parse import urlparse 10 | 11 | import dateutil.parser 12 | from dateutil.tz import tzlocal 13 | from google.protobuf.message import DecodeError 14 | import rethinkdb as r 15 | from rethinkdb.errors import ReqlNonExistenceError 16 | from trio_websocket import ConnectionClosed, serve_websocket 17 | import trio 18 | 19 | from starbelly.subscription import SubscriptionManager 20 | from starbelly.starbelly_pb2 import Request, Response, ServerMessage 21 | 22 | # Define API handler decorator before importing API implementations, because 23 | # those implementations use this decorator to register themselves with the 24 | # server. 25 | _handlers = dict() 26 | def api_handler(handler): 27 | ''' 28 | This decorator registers a function as a callable command through the 29 | API server. 30 | ''' 31 | _handlers[handler.__name__] = handler 32 | return handler 33 | 34 | 35 | class InvalidRequestException(Exception): 36 | ''' Indicates a request is invalid. ''' 37 | 38 | 39 | # pylint: disable=cyclic-import, wrong-import-position 40 | from .captcha import * 41 | from .job import * 42 | from .login import * 43 | from .policy import * 44 | from .rate_limit import * 45 | from .schedule import * 46 | from .subscription import * 47 | from .system import * 48 | 49 | 50 | logger = logging.getLogger(__name__) 51 | 52 | 53 | class Server: 54 | ''' Handles websocket connections from clients and command dispatching. ''' 55 | 56 | def __init__(self, host, port, server_db, subscription_db, crawl_manager, 57 | rate_limiter, resource_monitor, stats_tracker, scheduler): 58 | ''' 59 | Constructor 60 | 61 | :param str host: The hostname to serve on. 62 | :param int port: The port to serve on, or zero to automatically pick a 63 | port. 64 | :param starbelly.db.ServerDb server_db: 65 | :param starbelly.db.SubscriptionDb subscription_db: 66 | :param starbelly.job.CrawlManager crawl_manager: 67 | :param starbelly.rate_limiter.RateLimiter: 68 | :param starbelly.resource_monitor.ResourceMonitor resource_monitor: 69 | :param starbelly.job.StatsTracker stats_tracker: 70 | :param starbelly.schedule.Scheduler scheduler: 71 | ''' 72 | self._host = host 73 | self._port = port 74 | self._server_db = server_db 75 | self._subscription_db = subscription_db 76 | self._crawl_manager = crawl_manager 77 | self._rate_limiter = rate_limiter 78 | self._resource_monitor = resource_monitor 79 | self._stats_tracker = stats_tracker 80 | self._scheduler = scheduler 81 | 82 | @property 83 | def port(self): 84 | return self._port 85 | 86 | async def run(self, *, task_status=trio.TASK_STATUS_IGNORED): 87 | ''' 88 | Run the websocket server. 89 | 90 | To ensure that the server is ready, call ``await 91 | nursery.start(server.run)``. 92 | 93 | :returns: Runs until cancelled. 94 | ''' 95 | logger.info('Starting server on %s:%d', self._host, self._port) 96 | async with trio.open_nursery() as nursery: 97 | serve_fn = partial(serve_websocket, self._handle_connection, 98 | self._host, self._port, ssl_context=None, 99 | handler_nursery=nursery) 100 | server = await nursery.start(serve_fn, name='Connection Listener') 101 | self._port = server.port 102 | task_status.started() 103 | logger.info('Server stopped') 104 | 105 | async def _handle_connection(self, request): 106 | ''' 107 | Handle an incoming connection. 108 | 109 | :param request: A WebSocket connection request. 110 | ''' 111 | headers = dict(request.headers) 112 | ws = await request.accept() 113 | client = '{}:{}'.format(ws.remote.address, ws.remote.port) 114 | logger.info('Connection opened: client=%s path=%s', client, ws.path) 115 | connection = Connection(client, ws, self._server_db, 116 | self._subscription_db, self._crawl_manager, self._rate_limiter, 117 | self._resource_monitor, self._stats_tracker, self._scheduler) 118 | await connection.run() 119 | 120 | 121 | class Connection: 122 | def __init__(self, client, ws, server_db, subscription_db, crawl_manager, 123 | rate_limiter, resource_monitor, stats_tracker, scheduler): 124 | ''' 125 | Constructor. 126 | 127 | :param str client: Description the client (IP address and port) that 128 | opened this connection. 129 | :param trio_websocket.WebSocketConnection ws: A websocket connection. 130 | :param starbelly.db.ServerDb: A database layer. 131 | :param starbelly.db.SubscriptionDb: A database layer. 132 | :param starbelly.job.CrawlManager crawl_manager: A crawl manager. 133 | :param starbelly.rate_limiter.RateLimiter: A rate limiter. 134 | :param starbelly.resource_monitor.ResourceMonitor resource_monitor: A 135 | resource monitor. 136 | :param starbelly.schedule.Scheduler scheduler: A scheduler. 137 | :param starbelly.job.StatsTracker stats_tracker: 138 | :param starbelly.subscription.SubscriptionManager: A subscription 139 | manager. 140 | ''' 141 | self._client = client 142 | self._ws = ws 143 | self._server_db = server_db 144 | self._subscription_db = subscription_db 145 | self._crawl_manager = crawl_manager 146 | self._rate_limiter = rate_limiter 147 | self._resource_monitor = resource_monitor 148 | self._scheduler = scheduler 149 | self._subscription_db = subscription_db 150 | self._nursery = None 151 | self._stats_tracker = stats_tracker 152 | self._subscription_manager = None 153 | 154 | async def run(self): 155 | ''' 156 | Run the connection: read requests and send responses. 157 | 158 | This opens an internal nursery in case background tasks, like 159 | subscriptions, need to be started. 160 | 161 | :returns: This runs until the connection is closed. 162 | ''' 163 | try: 164 | async with trio.open_nursery() as nursery: 165 | self._nursery = nursery 166 | self._subscription_manager = SubscriptionManager( 167 | self._subscription_db, nursery, self._ws) 168 | while True: 169 | request_data = await self._ws.get_message() 170 | nursery.start_soon(self._handle_request, request_data, 171 | name='Request Handler') 172 | except ConnectionClosed: 173 | logger.info('Connection closed for %s', self._client) 174 | except: 175 | logger.exception('Connection exception') 176 | finally: 177 | await self._ws.aclose() 178 | 179 | async def _handle_request(self, request_data): 180 | ''' 181 | Handle a single API request. 182 | 183 | :param request: A protobuf request object. 184 | ''' 185 | start = trio.current_time() 186 | message = ServerMessage() 187 | message.response.is_success = False 188 | request = None 189 | 190 | try: 191 | # Prepare response. 192 | request = Request.FromString(request_data) 193 | message.response.request_id = request.request_id 194 | 195 | # Find an appropriate handler. 196 | command_name = request.WhichOneof('Command') 197 | if command_name is None: 198 | raise InvalidRequestException('No command specified') 199 | command = getattr(request, command_name) 200 | try: 201 | handler = _handlers[command_name] 202 | except KeyError: 203 | raise InvalidRequestException('Invalid command name: {}' 204 | .format(command_name)) from None 205 | 206 | # Inject dependencies into argument list, then call the handler. 207 | argspec = inspect.getfullargspec(handler) 208 | args = list() 209 | for var in argspec[0]: 210 | if var == 'command': 211 | args.append(command) 212 | elif var == 'crawl_manager': 213 | args.append(self._crawl_manager) 214 | elif var == 'nursery': 215 | args.append(self._nursery) 216 | elif var == 'rate_limiter': 217 | args.append(self._rate_limiter) 218 | elif var == 'resource_monitor': 219 | args.append(self._resource_monitor) 220 | elif var == 'response': 221 | args.append(message.response) 222 | elif var == 'scheduler': 223 | args.append(self._scheduler) 224 | elif var == 'server_db': 225 | args.append(self._server_db) 226 | elif var == 'subscription_manager': 227 | args.append(self._subscription_manager) 228 | elif var == 'stats_tracker': 229 | args.append(self._stats_tracker) 230 | elif var == 'websocket': 231 | args.append(self._ws) 232 | else: 233 | raise Exception('Unknown dependency "{}" in handler {}()' 234 | .format(var, command_name)) 235 | 236 | await handler(*args) 237 | message.response.is_success = True 238 | elapsed = trio.current_time() - start 239 | logger.info('Request OK %s %s %0.3fs', self._client, 240 | command_name, elapsed) 241 | except DecodeError: 242 | # Failure to decode a protobuf message means that the connection 243 | # is severely damaged; raise to the nursery so we can close the 244 | # entire connection. 245 | raise 246 | except InvalidRequestException as ire: 247 | error_message = str(ire) 248 | logger.error('Request ERROR %s %s (%s)', command_name, 249 | self._client, error_message) 250 | message.response.error_message = error_message 251 | except: 252 | logger.exception('Exception while handling request:\n%r', 253 | request) 254 | message.response.error_message = 'A server exception occurred' 255 | 256 | message_data = message.SerializeToString() 257 | await self._ws.send_message(message_data) 258 | -------------------------------------------------------------------------------- /starbelly/server/captcha.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | import logging 3 | from uuid import UUID 4 | 5 | from . import api_handler, InvalidRequestException 6 | from ..captcha import captcha_doc_to_pb, captcha_pb_to_doc 7 | 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | @api_handler 13 | async def delete_captcha_solver(command, server_db): 14 | ''' Delete a a CAPTCHA solver. ''' 15 | solver_id = str(UUID(bytes=command.solver_id)) 16 | try: 17 | await server_db.delete_captcha_solver(solver_id) 18 | except ValueError as ve: 19 | raise InvalidRequestException(str(ve)) from None 20 | 21 | 22 | @api_handler 23 | async def get_captcha_solver(command, response, server_db): 24 | ''' Get a CAPTCHA solver. ''' 25 | solver_id = str(UUID(bytes=command.solver_id)) 26 | doc = await server_db.get_captcha_solver(solver_id) 27 | 28 | if doc is None: 29 | raise InvalidRequestException('No CAPTCHA solver found for that ID') 30 | 31 | response.solver.CopyFrom(captcha_doc_to_pb(doc)) 32 | 33 | 34 | @api_handler 35 | async def list_captcha_solvers(command, response, server_db): 36 | ''' Return a list of CAPTCHA solvers. ''' 37 | limit = command.page.limit 38 | offset = command.page.offset 39 | count, docs = await server_db.list_captcha_solvers(limit, offset) 40 | 41 | for doc in docs: 42 | solver = response.list_captcha_solvers.solvers.add() 43 | solver.CopyFrom(captcha_doc_to_pb(doc)) 44 | 45 | response.list_captcha_solvers.total = count 46 | 47 | 48 | @api_handler 49 | async def set_captcha_solver(command, response, server_db): 50 | ''' Create or update CAPTCHA solver. ''' 51 | now = datetime.now(timezone.utc) 52 | doc = captcha_pb_to_doc(command.solver) 53 | new_id = await server_db.set_captcha_solver(doc, now) 54 | if new_id: 55 | response.new_solver.solver_id = UUID(new_id).bytes 56 | -------------------------------------------------------------------------------- /starbelly/server/job.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import logging 3 | from uuid import UUID 4 | from yarl import URL 5 | 6 | import dateutil.parser 7 | 8 | from . import api_handler, InvalidRequestException 9 | from ..policy import Policy 10 | from ..starbelly_pb2 import JobRunState as PbRunState 11 | 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | @api_handler 17 | async def delete_job(command, server_db, stats_tracker): 18 | """ Delete a job. """ 19 | job_id = str(UUID(bytes=command.job_id)) 20 | await server_db.delete_job(job_id) 21 | stats_tracker.delete_job(job_id) 22 | 23 | 24 | @api_handler 25 | async def get_job(command, response, server_db): 26 | """ Get status for a single job. """ 27 | job_id = str(UUID(bytes=command.job_id)) 28 | job_doc = await server_db.get_job(job_id) 29 | if not job_doc: 30 | raise InvalidRequestException(f"No job exists with ID={job_id}") 31 | 32 | job = response.job 33 | job.job_id = UUID(job_doc["id"]).bytes 34 | for seed in job_doc["seeds"]: 35 | job.seeds.append(seed) 36 | for tag in job_doc["tags"]: 37 | job.tags.append(tag) 38 | Policy.convert_doc_to_pb(job_doc["policy"], job.policy) 39 | job.name = job_doc["name"] 40 | job.item_count = job_doc["item_count"] 41 | job.http_success_count = job_doc["http_success_count"] 42 | job.http_error_count = job_doc["http_error_count"] 43 | job.exception_count = job_doc["exception_count"] 44 | job.started_at = job_doc["started_at"].isoformat() 45 | if job_doc["completed_at"] is not None: 46 | job.completed_at = job_doc["completed_at"].isoformat() 47 | run_state = job_doc["run_state"].upper() 48 | job.run_state = PbRunState.Value(run_state) 49 | http_status_counts = job_doc["http_status_counts"] 50 | for status_code, count in http_status_counts.items(): 51 | job.http_status_counts[int(status_code)] = count 52 | 53 | 54 | @api_handler 55 | async def get_job_items(command, response, server_db): 56 | """ Get a page of items (crawl responses) from a job. """ 57 | job_id = str(UUID(bytes=command.job_id)) 58 | limit = command.page.limit 59 | offset = command.page.offset 60 | count, items = await server_db.get_job_items( 61 | job_id, 62 | limit, 63 | offset, 64 | command.include_success, 65 | command.include_error, 66 | command.include_exception, 67 | ) 68 | response.list_items.total = count 69 | compression_ok = command.compression_ok 70 | for item_doc in items: 71 | item = response.list_items.items.add() 72 | 73 | if item_doc["join"] is None: 74 | item.is_compressed = False 75 | elif item_doc["join"]["is_compressed"] and not compression_ok: 76 | item.body = gzip.decompress(item_doc["join"]["body"]) 77 | item.is_compressed = False 78 | else: 79 | item.body = item_doc["join"]["body"] 80 | item.is_compressed = item_doc["join"]["is_compressed"] 81 | if "content_type" in item_doc: 82 | item.content_type = item_doc["content_type"] 83 | if "exception" in item_doc: 84 | item.exception = item_doc["exception"] 85 | if "status_code" in item_doc: 86 | item.status_code = item_doc["status_code"] 87 | header_iter = iter(item_doc.get("headers", [])) 88 | for key in header_iter: 89 | value = next(header_iter) 90 | header = item.headers.add() 91 | header.key = key 92 | header.value = value 93 | item.cost = item_doc["cost"] 94 | item.job_id = UUID(item_doc["job_id"]).bytes 95 | item.completed_at = item_doc["completed_at"].isoformat() 96 | item.started_at = item_doc["started_at"].isoformat() 97 | item.duration = item_doc["duration"] 98 | item.url = item_doc["url"] 99 | item.url_can = item_doc["canonical_url"] 100 | item.is_success = item_doc["is_success"] 101 | 102 | 103 | @api_handler 104 | async def list_jobs(command, response, server_db): 105 | """ Return a list of jobs. """ 106 | limit = command.page.limit 107 | offset = command.page.offset 108 | if command.HasField("started_after"): 109 | started_after = dateutil.parser.parse(command.started_after) 110 | else: 111 | started_after = None 112 | tag = command.tag if command.HasField("tag") else None 113 | schedule_id = ( 114 | str(UUID(bytes=command.schedule_id)) 115 | if command.HasField("schedule_id") 116 | else None 117 | ) 118 | count, jobs = await server_db.list_jobs( 119 | limit, offset, started_after, tag, schedule_id 120 | ) 121 | response.list_jobs.total = count 122 | 123 | for job_doc in jobs: 124 | job = response.list_jobs.jobs.add() 125 | job.job_id = UUID(job_doc["id"]).bytes 126 | job.name = job_doc["name"] 127 | for seed in job_doc["seeds"]: 128 | job.seeds.append(seed) 129 | for tag in job_doc["tags"]: 130 | job.tags.append(tag) 131 | job.item_count = job_doc["item_count"] 132 | job.http_success_count = job_doc["http_success_count"] 133 | job.http_error_count = job_doc["http_error_count"] 134 | job.exception_count = job_doc["exception_count"] 135 | job.started_at = job_doc["started_at"].isoformat() 136 | if job_doc["completed_at"] is not None: 137 | job.completed_at = job_doc["completed_at"].isoformat() 138 | run_state = job_doc["run_state"].upper() 139 | job.run_state = PbRunState.Value(run_state) 140 | http_status_counts = job_doc["http_status_counts"] 141 | for status_code, count in http_status_counts.items(): 142 | job.http_status_counts[int(status_code)] = count 143 | 144 | 145 | @api_handler 146 | async def set_job(command, crawl_manager, response): 147 | """ Create or update job metadata. """ 148 | if command.HasField("job_id"): 149 | # Update run state of existing job. 150 | job_id = str(UUID(bytes=command.job_id)) 151 | if command.HasField("run_state"): 152 | run_state = command.run_state 153 | if run_state == PbRunState.Value("CANCELLED"): 154 | await crawl_manager.cancel_job(job_id) 155 | elif run_state == PbRunState.Value("PAUSED"): 156 | await crawl_manager.pause_job(job_id) 157 | elif run_state == PbRunState.Value("RUNNING"): 158 | await crawl_manager.resume_job(job_id) 159 | else: 160 | raise InvalidRequestException( 161 | f"Not allowed to set job run state: {run_state}" 162 | ) 163 | else: 164 | # Create new job. 165 | if not command.policy_id: 166 | raise InvalidRequestException('"policy_id" is required') 167 | if not command.seeds: 168 | raise InvalidRequestException('"seeds" is required') 169 | name = command.name 170 | policy_id = str(UUID(bytes=command.policy_id)) 171 | seeds = [s.strip() for s in command.seeds] 172 | tags = [t.strip() for t in command.tags] 173 | 174 | if name.strip() == "": 175 | url = URL(seeds[0]) 176 | name = url.host 177 | if len(seeds) > 1: 178 | name += "& {} more".format(len(seeds) - 1) 179 | 180 | job_id = await crawl_manager.start_job(name, seeds, tags, policy_id) 181 | response.new_job.job_id = UUID(job_id).bytes 182 | -------------------------------------------------------------------------------- /starbelly/server/login.py: -------------------------------------------------------------------------------- 1 | from . import api_handler, InvalidRequestException 2 | 3 | 4 | @api_handler 5 | async def delete_domain_login(command, server_db): 6 | ''' Delete a domain login and all of its users. ''' 7 | await server_db.delete_domain_login(command.domain) 8 | 9 | 10 | @api_handler 11 | async def get_domain_login(command, response, server_db): 12 | ''' Get a domain login. ''' 13 | domain = command.domain 14 | domain_login = await server_db.get_domain_login(domain) 15 | if domain_login is None: 16 | raise InvalidRequestException('No domain credentials found for' 17 | ' domain={}'.format(domain)) 18 | response.domain_login.domain = domain_login['domain'] 19 | response.domain_login.login_url = domain_login['login_url'] 20 | if domain_login['login_test'] is not None: 21 | response.domain_login.login_test = domain_login['login_test'] 22 | 23 | for user in domain_login['users']: 24 | dl_user = response.domain_login.users.add() 25 | dl_user.username = user['username'] 26 | dl_user.password = user['password'] 27 | dl_user.working = user['working'] 28 | 29 | 30 | @api_handler 31 | async def list_domain_logins(command, response, server_db): 32 | ''' Return a list of domain logins. ''' 33 | limit = command.page.limit 34 | offset = command.page.offset 35 | count, docs = await server_db.list_domain_logins(limit, offset) 36 | response.list_domain_logins.total = count 37 | for doc in docs: 38 | dl = response.list_domain_logins.logins.add() 39 | dl.domain = doc['domain'] 40 | dl.login_url = doc['login_url'] 41 | if doc['login_test'] is not None: 42 | dl.login_test = doc['login_test'] 43 | for user_doc in doc['users']: 44 | user = dl.users.add() 45 | user.username = user_doc['username'] 46 | user.password = user_doc['password'] 47 | user.working = user_doc['working'] 48 | 49 | 50 | @api_handler 51 | async def set_domain_login(command, server_db): 52 | ''' Create or update a domain login. ''' 53 | domain_login = command.login 54 | 55 | if not domain_login.HasField('domain'): 56 | raise InvalidRequestException('domain is required.') 57 | 58 | domain = domain_login.domain 59 | doc = await server_db.get_domain_login(domain) 60 | if doc is None: 61 | if not domain_login.HasField('login_url'): 62 | raise InvalidRequestException('login_url is required to' 63 | ' create a domain login.') 64 | doc = { 65 | 'domain': domain, 66 | 'login_url': domain_login.login_url, 67 | 'login_test': None, 68 | } 69 | 70 | if domain_login.HasField('login_url'): 71 | doc['login_url'] = domain_login.login_url 72 | 73 | if domain_login.HasField('login_test'): 74 | doc['login_test'] = domain_login.login_test 75 | 76 | doc['users'] = list() 77 | 78 | for user in domain_login.users: 79 | doc['users'].append({ 80 | 'username': user.username, 81 | 'password': user.password, 82 | 'working': user.working, 83 | }) 84 | 85 | await server_db.set_domain_login(doc) 86 | -------------------------------------------------------------------------------- /starbelly/server/policy.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | from uuid import UUID 3 | 4 | from . import api_handler 5 | from ..policy import Policy 6 | from ..version import __version__ 7 | 8 | 9 | @api_handler 10 | async def delete_policy(command, server_db): 11 | ''' Delete a policy. ''' 12 | policy_id = str(UUID(bytes=command.policy_id)) 13 | await server_db.delete_policy(policy_id) 14 | 15 | 16 | @api_handler 17 | async def get_policy(command, response, server_db): 18 | ''' Get a single policy. ''' 19 | policy_id = str(UUID(bytes=command.policy_id)) 20 | policy_doc = await server_db.get_policy(policy_id) 21 | Policy.convert_doc_to_pb(policy_doc, response.policy) 22 | 23 | 24 | @api_handler 25 | async def list_policies(command, response, server_db): 26 | ''' Get a list of policies. ''' 27 | limit = command.page.limit 28 | offset = command.page.offset 29 | count, docs = await server_db.list_policies(limit, offset) 30 | response.list_policies.total = count 31 | 32 | for policy_doc in docs: 33 | policy = response.list_policies.policies.add() 34 | policy.policy_id = UUID(policy_doc['id']).bytes 35 | policy.name = policy_doc['name'] 36 | policy.created_at = policy_doc['created_at'].isoformat() 37 | policy.updated_at = policy_doc['updated_at'].isoformat() 38 | 39 | return response 40 | 41 | 42 | @api_handler 43 | async def set_policy(command, response, server_db): 44 | ''' 45 | Create or update a single policy. 46 | 47 | If the policy ID is set, then update the corresponding policy. 48 | Otherwise, create a new policy. 49 | ''' 50 | policy_doc = Policy.convert_pb_to_doc(command.policy) 51 | # Validate policy by trying to instantiate a Policy object, which will 52 | # raise an exception if the policy is invalid. 53 | Policy(policy_doc, version=__version__, 54 | seeds=['http://test1.com', 'http://test2.org']) 55 | now = datetime.now(timezone.utc) 56 | new_id = await server_db.set_policy(policy_doc, now) 57 | if new_id is not None: 58 | response.new_policy.policy_id = UUID(new_id).bytes 59 | -------------------------------------------------------------------------------- /starbelly/server/rate_limit.py: -------------------------------------------------------------------------------- 1 | from . import api_handler, InvalidRequestException 2 | from ..rate_limiter import get_domain_token, GLOBAL_RATE_LIMIT_TOKEN 3 | 4 | 5 | @api_handler 6 | async def list_rate_limits(command, response, server_db): 7 | ''' Get a page of rate limits. ''' 8 | limit = command.page.limit 9 | offset = command.page.offset 10 | count, rate_limits = await server_db.list_rate_limits(limit, offset) 11 | response.list_rate_limits.total = count 12 | 13 | for rate_limit in rate_limits: 14 | rl = response.list_rate_limits.rate_limits.add() 15 | rl.name = rate_limit['name'] 16 | rl.token = rate_limit['token'] 17 | rl.delay = rate_limit['delay'] 18 | if rl.name.startswith('domain:'): 19 | rl.domain = rl.name.split(':')[1] 20 | 21 | 22 | @api_handler 23 | async def set_rate_limit(command, rate_limiter, server_db): 24 | ''' Set a rate limit. ''' 25 | delay = command.delay if command.HasField('delay') else None 26 | 27 | if command.HasField('domain'): 28 | # Set a specific rate limit. 29 | domain = command.domain 30 | token = get_domain_token(domain) 31 | name = 'domain:{}'.format(domain) 32 | else: 33 | # Set global rate limit. 34 | if delay is None: 35 | raise InvalidRequestException( 36 | 'Cannot delete the global rate limit.') 37 | token = GLOBAL_RATE_LIMIT_TOKEN 38 | name = 'Global Rate Limit' 39 | 40 | await server_db.set_rate_limit(name, token, delay) 41 | if delay is None: 42 | rate_limiter.delete_rate_limit(token) 43 | else: 44 | rate_limiter.set_rate_limit(token, delay) 45 | -------------------------------------------------------------------------------- /starbelly/server/schedule.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | from uuid import UUID 3 | 4 | from . import api_handler 5 | from ..schedule import Schedule 6 | from ..starbelly_pb2 import JobRunState as PbRunState 7 | 8 | 9 | @api_handler 10 | async def delete_schedule(command, scheduler, server_db): 11 | ''' Delete a job schedule. ''' 12 | schedule_id = str(UUID(bytes=command.schedule_id)) 13 | await server_db.delete_schedule(schedule_id) 14 | scheduler.remove_schedule(schedule_id) 15 | 16 | 17 | @api_handler 18 | async def get_schedule(command, response, server_db): 19 | ''' Get metadata for a job schedule. ''' 20 | schedule_id = str(UUID(bytes=command.schedule_id)) 21 | doc = await server_db.get_schedule(schedule_id) 22 | if doc is None: 23 | response.is_success = False 24 | response.error_message = f'No schedule exists with ID={schedule_id}' 25 | else: 26 | pb = response.schedule 27 | Schedule.from_doc(doc).to_pb(pb) 28 | 29 | 30 | @api_handler 31 | async def list_schedules(command, response, server_db): 32 | ''' Return a list of job schedules. ''' 33 | limit = command.page.limit 34 | offset = command.page.offset 35 | count, schedules = await server_db.list_schedules(limit, offset) 36 | response.list_schedules.total = count 37 | for doc in schedules: 38 | pb = response.list_schedules.schedules.add() 39 | Schedule.from_doc(doc).to_pb(pb) 40 | 41 | 42 | @api_handler 43 | async def list_schedule_jobs(command, response, server_db): 44 | ''' Return a list of job schedules. ''' 45 | schedule_id = str(UUID(bytes=command.schedule_id)) 46 | limit = command.page.limit 47 | offset = command.page.offset 48 | count, jobs = await server_db.list_schedule_jobs(schedule_id, limit, offset) 49 | response.list_schedule_jobs.total = count 50 | for job_doc in jobs: 51 | job = response.list_schedule_jobs.jobs.add() 52 | job.job_id = UUID(job_doc['id']).bytes 53 | job.name = job_doc['name'] 54 | for seed in job_doc['seeds']: 55 | job.seeds.append(seed) 56 | for tag in job_doc['tags']: 57 | job.tags.append(tag) 58 | job.item_count = job_doc['item_count'] 59 | job.http_success_count = job_doc['http_success_count'] 60 | job.http_error_count = job_doc['http_error_count'] 61 | job.exception_count = job_doc['exception_count'] 62 | job.started_at = job_doc['started_at'].isoformat() 63 | if job_doc['completed_at'] is not None: 64 | job.completed_at = job_doc['completed_at'].isoformat() 65 | run_state = job_doc['run_state'].upper() 66 | job.run_state = PbRunState.Value(run_state) 67 | http_status_counts = job_doc['http_status_counts'] 68 | for status_code, count in http_status_counts.items(): 69 | job.http_status_counts[int(status_code)] = count 70 | 71 | 72 | @api_handler 73 | async def set_schedule(command, response, scheduler, server_db): 74 | ''' Create or update job schedule metadata. ''' 75 | doc = Schedule.from_pb(command.schedule).to_doc() 76 | now = datetime.now(timezone.utc) 77 | schedule_id = await server_db.set_schedule(doc, now) 78 | if schedule_id: 79 | response.new_schedule.schedule_id = UUID(schedule_id).bytes 80 | else: 81 | schedule_id = str(UUID(bytes=command.schedule.schedule_id)) 82 | scheduler.remove_schedule(schedule_id) 83 | if command.schedule.enabled: 84 | schedule_doc = await server_db.get_schedule(schedule_id) 85 | job_docs = await server_db.list_schedule_jobs(schedule_id, limit=1, 86 | offset=0) 87 | try: 88 | latest_job_doc = job_docs[0] 89 | except IndexError: 90 | latest_job_doc = None 91 | scheduler.add_schedule(schedule_doc, latest_job_doc) 92 | -------------------------------------------------------------------------------- /starbelly/server/subscription.py: -------------------------------------------------------------------------------- 1 | from uuid import UUID 2 | 3 | import trio.hazmat 4 | 5 | from . import api_handler 6 | 7 | 8 | @api_handler 9 | async def subscribe_job_sync(command, crawl_manager, response, 10 | subscription_manager): 11 | ''' Handle the subscribe crawl items command. ''' 12 | job_id = str(UUID(bytes=command.job_id)) 13 | compression_ok = command.compression_ok 14 | job_state_recv = crawl_manager.get_job_state_channel() 15 | sync_token = command.sync_token if command.HasField('sync_token') else None 16 | sub_id = subscription_manager.subscribe_job_sync(job_id, compression_ok, 17 | job_state_recv, sync_token) 18 | response.new_subscription.subscription_id = sub_id 19 | 20 | 21 | @api_handler 22 | async def subscribe_job_status(command, response, subscription_manager, 23 | stats_tracker): 24 | ''' Handle the subscribe crawl status command. ''' 25 | sub_id = subscription_manager.subscribe_job_status(stats_tracker, 26 | command.min_interval) 27 | response.new_subscription.subscription_id = sub_id 28 | 29 | 30 | @api_handler 31 | async def subscribe_resource_monitor(command, response, resource_monitor, 32 | subscription_manager): 33 | ''' Handle the subscribe resource monitor command. ''' 34 | sub_id = subscription_manager.subscribe_resource_monitor(resource_monitor, 35 | command.history) 36 | response.new_subscription.subscription_id = sub_id 37 | 38 | 39 | @api_handler 40 | async def subscribe_task_monitor(command, response, subscription_manager): 41 | ''' Handle the subscribe task monitor command. ''' 42 | root_task = trio.hazmat.current_root_task() 43 | sub_id = subscription_manager.subscribe_task_monitor(command.period, 44 | root_task) 45 | response.new_subscription.subscription_id = sub_id 46 | 47 | 48 | @api_handler 49 | async def unsubscribe(command, subscription_manager): 50 | ''' Handle an unsubscribe command. ''' 51 | sub_id = command.subscription_id 52 | subscription_manager.cancel_subscription(sub_id) 53 | -------------------------------------------------------------------------------- /starbelly/server/system.py: -------------------------------------------------------------------------------- 1 | import cProfile 2 | import operator 3 | import pstats 4 | 5 | import trio 6 | 7 | from . import api_handler, InvalidRequestException 8 | 9 | 10 | @api_handler 11 | async def performance_profile(command, response): 12 | ''' Run CPU profiler. ''' 13 | profile = cProfile.Profile() 14 | profile.enable() 15 | await trio.sleep(command.duration) 16 | profile.disable() 17 | 18 | # pstats sorting only works when you use pstats printing... so we have 19 | # to build our own data structure in order to sort it. 20 | pr_stats = pstats.Stats(profile) 21 | stats = list() 22 | for key, value in pr_stats.stats.items(): 23 | stats.append({ 24 | 'file': key[0], 25 | 'line_number': key[1], 26 | 'function': key[2], 27 | 'calls': value[0], 28 | 'non_recursive_calls': value[1], 29 | 'total_time': value[2], 30 | 'cumulative_time': value[3], 31 | }) 32 | 33 | try: 34 | stats.sort(key=operator.itemgetter(command.sort_by), reverse=True) 35 | except KeyError: 36 | raise InvalidRequestException('Invalid sort key: {}' 37 | .format(command.sort_by)) 38 | 39 | response.performance_profile.total_calls = pr_stats.total_calls 40 | response.performance_profile.total_time = pr_stats.total_tt 41 | 42 | for stat in stats[:command.top_n]: 43 | function = response.performance_profile.functions.add() 44 | function.file = stat['file'] 45 | function.line_number = stat['line_number'] 46 | function.function = stat['function'] 47 | function.calls = stat['calls'] 48 | function.non_recursive_calls = stat['non_recursive_calls'] 49 | function.total_time = stat['total_time'] 50 | function.cumulative_time = stat['cumulative_time'] 51 | -------------------------------------------------------------------------------- /starbelly/storage.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import hashlib 3 | import logging 4 | import functools 5 | 6 | import mimeparse 7 | import trio 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def should_compress_body(response): 14 | ''' 15 | Returns true if the response body should be compressed. 16 | 17 | This logic can be amended over time to add additional MIME types that 18 | should be compressed. 19 | 20 | :param starbelly.downloader.DownloadResponse response: 21 | ''' 22 | should_compress = False 23 | type_, subtype, _ = mimeparse.parse_mime_type(response.content_type) 24 | if type_ == 'text': 25 | should_compress = True 26 | elif type_ == 'application' and subtype in ('json', 'pdf'): 27 | should_compress = True 28 | return should_compress 29 | 30 | 31 | class CrawlStorage: 32 | ''' This class stores crawl items in the database. ''' 33 | def __init__(self, job_id, db, send_channel, receive_channel, policy, 34 | sequence): 35 | ''' 36 | Constructor 37 | 38 | :param str job_id: The job to store items for. 39 | :param starbelly.db.CrawlStorageDb db: Database layer. 40 | :param starbelly.policy.Policy: A policy to use for determining which 41 | responses to save. 42 | :param sequence: An iterator that returns a sequence number for each 43 | item to be saved. 44 | ''' 45 | self._job_id = job_id 46 | self._db = db 47 | self._send_channel = send_channel 48 | self._receive_channel = receive_channel 49 | self._policy = policy 50 | self._sequence = sequence 51 | 52 | def __repr__(self): 53 | ''' Put job ID in repr. ''' 54 | return ''.format(self._job_id[:8]) 55 | 56 | async def run(self): 57 | ''' 58 | Read items from channel and saves them into the database. 59 | 60 | :returns: This function runs until cancelled. 61 | ''' 62 | async for response in self._receive_channel: 63 | await self._save_response(response) 64 | await self._db.update_job_stats(self._job_id, response) 65 | await self._send_channel.send(response) 66 | 67 | async def _save_response(self, response): 68 | ''' 69 | Save a response to the database. 70 | 71 | :param starbelly.downloader.DownloadResponse response: 72 | ''' 73 | response_doc = { 74 | 'completed_at': response.completed_at, 75 | 'cost': response.cost, 76 | 'duration': response.duration, 77 | 'job_id': self._job_id, 78 | 'started_at': response.started_at, 79 | 'url': response.url.human_repr(), 80 | 'canonical_url': response.canonical_url, 81 | } 82 | 83 | if response.exception is None: 84 | response_doc['completed_at'] = response.completed_at 85 | response_doc['content_type'] = response.content_type 86 | response_doc['is_success'] = response.status_code // 100 == 2 87 | response_doc['status_code'] = response.status_code 88 | compress_body = should_compress_body(response) 89 | 90 | headers = list() 91 | for key, value in response.headers.items(): 92 | headers.append(key.upper()) 93 | headers.append(value) 94 | response_doc['headers'] = headers 95 | 96 | body_hash = hashlib.blake2b(response.body, digest_size=16).digest() 97 | if compress_body: 98 | body = await trio.run_sync_in_worker_thread(functools.partial( 99 | gzip.compress, response.body, compresslevel=6)) 100 | else: 101 | body = response.body 102 | 103 | response_doc['body_id'] = body_hash 104 | response_body_doc = { 105 | 'id': body_hash, 106 | 'body': body, 107 | 'is_compressed': compress_body, 108 | } 109 | else: 110 | response_doc['exception'] = response.exception 111 | response_doc['is_success'] = False 112 | response_body_doc = None 113 | 114 | response_doc['sequence'] = next(self._sequence) 115 | await self._db.save_response(response_doc, response_body_doc) 116 | -------------------------------------------------------------------------------- /starbelly/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '2.0.0-dev' 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | from functools import wraps 3 | import pathlib 4 | from os.path import dirname 5 | from sys import path 6 | from unittest.mock import Mock 7 | 8 | import pytest 9 | import trio 10 | import trio_asyncio 11 | 12 | 13 | # Add this project to the Python path. 14 | path.append(dirname(dirname(__file__))) 15 | 16 | 17 | @contextmanager 18 | def assert_min_elapsed(seconds): 19 | ''' 20 | Fail the test if the execution of a block takes less than ``seconds``. 21 | ''' 22 | start = trio.current_time() 23 | yield 24 | elapsed = trio.current_time() - start 25 | assert elapsed >= seconds, 'Completed in under {} seconds'.format(seconds) 26 | 27 | 28 | @contextmanager 29 | def assert_max_elapsed(seconds): 30 | ''' 31 | Fail the test if the execution of a block takes longer than ``seconds``. 32 | ''' 33 | try: 34 | with trio.fail_after(seconds): 35 | yield 36 | except trio.TooSlowError: 37 | pytest.fail('Failed to complete within {} seconds'.format(seconds)) 38 | 39 | 40 | @contextmanager 41 | def assert_elapsed(seconds, delta=0.1): 42 | ''' 43 | Fail the test if the execution of a block takes more than seconds+delta time 44 | or less than seconds-delta time. 45 | ''' 46 | with assert_min_elapsed(seconds-delta), assert_max_elapsed(seconds+delta): 47 | yield 48 | 49 | 50 | class AsyncMock: 51 | ''' A mock that acts like an async def function. ''' 52 | def __init__(self, return_value=None, return_values=None, raises=None, 53 | side_effect=None): 54 | self._raises = None 55 | self._side_effect = None 56 | self._return_value = None 57 | self._index = None 58 | self._call_count = 0 59 | self._call_args = None 60 | self._call_kwargs = None 61 | 62 | if raises: 63 | self._raises = raises 64 | elif return_values: 65 | self._return_value = return_values 66 | self._index = 0 67 | elif side_effect: 68 | self._side_effect=side_effect 69 | else: 70 | self._return_value = return_value 71 | 72 | @property 73 | def call_args(self): 74 | return self._call_args 75 | 76 | @property 77 | def call_kwargs(self): 78 | return self._call_kwargs 79 | 80 | @property 81 | def called(self): 82 | return self._call_count > 0 83 | 84 | @property 85 | def call_count(self): 86 | return self._call_count 87 | 88 | async def __call__(self, *args, **kwargs): 89 | self._call_args = args 90 | self._call_kwargs = kwargs 91 | self._call_count += 1 92 | if self._raises: 93 | raise(self._raises) 94 | elif self._side_effect: 95 | return await self._side_effect(*args, **kwargs) 96 | elif self._index is not None: 97 | return_index = self._index 98 | self._index += 1 99 | return self._return_value[return_index] 100 | else: 101 | return self._return_value 102 | 103 | 104 | async def async_iter(iter): 105 | ''' 106 | Convert a synchronous iterable into an async iterator. 107 | 108 | :param iterable iter: 109 | ''' 110 | for item in iter: 111 | await trio.sleep(0) 112 | yield item 113 | 114 | 115 | @pytest.fixture 116 | async def asyncio_loop(): 117 | ''' Open an asyncio loop. Useful for things like aiohttp.CookieJar that 118 | require a global loop. ''' 119 | async with trio_asyncio.open_loop() as loop: 120 | yield loop 121 | 122 | 123 | class fail_after: 124 | ''' This decorator fails if the runtime of the decorated function (as 125 | measured by the Trio clock) exceeds the specified value. ''' 126 | def __init__(self, seconds): 127 | self._seconds = seconds 128 | 129 | def __call__(self, fn): 130 | @wraps(fn) 131 | async def wrapper(*args, **kwargs): 132 | with trio.move_on_after(self._seconds) as cancel_scope: 133 | await fn(*args, **kwargs) 134 | if cancel_scope.cancelled_caught: 135 | pytest.fail('Test runtime exceeded the maximum {} seconds' 136 | .format(self._seconds)) 137 | return wrapper 138 | -------------------------------------------------------------------------------- /tests/test_async_mock.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from . import AsyncMock 4 | 5 | 6 | async def test_async_mock_no_return(): 7 | foo = AsyncMock() 8 | assert await foo() is None 9 | 10 | 11 | async def test_async_mock_single_return(): 12 | foo = AsyncMock(return_value=1) 13 | assert await foo() == 1 14 | assert await foo() == 1 15 | 16 | 17 | async def test_async_mock_multiple_returns(): 18 | foo = AsyncMock(return_values=(1,2)) 19 | assert await foo() == 1 20 | assert await foo() == 2 21 | 22 | 23 | async def test_async_mock_raises(): 24 | foo = AsyncMock(raises=Exception) 25 | with pytest.raises(Exception): 26 | await foo() 27 | -------------------------------------------------------------------------------- /tests/test_backoff.py: -------------------------------------------------------------------------------- 1 | from . import assert_elapsed 2 | from starbelly.backoff import ExponentialBackoff 3 | 4 | 5 | async def test_backoff_no_change(autojump_clock): 6 | ''' Backoff starts at 1, so 3 iterations takes ~2 seconds. ''' 7 | with assert_elapsed(2): 8 | loop_count = 0 9 | async for _ in ExponentialBackoff(min_=1, max_=64): 10 | loop_count += 1 11 | if loop_count == 3: 12 | break 13 | 14 | 15 | async def test_backoff_increase(autojump_clock): 16 | ''' Increase backoff on each loop. Backoffs should be equal to 1, 2, 4, 17 | 8, 16, 16, but the first value is skipped, so the total is ~46 seconds. ''' 18 | with assert_elapsed(seconds=46): 19 | loop_count = 0 20 | backoff = ExponentialBackoff(min_=1, max_=16) 21 | async for n in backoff: 22 | backoff.increase() 23 | loop_count += 1 24 | if loop_count == 6: break 25 | 26 | 27 | async def test_backoff_returns_value(autojump_clock): 28 | ''' Backoff returns the current value. Increase up to max and then decrease 29 | back to starting point. ''' 30 | backoff = ExponentialBackoff(min_=1, max_=8) 31 | assert await backoff.__anext__() == 0 32 | assert await backoff.__anext__() == 1 33 | backoff.increase() 34 | assert await backoff.__anext__() == 2 35 | backoff.increase() 36 | assert await backoff.__anext__() == 4 37 | backoff.increase() 38 | assert await backoff.__anext__() == 8 39 | backoff.increase() 40 | assert await backoff.__anext__() == 8 41 | backoff.decrease() 42 | assert await backoff.__anext__() == 4 43 | backoff.decrease() 44 | assert await backoff.__anext__() == 2 45 | backoff.decrease() 46 | assert await backoff.__anext__() == 1 47 | backoff.decrease() 48 | assert await backoff.__anext__() == 1 49 | -------------------------------------------------------------------------------- /tests/test_captcha.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | from uuid import UUID 3 | 4 | import pytest 5 | 6 | from starbelly.captcha import ( 7 | CaptchaSolver, 8 | captcha_doc_to_pb, 9 | captcha_pb_to_doc, 10 | ) 11 | from starbelly.starbelly_pb2 import CaptchaSolverAntigateCharacters 12 | 13 | 14 | def test_captcha_command(): 15 | captcha_doc = { 16 | 'id': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', 17 | 'name': 'Captcha #1', 18 | 'service_url': 'https://captcha.example/service.php', 19 | 'api_key': 'FAKE-API-KEY', 20 | 'require_phrase': False, 21 | 'case_sensitive': True, 22 | 'characters': 'ALPHANUMERIC', 23 | 'require_math': False, 24 | } 25 | solver = CaptchaSolver(captcha_doc) 26 | img_data = b'\x01\x02\x03\x04' 27 | command = solver.get_command(img_data) 28 | assert command['clientKey'] == 'FAKE-API-KEY' 29 | assert command['task']['type'] == 'ImageToTextTask' 30 | assert command['task']['body'] == 'AQIDBA==' # Base64 of img_data 31 | assert not command['task']['phrase'] 32 | assert command['task']['case'] 33 | assert command['task']['numeric'] == 0 34 | assert not command['task']['math'] 35 | assert command['task']['minLength'] == 0 36 | assert command['task']['maxLength'] == 0 37 | 38 | 39 | def test_captcha_doc_to_pb(): 40 | captcha_id = UUID('aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa') 41 | captcha_doc = { 42 | 'id': str(captcha_id), 43 | 'type': 'antigate', 44 | 'created_at': datetime(2019, 1, 26, 15, 30, 0, tzinfo=timezone.utc), 45 | 'updated_at': datetime(2019, 1, 26, 15, 35, 0, tzinfo=timezone.utc), 46 | 'name': 'Captcha #1', 47 | 'service_url': 'https://captcha.example/service.php', 48 | 'api_key': 'FAKE-API-KEY', 49 | 'require_phrase': False, 50 | 'case_sensitive': True, 51 | 'characters': 'ALPHANUMERIC', 52 | 'require_math': False, 53 | } 54 | pb_captcha = captcha_doc_to_pb(captcha_doc) 55 | assert pb_captcha.name == 'Captcha #1' 56 | assert pb_captcha.solver_id == captcha_id.bytes 57 | assert pb_captcha.created_at == '2019-01-26T15:30:00+00:00' 58 | assert pb_captcha.updated_at == '2019-01-26T15:35:00+00:00' 59 | assert pb_captcha.antigate.service_url == \ 60 | 'https://captcha.example/service.php' 61 | assert pb_captcha.antigate.api_key == 'FAKE-API-KEY' 62 | assert not pb_captcha.antigate.require_phrase 63 | assert pb_captcha.antigate.case_sensitive 64 | assert pb_captcha.antigate.characters == \ 65 | CaptchaSolverAntigateCharacters.Value('ALPHANUMERIC') 66 | assert not pb_captcha.antigate.require_math 67 | 68 | captcha_doc = captcha_pb_to_doc(pb_captcha) 69 | assert captcha_doc['id'] == str(captcha_id) 70 | assert captcha_doc['name'] == 'Captcha #1' 71 | assert captcha_doc['type'] == 'antigate' 72 | assert captcha_doc['service_url'] == 'https://captcha.example/service.php' 73 | assert captcha_doc['api_key'] == 'FAKE-API-KEY' 74 | assert captcha_doc['require_phrase'] == False 75 | assert captcha_doc['case_sensitive'] == True 76 | assert captcha_doc['characters'] == 'ALPHANUMERIC' 77 | assert captcha_doc['require_math'] == False 78 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import starbelly.config 4 | 5 | 6 | LOCAL_INI = '''[database] 7 | host = starbelly-host 8 | db = starbelly-db 9 | user = starbelly-app 10 | password = normalpass 11 | super_user = starbelly-admin 12 | super_password = superpass''' 13 | 14 | 15 | SYSTEM_INI = '''[database] 16 | host = 17 | port = 28015 18 | db = 19 | user = 20 | password = 21 | super_user = 22 | super_password = 23 | 24 | [rate_limiter] 25 | capacity = 10000''' 26 | 27 | 28 | def test_get_config(tmp_path): 29 | # Hack: modify the module's private _root variable to point at our temp 30 | # directory. 31 | starbelly.config._root = tmp_path 32 | 33 | # Create temp configuration files. 34 | config_dir = tmp_path / 'conf' 35 | config_dir.mkdir() 36 | 37 | with (config_dir / 'local.ini').open('w') as f: 38 | f.write(LOCAL_INI) 39 | 40 | with (config_dir / 'system.ini').open('w') as f: 41 | f.write(SYSTEM_INI) 42 | 43 | # Read configuration. 44 | config = starbelly.config.get_config() 45 | import logging 46 | logging.debug('secrions %r', config.sections()) 47 | db = config['database'] 48 | rl = config['rate_limiter'] 49 | 50 | assert db['host'] == 'starbelly-host' 51 | assert db['port'] == '28015' 52 | assert db['db'] == 'starbelly-db' 53 | assert rl['capacity'] == '10000' 54 | -------------------------------------------------------------------------------- /tests/test_crawl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/tests/test_crawl.py -------------------------------------------------------------------------------- /tests/test_frontier.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | from unittest.mock import Mock 3 | 4 | import pytest 5 | import trio 6 | 7 | from . import AsyncMock 8 | from starbelly.frontier import ( 9 | CrawlFrontier, 10 | FrontierItem, 11 | FrontierExhaustionError, 12 | ) 13 | from starbelly.policy import Policy 14 | 15 | 16 | def make_policy(): 17 | created_at = datetime(2018,12,31,13,47,00) 18 | policy_doc = { 19 | 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', 20 | 'name': 'Test', 21 | 'created_at': created_at, 22 | 'updated_at': created_at, 23 | 'authentication': { 24 | 'enabled': True, 25 | }, 26 | 'limits': { 27 | 'max_cost': 10, 28 | 'max_duration': 3600, 29 | 'max_items': 10_000, 30 | }, 31 | 'mime_type_rules': [ 32 | {'match': 'MATCHES', 'pattern': '^text/', 'save': True}, 33 | {'save': False}, 34 | ], 35 | 'proxy_rules': [], 36 | 'robots_txt': { 37 | 'usage': 'IGNORE', 38 | }, 39 | 'url_normalization': { 40 | 'enabled': True, 41 | 'strip_parameters': ['b'], 42 | }, 43 | 'url_rules': [ 44 | {'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 45 | 'pattern': '^https?://({SEED_DOMAINS})/'}, 46 | {'action': 'MULTIPLY', 'amount': 0}, 47 | ], 48 | 'user_agents': [ 49 | {'name': 'Test User Agent'} 50 | ] 51 | } 52 | return Policy(policy_doc, '1.0.0', ['https://frontier.example']) 53 | 54 | 55 | async def test_frontier_exhaustion(nursery): 56 | # Set up test fixtures 57 | job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' 58 | db = Mock() 59 | db.any_in_flight = AsyncMock() 60 | db.get_frontier_batch = AsyncMock(return_value=list()) 61 | db.get_frontier_size = AsyncMock(return_value=5) 62 | send_channel, recv_channel = trio.open_memory_channel(0) 63 | login_manager = Mock() 64 | login_manager.login = AsyncMock() 65 | policy = make_policy() 66 | stats = dict() 67 | frontier = CrawlFrontier(job_id, db, send_channel, login_manager, policy, 68 | stats) 69 | 70 | # This test has an empty frontier, so it should raise an exhaustion error 71 | # in its run() method. 72 | with pytest.raises(FrontierExhaustionError): 73 | await frontier.run() 74 | 75 | 76 | async def test_frontier_batches(autojump_clock, nursery): 77 | # Set up test fixtures 78 | job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' 79 | db = Mock() 80 | db.any_in_flight = AsyncMock() 81 | batch1 = [{ 82 | 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', 83 | 'cost': 1.0, 84 | 'job_id': job_id, 85 | 'url': 'https://frontier.example/1', 86 | 'in_flight': False, 87 | },{ 88 | 'id': 'cccccccc-cccc-cccc-cccc-cccccccccccc', 89 | 'cost': 2.0, 90 | 'job_id': job_id, 91 | 'url': 'https://frontier.example/2', 92 | 'in_flight': False, 93 | }] 94 | batch2 = [{ 95 | 'id': 'dddddddd-dddd-dddd-dddd-dddddddddddd', 96 | 'cost': 3.0, 97 | 'job_id': job_id, 98 | 'url': 'https://frontier.example/3', 99 | 'in_flight': False, 100 | }] 101 | db.get_frontier_batch = AsyncMock(return_values=(batch1, batch2, [])) 102 | db.get_frontier_size = AsyncMock(return_value=5) 103 | send_channel, recv_channel = trio.open_memory_channel(0) 104 | login_manager = Mock() 105 | login_manager.login = AsyncMock() 106 | policy = make_policy() 107 | stats = dict() 108 | frontier = CrawlFrontier(job_id, db, send_channel, login_manager, policy, 109 | stats) 110 | assert repr(frontier) == '' 111 | nursery.start_soon(frontier.run) 112 | 113 | # Wait for the first item from the frontier. It should trigger the login 114 | # manager to log in to this domain, and also check the robots.txt to see if 115 | # the item is allowed. 116 | item1 = await recv_channel.receive() 117 | assert login_manager.login.call_count == 1 118 | assert login_manager.login.call_args[0] == 'frontier.example' 119 | assert str(item1.url) == 'https://frontier.example/1' 120 | assert item1.cost == 1.0 121 | assert item1.job_id == job_id 122 | assert item1.frontier_id == 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb' 123 | -------------------------------------------------------------------------------- /tests/test_job.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta, timezone 2 | import logging 3 | import pickle 4 | from unittest.mock import Mock 5 | 6 | import pytest 7 | import trio 8 | 9 | from . import AsyncMock, asyncio_loop, fail_after 10 | from starbelly.frontier import FrontierExhaustionError 11 | from starbelly.job import ( 12 | PipelineTerminator, 13 | RunState, 14 | StatsTracker, 15 | CrawlManager, 16 | ) 17 | 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | def make_policy_doc(): 23 | created_at = datetime(2019, 1, 1, 12, 0, 0, tzinfo=timezone.utc) 24 | return { 25 | 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', 26 | 'name': 'Test Policy', 27 | 'created_at': created_at, 28 | 'updated_at': created_at, 29 | 'authentication': {'enabled': True}, 30 | 'limits': { 31 | 'max_cost': 10, 32 | 'max_duration': 3600, 33 | 'max_items': 10_000, 34 | }, 35 | 'mime_type_rules': [ 36 | {'match': 'MATCHES', 'pattern': '^text/', 'save': True}, 37 | {'save': False}, 38 | ], 39 | 'proxy_rules': [], 40 | 'robots_txt': { 41 | 'usage': 'IGNORE', 42 | }, 43 | 'url_normalization': { 44 | 'enabled': True, 45 | 'strip_parameters': ['PHPSESSID'], 46 | }, 47 | 'url_rules': [ 48 | {'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 49 | 'pattern': '^https?://({SEED_DOMAINS})/'}, 50 | {'action': 'MULTIPLY', 'amount': 0}, 51 | ], 52 | 'user_agents': [ 53 | {'name': 'Test User Agent'} 54 | ] 55 | } 56 | 57 | 58 | @fail_after(3) 59 | async def test_start_job(asyncio_loop, nursery): 60 | # Set up fixtures 61 | job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' 62 | policy_id = 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb' 63 | rate_limiter = Mock() 64 | rate_limiter.remove_job = AsyncMock() 65 | stats_tracker = StatsTracker(timedelta(seconds=60)) 66 | robots_txt_manager = Mock() 67 | manager_db = Mock() 68 | manager_db.clear_frontier = AsyncMock() 69 | manager_db.create_job = AsyncMock(job_id) 70 | manager_db.finish_job = AsyncMock() 71 | manager_db.get_max_sequence = AsyncMock(100) 72 | manager_db.get_policy = AsyncMock(make_policy_doc()) 73 | manager_db.run_job = AsyncMock() 74 | frontier_db = Mock() 75 | frontier_db.any_in_flight = AsyncMock(False) 76 | frontier_db.get_frontier_batch = AsyncMock({}) 77 | frontier_db.get_frontier_size = AsyncMock(0) 78 | frontier_db.run = AsyncMock() 79 | extractor_db = Mock() 80 | storage_db = Mock() 81 | login_db = Mock() 82 | crawl_manager = CrawlManager(rate_limiter, stats_tracker, 83 | robots_txt_manager, manager_db, frontier_db, extractor_db, storage_db, 84 | login_db) 85 | 86 | # Run the crawl manager and start a new job 87 | await nursery.start(crawl_manager.run) 88 | await crawl_manager.start_job('Test Job', ['https://seed.example'], 89 | ['tag1'], policy_id) 90 | 91 | # Wait for the crawler to tell us that the job is running. 92 | recv_channel = crawl_manager.get_job_state_channel() 93 | state_event = await recv_channel.receive() 94 | assert state_event.run_state == RunState.RUNNING 95 | 96 | resources = crawl_manager.get_resource_usage() 97 | assert resources['maximum_downloads'] == 20 98 | assert resources['current_downloads'] == 0 99 | assert resources['jobs'][0]['id'] == job_id 100 | assert resources['jobs'][0]['name'] == 'Test Job' 101 | assert resources['jobs'][0]['current_downloads'] == 0 102 | 103 | # The job has an empty frontier, so it will quit immediately after starting. 104 | # Wait for the completed job state. 105 | state_event = await recv_channel.receive() 106 | assert state_event.run_state == RunState.COMPLETED 107 | 108 | # Make sure the manager interacted with other objects correctly. 109 | assert manager_db.clear_frontier.call_args[0] == job_id 110 | assert manager_db.finish_job.call_args[0] == job_id 111 | assert manager_db.finish_job.call_args[1] == RunState.COMPLETED 112 | assert manager_db.get_policy.call_args[0] == policy_id 113 | assert manager_db.run_job.call_args[0] == job_id 114 | assert frontier_db.get_frontier_batch.call_args[0] == job_id 115 | 116 | stats = stats_tracker.snapshot() 117 | assert stats[0]['id'] == job_id 118 | assert stats[0]['name'] == 'Test Job' 119 | assert stats[0]['run_state'] == RunState.COMPLETED 120 | assert stats[0]['seeds'] == ['https://seed.example'] 121 | assert stats[0]['tags'] == ['tag1'] 122 | 123 | 124 | @fail_after(3) 125 | async def test_pause_resume_cancel(asyncio_loop, nursery): 126 | # Set up fixtures 127 | job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' 128 | created_at = datetime(2019, 1, 1, 12, 0, 0, tzinfo=timezone.utc) 129 | job_doc = { 130 | 'id': job_id, 131 | 'name': 'Test Job', 132 | 'seeds': ['https://seed1.example', 'https://seed2.example'], 133 | 'tags': [], 134 | 'run_state': RunState.PAUSED, 135 | 'old_urls': b'\x80\x03cbuiltins\nset\nq\x00]q\x01C\x10\xad\xb6\x93\x9b' 136 | b'\xac\x92\xd8\xfd\xc0\x8dJ\x94^\x8d\xe5~q\x02a\x85q\x03Rq' 137 | b'\x04.', 138 | 'started_at': created_at, 139 | 'completed_at': None, 140 | 'duration': None, 141 | 'item_count': 0, 142 | 'http_success_count': 0, 143 | 'http_error_count': 0, 144 | 'exception_count': 0, 145 | 'http_status_counts': {}, 146 | 'schedule_id': 'cccccccc-cccc-cccc-cccc-cccccccccccc', 147 | 'policy': { 148 | 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', 149 | 'name': 'Test Policy', 150 | 'created_at': created_at, 151 | 'updated_at': created_at, 152 | 'authentication': { 153 | 'enabled': False, 154 | }, 155 | 'captcha_solver_id': None, 156 | 'limits': { 157 | 'max_cost': 10, 158 | 'max_duration': 3600, 159 | 'max_items': 10_000, 160 | }, 161 | 'mime_type_rules': [ 162 | {'match': 'MATCHES', 'pattern': '^text/', 'save': True}, 163 | {'save': False}, 164 | ], 165 | 'proxy_rules': [], 166 | 'robots_txt': { 167 | 'usage': 'IGNORE', 168 | }, 169 | 'url_normalization': { 170 | 'enabled': True, 171 | 'strip_parameters': [], 172 | }, 173 | 'url_rules': [ 174 | {'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 175 | 'pattern': '^https?://({SEED_DOMAINS})/'}, 176 | {'action': 'MULTIPLY', 'amount': 0}, 177 | ], 178 | 'user_agents': [ 179 | {'name': 'Test User Agent'} 180 | ], 181 | }, 182 | } 183 | 184 | rate_limiter = Mock() 185 | rate_limiter.remove_job = AsyncMock() 186 | stats_tracker = StatsTracker(timedelta(seconds=60)) 187 | robots_txt_manager = Mock() 188 | manager_db = Mock() 189 | manager_db.clear_frontier = AsyncMock() 190 | manager_db.create_job = AsyncMock(job_id) 191 | manager_db.finish_job = AsyncMock() 192 | manager_db.get_max_sequence = AsyncMock(100) 193 | manager_db.get_policy = AsyncMock(make_policy_doc()) 194 | manager_db.resume_job = AsyncMock(job_doc) 195 | manager_db.pause_job = AsyncMock() 196 | manager_db.run_job = AsyncMock() 197 | frontier_db = Mock() 198 | frontier_db.any_in_flight = AsyncMock(True) 199 | frontier_db.get_frontier_batch = AsyncMock({}) 200 | frontier_db.get_frontier_size = AsyncMock(0) 201 | frontier_db.run = AsyncMock() 202 | extractor_db = Mock() 203 | storage_db = Mock() 204 | login_db = Mock() 205 | crawl_manager = CrawlManager(rate_limiter, stats_tracker, 206 | robots_txt_manager, manager_db, frontier_db, extractor_db, storage_db, 207 | login_db) 208 | 209 | # Run the crawl manager and start a new job 210 | await nursery.start(crawl_manager.run) 211 | await crawl_manager.start_job(job_doc['name'], job_doc['seeds'], 212 | job_doc['tags'], job_doc['policy']['id']) 213 | 214 | # Wait for the crawler to tell us that the job is running. 215 | recv_channel = crawl_manager.get_job_state_channel() 216 | state_event = await recv_channel.receive() 217 | assert state_event.run_state == RunState.RUNNING 218 | 219 | # Now pause and wait for the paused event. 220 | await crawl_manager.pause_job(job_id) 221 | state_event = await recv_channel.receive() 222 | assert state_event.run_state == RunState.PAUSED 223 | assert manager_db.pause_job.call_args[0] == job_id 224 | # There are two "old URLs": the seed URLs. 225 | assert len(pickle.loads(manager_db.pause_job.call_args[1])) == 2 226 | assert stats_tracker.snapshot()[0]['run_state'] == RunState.PAUSED 227 | 228 | # Now resume and wait for the running event. 229 | await crawl_manager.resume_job(job_id) 230 | state_event = await recv_channel.receive() 231 | assert state_event.run_state == RunState.RUNNING 232 | assert manager_db.resume_job.call_args[0] == job_id 233 | 234 | # Now cancel and wait for the cancelled event 235 | await crawl_manager.cancel_job(job_id) 236 | state_event = await recv_channel.receive() 237 | assert state_event.run_state == RunState.CANCELLED 238 | assert manager_db.finish_job.call_args[0] == job_id 239 | assert manager_db.finish_job.call_args[1] == RunState.CANCELLED 240 | -------------------------------------------------------------------------------- /tests/test_login.py: -------------------------------------------------------------------------------- 1 | from base64 import b64decode 2 | from datetime import datetime, timezone 3 | from functools import partial 4 | from unittest.mock import Mock 5 | 6 | from aiohttp import CookieJar 7 | import trio 8 | from yarl import URL 9 | 10 | from . import asyncio_loop, AsyncMock 11 | from starbelly.captcha import CaptchaSolver 12 | from starbelly.downloader import Downloader, DownloadResponse 13 | from starbelly.login import LoginManager 14 | from starbelly.policy import Policy 15 | 16 | 17 | def make_policy(captcha_port=80): 18 | policy_doc = { 19 | 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f7', 20 | 'name': 'Test', 21 | 'created_at': datetime(2019, 1, 28, 14, 26, 0, tzinfo=timezone.utc), 22 | 'updated_at': datetime(2019, 1, 28, 14, 26, 0, tzinfo=timezone.utc), 23 | 'authentication': { 24 | 'enabled': False, 25 | }, 26 | 'captcha_solver': { 27 | 'id': '01b60eeb-2ac9-4f41-9b0c-47dcbcf637f8', 28 | 'name': 'Example CAPTCHA', 29 | 'service_url': 'http://127.0.0.1:{}'.format(captcha_port), 30 | 'api_key': None, 31 | 'require_phrase': False, 32 | 'case_sensitive': True, 33 | 'characters': 'ALPHANUMERIC', 34 | 'require_math': False, 35 | }, 36 | 'limits': { 37 | 'max_cost': 10, 38 | 'max_duration': 3600, 39 | 'max_items': 10_000, 40 | }, 41 | 'mime_type_rules': [ 42 | {'match': 'MATCHES', 'pattern': '^text/', 'save': True}, 43 | {'save': False}, 44 | ], 45 | 'proxy_rules': [], 46 | 'robots_txt': { 47 | 'usage': 'IGNORE', 48 | }, 49 | 'url_normalization': { 50 | 'enabled': True, 51 | 'strip_parameters': [], 52 | }, 53 | 'url_rules': [ 54 | {'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 55 | 'pattern': '^https?://({SEED_DOMAINS})/'}, 56 | {'action': 'MULTIPLY', 'amount': 0}, 57 | ], 58 | 'user_agents': [ 59 | {'name': 'Test User Agent'} 60 | ] 61 | } 62 | return Policy(policy_doc, '1.0.0', ['https://login.example']) 63 | 64 | 65 | async def test_login_form(): 66 | job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' 67 | db = Mock() 68 | login = { 69 | 'domain': 'login.example', 70 | 'login_url': 'https://login.example/index', 71 | 'users': [{'username': 'john', 'password': 'fake'}] 72 | } 73 | db.get_login = AsyncMock(return_value=login) 74 | policy = make_policy() 75 | downloader = Mock() 76 | html1 = \ 77 | b''' 78 | Login Test 79 | 80 |
81 | 82 | 83 | 84 | 85 | ''' 86 | response1 = DownloadResponse( 87 | frontier_id='bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', 88 | cost=1.0, 89 | url='https://login.example', 90 | canonical_url='https://login.example', 91 | content_type='text/html', 92 | body=html1, 93 | started_at=datetime(2019, 2, 1, 10, 2, 0, tzinfo=timezone.utc), 94 | completed_at=datetime(2019, 2, 1, 10, 2, 0, tzinfo=timezone.utc), 95 | exception=None, 96 | status_code=200, 97 | headers=dict() 98 | ) 99 | response2 = DownloadResponse( 100 | frontier_id='cccccccc-cccc-cccc-cccc-cccccccccccc', 101 | cost=1.0, 102 | url='https://login.example', 103 | canonical_url='https://login.example', 104 | content_type='text/html', 105 | body=None, 106 | started_at=datetime(2019, 2, 1, 10, 2, 0, tzinfo=timezone.utc), 107 | completed_at=datetime(2019, 2, 1, 10, 2, 0, tzinfo=timezone.utc), 108 | exception=None, 109 | status_code=200, 110 | headers=dict() 111 | ) 112 | downloader.download = AsyncMock(return_values=(response1, response2)) 113 | login_manager = LoginManager(job_id, db, policy, downloader) 114 | await login_manager.login('login.example') 115 | assert downloader.download.call_count == 2 116 | request = downloader.download.call_args[0] 117 | assert request.method == 'POST' 118 | assert str(request.url) == 'https://login.example/login' 119 | assert request.form_data['username'] == 'john' 120 | assert request.form_data['password'] == 'fake' 121 | 122 | 123 | async def test_login_with_captcha(asyncio_loop, mocker, nursery): 124 | # Create a CAPTCHA server 125 | conn_count = 0 126 | async def handler(stream): 127 | nonlocal conn_count 128 | if conn_count == 0: 129 | request = await stream.receive_some(4096) 130 | assert request.startswith(b'POST /createTask HTTP/1.1\r\n') 131 | await stream.send_all( 132 | b'HTTP/1.1 200 OK\r\n' 133 | b'Content-type: application/json\r\n' 134 | b'\r\n' 135 | b'{"errorId": 0, "taskId": 28278116}\r\n' 136 | b'\r\n' 137 | ) 138 | else: 139 | request = await stream.receive_some(4096) 140 | assert request.startswith(b'POST /getTaskResult HTTP/1.1\r\n') 141 | await stream.send_all( 142 | b'HTTP/1.1 200 OK\r\n' 143 | b'Content-type: application/json\r\n' 144 | b'\r\n' 145 | b'{"errorId": 0, "taskId": 28278116, "status": "ready",\r\n' 146 | b' "solution": {"text": "ABCD1234"}}\r\n' 147 | b'\r\n' 148 | ) 149 | conn_count += 1 150 | serve_tcp = partial(trio.serve_tcp, handler, port=0, host='127.0.0.1') 151 | http_server = await nursery.start(serve_tcp) 152 | addr, port = http_server[0].socket.getsockname() 153 | 154 | # Patch out asyncio sleep so that the test completes quickly. 155 | sleep_mock = mocker.patch('asyncio.sleep', new=AsyncMock()) 156 | 157 | # Create test fixtures 158 | job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' 159 | db = Mock() 160 | login = { 161 | 'domain': 'login.example', 162 | 'login_url': 'https://login.example/index', 163 | 'users': [{'username': 'john', 'password': 'fake'}] 164 | } 165 | db.get_login = AsyncMock(return_value=login) 166 | policy = make_policy(port) 167 | downloader = Mock() 168 | html1 = \ 169 | b''' 170 | Login Test 171 | 172 | 173 | 174 | 175 | CAPTCHA 176 | 177 | 178 | 179 | ''' 180 | response1 = DownloadResponse( 181 | frontier_id='bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', 182 | cost=1.0, 183 | url='https://login.example/index', 184 | canonical_url='https://login.example/index', 185 | content_type='text/html', 186 | body=html1, 187 | started_at=datetime(2019, 2, 1, 10, 2, 0, tzinfo=timezone.utc), 188 | completed_at=datetime(2019, 2, 1, 10, 2, 1, tzinfo=timezone.utc), 189 | exception=None, 190 | status_code=200, 191 | headers=dict() 192 | ) 193 | img_data = b64decode('iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJ' 194 | 'AAAACklEQVR4nGMAAQAABQABDQottAAAAABJRU5ErkJggg==') 195 | response2 = DownloadResponse( 196 | frontier_id='cccccccc-cccc-cccc-cccc-cccccccccccc', 197 | cost=1.0, 198 | url='https://login.example/get-captcha', 199 | canonical_url='https://login.example/get-captcha', 200 | content_type='image/png', 201 | body=img_data, 202 | started_at=datetime(2019, 2, 1, 10, 2, 4, tzinfo=timezone.utc), 203 | completed_at=datetime(2019, 2, 1, 10, 2, 5, tzinfo=timezone.utc), 204 | exception=None, 205 | status_code=200, 206 | headers=dict() 207 | ) 208 | response3 = DownloadResponse( 209 | frontier_id='cccccccc-cccc-cccc-cccc-cccccccccccc', 210 | cost=1.0, 211 | url='https://login.example/login', 212 | canonical_url='https://login.example/login', 213 | content_type='text/plain', 214 | body=b'200 OK', 215 | started_at=datetime(2019, 2, 1, 10, 2, 2, tzinfo=timezone.utc), 216 | completed_at=datetime(2019, 2, 1, 10, 2, 3, tzinfo=timezone.utc), 217 | exception=None, 218 | status_code=200, 219 | headers=dict() 220 | ) 221 | downloader.download = AsyncMock(return_values=(response1, response2, 222 | response3)) 223 | 224 | # Run the test 225 | login_manager = LoginManager(job_id, db, policy, downloader) 226 | await login_manager.login('login.example') 227 | assert downloader.download.call_count == 3 228 | request = downloader.download.call_args[0] 229 | assert request.method == 'POST' 230 | assert str(request.url) == 'https://login.example/login' 231 | assert request.form_data['username'] == 'john' 232 | assert request.form_data['password'] == 'fake' 233 | assert request.form_data['captcha'] == 'ABCD1234' 234 | -------------------------------------------------------------------------------- /tests/test_rate_limiter.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import trio 3 | 4 | from . import assert_min_elapsed, assert_max_elapsed 5 | from starbelly.rate_limiter import ( 6 | Expiry, 7 | get_domain_token, 8 | GLOBAL_RATE_LIMIT_TOKEN, 9 | RateLimiter, 10 | ) 11 | from starbelly.downloader import DownloadRequest 12 | 13 | 14 | def make_request(job_id, url): 15 | ''' Make a download request object. ''' 16 | return DownloadRequest( 17 | frontier_id='aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', 18 | job_id=job_id, 19 | method='GET', 20 | url=url, 21 | form_data=None, 22 | cost=1.0 23 | ) 24 | 25 | 26 | def test_expiry_repr(): 27 | token = b'\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff' 28 | expiry = Expiry(time=1.5, token=token) 29 | assert repr(expiry) == \ 30 | 'Expiry(time=1.500, token=00112233445566778899aabbccddeeff)' 31 | 32 | 33 | def test_compare_expiry_to_expiry(): 34 | token = b'\x01' * 16 35 | expiry1 = Expiry(time=1, token=token) 36 | expiry2 = Expiry(time=1, token=token) 37 | expiry3 = Expiry(time=2, token=token) 38 | assert expiry1 == expiry2 39 | assert expiry2 < expiry3 40 | 41 | 42 | def test_compare_expiry_to_float(): 43 | token = b'\x01' * 16 44 | expiry1 = Expiry(time=2, token=token) 45 | assert expiry1 > 1.5 46 | assert 1.5 < expiry1 47 | assert expiry1 == 2.0 48 | 49 | 50 | async def test_one_request(nursery): 51 | job_id = '123e4567-e89b-12d3-a456-426655440001' 52 | rl = RateLimiter(1) 53 | request_send = rl.get_request_channel() 54 | reset_send = rl.get_reset_channel() 55 | job_recv = rl.add_job(job_id) 56 | assert rl.job_count == 1 57 | assert rl.item_count == 0 58 | nursery.start_soon(rl.run) 59 | request = make_request(job_id, 'http://domain.example') 60 | await request_send.send(request) 61 | job_request = await job_recv.receive() 62 | assert job_request is request 63 | 64 | 65 | async def test_two_requests_different_domains(nursery): 66 | ''' 67 | The requests are for separate domains, so the rate limiter will emit both 68 | requests without delay. 69 | ''' 70 | job_id = '123e4567-e89b-12d3-a456-426655440001' 71 | rl = RateLimiter(2) 72 | rl.set_rate_limit(GLOBAL_RATE_LIMIT_TOKEN, 10) 73 | request_send = rl.get_request_channel() 74 | reset_send = rl.get_reset_channel() 75 | job_recv = rl.add_job(job_id) 76 | nursery.start_soon(rl.run) 77 | request1 = make_request(job_id, 'http://domain1.example') 78 | request2 = make_request(job_id, 'http://domain2.example') 79 | await request_send.send(request1) 80 | await request_send.send(request2) 81 | with assert_max_elapsed(seconds=1): 82 | job_request1 = await job_recv.receive() 83 | job_request2 = await job_recv.receive() 84 | assert job_request1 is request1 85 | assert job_request2 is request2 86 | 87 | 88 | async def test_two_requests_same_domain(autojump_clock, nursery): 89 | ''' 90 | The requests are for the same domain, so the rate limiter will impose a 91 | 10 second delay between the reset of the first request and issuing the 92 | second request. 93 | ''' 94 | job_id = '123e4567-e89b-12d3-a456-426655440001' 95 | rl = RateLimiter(2) 96 | rl.set_rate_limit(GLOBAL_RATE_LIMIT_TOKEN, 10) 97 | request_send = rl.get_request_channel() 98 | reset_send = rl.get_reset_channel() 99 | job_recv = rl.add_job(job_id) 100 | nursery.start_soon(rl.run) 101 | request1 = make_request(job_id, 'http://domain.example/1') 102 | request2 = make_request(job_id, 'http://domain.example/2') 103 | await request_send.send(request1) 104 | await request_send.send(request2) 105 | job_request1 = await job_recv.receive() 106 | with assert_min_elapsed(seconds=10): 107 | await reset_send.send(job_request1.url) 108 | job_request2 = await job_recv.receive() 109 | assert job_request1 is request1 110 | assert job_request2 is request2 111 | 112 | 113 | async def test_rate_limiter_over_capacity(autojump_clock, nursery): 114 | ''' 115 | The rate limiter will be over capacity when the 3rd item is added and will 116 | block for 5 seconds until ``remove_one_request()`` reads one item from the 117 | rate limiter. 118 | ''' 119 | job_id = '123e4567-e89b-12d3-a456-426655440001' 120 | rl = RateLimiter(2) 121 | rl.set_rate_limit(GLOBAL_RATE_LIMIT_TOKEN, 10) 122 | request_send = rl.get_request_channel() 123 | reset_send = rl.get_reset_channel() 124 | job_recv = rl.add_job(job_id) 125 | nursery.start_soon(rl.run) 126 | request1 = make_request(job_id, 'http://domain1.example') 127 | request2 = make_request(job_id, 'http://domain2.example') 128 | request3 = make_request(job_id, 'http://domain3.example') 129 | await request_send.send(request1) 130 | await request_send.send(request2) 131 | 132 | async def read_one_request(when): 133 | await trio.sleep(when) 134 | await job_recv.receive() 135 | 136 | with assert_min_elapsed(seconds=5): 137 | async with trio.open_nursery() as inner: 138 | inner.start_soon(read_one_request, 5) 139 | await request_send.send(request3) 140 | 141 | 142 | async def test_token_limit_supercedes_global_limit(autojump_clock, nursery): 143 | ''' 144 | If a limit is set on a domain token, that rate limit is used, otherwise the 145 | global rate limit is used. 146 | ''' 147 | job_id = '123e4567-e89b-12d3-a456-426655440001' 148 | rl = RateLimiter(2) 149 | request_send = rl.get_request_channel() 150 | reset_send = rl.get_reset_channel() 151 | token = get_domain_token('domain.example') 152 | rl.set_rate_limit(token, 2) 153 | rl.set_rate_limit(GLOBAL_RATE_LIMIT_TOKEN, 10) 154 | job_recv = rl.add_job(job_id) 155 | nursery.start_soon(rl.run) 156 | 157 | # These two requests should take ~2 seconds due to the domain rate limit. 158 | with assert_max_elapsed(seconds=2.5): 159 | request1 = make_request(job_id, 'http://domain.example/1') 160 | request2 = make_request(job_id, 'http://domain.example/2') 161 | await request_send.send(request1) 162 | await request_send.send(request2) 163 | await job_recv.receive() 164 | await reset_send.send(request1.url) 165 | await job_recv.receive() 166 | await reset_send.send(request2.url) 167 | 168 | # Now if we delete the domain rate limit, the next two requests should take 169 | # 10 seconds due to the global rate limit. 170 | await trio.sleep(2) 171 | rl.delete_rate_limit(token) 172 | with assert_min_elapsed(seconds=10): 173 | request3 = make_request(job_id, 'http://domain.example/3') 174 | request4 = make_request(job_id, 'http://domain.example/4') 175 | await request_send.send(request3) 176 | await request_send.send(request4) 177 | await job_recv.receive() 178 | await reset_send.send(request3.url) 179 | await job_recv.receive() 180 | await reset_send.send(request4.url) 181 | 182 | # Deleting a non-existent token has no effect: 183 | rl.delete_rate_limit(token) 184 | 185 | 186 | async def test_skip_expired_limit_if_nothing_pending(autojump_clock, nursery): 187 | ''' The rate limit for domain1 will expire before the rate limit for 188 | domain2, but since domain1 has no pending requests, it will wait for domain2 189 | to become available again. ''' 190 | job_id = '123e4567-e89b-12d3-a456-426655440001' 191 | rl = RateLimiter(2) 192 | request_send = rl.get_request_channel() 193 | reset_send = rl.get_reset_channel() 194 | token1 = get_domain_token('domain1.example') 195 | token2 = get_domain_token('domain2.example') 196 | rl.set_rate_limit(token1, 1) 197 | rl.set_rate_limit(token2, 2) 198 | job_recv = rl.add_job(job_id) 199 | nursery.start_soon(rl.run) 200 | request1 = make_request(job_id, 'http://domain1.example') 201 | request2a = make_request(job_id, 'http://domain2.example/a') 202 | request2b = make_request(job_id, 'http://domain2.example/b') 203 | await request_send.send(request1) 204 | await request_send.send(request2a) 205 | await request_send.send(request2b) 206 | with assert_min_elapsed(seconds=2): 207 | job_request1 = await job_recv.receive() 208 | await reset_send.send(request1.url) 209 | job_request2a = await job_recv.receive() 210 | await reset_send.send(request2a.url) 211 | job_request2b = await job_recv.receive() 212 | await reset_send.send(request2b.url) 213 | assert job_request1 is request1 214 | assert job_request2a is request2a 215 | assert job_request2b is request2b 216 | 217 | 218 | async def test_push_after_get(autojump_clock, nursery): 219 | ''' If a job is waiting for a request but nothing is pending, then the rate 220 | limiter will wait until it receives a request. ''' 221 | job_id = '123e4567-e89b-12d3-a456-426655440001' 222 | rl = RateLimiter(2) 223 | rl.set_rate_limit(GLOBAL_RATE_LIMIT_TOKEN, 10) 224 | request_send = rl.get_request_channel() 225 | reset_send = rl.get_reset_channel() 226 | job_recv = rl.add_job(job_id) 227 | nursery.start_soon(rl.run) 228 | request = make_request(job_id, 'http://domain.example') 229 | 230 | async def wait_to_send(): 231 | await trio.sleep(2) 232 | await request_send.send(request) 233 | 234 | with assert_min_elapsed(seconds=2): 235 | nursery.start_soon(wait_to_send) 236 | job_request = await job_recv.receive() 237 | assert job_request is request 238 | 239 | -------------------------------------------------------------------------------- /tests/test_resource_monitor.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | from datetime import datetime, timezone 3 | from unittest.mock import patch 4 | 5 | import pytest 6 | import trio 7 | 8 | from . import assert_min_elapsed, assert_max_elapsed 9 | from starbelly.rate_limiter import RateLimiter 10 | from starbelly.resource_monitor import ResourceMonitor 11 | 12 | 13 | @pytest.fixture 14 | def rate_limiter(): 15 | return RateLimiter(1) 16 | 17 | 18 | async def test_history(autojump_clock, nursery, rate_limiter): 19 | ''' 20 | Set interval to 2 seconds and run for 11 seconds. This should produce 21 | 6 measurements. 22 | 23 | Note: this test doesn't mock out psutil, so it also ensures that we are 24 | consuming the psutil API correctly. 25 | ''' 26 | job1_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' 27 | crawl_resources_fn = lambda: { 28 | 'current_downloads': 3, 29 | 'maximum_downloads': 10, 30 | 'jobs': [{ 31 | 'id': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', 32 | 'name': 'Test Job', 33 | 'current_downloads': 3, 34 | }], 35 | } 36 | rm = ResourceMonitor(interval=2, buffer_size=300, 37 | crawl_resources_fn=crawl_resources_fn, rate_limiter=rate_limiter) 38 | nursery.start_soon(rm.run) 39 | await trio.sleep(11) 40 | history1 = list(rm.history()) 41 | assert len(history1) == 6 42 | # We should also be able to get a subset of history 43 | history2 = list(rm.history(3)) 44 | assert len(history2) == 3 45 | assert history1[0]['timestamp'] < history2[0]['timestamp'] 46 | 47 | 48 | async def test_measurement(autojump_clock, nursery, mocker, 49 | rate_limiter): 50 | ''' Mock out inputs and check that the resource monitor formats the data 51 | correctly. ''' 52 | # Set up patches 53 | Consumed = namedtuple('Memory', 'used total') 54 | Mount = namedtuple('Disk', 'mountpoint') 55 | Nic = namedtuple('Nic', 'bytes_sent bytes_recv') 56 | psutil_cpu_percent = mocker.patch('psutil.cpu_percent') 57 | psutil_cpu_percent.return_value = [12.3, 45.6] 58 | ps_util_virtual_memory = mocker.patch('psutil.virtual_memory') 59 | ps_util_virtual_memory.return_value = Consumed(100_000, 200_000) 60 | psutil_disk_partitions = mocker.patch('psutil.disk_partitions') 61 | psutil_disk_partitions.return_value = [Mount('/'), Mount('/mnt/external')] 62 | psutil_disk_usage = mocker.patch('psutil.disk_usage') 63 | psutil_disk_usage.return_value = Consumed(300_000, 400_000) 64 | psutil_net_io_counters = mocker.patch('psutil.net_io_counters') 65 | psutil_net_io_counters.return_value = { 66 | 'eth0': Nic(100, 200), 67 | 'eth1': Nic(300, 400), 68 | } 69 | 70 | # The crawl resources can be instantiated right here; no mocking required. 71 | job1_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' 72 | job2_id = 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb' 73 | crawl_resources_fn = lambda: { 74 | 'current_downloads': 5, 75 | 'maximum_downloads': 10, 76 | 'jobs': [{ 77 | 'id': job1_id, 78 | 'name': 'Test Job 1', 79 | 'current_downloads': 3, 80 | },{ 81 | 'id': job2_id, 82 | 'name': 'Test Job 2', 83 | 'current_downloads': 2, 84 | }], 85 | } 86 | 87 | # Run the resource monitor 88 | rm = ResourceMonitor(interval=1, buffer_size=300, 89 | crawl_resources_fn=crawl_resources_fn, rate_limiter=rate_limiter) 90 | rm_recv = rm.get_channel(channel_size=5) 91 | nursery.start_soon(rm.run) 92 | 93 | # Read one measurement: 94 | measurement = await rm_recv.receive() 95 | assert measurement['cpus'] == [12.3, 45.6] 96 | assert measurement['memory_used'] == 100_000 97 | assert measurement['memory_total'] == 200_000 98 | assert len(measurement['disks']) == 2 99 | assert measurement['disks'][0]['mount'] == '/' 100 | assert measurement['disks'][0]['used'] == 300_000 101 | assert measurement['disks'][0]['total'] == 400_000 102 | assert measurement['disks'][1]['mount'] == '/mnt/external' 103 | assert measurement['disks'][1]['used'] == 300_000 104 | assert measurement['disks'][1]['total'] == 400_000 105 | assert len(measurement['networks']) == 2 106 | assert measurement['networks'][0]['name'] == 'eth0' 107 | assert measurement['networks'][0]['sent'] == 100 108 | assert measurement['networks'][0]['received'] == 200 109 | assert measurement['networks'][1]['name'] == 'eth1' 110 | assert measurement['networks'][1]['sent'] == 300 111 | assert measurement['networks'][1]['received'] == 400 112 | assert measurement['jobs'][0]['id'] == job1_id 113 | assert measurement['jobs'][0]['name'] == 'Test Job 1' 114 | assert measurement['jobs'][0]['current_downloads'] == 3 115 | assert measurement['jobs'][1]['id'] == job2_id 116 | assert measurement['jobs'][1]['name'] == 'Test Job 2' 117 | assert measurement['jobs'][1]['current_downloads'] == 2 118 | assert measurement['current_downloads'] == 5 119 | assert measurement['maximum_downloads'] == 10 120 | assert measurement['rate_limiter'] == 0 121 | 122 | 123 | async def test_slow_channel(autojump_clock, nursery, rate_limiter): 124 | ''' If there are two subscribers to the resource monitor and one is slow, it 125 | will not prevent delivery to the other subscriber. ''' 126 | crawl_resources_fn = lambda: { 127 | 'current_downloads': 3, 128 | 'maximum_downloads': 10, 129 | 'jobs': [{ 130 | 'id': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', 131 | 'name': 'Test Job', 132 | 'current_downloads': 3, 133 | }], 134 | } 135 | 136 | rm = ResourceMonitor(interval=1, buffer_size=300, 137 | crawl_resources_fn=crawl_resources_fn, rate_limiter=rate_limiter) 138 | slow_recv = rm.get_channel(channel_size=1) 139 | fast_recv = rm.get_channel(channel_size=1) 140 | nursery.start_soon(rm.run) 141 | # The fast reader gets one measurement per second even though the slow 142 | # reader is blocked. 143 | with assert_min_elapsed(3), assert_max_elapsed(4): 144 | for _ in range(4): 145 | await fast_recv.receive() 146 | # Now we close the slow reader and make sure the fast reader still gets 147 | # measurements. 148 | await slow_recv.aclose() 149 | with assert_min_elapsed(3), assert_max_elapsed(4): 150 | for _ in range(3): 151 | await fast_recv.receive() 152 | 153 | -------------------------------------------------------------------------------- /tests/test_storage.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | from itertools import count 3 | from unittest.mock import Mock 4 | 5 | from yarl import URL 6 | import trio 7 | 8 | from . import AsyncMock 9 | from starbelly.downloader import DownloadResponse 10 | from starbelly.policy import Policy 11 | from starbelly.storage import CrawlStorage 12 | 13 | 14 | def make_policy(): 15 | ''' Make a sample policy. ''' 16 | dt = datetime(2018,12,31,13,47,00) 17 | doc = { 18 | 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', 19 | 'name': 'Test', 20 | 'created_at': dt, 21 | 'updated_at': dt, 22 | 'authentication': { 23 | 'enabled': False, 24 | }, 25 | 'limits': { 26 | 'max_cost': 10, 27 | 'max_duration': 3600, 28 | 'max_items': 10_000, 29 | }, 30 | 'mime_type_rules': [ 31 | {'match': 'MATCHES', 'pattern': '^text/', 'save': True}, 32 | {'save': False}, 33 | ], 34 | 'proxy_rules': [], 35 | 'robots_txt': { 36 | 'usage': 'IGNORE', 37 | }, 38 | 'url_normalization': { 39 | 'enabled': True, 40 | 'strip_parameters': [], 41 | }, 42 | 'url_rules': [ 43 | {'action': 'ADD', 'amount': 1, 'match': 'MATCHES', 44 | 'pattern': '^https?://({SEED_DOMAINS})/'}, 45 | {'action': 'MULTIPLY', 'amount': 0}, 46 | ], 47 | 'user_agents': [ 48 | {'name': 'Test User Agent'} 49 | ] 50 | } 51 | return Policy(doc, '1.0.0', ['https://seeds.example']) 52 | 53 | 54 | async def test_storage(nursery): 55 | job_id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaa' 56 | db = Mock() 57 | db.save_response = AsyncMock() 58 | db.update_job_stats = AsyncMock() 59 | test_send, storage_receive = trio.open_memory_channel(0) 60 | storage_send, test_receive = trio.open_memory_channel(0) 61 | policy = make_policy() 62 | sequence = count(start=100) 63 | storage = CrawlStorage(job_id, db, storage_send, storage_receive, policy, 64 | sequence) 65 | assert repr(storage) == '' 66 | nursery.start_soon(storage.run) 67 | started_at = datetime(2019, 1, 1, 12, 0, 0, tzinfo=timezone.utc) 68 | completed_at = datetime(2019, 1, 1, 12, 0, 3, tzinfo=timezone.utc) 69 | response = DownloadResponse( 70 | frontier_id='cccccccc-cccc-cccc-cccc-cccccccccccc', 71 | cost=1.0, 72 | url='https://storage.example/', 73 | canonical_url='https://storage.example/', 74 | content_type='text/plain', 75 | body=b'Hello, world!', 76 | started_at=started_at, 77 | completed_at=completed_at, 78 | exception=None, 79 | status_code=200, 80 | headers={'Server': 'FooServe 1.0'} 81 | ) 82 | response.duration = 3.0 83 | await test_send.send(response) 84 | response2 = await test_receive.receive() 85 | assert response is response2 86 | assert db.save_response.call_count == 1 87 | save_response_args = db.save_response.call_args 88 | assert save_response_args[0] == { 89 | 'sequence': 100, 90 | 'job_id': job_id, 91 | 'body_id': b'\x9b\xbb\xb7A\x0f\xa6FJ\x1aj!i\x19\x17\x94U', 92 | 'url': 'https://storage.example/', 93 | 'canonical_url': 'https://storage.example/', 94 | 'content_type': 'text/plain', 95 | 'cost': 1.0, 96 | 'duration': 3.0, 97 | 'headers': ['SERVER', 'FooServe 1.0'], 98 | 'is_success': True, 99 | 'status_code': 200, 100 | 'started_at': started_at, 101 | 'completed_at': completed_at, 102 | } 103 | # Note that the gzip'ed body is non deterministic, so we only check body ID 104 | # and is_compressed fields. 105 | assert save_response_args[1]['id'] == \ 106 | b'\x9b\xbb\xb7A\x0f\xa6FJ\x1aj!i\x19\x17\x94U' 107 | assert save_response_args[1]['is_compressed'] 108 | assert db.update_job_stats.call_count == 1 109 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HyperionGray/starbelly/163b7e48d8816c3ce77fed0e01eeaf9705c4919c/tools/__init__.py -------------------------------------------------------------------------------- /tools/clear.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Clears data from the database. Only intended for developers who want to clear 3 | their environment and start from scratch. 4 | ''' 5 | import logging 6 | import os 7 | import sys 8 | 9 | from rethinkdb import RethinkDB 10 | import trio 11 | 12 | sys.path.append(os.path.dirname(os.path.dirname(__file__))) 13 | 14 | from starbelly.config import get_config 15 | 16 | 17 | r = RethinkDB() 18 | r.set_loop_type('trio') 19 | logging.basicConfig(level=logging.INFO) 20 | logger = logging.getLogger('clear.py') 21 | 22 | 23 | async def clear(conn, table): 24 | logger.info('Clearing table %s', table) 25 | query = r.table(table) 26 | 27 | if table == 'policy': 28 | # Exclude built-in policies. 29 | query = query.filter( 30 | (r.row['name'] != 'Broad Crawl') & 31 | (r.row['name'] != 'Deep Crawl') 32 | ) 33 | elif table == 'rate_limit': 34 | # Exclude global rate limit. 35 | query = query.filter(r.row['type'] != 'global') 36 | 37 | await query.delete().run(conn) 38 | 39 | 40 | async def main(): 41 | db_config = get_config()['database'] 42 | async with trio.open_nursery() as nursery: 43 | conn = await r.connect( 44 | host=db_config['host'], 45 | port=db_config['port'], 46 | db=db_config['db'], 47 | user=db_config['user'], 48 | password=db_config['password'], 49 | nursery=nursery 50 | ) 51 | await clear(conn, 'captcha_solver') 52 | await clear(conn, 'domain_login') 53 | await clear(conn, 'frontier') 54 | await clear(conn, 'job') 55 | await clear(conn, 'job_schedule') 56 | await clear(conn, 'policy') 57 | await clear(conn, 'rate_limit') 58 | await clear(conn, 'response') 59 | await clear(conn, 'response_body') 60 | await clear(conn, 'robots_txt') 61 | await conn.close() 62 | 63 | 64 | if __name__ == '__main__': 65 | trio.run(main) 66 | -------------------------------------------------------------------------------- /tools/delete_dangling_bodies.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Response bodies are deduplicated and stored separate from response headers. When 3 | crawls are deleted, the response bodies remain behind. This script finds 4 | "dangling" response bodies—i.e. bodies that don't match to any existing 5 | response—and deletes them. 6 | 7 | This is a slow process, since it involves looking at every single response body. 8 | ''' 9 | import logging 10 | import os 11 | import sys 12 | 13 | from rethinkdb import RethinkDB 14 | 15 | import trio 16 | 17 | sys.path.append(os.path.dirname(os.path.dirname(__file__))) 18 | 19 | from starbelly.config import get_config 20 | 21 | 22 | r = RethinkDB() 23 | r.set_loop_type('trio') 24 | logging.basicConfig(level=logging.INFO) 25 | logger = logging.getLogger('clear.py') 26 | 27 | 28 | async def clear_dangling_response_bodies(conn): 29 | ''' 30 | Response bodies are deduplicated and stored in a separate table from 31 | response metadata. When a job is deleted, only the response metadata is 32 | removed. This method finds response bodies that are dangling (not 33 | referred to by any existing response) and removes them. 34 | 35 | Note: this is a very slow operation because it has to iterate over all 36 | existing response bodies. This should only be run periodically. 37 | 38 | :param db_pool: A RethinkDB connection pool. 39 | ''' 40 | def dangling(body): 41 | responses = r.table('response').get_all(body['id'], index='body_id') 42 | return responses.count().eq(0) 43 | 44 | await ( 45 | r.table('response_body') 46 | .order_by('id') 47 | .filter(dangling) 48 | .delete() 49 | .run(conn) 50 | ) 51 | 52 | 53 | async def main(): 54 | db_config = get_config()['database'] 55 | async with trio.open_nursery() as nursery: 56 | conn = await r.connect( 57 | host=db_config['host'], 58 | port=db_config['port'], 59 | db=db_config['db'], 60 | user=db_config['user'], 61 | password=db_config['password'], 62 | nursery=nursery 63 | ) 64 | await clear_dangling_response_bodies(conn) 65 | 66 | 67 | if __name__ == '__main__': 68 | trio.run(main) 69 | -------------------------------------------------------------------------------- /tools/shell.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A Python REPL for Starbelly. 3 | 4 | This "shell" imports useful modules and sets up the application 5 | configuration, database pool, and other useful features. This shell 6 | is intended for use with Python's interactive flag, i.e.: 7 | 8 | $ python3 -im tools.shell 9 | >>> config['database']['user'] 10 | 'starbelly-app' 11 | 12 | You can also load this in Jupyter Notebook by running this in the first cell: 13 | 14 | from tools.shell import * 15 | 16 | The shell is handy for development and debugging in order to execute 17 | sections of Starbelly without running the entire server. 18 | ''' 19 | 20 | import functools 21 | import logging 22 | import os 23 | import sys 24 | 25 | from IPython.terminal.embed import InteractiveShellEmbed 26 | from rethinkdb import RethinkDB 27 | import trio 28 | 29 | sys.path.append(os.path.dirname(os.path.dirname(__file__))) 30 | 31 | import starbelly.config 32 | from starbelly.version import __version__ 33 | 34 | 35 | # Globals exposed in the shell: 36 | r = RethinkDB() 37 | r.set_loop_type('trio') 38 | logger = None 39 | config = None 40 | 41 | 42 | def run_query(query, super_user=False): 43 | ''' Run ``query`` on RethinkDB and return result. ''' 44 | async def async_query(): 45 | db_config = config['database'] 46 | kwargs = { 47 | 'host': db_config['host'], 48 | 'port': db_config['port'], 49 | 'db': db_config['db'], 50 | 'user': db_config['user'], 51 | 'password': db_config['password'], 52 | } 53 | if super_user: 54 | kwargs['user'] = db_config['super_user'] 55 | kwargs['password'] = db_config['super_password'] 56 | async with trio.open_nursery() as nursery: 57 | kwargs['nursery'] = nursery 58 | connect_db = functools.partial(r.connect, **kwargs) 59 | conn = await connect_db() 60 | try: 61 | result = await query.run(conn) 62 | finally: 63 | await conn.close() 64 | return result 65 | 66 | return trio.run(async_query) 67 | 68 | 69 | def list_results(results): 70 | ''' 71 | Convert query results to list, even if the query resulted in a cursor. ''' 72 | async def async_list_results(): 73 | if isinstance(results, list): 74 | return results.copy() 75 | elif isinstance(results, r.Cursor): 76 | l = list() 77 | async with results: 78 | async for item in results: 79 | l.append(item) 80 | return l 81 | else: 82 | type_ = type(results) 83 | logger.error(f'RethinkDB UNKNOWN TYPE: {type_}') 84 | return None 85 | 86 | return trio.run(async_list_results) 87 | 88 | 89 | def print_results(results): 90 | ''' 91 | Pretty print RethinkDB query results. 92 | 93 | This method correctly handles different types of results, such as a 94 | cursor, list, etc. 95 | ''' 96 | async def async_print_results(): 97 | MAX_ITEMS = 100 98 | INDENT = ' ' 99 | if isinstance(results, list): 100 | len_ = len(results) 101 | print(f'RethinkDB List (len={len_}): [') 102 | for item in results[:MAX_ITEMS]: 103 | print(f'{INDENT}{item},') 104 | if len_ > MAX_ITEMS: 105 | print(f'{INDENT}...') 106 | print(']') 107 | elif isinstance(results, r.Cursor): 108 | print('RethinkDB Cursor: [') 109 | item_count = 0 110 | try: 111 | async for item in results: 112 | if item_count > MAX_ITEMS: 113 | print(f'{INDENT}...') 114 | print(f'{INDENT}{item},') 115 | item_count += 1 116 | finally: 117 | await results.close() 118 | print(']') 119 | else: 120 | type_ = type(results) 121 | logger.error(f'RethinkDB UNKNOWN TYPE: {type_}') 122 | 123 | trio.run(async_print_results) 124 | 125 | 126 | def setup(): 127 | ''' Set up configuration and logging. ''' 128 | global config, logger 129 | log_format = '%(asctime)s [%(name)s] %(levelname)s: %(message)s' 130 | log_date_format = '%H:%M:%S' 131 | log_formatter = logging.Formatter(log_format, log_date_format) 132 | log_handler = logging.StreamHandler(sys.stdout) 133 | log_handler.setFormatter(log_formatter) 134 | logger = logging.getLogger('tools.shell') 135 | logger.addHandler(log_handler) 136 | logger.setLevel(logging.INFO) 137 | config = starbelly.config.get_config() 138 | 139 | 140 | def main(): 141 | ''' Run IPython shell. ''' 142 | ipy_shell = InteractiveShellEmbed( 143 | banner1=f'IPython Shell: Starbelly v{__version__}') 144 | ipy_shell.magic('autoawait trio') 145 | ipy_shell() 146 | 147 | 148 | setup() 149 | if __name__ == '__main__': 150 | main() 151 | else: 152 | print(f'Starbelly v{__version__} Shell') 153 | 154 | --------------------------------------------------------------------------------