├── front ├── src │ ├── boot │ │ └── .gitkeep │ ├── css │ │ ├── app.scss │ │ └── quasar.variables.scss │ ├── App.vue │ ├── components │ │ ├── ScheduleChip.vue │ │ ├── ScraperJobStatusChip.vue │ │ ├── PriorityChip.vue │ │ ├── ScraperCard.vue │ │ ├── TaskLogs.vue │ │ ├── ScraperIdeComponent.vue │ │ └── ScraperJobs.vue │ ├── pages │ │ ├── NewScraperPage.vue │ │ ├── ScraperPage.vue │ │ ├── ErrorNotFound.vue │ │ ├── ScrapersPage.vue │ │ └── ScraperIde.vue │ ├── router │ │ ├── routes.js │ │ └── index.js │ ├── assets │ │ └── logo.svg │ ├── api.js │ └── layouts │ │ └── MainLayout.vue ├── .npmrc ├── .eslintignore ├── public │ └── icons │ │ └── favicon.png ├── .editorconfig ├── .vscode │ ├── extensions.json │ └── settings.json ├── .gitignore ├── README.md ├── index.html ├── jsconfig.json ├── postcss.config.js ├── package.json ├── quasar.config.js └── .eslintrc.js ├── sneakpeek ├── static │ ├── .gitkeep │ └── docs │ │ └── .gitkeep ├── session_loggers │ ├── base.py │ ├── redis_logger.py │ └── file_logger.py ├── middleware │ ├── parser.py │ ├── base.py │ ├── proxy_middleware.py │ ├── user_agent_injecter_middleware.py │ ├── requests_logging_middleware.py │ ├── robots_txt_middleware.py │ └── rate_limiter_middleware.py ├── scraper │ ├── task_handler.py │ ├── ephemeral_scraper_task_handler.py │ ├── dynamic_scraper_handler.py │ ├── in_memory_storage.py │ ├── redis_storage.py │ ├── tests │ │ ├── test_dynamic_scraper_handler.py │ │ └── test_scraper_storage.py │ └── runner.py ├── scheduler │ ├── redis_lease_storage.py │ ├── in_memory_lease_storage.py │ ├── tests │ │ └── test_lease_storage.py │ └── model.py ├── queue │ ├── tasks.py │ ├── tests │ │ ├── test_queue_storage.py │ │ ├── test_queue.py │ │ └── test_consumer.py │ ├── in_memory_storage.py │ ├── queue.py │ ├── redis_storage.py │ └── consumer.py ├── logging.py ├── tests │ └── test_metrics.py └── metrics.py ├── .gitattributes ├── .flake8 ├── .gitignore ├── .coveragerc ├── .vscode ├── extensions.json ├── settings.json └── launch.json ├── docs ├── middleware │ ├── index.rst │ ├── requests_logging_middleware.rst │ ├── robots_txt_middleware.rst │ ├── proxy_middleware.rst │ ├── user_agent_injecter_middleware.rst │ ├── rate_limiter_middleware.rst │ └── new_middleware.rst ├── Makefile ├── make.bat ├── index.rst ├── api.rst ├── deployment.rst ├── local_debugging.rst ├── conf.py ├── design.rst └── quick_start.rst ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md └── workflows │ └── ci.yml ├── CONTRIBUTING.md ├── LICENCE ├── README.md ├── Makefile └── pyproject.toml /front/src/boot/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sneakpeek/static/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sneakpeek/static/docs/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | sneakpeek/static/** linguist-documentation -------------------------------------------------------------------------------- /front/src/css/app.scss: -------------------------------------------------------------------------------- 1 | // app global css in SCSS form 2 | -------------------------------------------------------------------------------- /front/.npmrc: -------------------------------------------------------------------------------- 1 | # pnpm-related options 2 | shamefully-hoist=true 3 | strict-peer-dependencies=false 4 | -------------------------------------------------------------------------------- /front/.eslintignore: -------------------------------------------------------------------------------- 1 | /dist 2 | /src-capacitor 3 | /src-cordova 4 | /.quasar 5 | /node_modules 6 | .eslintrc.js 7 | -------------------------------------------------------------------------------- /front/public/icons/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flulemon/sneakpeek/HEAD/front/public/icons/favicon.png -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E501, W503 3 | exclude = 4 | __pycache__, 5 | .eggs, 6 | .git, 7 | .tox, 8 | .nox, 9 | build, 10 | dist, 11 | src/test/python_tests/test_data -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__/* 2 | .venv 3 | *.install.stamp 4 | dist 5 | .dist 6 | **/.pytest_cache/* 7 | .pytest_cache/ 8 | .coverage 9 | htmlcov 10 | demo 11 | coverage.xml 12 | !/**/.gitkeep 13 | logs -------------------------------------------------------------------------------- /front/.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | indent_style = space 6 | indent_size = 2 7 | end_of_line = lf 8 | insert_final_newline = true 9 | trim_trailing_whitespace = true 10 | -------------------------------------------------------------------------------- /front/src/App.vue: -------------------------------------------------------------------------------- 1 | 4 | 5 | 12 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | exclude_lines = 3 | @abstractmethod 4 | @abc.abstractmethod 5 | raise AssertionError 6 | raise NotImplementedError 7 | if __name__ == .__main__.: 8 | @entrypoint.method() 9 | pragma: no cover 10 | def __repr__ 11 | if self.debug: 12 | if settings.DEBUG 13 | if 0: 14 | class .*\bProtocol\): 15 | logger. -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "dbaeumer.vscode-eslint", 4 | "esbenp.prettier-vscode", 5 | "editorconfig.editorconfig", 6 | "vue.volar", 7 | "wayou.vscode-todo-highlight" 8 | ], 9 | "unwantedRecommendations": [ 10 | "octref.vetur", 11 | "hookyqr.beautify", 12 | "dbaeumer.jshint", 13 | "ms-vscode.vscode-typescript-tslint-plugin" 14 | ] 15 | } -------------------------------------------------------------------------------- /front/.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "dbaeumer.vscode-eslint", 4 | "esbenp.prettier-vscode", 5 | "editorconfig.editorconfig", 6 | "vue.volar", 7 | "wayou.vscode-todo-highlight" 8 | ], 9 | "unwantedRecommendations": [ 10 | "octref.vetur", 11 | "hookyqr.beautify", 12 | "dbaeumer.jshint", 13 | "ms-vscode.vscode-typescript-tslint-plugin" 14 | ] 15 | } -------------------------------------------------------------------------------- /front/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.bracketPairColorization.enabled": true, 3 | "editor.guides.bracketPairs": true, 4 | "editor.formatOnSave": true, 5 | "editor.defaultFormatter": "esbenp.prettier-vscode", 6 | "editor.codeActionsOnSave": [ 7 | "source.fixAll.eslint" 8 | ], 9 | "eslint.validate": [ 10 | "javascript", 11 | "javascriptreact", 12 | "typescript", 13 | "vue" 14 | ] 15 | } -------------------------------------------------------------------------------- /front/src/components/ScheduleChip.vue: -------------------------------------------------------------------------------- 1 | 4 | 21 | -------------------------------------------------------------------------------- /front/src/pages/NewScraperPage.vue: -------------------------------------------------------------------------------- 1 | 10 | 11 | 19 | -------------------------------------------------------------------------------- /front/src/pages/ScraperPage.vue: -------------------------------------------------------------------------------- 1 | 8 | 9 | 19 | -------------------------------------------------------------------------------- /front/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .thumbs.db 3 | node_modules 4 | .yarn 5 | # Quasar core related directories 6 | .quasar 7 | /dist 8 | 9 | # Cordova related directories and files 10 | /src-cordova/node_modules 11 | /src-cordova/platforms 12 | /src-cordova/plugins 13 | /src-cordova/www 14 | 15 | # Capacitor related directories and files 16 | /src-capacitor/www 17 | /src-capacitor/node_modules 18 | 19 | # Log files 20 | npm-debug.log* 21 | yarn-debug.log* 22 | yarn-error.log* 23 | 24 | # Editor directories and files 25 | .idea 26 | *.suo 27 | *.ntvs* 28 | *.njsproj 29 | *.sln 30 | -------------------------------------------------------------------------------- /docs/middleware/index.rst: -------------------------------------------------------------------------------- 1 | ################# 2 | Middleware 3 | ################# 4 | 5 | **Sneakpeek** allows you to run arbitrary code before the request and after the response has been recieved. 6 | This can be helpful if you have some common logic you want to use in your scrapers. 7 | 8 | There are some plugins that are already implemented: 9 | 10 | .. toctree:: 11 | :maxdepth: 2 12 | 13 | 14 | rate_limiter_middleware 15 | robots_txt_middleware 16 | user_agent_injecter_middleware 17 | proxy_middleware 18 | requests_logging_middleware 19 | new_middleware 20 | -------------------------------------------------------------------------------- /front/src/pages/ErrorNotFound.vue: -------------------------------------------------------------------------------- 1 | 16 | 17 | 24 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FEATURE]" 5 | labels: enhancement 6 | assignees: flulemon 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /front/README.md: -------------------------------------------------------------------------------- 1 | # Sneakpeek (sneakpeek-front) 2 | 3 | A toolbox to create scrapers 4 | 5 | ## Install the dependencies 6 | ```bash 7 | yarn 8 | # or 9 | npm install 10 | ``` 11 | 12 | ### Start the app in development mode (hot-code reloading, error reporting, etc.) 13 | ```bash 14 | quasar dev 15 | ``` 16 | 17 | 18 | ### Lint the files 19 | ```bash 20 | yarn lint 21 | # or 22 | npm run lint 23 | ``` 24 | 25 | 26 | ### Format the files 27 | ```bash 28 | yarn format 29 | # or 30 | npm run format 31 | ``` 32 | 33 | 34 | 35 | ### Build the app for production 36 | ```bash 37 | quasar build 38 | ``` 39 | 40 | ### Customize the configuration 41 | See [Configuring quasar.config.js](https://v2.quasar.dev/quasar-cli-vite/quasar-config-js). 42 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: bug 6 | assignees: flulemon 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Screenshots** 20 | If applicable, add screenshots to help explain your problem. 21 | 22 | **Environment (please complete the following information):** 23 | - OS: [e.g. iOS] 24 | - Python version [e.g. 3.10] 25 | - Package Version [e.g. 0.1.4] 26 | 27 | **Additional context** 28 | Add any other context about the problem here. 29 | -------------------------------------------------------------------------------- /front/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | <%= productName %> 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /front/jsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "baseUrl": ".", 4 | "paths": { 5 | "src/*": [ 6 | "src/*" 7 | ], 8 | "app/*": [ 9 | "*" 10 | ], 11 | "components/*": [ 12 | "src/components/*" 13 | ], 14 | "layouts/*": [ 15 | "src/layouts/*" 16 | ], 17 | "pages/*": [ 18 | "src/pages/*" 19 | ], 20 | "assets/*": [ 21 | "src/assets/*" 22 | ], 23 | "boot/*": [ 24 | "src/boot/*" 25 | ], 26 | "stores/*": [ 27 | "src/stores/*" 28 | ], 29 | "vue$": [ 30 | "node_modules/vue/dist/vue.runtime.esm-bundler.js" 31 | ] 32 | } 33 | }, 34 | "exclude": [ 35 | "dist", 36 | ".quasar", 37 | "node_modules" 38 | ] 39 | } -------------------------------------------------------------------------------- /front/src/components/ScraperJobStatusChip.vue: -------------------------------------------------------------------------------- 1 | 4 | 33 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.formatting.provider": "black", 3 | "python.linting.flake8Enabled": true, 4 | "python.linting.enabled": true, 5 | "editor.formatOnSave": true, 6 | "editor.codeActionsOnSave": { 7 | "source.organizeImports": true, 8 | "source.fixAll.eslint": true 9 | }, 10 | "python.testing.pytestArgs": [], 11 | "python.testing.unittestEnabled": false, 12 | "python.testing.pytestEnabled": true, 13 | "eslint.validate": ["javascript", "javascriptreact", "typescript", "vue"], 14 | "editor.bracketPairColorization.enabled": true, 15 | "editor.guides.bracketPairs": true, 16 | "editor.defaultFormatter": "esbenp.prettier-vscode", 17 | "typescript.tsdk": "node_modules/typescript/lib", 18 | "[xml]": { 19 | "editor.defaultFormatter": "redhat.vscode-xml" 20 | }, 21 | "esbonio.sphinx.confDir": "" 22 | } 23 | -------------------------------------------------------------------------------- /front/src/router/routes.js: -------------------------------------------------------------------------------- 1 | 2 | const routes = [ 3 | { 4 | name: 'Homepage', 5 | path: '/', 6 | component: () => import('layouts/MainLayout.vue'), 7 | children: [ 8 | { name: 'ScrapersPage', path: '', component: () => import('src/pages/ScrapersPage.vue') }, 9 | { name: 'NewScraperPage', path: 'new', component: () => import('src/pages/NewScraperPage.vue') }, 10 | { name: 'ScraperIde', path: 'ide', component: () => import('src/pages/ScraperIde.vue') }, 11 | { name: 'ScraperPage', path: 'scraper/:id', component: () => import('src/pages/ScraperPage.vue'), props: true }, 12 | ] 13 | }, 14 | 15 | // Always leave this as last one, 16 | // but you can also remove it 17 | { 18 | path: '/:catchAll(.*)*', 19 | component: () => import('pages/ErrorNotFound.vue') 20 | } 21 | ] 22 | 23 | export default routes 24 | -------------------------------------------------------------------------------- /front/src/css/quasar.variables.scss: -------------------------------------------------------------------------------- 1 | // Quasar SCSS (& Sass) Variables 2 | // -------------------------------------------------- 3 | // To customize the look and feel of this app, you can override 4 | // the Sass/SCSS variables found in Quasar's source Sass/SCSS files. 5 | 6 | // Check documentation for full list of Quasar variables 7 | 8 | // Your own variables (that are declared here) and Quasar's own 9 | // ones will be available out of the box in your .vue/.scss/.sass files 10 | 11 | // It's highly recommended to change the default colors 12 | // to match your app's branding. 13 | // Tip: Use the "Theme Builder" on Quasar's documentation website. 14 | 15 | $primary : #1976d2; 16 | $secondary : #c2c2c2; 17 | $accent : #9C27B0; 18 | 19 | $dark : #3b3535; 20 | $dark-page : #121212; 21 | 22 | $positive : #37994e; 23 | $negative : #c22b3c; 24 | $info : #add5de; 25 | $warning : #ff9b29; 26 | -------------------------------------------------------------------------------- /front/src/components/PriorityChip.vue: -------------------------------------------------------------------------------- 1 | 4 | 36 | -------------------------------------------------------------------------------- /front/postcss.config.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable */ 2 | // https://github.com/michael-ciniawsky/postcss-load-config 3 | 4 | module.exports = { 5 | plugins: [ 6 | // https://github.com/postcss/autoprefixer 7 | require('autoprefixer')({ 8 | overrideBrowserslist: [ 9 | 'last 4 Chrome versions', 10 | 'last 4 Firefox versions', 11 | 'last 4 Edge versions', 12 | 'last 4 Safari versions', 13 | 'last 4 Android versions', 14 | 'last 4 ChromeAndroid versions', 15 | 'last 4 FirefoxAndroid versions', 16 | 'last 4 iOS versions' 17 | ] 18 | }) 19 | 20 | // https://github.com/elchininet/postcss-rtlcss 21 | // If you want to support RTL css, then 22 | // 1. yarn/npm install postcss-rtlcss 23 | // 2. optionally set quasar.config.js > framework > lang to an RTL language 24 | // 3. uncomment the following line: 25 | // require('postcss-rtlcss') 26 | ] 27 | } 28 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.https://www.sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /sneakpeek/session_loggers/base.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from abc import ABC, abstractmethod 3 | from typing import Any, List 4 | 5 | from pydantic import BaseModel 6 | 7 | FIELDS_TO_LOG = [ 8 | "levelname", 9 | "msg", 10 | "filename", 11 | "lineno", 12 | "name", 13 | "funcName", 14 | "task_id", 15 | "task_name", 16 | "task_handler", 17 | "asctime", 18 | "headers", 19 | "kwargs", 20 | "request", 21 | "response", 22 | ] 23 | 24 | 25 | def get_fields_to_log(record: logging.LogRecord) -> dict[str, Any]: 26 | return { 27 | field: value 28 | for field in FIELDS_TO_LOG 29 | if (value := getattr(record, field, None)) is not None 30 | } 31 | 32 | 33 | class LogLine(BaseModel): 34 | id: str 35 | data: dict[str, Any] 36 | 37 | 38 | class SessionLogger(ABC, logging.Handler): 39 | @abstractmethod 40 | async def read( 41 | self, 42 | task_id: str, 43 | last_log_line_id: str | None = None, 44 | max_lines: int = 100, 45 | ) -> List[LogLine]: 46 | ... 47 | -------------------------------------------------------------------------------- /front/src/router/index.js: -------------------------------------------------------------------------------- 1 | import { route } from 'quasar/wrappers' 2 | import { createRouter, createMemoryHistory, createWebHistory, createWebHashHistory } from 'vue-router' 3 | import routes from './routes' 4 | 5 | /* 6 | * If not building with SSR mode, you can 7 | * directly export the Router instantiation; 8 | * 9 | * The function below can be async too; either use 10 | * async/await or return a Promise which resolves 11 | * with the Router instance. 12 | */ 13 | 14 | export default route(function (/* { store, ssrContext } */) { 15 | const createHistory = process.env.SERVER 16 | ? createMemoryHistory 17 | : (process.env.VUE_ROUTER_MODE === 'history' ? createWebHistory : createWebHashHistory) 18 | 19 | const Router = createRouter({ 20 | scrollBehavior: () => ({ left: 0, top: 0 }), 21 | routes, 22 | 23 | // Leave this as is and make changes in quasar.conf.js instead! 24 | // quasar.conf.js -> build -> vueRouterMode 25 | // quasar.conf.js -> build -> publicPath 26 | history: createHistory(process.env.VUE_ROUTER_BASE) 27 | }) 28 | 29 | return Router 30 | }) 31 | -------------------------------------------------------------------------------- /sneakpeek/middleware/parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from dataclasses import dataclass 3 | 4 | from sneakpeek.middleware.base import BaseMiddleware 5 | 6 | 7 | @dataclass 8 | class RegexMatch: 9 | """Regex match""" 10 | 11 | full_match: str #: Full regular expression match 12 | groups: dict[str, str] #: Regular expression group matches 13 | 14 | 15 | class ParserMiddleware(BaseMiddleware): 16 | """Parser middleware provides parsing utilities""" 17 | 18 | @property 19 | def name(self) -> str: 20 | return "parser" 21 | 22 | def regex( 23 | self, 24 | text: str, 25 | pattern: str, 26 | flags: re.RegexFlag = re.UNICODE | re.MULTILINE | re.IGNORECASE, 27 | ) -> list[RegexMatch]: 28 | """Find matches in the text using regular expression 29 | 30 | Args: 31 | text (str): Text to search in 32 | pattern (str): Regular expression 33 | flags (re.RegexFlag, optional): Regular expression flags. Defaults to re.UNICODE | re.MULTILINE | re.IGNORECASE. 34 | 35 | Returns: 36 | list[RegexMatch]: Matches found in the text 37 | """ 38 | return [ 39 | RegexMatch(full_match=match.group(0), groups=match.groupdict()) 40 | for match in re.finditer(pattern, text, flags) 41 | ] 42 | -------------------------------------------------------------------------------- /front/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "sneakpeek-front", 3 | "version": "0.2.2", 4 | "description": "A toolbox to create scrapers", 5 | "productName": "Sneakpeek", 6 | "author": "Dan Yazovsky ", 7 | "private": true, 8 | "scripts": { 9 | "lint": "eslint --ext .js,.vue ./", 10 | "format": "prettier --write \"**/*.{js,vue,scss,html,md,json}\" --ignore-path .gitignore", 11 | "test": "echo \"No test specified\" && exit 0", 12 | "dev": "quasar dev", 13 | "build": "quasar build" 14 | }, 15 | "dependencies": { 16 | "@quasar/extras": "^1.0.0", 17 | "axios": "^1.3.5", 18 | "json-editor-vue": "^0.10.5", 19 | "monaco-editor-vue3": "^0.1.6", 20 | "monaco-editor-webpack-plugin": "^7.0.1", 21 | "quasar": "^2.6.0", 22 | "vanilla-jsoneditor": "^0.16.1", 23 | "vscode-ws-jsonrpc": "^3.0.0", 24 | "vue": "^3.0.0", 25 | "vue-router": "^4.0.0" 26 | }, 27 | "devDependencies": { 28 | "@quasar/app-vite": "^1.0.0", 29 | "autoprefixer": "^10.4.2", 30 | "eslint": "^8.10.0", 31 | "eslint-config-prettier": "^8.1.0", 32 | "eslint-plugin-vue": "^9.0.0", 33 | "postcss": "^8.4.14", 34 | "prettier": "^2.5.1" 35 | }, 36 | "engines": { 37 | "node": "^18 || ^16 || ^14.19", 38 | "npm": ">= 6.13.4", 39 | "yarn": ">= 1.21.1" 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /sneakpeek/scraper/task_handler.py: -------------------------------------------------------------------------------- 1 | from sneakpeek.queue.model import Task, TaskHandlerABC 2 | from sneakpeek.scraper.model import ( 3 | SCRAPER_PERIODIC_TASK_HANDLER_NAME, 4 | Scraper, 5 | ScraperHandler, 6 | ScraperRunnerABC, 7 | ScraperStorageABC, 8 | UnknownScraperHandlerError, 9 | ) 10 | 11 | 12 | class ScraperTaskHandler(TaskHandlerABC): 13 | def __init__( 14 | self, 15 | scraper_handlers: list[ScraperHandler], 16 | runner: ScraperRunnerABC, 17 | storage: ScraperStorageABC, 18 | ) -> None: 19 | self.scraper_handlers = {handler.name: handler for handler in scraper_handlers} 20 | self.runner = runner 21 | self.storage = storage 22 | 23 | def name(self) -> int: 24 | return SCRAPER_PERIODIC_TASK_HANDLER_NAME 25 | 26 | async def process(self, task: Task) -> str: 27 | scraper = await self.storage.get_scraper(task.task_name) 28 | handler = self._get_handler(scraper) 29 | return await self.runner.run(handler, scraper) 30 | 31 | def _get_handler(self, scraper: Scraper) -> ScraperHandler: 32 | if scraper.handler not in self.scraper_handlers: 33 | raise UnknownScraperHandlerError( 34 | f"Unknown scraper handler '{scraper.handler}'" 35 | ) 36 | return self.scraper_handlers[scraper.handler] 37 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Contributing 2 | 3 | 1. File an issue to notify the maintainers about what you're working on. 4 | 2. Fork the repo, develop and test your code changes, add docs. 5 | 3. Make sure that your commit messages clearly describe the changes. 6 | 4. Send a pull request. 7 | 8 | ## File an Issue 9 | 10 | Use the issue tracker to start the discussion. It is possible that someone 11 | else is already working on your idea, your approach is not quite right, or that 12 | the functionality exists already. The ticket you file in the issue tracker will 13 | be used to hash that all out. 14 | 15 | ## Running tests, building package and docs 16 | 17 | Use the issue tracker to start the discussion. It is possible that someone 18 | else is already working on your idea, your approach is not quite right, or that 19 | the functionality exists already. The ticket you file in the issue tracker will 20 | be used to hash that all out. 21 | 22 | ## Make the Pull Request 23 | 24 | Once you have made all your changes, tests, and updated the documentation, run the tests and build the package: 25 | 26 | ``` 27 | make test 28 | make build 29 | ``` 30 | 31 | Once everything succeeds make a pull request to move everything back into the main branch of the 32 | `repository`. 33 | 34 | Be sure to reference the original issue in the pull request. 35 | Expect some back-and-forth with regards to style and compliance of these 36 | rules. 37 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ################# 2 | Overview 3 | ################# 4 | 5 | **Sneakpeek** - is a platform to author, schedule and monitor scrapers in an easy, fast and extensible way. 6 | It's the best choice for scrapers that have some specific complex scraping logic that needs 7 | to be run on a constant basis. 8 | 9 | Key features 10 | ############ 11 | 12 | - Horizontally scalable 13 | - Robust scraper scheduler and priority task queue 14 | - Multiple storage implementations to persist scrapers' configs, tasks, logs, etc. 15 | - JSON RPC API to manage the platform programmatically 16 | - Useful UI to manage all of your scrapers 17 | - Scraper IDE to enable you developing scrapers right in your browser 18 | - Easily extendable via middleware 19 | 20 | Demo 21 | #### 22 | 23 | [Here's a demo project](https://github.com/flulemon/sneakpeek-demo) which uses **Sneakpeek** framework. 24 | 25 | You can also run the demo using Docker: 26 | 27 | .. code-block:: bash 28 | 29 | docker run -it --rm -p 8080:8080 flulemon/sneakpeek-demo 30 | 31 | 32 | Once it has started head over to http://localhost:8080 to play around with it. 33 | 34 | Table of contents 35 | ================== 36 | 37 | .. toctree:: 38 | :maxdepth: 2 39 | 40 | self 41 | quick_start 42 | local_debugging 43 | design 44 | deployment 45 | middleware/index 46 | api 47 | 48 | Indices 49 | ================== 50 | * :ref:`genindex` 51 | * :ref:`modindex` 52 | * :ref:`search` -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | ################# 2 | API 3 | ################# 4 | 5 | .. automodule:: sneakpeek.server 6 | .. automodule:: sneakpeek.queue.model 7 | .. automodule:: sneakpeek.scheduler.model 8 | .. automodule:: sneakpeek.scraper.model 9 | .. automodule:: sneakpeek.queue.queue 10 | .. automodule:: sneakpeek.queue.consumer 11 | .. automodule:: sneakpeek.queue.in_memory_storage 12 | .. automodule:: sneakpeek.queue.redis_storage 13 | .. automodule:: sneakpeek.queue.tasks 14 | .. automodule:: sneakpeek.scheduler.scheduler 15 | .. automodule:: sneakpeek.scheduler.in_memory_lease_storage 16 | .. automodule:: sneakpeek.scheduler.redis_lease_storage 17 | .. automodule:: sneakpeek.scraper.context 18 | .. automodule:: sneakpeek.scraper.runner 19 | .. automodule:: sneakpeek.scraper.task_handler 20 | .. automodule:: sneakpeek.scraper.redis_storage 21 | .. automodule:: sneakpeek.scraper.in_memory_storage 22 | .. automodule:: sneakpeek.scraper.dynamic_scraper_handler 23 | .. automodule:: sneakpeek.middleware.base 24 | .. automodule:: sneakpeek.middleware.parser 25 | .. automodule:: sneakpeek.middleware.proxy_middleware 26 | .. automodule:: sneakpeek.middleware.rate_limiter_middleware 27 | .. automodule:: sneakpeek.middleware.requests_logging_middleware 28 | .. automodule:: sneakpeek.middleware.robots_txt_middleware 29 | .. automodule:: sneakpeek.middleware.user_agent_injecter_middleware 30 | .. automodule:: sneakpeek.api 31 | .. automodule:: sneakpeek.logging 32 | .. automodule:: sneakpeek.metrics -------------------------------------------------------------------------------- /docs/deployment.rst: -------------------------------------------------------------------------------- 1 | ################## 2 | Deployment options 3 | ################## 4 | 5 | There are multiple options how you can deploy your scrapers depending on your requirements: 6 | 7 | ============================= 8 | One replica that does it all 9 | ============================= 10 | 11 | This is a good option if: 12 | 13 | * you can tolerate some downtime 14 | * you don't need to host thousands of scrapers that can be dynamically changed by users 15 | * you don't care if you lose the information about the scraper jobs 16 | 17 | In this case all you need to do is to: 18 | 19 | * define a list of scrapers in the code (just like in the :doc:`tutorial `) 20 | * use in-memory storage 21 | 22 | ====================== 23 | Using external storage 24 | ====================== 25 | 26 | If you use some external storage (e.g. redis or RDBMS) for jobs queue and lease storage you'll be able: 27 | 28 | * to scale workers horizontally until queue, storage or scheduler becomes a bottleneck 29 | * to have a secondary replicas for the scheduler, so when primary dies for some reason there are fallback options 30 | 31 | If you also use the external storage as a scrapers storage you'll be able to dynamically 32 | add, delete and update scrapers via UI or JsonRPC API. 33 | 34 | Note that each **Sneakpeek** server by default runs worker, scheduler and API services, but 35 | it's possible to run only one role at the time, therefore you'll be able to scale 36 | services independently. 37 | 38 | -------------------------------------------------------------------------------- /sneakpeek/middleware/base.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from traceback import format_exc 3 | from typing import Any, Coroutine, Type, TypeVar 4 | 5 | from aiohttp import ClientResponse 6 | from pydantic import BaseModel 7 | from typing_extensions import override 8 | 9 | from sneakpeek.scraper.model import Middleware, MiddlewareConfig, Request 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | _TBaseModel = TypeVar("_TBaseModel", bound=BaseModel) 14 | 15 | 16 | def parse_config_from_obj( 17 | config: Any | None, 18 | plugin_name: str, 19 | config_type: Type[_TBaseModel], 20 | default_config: _TBaseModel, 21 | ) -> _TBaseModel: 22 | if not config: 23 | return default_config 24 | try: 25 | return config_type.parse_obj(config) 26 | except Exception as e: 27 | logger.warn(f"Failed to parse config for plugin '{plugin_name}': {e}") 28 | logger.debug(f"Traceback: {format_exc()}") 29 | return default_config 30 | 31 | 32 | class BaseMiddleware(Middleware): 33 | @property 34 | def name(self) -> str: 35 | return "proxy" 36 | 37 | @override 38 | async def on_request( 39 | self, 40 | request: Request, 41 | config: Any | None, 42 | ) -> Request: 43 | return request 44 | 45 | async def on_response( 46 | self, 47 | request: Request, 48 | response: ClientResponse, 49 | config: MiddlewareConfig | None = None, 50 | ) -> Coroutine[Any, Any, ClientResponse]: 51 | return response 52 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2023, Daniil Iazovskii 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of the copyright holder nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.1", 6 | "configurations": [ 7 | { 8 | "name": "python", 9 | "type": "python", 10 | "request": "launch", 11 | "program": "${file}", 12 | "console": "integratedTerminal", 13 | "justMyCode": true, 14 | "env": { 15 | "PYTHONPATH": "${workspaceFolder}" 16 | } 17 | }, 18 | { 19 | "name": "Run all", 20 | "type": "python", 21 | "request": "launch", 22 | "program": "${workspaceFolder}/sneakpeek/app.py", 23 | "args": ["--api", "--scheduler", "--worker"], 24 | "console": "integratedTerminal", 25 | "justMyCode": true, 26 | "env": { 27 | "PYTHONPATH": "${workspaceFolder}" 28 | } 29 | }, 30 | { 31 | "name": "Run demo", 32 | "type": "python", 33 | "request": "launch", 34 | "program": "${workspaceFolder}/demo/app.py", 35 | "console": "integratedTerminal", 36 | "justMyCode": true, 37 | "env": { 38 | "PYTHONPATH": "${workspaceFolder}" 39 | } 40 | }, 41 | { 42 | "name": "Run demo (local handler)", 43 | "type": "python", 44 | "request": "launch", 45 | "program": "${workspaceFolder}/demo/demo_scraper.py", 46 | "console": "integratedTerminal", 47 | "justMyCode": true, 48 | "env": { 49 | "PYTHONPATH": "${workspaceFolder}" 50 | } 51 | } 52 | ] 53 | } 54 | -------------------------------------------------------------------------------- /sneakpeek/scraper/ephemeral_scraper_task_handler.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from sneakpeek.queue.model import Task, TaskHandlerABC 4 | from sneakpeek.scraper.model import ( 5 | EPHEMERAL_SCRAPER_TASK_HANDLER_NAME, 6 | ScraperConfig, 7 | ScraperHandler, 8 | ScraperRunnerABC, 9 | UnknownScraperHandlerError, 10 | ) 11 | 12 | 13 | class EphemeralScraperTask(BaseModel): 14 | scraper_handler: str 15 | scraper_config: ScraperConfig 16 | scraper_state: str | None = None 17 | 18 | 19 | class EphemeralScraperTaskHandler(TaskHandlerABC): 20 | def __init__( 21 | self, 22 | scraper_handlers: list[ScraperHandler], 23 | runner: ScraperRunnerABC, 24 | ) -> None: 25 | self.scraper_handlers = {handler.name: handler for handler in scraper_handlers} 26 | self.runner = runner 27 | 28 | def name(self) -> int: 29 | return EPHEMERAL_SCRAPER_TASK_HANDLER_NAME 30 | 31 | async def process(self, task: Task) -> str: 32 | config = EphemeralScraperTask.parse_raw(task.payload) 33 | handler = self._get_handler(config.scraper_handler) 34 | return await self.runner.run_ephemeral( 35 | handler, 36 | config.scraper_config, 37 | config.scraper_state, 38 | ) 39 | 40 | def _get_handler(self, scraper_handler: str) -> ScraperHandler: 41 | if scraper_handler not in self.scraper_handlers: 42 | raise UnknownScraperHandlerError( 43 | f"Unknown scraper handler '{scraper_handler}'" 44 | ) 45 | return self.scraper_handlers[scraper_handler] 46 | -------------------------------------------------------------------------------- /docs/local_debugging.rst: -------------------------------------------------------------------------------- 1 | ################################ 2 | Local handler debugging 3 | ################################ 4 | 5 | You can easily test handler without running full-featured server. Here's how you can do that for the `DemoScraper` that we have developed in the :doc:`tutorial `. 6 | 7 | Add import in the beginning of the file: 8 | 9 | .. code-block:: python3 10 | 11 | from sneakpeek.scraper.runner import ScraperRunner 12 | 13 | 14 | And add the following lines to the end of the file: 15 | 16 | 17 | .. code-block:: python3 18 | 19 | 20 | async def main(): 21 | result = await ScraperRunner.debug_handler( 22 | DemoScraper(), 23 | config=ScraperConfig( 24 | params=DemoScraperParams( 25 | start_url="https://www.ycombinator.com/", 26 | max_pages=20, 27 | ).dict(), 28 | ), 29 | middlewares=[ 30 | RequestsLoggingMiddleware(), 31 | ], 32 | ) 33 | logging.info(f"Finished scraper with result: {result}") 34 | 35 | if __name__ == "__main__": 36 | asyncio.run(main()) 37 | 38 | 39 | For the argument `ScraperRunner.debug_handler` takes: 40 | 41 | 1. An instance of your scraper handler 42 | 2. Scraper config 43 | 3. **[Optional]** Middleware that will be used in the handler (:doc:`see full list of the middleware here `) 44 | 45 | Now you can run you handler as an ordinary Python script. Given it's in `demo_scraper.py` file you can use: 46 | 47 | .. code-block:: bash 48 | 49 | python3 demo_scraper.py 50 | -------------------------------------------------------------------------------- /docs/middleware/requests_logging_middleware.rst: -------------------------------------------------------------------------------- 1 | ############################## 2 | Requests logging middleware 3 | ############################## 4 | 5 | Requests logging middleware logs all requests being made and received responses. 6 | 7 | Configuration of the middleware is defined in :py:class:`RequestsLoggingMiddlewareConfig `. 8 | 9 | How to configure middleware for the :py:class:`SneakpeekServer ` (will be used globally for all requests): 10 | 11 | .. code-block:: python3 12 | 13 | from sneakpeek.middleware.requests_logging_middleware import RequestsLoggingMiddleware, RequestsLoggingMiddlewareConfig 14 | 15 | server = SneakpeekServer.create( 16 | ... 17 | middleware=[ 18 | RequestsLoggingMiddleware( 19 | RequestsLoggingMiddlewareConfig( 20 | log_request=True, 21 | log_response=True, 22 | ) 23 | ) 24 | ], 25 | ) 26 | 27 | 28 | How to override middleware settings for a given scraper: 29 | 30 | .. code-block:: python3 31 | 32 | from sneakpeek.middleware.requests_logging_middleware import RequestsLoggingMiddlewareConfig 33 | 34 | scraper = Scraper( 35 | ... 36 | config=ScraperConfig( 37 | ... 38 | middleware={ 39 | "requests_logging": RequestsLoggingMiddlewareConfig( 40 | log_request=True, 41 | log_response=False, 42 | ) 43 | } 44 | ), 45 | ) 46 | -------------------------------------------------------------------------------- /docs/middleware/robots_txt_middleware.rst: -------------------------------------------------------------------------------- 1 | ######################### 2 | Robots.txt 3 | ######################### 4 | 5 | Robots.txt middleware can log and optionally block requests if they are disallowed by website robots.txt. 6 | If robots.txt is unavailable (e.g. request returns 5xx code) all requests will be allowed. 7 | 8 | Configuration of the middleware is defined in :py:class:`RobotsTxtMiddlewareConfig `. 9 | 10 | How to configure middleware for the :py:class:`SneakpeekServer ` (will be used globally for all requests): 11 | 12 | .. code-block:: python3 13 | 14 | from sneakpeek.middleware.robots_txt_middleware import RobotsTxtMiddleware, RobotsTxtMiddlewareConfig 15 | 16 | server = SneakpeekServer.create( 17 | ... 18 | middleware=[ 19 | ProxyMiddleware( 20 | ProxyMiddlewareConfig( 21 | violation_strategy = RobotsTxtViolationStrategy.THROW, 22 | ) 23 | ) 24 | ], 25 | ) 26 | 27 | 28 | How to override middleware settings for a given scraper: 29 | 30 | .. code-block:: python3 31 | 32 | from aiohttp import BasicAuth 33 | from sneakpeek.middleware.robots_txt_middleware import RobotsTxtMiddlewareConfig 34 | 35 | scraper = Scraper( 36 | ... 37 | config=ScraperConfig( 38 | ... 39 | middleware={ 40 | "robots_txt": ProxyMiddlewareConfig( 41 | violation_strategy = RobotsTxtViolationStrategy.LOG, 42 | ) 43 | } 44 | ), 45 | ) 46 | -------------------------------------------------------------------------------- /docs/middleware/proxy_middleware.rst: -------------------------------------------------------------------------------- 1 | ######################### 2 | Proxy middleware 3 | ######################### 4 | 5 | Proxy middleware automatically sets proxy arguments for all HTTP requests. 6 | Configuration of the middleware is defined in :py:class:`ProxyMiddlewareConfig `. 7 | 8 | How to configure middleware for the :py:class:`SneakpeekServer ` (will be used globally for all requests): 9 | 10 | .. code-block:: python3 11 | 12 | from aiohttp import BasicAuth 13 | from sneakpeek.middleware.proxy_middleware import ProxyMiddleware, ProxyMiddlewareConfig 14 | 15 | server = SneakpeekServer.create( 16 | ... 17 | middleware=[ 18 | ProxyMiddleware( 19 | ProxyMiddlewareConfig( 20 | proxy = "http://example.proxy.com:3128", 21 | proxy_auth = BasicAuth(login="mylogin", password="securepassword"), 22 | ) 23 | ) 24 | ], 25 | ) 26 | 27 | 28 | How to override middleware settings for a given scraper: 29 | 30 | .. code-block:: python3 31 | 32 | from aiohttp import BasicAuth 33 | from sneakpeek.middleware.proxy_middleware import ProxyMiddlewareConfig 34 | 35 | scraper = Scraper( 36 | ... 37 | config=ScraperConfig( 38 | ... 39 | middleware={ 40 | "proxy": ProxyMiddlewareConfig( 41 | proxy = "http://example.proxy.com:3128", 42 | proxy_auth = BasicAuth(login="mylogin", password="securepassword"), 43 | ) 44 | } 45 | ), 46 | ) 47 | -------------------------------------------------------------------------------- /docs/middleware/user_agent_injecter_middleware.rst: -------------------------------------------------------------------------------- 1 | ######################### 2 | User Agent injector 3 | ######################### 4 | 5 | This middleware automatically adds ``User-Agent`` header if it's not present. 6 | It uses `fake-useragent `_ in order to generate fake real world user agents. 7 | 8 | Configuration of the middleware is defined in :py:class:`UserAgentInjecterMiddlewareConfig `. 9 | 10 | How to configure middleware for the :py:class:`SneakpeekServer ` (will be used globally for all requests): 11 | 12 | .. code-block:: python3 13 | 14 | from sneakpeek.middleware.user_agent_injecter_middleware import UserAgentInjecterMiddleware, UserAgentInjecterMiddlewareConfig 15 | 16 | server = SneakpeekServer.create( 17 | ... 18 | middleware=[ 19 | UserAgentInjecterMiddleware( 20 | UserAgentInjecterMiddlewareConfig( 21 | use_external_data = True, 22 | browsers = ["chrome", "firefox"], 23 | ) 24 | ) 25 | ], 26 | ) 27 | 28 | 29 | How to override middleware settings for a given scraper: 30 | 31 | .. code-block:: python3 32 | 33 | from sneakpeek.middleware.user_agent_injecter_middleware import UserAgentInjecterMiddlewareConfig 34 | 35 | scraper = Scraper( 36 | ... 37 | config=ScraperConfig( 38 | ... 39 | middleware={ 40 | "user_agent_injecter": UserAgentInjecterMiddlewareConfig( 41 | use_external_data = False, 42 | browsers = ["chrome", "firefox"], 43 | ) 44 | } 45 | ), 46 | ) 47 | -------------------------------------------------------------------------------- /front/quasar.config.js: -------------------------------------------------------------------------------- 1 | const { configure } = require('quasar/wrappers'); 2 | const MonacoWebpackPlugin = require('monaco-editor-webpack-plugin'); 3 | 4 | module.exports = configure(function (ctx) { 5 | return { 6 | eslint: { 7 | warnings: true, 8 | errors: true 9 | }, 10 | boot: [ 11 | ], 12 | css: [ 13 | 'app.scss' 14 | ], 15 | extras: [ 16 | 'fontawesome-v6', 17 | 'roboto-font', 18 | 'material-icons', 19 | ], 20 | build: { 21 | target: { 22 | browser: [ 'es2019', 'edge88', 'firefox78', 'chrome87', 'safari13.1' ], 23 | node: 'node16' 24 | }, 25 | distDir: '../sneakpeek/static/ui/', 26 | vueRouterMode: 'hash', 27 | env: { 28 | JSONRPC_ENDPOINT: ctx.dev ? 'http://localhost:8080/api/v1/jsonrpc' : '/api/v1/jsonrpc', 29 | }, 30 | chainWebpack: config => { 31 | config.plugin('monaco-editor').use(MonacoWebpackPlugin, [ 32 | { 33 | languages: ['python', 'javascript', 'html', 'xml'] 34 | } 35 | ]) 36 | } 37 | }, 38 | devServer: { 39 | open: true 40 | }, 41 | framework: { 42 | config: { 43 | dark: "auto", 44 | notify: { 45 | position: "bottom" 46 | } 47 | }, 48 | plugins: [ 49 | "Notify", 50 | "SessionStorage", 51 | ] 52 | }, 53 | ssr: { 54 | pwa: false, 55 | prodPort: 3000, 56 | middlewares: [ 57 | 'render' 58 | ] 59 | }, 60 | pwa: { 61 | workboxMode: 'generateSW', 62 | injectPwaMetaTags: true, 63 | swFilename: 'sw.js', 64 | manifestFilename: 'manifest.json', 65 | useCredentialsForManifestTag: false, 66 | }, 67 | capacitor: { 68 | hideSplashscreen: true 69 | }, 70 | } 71 | }); 72 | -------------------------------------------------------------------------------- /sneakpeek/middleware/proxy_middleware.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from aiohttp import BasicAuth 4 | from fake_useragent import UserAgent 5 | from pydantic import BaseModel 6 | from typing_extensions import override 7 | from yarl import URL 8 | 9 | from sneakpeek.middleware.base import BaseMiddleware, parse_config_from_obj 10 | from sneakpeek.scraper.model import Request 11 | 12 | 13 | class ProxyMiddlewareConfig(BaseModel): 14 | """Proxy middleware config""" 15 | 16 | proxy: str | URL | None = None #: Proxy URL 17 | proxy_auth: BasicAuth | None = None #: Proxy authentication info to use 18 | 19 | class Config: 20 | arbitrary_types_allowed = True 21 | 22 | 23 | class ProxyMiddleware(BaseMiddleware): 24 | """Proxy middleware automatically sets proxy arguments for all HTTP requests.""" 25 | 26 | def __init__(self, default_config: ProxyMiddlewareConfig | None = None) -> None: 27 | self._default_config = default_config or ProxyMiddlewareConfig() 28 | self._user_agents = UserAgent( 29 | use_external_data=self._default_config.use_external_data, 30 | browsers=self._default_config.browsers, 31 | ) 32 | 33 | @property 34 | def name(self) -> str: 35 | return "proxy" 36 | 37 | @override 38 | async def on_request( 39 | self, 40 | request: Request, 41 | config: Any | None, 42 | ) -> Request: 43 | config = parse_config_from_obj( 44 | config, 45 | self.name, 46 | ProxyMiddlewareConfig, 47 | self._default_config, 48 | ) 49 | if not request.kwargs: 50 | request.kwargs = {} 51 | if config.proxy: 52 | request.kwargs["proxy"] = config.proxy 53 | if config.proxy_auth: 54 | request.kwargs["proxy_auth"] = config.proxy_auth 55 | return request 56 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: push 4 | 5 | jobs: 6 | ci: 7 | name: Build and publish Python pacakage to PyPI 8 | runs-on: "ubuntu-latest" 9 | permissions: 10 | id-token: write 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | python-version: ["3.10"] 15 | poetry-version: ["1.4.2"] 16 | node-version: ["18.16.0"] 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - name: Set up Python 21 | uses: actions/setup-python@v4 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | 25 | - name: Set up Poetry 26 | uses: abatilo/actions-poetry@v2 27 | with: 28 | poetry-version: ${{ matrix.poetry-version }} 29 | 30 | - name: Set Node.js 31 | uses: actions/setup-node@v3 32 | with: 33 | node-version: ${{ matrix.node-version }} 34 | 35 | - name: Run install 36 | run: make install 37 | 38 | - name: Run tests 39 | run: make test 40 | 41 | - name: Tests coverage 42 | run: make coverage 43 | 44 | - name: Upload coverage reports to Codecov 45 | uses: codecov/codecov-action@v3 46 | with: 47 | token: ${{ secrets.CODECOV_TOKEN }} 48 | files: ./coverage.xml 49 | verbose: true 50 | 51 | - name: Build package 52 | run: make build 53 | 54 | - name: Publish package to Test PyPI 55 | if: startsWith(github.ref, 'refs/tags') 56 | uses: pypa/gh-action-pypi-publish@release/v1 57 | with: 58 | repository-url: https://test.pypi.org/legacy/ 59 | skip-existing: true 60 | 61 | - name: Publish package to PyPI 62 | if: startsWith(github.ref, 'refs/tags') 63 | uses: pypa/gh-action-pypi-publish@release/v1 64 | with: 65 | password: ${{ secrets.PYPI_API_TOKEN }} 66 | -------------------------------------------------------------------------------- /sneakpeek/scheduler/redis_lease_storage.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from redis.asyncio import Redis 4 | 5 | from sneakpeek.metrics import count_invocations, measure_latency 6 | from sneakpeek.scheduler.model import Lease, LeaseStorageABC 7 | 8 | 9 | class RedisLeaseStorage(LeaseStorageABC): 10 | """Redis storage for leases. Should only be used for development purposes""" 11 | 12 | def __init__(self, redis: Redis) -> None: 13 | """ 14 | Args: 15 | redis (Redis): Async redis client 16 | """ 17 | self._redis = redis 18 | 19 | @count_invocations(subsystem="storage") 20 | @measure_latency(subsystem="storage") 21 | async def maybe_acquire_lease( 22 | self, 23 | lease_name: str, 24 | owner_id: str, 25 | acquire_for: timedelta, 26 | ) -> Lease | None: 27 | lease_key = f"lease:{lease_name}" 28 | existing_lease = await self._redis.get(lease_key) 29 | result = None 30 | if not existing_lease or existing_lease.decode() == owner_id: 31 | result = await self._redis.set( 32 | f"lease:{lease_name}", 33 | owner_id, 34 | ex=acquire_for, 35 | ) 36 | return ( 37 | Lease( 38 | name=lease_name, 39 | owner_id=owner_id, 40 | acquired=datetime.utcnow(), 41 | acquired_until=datetime.utcnow() + acquire_for, 42 | ) 43 | if result 44 | else None 45 | ) 46 | 47 | @count_invocations(subsystem="storage") 48 | @measure_latency(subsystem="storage") 49 | async def release_lease(self, lease_name: str, owner_id: str) -> None: 50 | lease_owner = await self._redis.get(f"lease:{lease_name}") 51 | if lease_owner == owner_id: 52 | await self._redis.delete(f"lease:{lease_name}") 53 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | print(os.path.abspath("..")) 17 | sys.path.insert(0, os.path.abspath("..")) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = "Sneakpeek" 23 | copyright = "2023, Dan Yazovsky" 24 | author = "Dan Yazovsky" 25 | version = "0.2" 26 | release = "0.2.2" 27 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.coverage", "sphinx.ext.napoleon"] 28 | templates_path = ["_templates"] 29 | language = "en" 30 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 31 | html_static_path = ["_static"] 32 | autoclass_content = "both" 33 | html_theme = "sphinx_rtd_theme" 34 | html_theme_options = { 35 | "analytics_id": "G-3EW8JNTBHC", 36 | "logo_only": False, 37 | "display_version": True, 38 | "prev_next_buttons_location": "bottom", 39 | "style_external_links": False, 40 | "vcs_pageview_mode": "display_github", 41 | "collapse_navigation": False, 42 | "sticky_navigation": True, 43 | "navigation_depth": 4, 44 | "includehidden": True, 45 | "titles_only": True, 46 | } 47 | github_url = "https://github.com/flulemon/sneakpeek" 48 | highlight_language = "python3" 49 | pygments_style = "sphinx" 50 | 51 | autodoc_default_options = { 52 | "members": True, 53 | "show-inheritance": True, 54 | } 55 | autodoc_typehints = "both" 56 | -------------------------------------------------------------------------------- /sneakpeek/queue/tasks.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from sneakpeek.queue.model import QueueABC, Task, TaskHandlerABC, TaskPriority 4 | from sneakpeek.scheduler.model import ( 5 | PeriodicTask, 6 | StaticPeriodicTasksStorage, 7 | TaskSchedule, 8 | generate_id, 9 | ) 10 | 11 | KILL_DEAD_TASKS_TASK_NAME = "internal::queue::kill_dead_tasks" 12 | DELETE_OLD_TASKS_TASK_NAME = "internal::queue::delete_old_tasks" 13 | 14 | 15 | class KillDeadTasksHandler(TaskHandlerABC): 16 | def __init__(self, queue: QueueABC) -> None: 17 | self.queue = queue 18 | 19 | def name(self) -> int: 20 | return KILL_DEAD_TASKS_TASK_NAME 21 | 22 | async def process(self, task: Task) -> str: 23 | killed = await self.queue.kill_dead_tasks() 24 | return json.dumps( 25 | { 26 | "success": True, 27 | "killed": [item.id for item in killed], 28 | }, 29 | indent=4, 30 | ) 31 | 32 | 33 | class DeleteOldTasksHandler(TaskHandlerABC): 34 | def __init__(self, queue: QueueABC) -> None: 35 | self.queue = queue 36 | 37 | def name(self) -> int: 38 | return DELETE_OLD_TASKS_TASK_NAME 39 | 40 | async def process(self, task: Task) -> str: 41 | await self.queue.delete_old_tasks() 42 | return json.dumps({"success": True}, indent=4) 43 | 44 | 45 | queue_periodic_tasks = StaticPeriodicTasksStorage( 46 | tasks=[ 47 | PeriodicTask( 48 | id=generate_id(), 49 | name=KILL_DEAD_TASKS_TASK_NAME, 50 | handler=KILL_DEAD_TASKS_TASK_NAME, 51 | priority=TaskPriority.NORMAL, 52 | payload="", 53 | schedule=TaskSchedule.EVERY_HOUR, 54 | ), 55 | PeriodicTask( 56 | id=generate_id(), 57 | name=DELETE_OLD_TASKS_TASK_NAME, 58 | handler=DELETE_OLD_TASKS_TASK_NAME, 59 | priority=TaskPriority.NORMAL, 60 | payload="", 61 | schedule=TaskSchedule.EVERY_HOUR, 62 | ), 63 | ] 64 | ) 65 | -------------------------------------------------------------------------------- /sneakpeek/scheduler/in_memory_lease_storage.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from asyncio import Lock 3 | from datetime import datetime, timedelta 4 | 5 | from sneakpeek.metrics import count_invocations, measure_latency 6 | from sneakpeek.scheduler.model import Lease, LeaseStorageABC 7 | 8 | 9 | class InMemoryLeaseStorage(LeaseStorageABC): 10 | """In memory storage for leases. Should only be used for development purposes""" 11 | 12 | def __init__(self) -> None: 13 | self._logger = logging.getLogger(__name__) 14 | self._lock = Lock() 15 | self._leases: dict[str, Lease] = {} 16 | 17 | def _can_acquire_lease(self, lease_name: str, owner_id: str) -> bool: 18 | existing_lease = self._leases.get(lease_name) 19 | return ( 20 | not existing_lease 21 | or existing_lease.acquired_until < datetime.utcnow() 22 | or existing_lease.owner_id == owner_id 23 | ) 24 | 25 | @count_invocations(subsystem="storage") 26 | @measure_latency(subsystem="storage") 27 | async def maybe_acquire_lease( 28 | self, 29 | lease_name: str, 30 | owner_id: str, 31 | acquire_for: timedelta, 32 | ) -> Lease | None: 33 | async with self._lock: 34 | if self._can_acquire_lease(lease_name, owner_id): 35 | self._leases[lease_name] = Lease( 36 | name=lease_name, 37 | owner_id=owner_id, 38 | acquired=datetime.utcnow(), 39 | acquired_until=datetime.utcnow() + acquire_for, 40 | ) 41 | return self._leases[lease_name] 42 | return None 43 | 44 | @count_invocations(subsystem="storage") 45 | @measure_latency(subsystem="storage") 46 | async def release_lease(self, lease_name: str, owner_id: str) -> None: 47 | async with self._lock: 48 | if lease_name not in self._leases: 49 | return 50 | if self._can_acquire_lease(lease_name, owner_id): 51 | del self._leases[lease_name] 52 | -------------------------------------------------------------------------------- /sneakpeek/middleware/user_agent_injecter_middleware.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from fake_useragent import UserAgent 4 | from pydantic import BaseModel 5 | from typing_extensions import override 6 | 7 | from sneakpeek.middleware.base import BaseMiddleware, parse_config_from_obj 8 | from sneakpeek.scraper.model import Request 9 | 10 | 11 | class UserAgentInjecterMiddlewareConfig(BaseModel): 12 | """Middleware configuration""" 13 | 14 | #: Whether to use external data as a fallback 15 | use_external_data: bool = True 16 | 17 | #: List of browsers which are used to generate user agents 18 | browsers: list[str] = ["chrome", "edge", "firefox", "safari", "opera"] 19 | 20 | 21 | class UserAgentInjecterMiddleware(BaseMiddleware): 22 | """ 23 | This middleware automatically adds ``User-Agent`` header if it's not present. 24 | It uses `fake-useragent `_ in order to generate fake real world user agents. 25 | """ 26 | 27 | def __init__( 28 | self, default_config: UserAgentInjecterMiddlewareConfig | None = None 29 | ) -> None: 30 | self._default_config = default_config or UserAgentInjecterMiddlewareConfig() 31 | self._user_agents = UserAgent( 32 | use_external_data=self._default_config.use_external_data, 33 | browsers=self._default_config.browsers, 34 | ) 35 | 36 | @property 37 | def name(self) -> str: 38 | return "user_agent_injecter" 39 | 40 | @override 41 | async def on_request( 42 | self, 43 | request: Request, 44 | config: Any | None, 45 | ) -> Request: 46 | config = parse_config_from_obj( 47 | config, 48 | self.name, 49 | UserAgentInjecterMiddlewareConfig, 50 | self._default_config, 51 | ) 52 | if (request.headers or {}).get("User-Agent"): 53 | return request 54 | if not request.headers: 55 | request.headers = {} 56 | request.headers["User-Agent"] = self._user_agents.random 57 | return request 58 | -------------------------------------------------------------------------------- /sneakpeek/scraper/dynamic_scraper_handler.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import json 3 | from typing import Any, Awaitable, Callable, Mapping 4 | 5 | from pydantic import BaseModel 6 | from typing_extensions import override 7 | 8 | from sneakpeek.scraper.model import ScraperContextABC, ScraperHandler 9 | 10 | 11 | class DynamicScraperParams(BaseModel): 12 | source_code: str 13 | args: list[Any] | None = None 14 | kwargs: Mapping[str, Any] | None = None 15 | 16 | 17 | class DynamicScraperHandler(ScraperHandler): 18 | @property 19 | def name(self) -> str: 20 | return "dynamic_scraper" 21 | 22 | def compile(self, source_code: str) -> Callable[..., Awaitable[None]]: 23 | bytecode = compile(source=source_code, filename="", mode="exec") 24 | session_globals = {} 25 | exec(bytecode, session_globals) 26 | if "context" in session_globals: 27 | raise SyntaxError("`context` is a reserved keyword") 28 | if "handler" not in session_globals: 29 | raise SyntaxError("Expected source code to define a `handler` function") 30 | handler = session_globals["handler"] 31 | if not inspect.iscoroutinefunction(handler): 32 | raise SyntaxError("Expected `handler` to be a function") 33 | if handler.__code__.co_argcount == 0: 34 | raise SyntaxError( 35 | "Expected `handler` to have at least one argument: `context: ScraperContext`" 36 | ) 37 | return handler 38 | 39 | @override 40 | async def run(self, context: ScraperContextABC) -> str: 41 | params = DynamicScraperParams.parse_obj(context.params) 42 | handler = self.compile(params.source_code) 43 | result = await handler(context, *(params.args or []), **(params.kwargs or {})) 44 | if result is None: 45 | return "No result was returned" 46 | if isinstance(result, str): 47 | return result 48 | try: 49 | return json.dumps(result, indent=4) 50 | except TypeError as ex: 51 | return f"Failed to serialize result with error: {ex}" 52 | -------------------------------------------------------------------------------- /front/src/components/ScraperCard.vue: -------------------------------------------------------------------------------- 1 | 27 | 28 | 70 | -------------------------------------------------------------------------------- /sneakpeek/session_loggers/redis_logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from asyncio import AbstractEventLoop 3 | from copy import copy 4 | from dataclasses import dataclass 5 | from datetime import datetime, timedelta 6 | from typing import Any 7 | 8 | from redis.asyncio import Redis 9 | 10 | from sneakpeek.session_loggers.base import SessionLogger, get_fields_to_log 11 | 12 | MAX_BUFFER_AGE = timedelta(seconds=5) 13 | 14 | 15 | @dataclass 16 | class _LogRecord: 17 | task_id: str 18 | data: Any 19 | 20 | 21 | class RedisLoggerHandler(SessionLogger): 22 | def __init__( 23 | self, 24 | redis: Redis, 25 | loop: AbstractEventLoop | None = None, 26 | max_buffer_size: int = 10, 27 | max_buffer_age: timedelta = MAX_BUFFER_AGE, 28 | ) -> None: 29 | super().__init__() 30 | self.redis = redis 31 | self.loop = loop 32 | self.max_buffer_size = max_buffer_size 33 | self.max_buffer_age = max_buffer_age 34 | self.buffer: list[_LogRecord] = [] 35 | self.last_flush = datetime.min 36 | 37 | async def _write_to_log(self, messages: list[_LogRecord]) -> None: 38 | for message in messages: 39 | await self.redis.xadd(name=message.task_id, fields=message.data) 40 | 41 | def flush(self): 42 | """ 43 | Flushes the stream. 44 | """ 45 | if not self.buffer: 46 | return 47 | if ( 48 | len(self.buffer) < self.max_buffer_size 49 | and datetime.utcnow() - self.last_flush < self.max_buffer_age 50 | ): 51 | return 52 | self.acquire() 53 | try: 54 | self.loop.create_task(self._write_to_log, copy(self.buffer)) 55 | finally: 56 | self.buffer.clear() 57 | self.release() 58 | 59 | def emit(self, record: logging.LogRecord) -> None: 60 | if not getattr(record, "task_id"): 61 | return 62 | 63 | self.buffer.append( 64 | _LogRecord( 65 | task_id=record.task_id, 66 | data=get_fields_to_log(record), 67 | ) 68 | ) 69 | self.flush() 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sneakpeek 2 | 3 | ![CI](https://github.com/flulemon/sneakpeek/actions/workflows/ci.yml/badge.svg) 4 | [![PyPI version](https://badge.fury.io/py/sneakpeek-py.svg)](https://badge.fury.io/py/sneakpeek-py) 5 | [![Downloads](https://static.pepy.tech/badge/sneakpeek-py)](https://pepy.tech/project/sneakpeek-py) 6 | [![Documentation Status](https://readthedocs.org/projects/sneakpeek-py/badge/?version=latest)](https://sneakpeek-py.readthedocs.io/en/latest/?badge=latest) 7 | [![codecov](https://codecov.io/gh/flulemon/sneakpeek/branch/main/graph/badge.svg?token=7h45P8qHRG)](https://codecov.io/gh/flulemon/sneakpeek) 8 | 9 | **Sneakpeek** - is a platform to author, schedule and monitor scrapers in an easy, fast and extensible way. 10 | It's the best choice for scrapers that have some specific complex scraping logic that needs 11 | to be run on a constant basis. 12 | 13 | ## Key features 14 | 15 | - Horizontally scalable 16 | - Robust scraper scheduler and priority task queue 17 | - Multiple storage implementations to persist scrapers' configs, tasks, logs, etc. 18 | - JSON RPC API to manage the platform programmatically 19 | - Useful UI to manage all of your scrapers 20 | - Scraper IDE to enable you developing scrapers right in your browser 21 | - Easily extendable via middleware 22 | 23 | ## Demo 24 | 25 | [Here's a demo project](https://github.com/flulemon/sneakpeek-demo) which uses **Sneakpeek** framework. 26 | 27 | You can also run the demo using Docker: 28 | 29 | ```bash 30 | docker run -it --rm -p 8080:8080 flulemon/sneakpeek-demo 31 | ``` 32 | 33 | Once it has started head over to http://localhost:8080 to play around with it. 34 | 35 | ## Documentation 36 | 37 | For the full documentation please visit [sneakpeek-py.readthedocs.io](https://sneakpeek-py.readthedocs.io/en/latest/) 38 | 39 | ## Contributing 40 | 41 | Please take a look at our [contributing](https://github.com/flulemon/sneakpeek/blob/main/CONTRIBUTING.md) guidelines if you're interested in helping! 42 | 43 | ## Future plans 44 | 45 | - Headful and headless browser engines middleware (Selenium and Playwright) 46 | - SQL and AmazonDB storage implementation 47 | - Advanced monitoring for the scrapers' health 48 | -------------------------------------------------------------------------------- /sneakpeek/scheduler/tests/test_lease_storage.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from datetime import timedelta 3 | 4 | import pytest 5 | from fakeredis.aioredis import FakeRedis 6 | 7 | from sneakpeek.scheduler.in_memory_lease_storage import InMemoryLeaseStorage 8 | from sneakpeek.scheduler.model import LeaseStorageABC 9 | from sneakpeek.scheduler.redis_lease_storage import RedisLeaseStorage 10 | 11 | NON_EXISTENT_SCRAPER_ID = 10001 12 | 13 | 14 | @pytest.fixture 15 | def in_memory_storage() -> LeaseStorageABC: 16 | return InMemoryLeaseStorage() 17 | 18 | 19 | @pytest.fixture 20 | def redis_storage() -> LeaseStorageABC: 21 | return RedisLeaseStorage(FakeRedis()) 22 | 23 | 24 | @pytest.fixture( 25 | params=[ 26 | pytest.lazy_fixture(in_memory_storage.__name__), 27 | pytest.lazy_fixture(redis_storage.__name__), 28 | ] 29 | ) 30 | def storage(request) -> LeaseStorageABC: 31 | yield request.param 32 | 33 | 34 | @pytest.mark.asyncio 35 | async def test_lease(storage: LeaseStorageABC): 36 | lease_name_1 = "test_lease_1" 37 | lease_name_2 = "test_lease_2" 38 | owner_1 = "owner_id_1" 39 | owner_2 = "owner_id_2" 40 | owner_1_acquire_until = timedelta(seconds=1) 41 | owner_2_acquire_until = timedelta(seconds=5) 42 | 43 | # initial acquire 44 | assert ( 45 | await storage.maybe_acquire_lease(lease_name_1, owner_1, owner_1_acquire_until) 46 | is not None 47 | ) 48 | # another lease can be acquired 49 | assert ( 50 | await storage.maybe_acquire_lease(lease_name_2, owner_2, owner_2_acquire_until) 51 | is not None 52 | ) 53 | # lock is acquired so no one can acquire 54 | assert ( 55 | await storage.maybe_acquire_lease(lease_name_1, owner_2, owner_2_acquire_until) 56 | is None 57 | ) 58 | # owner can re-acquire 59 | assert ( 60 | await storage.maybe_acquire_lease(lease_name_1, owner_1, owner_1_acquire_until) 61 | is not None 62 | ) 63 | 64 | # lock expires and can be acuired 65 | await asyncio.sleep(1) 66 | assert ( 67 | await storage.maybe_acquire_lease(lease_name_1, owner_2, owner_2_acquire_until) 68 | is not None 69 | ) 70 | -------------------------------------------------------------------------------- /docs/middleware/rate_limiter_middleware.rst: -------------------------------------------------------------------------------- 1 | ######################### 2 | Rate limiter 3 | ######################### 4 | 5 | Rate limiter implements `leaky bucket algorithm `_ 6 | to limit number of requests made to the hosts. If the request is rate limited it can either 7 | raise an exception or wait until the request won't be limited anymore. 8 | 9 | Configuration of the middleware is defined in :py:class:`RateLimiterMiddlewareConfig `. 10 | 11 | How to configure middleware for the :py:class:`SneakpeekServer ` (will be used globally for all requests): 12 | 13 | .. code-block:: python3 14 | 15 | from sneakpeek.middleware.rate_limiter_middleware import RateLimiterMiddleware, RateLimiterMiddlewareConfig 16 | 17 | server = SneakpeekServer.create( 18 | ... 19 | middleware=[ 20 | RateLimiterMiddleware( 21 | RateLimiterMiddlewareConfig( 22 | # maximum number of requests in a given time window 23 | max_requests = 60, 24 | # wait until request won't be rate limited 25 | rate_limited_strategy = RateLimitedStrategy.WAIT 26 | # only 60 requests per host are allowed within 1 minute 27 | time_window = timedelta(minute=1), 28 | ) 29 | ) 30 | ], 31 | ) 32 | 33 | 34 | How to override middleware settings for a given scraper: 35 | 36 | .. code-block:: python3 37 | 38 | from sneakpeek.middleware.rate_limiter_middleware import RateLimiterMiddlewareConfig 39 | 40 | scraper = Scraper( 41 | ... 42 | config=ScraperConfig( 43 | ... 44 | middleware={ 45 | "rate_limiter": RateLimiterMiddlewareConfig( 46 | # maximum number of requests in a given time window 47 | max_requests = 120, 48 | # throw RateLimiterException if request is rate limited 49 | rate_limited_strategy = RateLimitedStrategy.THROW 50 | # only 120 requests per host are allowed within 1 minute 51 | time_window = timedelta(minute=1), 52 | ) 53 | } 54 | ), 55 | ) 56 | -------------------------------------------------------------------------------- /front/src/components/TaskLogs.vue: -------------------------------------------------------------------------------- 1 | 7 | 80 | 91 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | NAME := sneakpeek 2 | PY_INSTALL_STAMP := .py.install.stamp 3 | JS_INSTALL_STAMP := .js.install.stamp 4 | POETRY := $(shell command -v poetry 2> /dev/null) 5 | YARN := $(shell command -v yarn 2> /dev/null) 6 | ROOT_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) 7 | 8 | .DEFAULT_GOAL := help 9 | 10 | 11 | .PHONY: help 12 | help: ##Show this help. 13 | @fgrep -h "##" $(MAKEFILE_LIST) | sed -e 's/\(\:.*\#\#\)/\:\ /' | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##//' 14 | 15 | install-py: $(PY_INSTALL_STAMP) ##Install python dependencies (Poetry is required) 16 | $(PY_INSTALL_STAMP): pyproject.toml poetry.lock 17 | @if [ -z $(POETRY) ]; then echo "Poetry could not be found. See https://python-poetry.org/docs/"; exit 2; fi 18 | $(POETRY) --version 19 | $(POETRY) install --all-extras --with remotesettings,taskcluster --no-ansi --no-interaction --verbose 20 | touch $(PY_INSTALL_STAMP) 21 | 22 | install-js: $(JS_INSTALL_STAMP) ##Install JS dependencies (Yarn is required) 23 | $(JS_INSTALL_STAMP): front/package.json front/yarn.lock 24 | @if [ -z $(YARN) ]; then echo "YARN could not be found. See https://yarnpkg.com/"; exit 2; fi 25 | $(YARN) --version 26 | cd $(ROOT_DIR)/front; $(YARN) install 27 | touch $(JS_INSTALL_STAMP) 28 | 29 | install: install-py install-js ##Install all dependencies 30 | 31 | gen-requirements: $(PY_INSTALL_STAMP) 32 | $(POETRY) export --without-hashes --format=requirements.txt > requirements.txt 33 | 34 | .PHONY: test 35 | test: $(PY_INSTALL_STAMP) ##Run tests 36 | $(POETRY) run pytest -n 20 37 | 38 | .PHONE: coverage 39 | coverage: $(PY_INSTALL_STAMP) ##Run tests 40 | $(POETRY) run pytest --cov=sneakpeek sneakpeek --cov-fail-under=85 --cov-report term-missing --cov-report html --cov-report xml 41 | 42 | build-ui: ##Build frontend 43 | $(YARN) --cwd $(ROOT_DIR)/front/ quasar build 44 | 45 | build-docs: $(PY_INSTALL_STAMP) ##Build documentation 46 | rm -rf $(ROOT_DIR)/sneakpeek/static/docs/ 47 | mkdir -p $(ROOT_DIR)/sneakpeek/static/docs/ 48 | $(POETRY) run sphinx-build $(ROOT_DIR)/docs $(ROOT_DIR)/sneakpeek/static/docs/ 49 | 50 | build-py: ##Build Python package 51 | $(POETRY) build 52 | 53 | build: build-ui build-docs build-py ##Build everything 54 | 55 | .PHONY: clean 56 | clean: ##Cleanup 57 | find . -type d -name "__pycache__" | xargs rm -rf {}; 58 | find . -type d -name ".pytest_cache" | xargs rm -rf {}; 59 | rm -rf $(PY_INSTALL_STAMP) $(JS_INSTALL_STAMP) .coverage .mypy_cache -------------------------------------------------------------------------------- /front/src/assets/logo.svg: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 8 | 10 | 12 | 14 | 15 | 16 | 17 | 18 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /front/.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | // https://eslint.org/docs/user-guide/configuring#configuration-cascading-and-hierarchy 3 | // This option interrupts the configuration hierarchy at this file 4 | // Remove this if you have an higher level ESLint config file (it usually happens into a monorepos) 5 | root: true, 6 | 7 | parserOptions: { 8 | ecmaVersion: '2021', // Allows for the parsing of modern ECMAScript features 9 | }, 10 | 11 | env: { 12 | node: true, 13 | browser: true, 14 | 'vue/setup-compiler-macros': true 15 | }, 16 | 17 | // Rules order is important, please avoid shuffling them 18 | extends: [ 19 | // Base ESLint recommended rules 20 | // 'eslint:recommended', 21 | 22 | // Uncomment any of the lines below to choose desired strictness, 23 | // but leave only one uncommented! 24 | // See https://eslint.vuejs.org/rules/#available-rules 25 | 'plugin:vue/vue3-essential', // Priority A: Essential (Error Prevention) 26 | // 'plugin:vue/vue3-strongly-recommended', // Priority B: Strongly Recommended (Improving Readability) 27 | // 'plugin:vue/vue3-recommended', // Priority C: Recommended (Minimizing Arbitrary Choices and Cognitive Overhead) 28 | 29 | // https://github.com/prettier/eslint-config-prettier#installation 30 | // usage with Prettier, provided by 'eslint-config-prettier'. 31 | 'prettier' 32 | ], 33 | 34 | plugins: [ 35 | // https://eslint.vuejs.org/user-guide/#why-doesn-t-it-work-on-vue-files 36 | // required to lint *.vue files 37 | 'vue', 38 | 39 | // https://github.com/typescript-eslint/typescript-eslint/issues/389#issuecomment-509292674 40 | // Prettier has not been included as plugin to avoid performance impact 41 | // add it as an extension for your IDE 42 | 43 | ], 44 | 45 | globals: { 46 | ga: 'readonly', // Google Analytics 47 | cordova: 'readonly', 48 | __statics: 'readonly', 49 | __QUASAR_SSR__: 'readonly', 50 | __QUASAR_SSR_SERVER__: 'readonly', 51 | __QUASAR_SSR_CLIENT__: 'readonly', 52 | __QUASAR_SSR_PWA__: 'readonly', 53 | process: 'readonly', 54 | Capacitor: 'readonly', 55 | chrome: 'readonly' 56 | }, 57 | 58 | // add your custom rules here 59 | rules: { 60 | 61 | 'prefer-promise-reject-errors': 'off', 62 | 63 | // allow debugger during development only 64 | 'no-debugger': process.env.NODE_ENV === 'production' ? 'error' : 'off' 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sneakpeek-py" 3 | packages = [{ include = "sneakpeek" }] 4 | version = "0.2.2" 5 | description = "Sneakpeek is a framework that helps to quickly and conviniently develop scrapers. It's the best choice for scrapers that have some specific complex scraping logic that needs to be run on a constant basis." 6 | authors = ["Dan Yazovsky "] 7 | maintainers = ["Dan Yazovsky "] 8 | repository = "https://github.com/flulemon/sneakpeek" 9 | documentation = "https://sneakpeek-py.readthedocs.io/en/latest/" 10 | homepage = "https://github.com/flulemon/sneakpeek" 11 | license = "BSD-3-Clause" 12 | readme = "README.md" 13 | classifiers = [ 14 | "Operating System :: OS Independent", 15 | "Development Status :: 2 - Pre-Alpha", 16 | "License :: OSI Approved :: BSD License", 17 | "Programming Language :: Python", 18 | "Programming Language :: Python :: 3", 19 | "Programming Language :: Python :: 3.10", 20 | "Programming Language :: Python :: 3.11", 21 | "Programming Language :: Python :: 3.12", 22 | "Intended Audience :: Developers", 23 | "Framework :: FastAPI", 24 | "Framework :: Pydantic", 25 | "Topic :: Software Development :: Libraries :: Application Frameworks", 26 | "Topic :: Internet :: WWW/HTTP :: Indexing/Search" 27 | ] 28 | 29 | [tool.poetry.dependencies] 30 | python = "^3.10" 31 | pydantic = "^1.10.7" 32 | fastapi = "^0.95.0" 33 | fastapi-jsonrpc = "^2.4.1" 34 | redis = "^4.5.4" 35 | apscheduler = "^3.10.1" 36 | aiohttp = "^3.8.4" 37 | uvicorn = "^0.21.1" 38 | cachetools = "^5.3.0" 39 | prometheus-client = "^0.16.0" 40 | fake-useragent = "^1.1.3" 41 | Sphinx = { version = "4.2.0", optional = true } 42 | sphinx-rtd-theme = { version = "1.0.0", optional = true } 43 | sphinxcontrib-napoleon = { version = "0.7", optional = true } 44 | yarl = "^1.9.1" 45 | 46 | [tool.poetry.group.dev.dependencies] 47 | pytest = "^7.2.2" 48 | fakeredis = "2.11.0" 49 | black = "^23.3.0" 50 | pytest-lazy-fixture = "^0.6.3" 51 | pytest-asyncio = "^0.21.0" 52 | pytest-cov = "^4.0.0" 53 | aioresponses = "^0.7.4" 54 | pytest-xdist = "^3.3.0" 55 | 56 | [build-system] 57 | requires = ["poetry-core"] 58 | build-backend = "poetry.core.masonry.api" 59 | 60 | [tool.pytest.ini_options] 61 | log_cli = true 62 | log_cli_level = "INFO" 63 | log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)" 64 | log_cli_date_format = "%Y-%m-%d %H:%M:%S" 65 | 66 | [tool.poetry.extras] 67 | docs = ["Sphinx", "sphinx-rtd-theme", "sphinxcontrib-napoleon"] -------------------------------------------------------------------------------- /sneakpeek/middleware/requests_logging_middleware.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Any 3 | 4 | import aiohttp 5 | from pydantic import BaseModel 6 | from typing_extensions import override 7 | 8 | from sneakpeek.middleware.base import parse_config_from_obj 9 | from sneakpeek.scraper.model import Middleware, Request 10 | 11 | 12 | class RequestsLoggingMiddlewareConfig(BaseModel): 13 | """Requests logging middleware config""" 14 | 15 | log_request: bool = True #: Whether to log request being made 16 | log_response: bool = True #: Whether to log response being made 17 | 18 | 19 | class RequestsLoggingMiddleware(Middleware): 20 | """Requests logging middleware logs all requests being made and received responses.""" 21 | 22 | def __init__( 23 | self, default_config: RequestsLoggingMiddlewareConfig | None = None 24 | ) -> None: 25 | self._default_config = default_config or RequestsLoggingMiddlewareConfig() 26 | self._logger = logging.getLogger(__name__) 27 | 28 | @property 29 | def name(self) -> str: 30 | return "requests_logging" 31 | 32 | @override 33 | async def on_request( 34 | self, 35 | request: Request, 36 | config: Any | None, 37 | ) -> Request: 38 | config = parse_config_from_obj( 39 | config, 40 | self.name, 41 | RequestsLoggingMiddlewareConfig, 42 | self._default_config, 43 | ) 44 | if config.log_request: 45 | self._logger.info( 46 | f"{request.method.upper()} {request.url}", 47 | extra={ 48 | "headers": request.headers, 49 | "kwargs": request.kwargs, 50 | }, 51 | ) 52 | return request 53 | 54 | @override 55 | async def on_response( 56 | self, 57 | request: Request, 58 | response: aiohttp.ClientResponse, 59 | config: Any | None, 60 | ) -> aiohttp.ClientResponse: 61 | config = parse_config_from_obj( 62 | config, 63 | self.name, 64 | RequestsLoggingMiddlewareConfig, 65 | self._default_config, 66 | ) 67 | if config.log_response: 68 | response_body = await response.text() 69 | self._logger.info( 70 | f"{request.method.upper()} {request.url} - {response.status} ", 71 | extra={ 72 | "headers": request.headers, 73 | "kwargs": request.kwargs, 74 | "response": {response_body}, 75 | }, 76 | ) 77 | return response 78 | -------------------------------------------------------------------------------- /sneakpeek/scraper/in_memory_storage.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from datetime import timedelta 3 | from uuid import uuid4 4 | 5 | from typing_extensions import override 6 | 7 | from sneakpeek.scraper.model import ( 8 | CreateScraperRequest, 9 | Scraper, 10 | ScraperId, 11 | ScraperNotFoundError, 12 | ScraperStorageABC, 13 | StorageIsReadOnlyError, 14 | ) 15 | 16 | 17 | class InMemoryScraperStorage(ScraperStorageABC): 18 | def __init__( 19 | self, 20 | initial_scrapers: list[Scraper] | None = None, 21 | is_read_only: bool = False, 22 | ) -> None: 23 | self.read_only = is_read_only 24 | self.scrapers: dict[ScraperId, Scraper] = { 25 | scraper.id: scraper for scraper in initial_scrapers or [] 26 | } 27 | self.lock = asyncio.Lock() 28 | 29 | @override 30 | def is_read_only(self) -> bool: 31 | return self.read_only 32 | 33 | @override 34 | async def create_scraper(self, request: CreateScraperRequest) -> Scraper: 35 | if self.read_only: 36 | raise StorageIsReadOnlyError() 37 | async with self.lock: 38 | id = str(uuid4()) 39 | self.scrapers[id] = Scraper( 40 | id=id, 41 | name=request.name, 42 | handler=request.handler, 43 | schedule=request.schedule, 44 | schedule_crontab=request.schedule_crontab, 45 | config=request.config, 46 | priority=request.priority, 47 | timeout=( 48 | timedelta(seconds=request.timeout_seconds) 49 | if request.timeout_seconds 50 | else None 51 | ), 52 | ) 53 | return self.scrapers[id] 54 | 55 | @override 56 | async def update_scraper(self, scraper: Scraper) -> Scraper: 57 | if self.read_only: 58 | raise StorageIsReadOnlyError() 59 | async with self.lock: 60 | if scraper.id not in self.scrapers: 61 | raise ScraperNotFoundError() 62 | self.scrapers[scraper.id] = scraper 63 | return scraper 64 | 65 | @override 66 | async def delete_scraper(self, id: ScraperId) -> Scraper: 67 | if self.read_only: 68 | raise StorageIsReadOnlyError() 69 | async with self.lock: 70 | if id not in self.scrapers: 71 | raise ScraperNotFoundError() 72 | return self.scrapers.pop(id) 73 | 74 | @override 75 | async def get_scraper(self, id: ScraperId) -> Scraper: 76 | if id not in self.scrapers: 77 | raise ScraperNotFoundError() 78 | return self.scrapers[id] 79 | 80 | @override 81 | async def get_scrapers(self) -> list[Scraper]: 82 | return list(self.scrapers.values()) 83 | -------------------------------------------------------------------------------- /sneakpeek/logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from contextlib import contextmanager 3 | from contextvars import ContextVar 4 | 5 | from sneakpeek.queue.model import Task 6 | 7 | ctx_task = ContextVar("scraper_job") 8 | 9 | 10 | @contextmanager 11 | def task_context(task: Task) -> None: 12 | """ 13 | Initialize scraper job logging context which automatically adds 14 | scraper and scraper job IDs to the logging metadata 15 | 16 | Args: 17 | scraper_job (ScraperJob): Scraper job definition 18 | """ 19 | try: 20 | token = ctx_task.set(task) 21 | yield 22 | finally: 23 | ctx_task.reset(token) 24 | 25 | 26 | class TaskContextInjectingFilter(logging.Filter): 27 | """ 28 | Scraper context filter which automatically injects 29 | scraper and scraper job IDs to the logging metadata. 30 | 31 | Example of usage: 32 | 33 | .. code-block:: python3 34 | 35 | logger = logging.getLogger() 36 | handler = logging.StreamHandler() 37 | handler.addFilter(ScraperContextInjectingFilter()) 38 | logger.addHandler(handler) 39 | """ 40 | 41 | def filter(self, record: logging.LogRecord) -> bool: 42 | """Injects task metadata into log record: 43 | 44 | * ``task_id`` - Task ID 45 | * ``task_name`` - Task name 46 | * ``task_handler`` - Task handler 47 | 48 | Args: 49 | record (logging.LogRecord): Log record to inject metadata into 50 | 51 | Returns: 52 | bool: Always True 53 | """ 54 | task: Task | None = ctx_task.get(None) 55 | record.task_id = task.id if task else "" 56 | record.task_name = task.task_name if task else "" 57 | record.task_handler = task.task_handler if task else "" 58 | return True 59 | 60 | 61 | def configure_logging( 62 | level: int = logging.INFO, 63 | session_logger_handler: logging.Handler | None = None, 64 | ): 65 | """ 66 | Helper function to configure logging: 67 | 68 | * Adds console logger to the root logger 69 | * Adds scraper context injector filter to the console logger 70 | * Configures console formatting to use scraper metadata 71 | 72 | Args: 73 | level (int, optional): Minimum logging level. Defaults to logging.INFO. 74 | """ 75 | logger = logging.getLogger() 76 | handler = logging.StreamHandler() 77 | handler.setFormatter( 78 | logging.Formatter( 79 | "%(asctime)s][%(levelname)s][%(name)s:%(lineno)d]%(task_handler)s:%(task_name)s:%(task_id)s - %(message)s" 80 | ) 81 | ) 82 | handler.addFilter(TaskContextInjectingFilter()) 83 | logger.addHandler(handler) 84 | if session_logger_handler: 85 | logger.addHandler(session_logger_handler) 86 | logger.setLevel(level) 87 | logging.getLogger("apscheduler.executors.default").setLevel(logging.WARNING) 88 | -------------------------------------------------------------------------------- /front/src/api.js: -------------------------------------------------------------------------------- 1 | import { SessionStorage } from 'quasar'; 2 | 3 | function rpc(method, params) { 4 | return fetch( 5 | process.env.JSONRPC_ENDPOINT || "/api/v1/jsonrpc", 6 | { 7 | method: "POST", 8 | headers: { 9 | "Content-Type": "application/json", 10 | }, 11 | body: JSON.stringify({ 12 | jsonrpc: "2.0", 13 | id: 0, 14 | method: method, 15 | params: params, 16 | }) 17 | } 18 | ).then(response => { 19 | if (response.ok) { 20 | return response.json(); 21 | } else { 22 | throw Error(response.statusText); 23 | } 24 | }).then(data => { 25 | if (data.error) { 26 | throw Error(data.error.message); 27 | } 28 | return data.result; 29 | }); 30 | } 31 | 32 | export function getScrapers() { 33 | return rpc("get_scrapers", {}); 34 | } 35 | 36 | export function getScraper(id) { 37 | return rpc("get_scraper", {id: id}); 38 | } 39 | 40 | export function getScraperJobs(id) { 41 | return rpc("get_task_instances", {task_name: id}); 42 | } 43 | 44 | export function getTask(id) { 45 | return rpc("get_task_instance", {task_id: id}); 46 | } 47 | 48 | export function getTaskLogs(id, last_log_line_id, max_lines) { 49 | return rpc( 50 | "get_task_logs", 51 | { 52 | task_id: id, 53 | last_log_line_id: last_log_line_id, 54 | max_lines: max_lines 55 | } 56 | ); 57 | } 58 | 59 | export function getScraperHandlers() { 60 | return rpc("get_scraper_handlers", {}); 61 | } 62 | 63 | export function getSchedules() { 64 | return rpc("get_schedules", {}); 65 | } 66 | 67 | export function getPriorities() { 68 | return rpc("get_priorities", {}); 69 | } 70 | 71 | export function enqueueScraper(id) { 72 | return rpc("enqueue_scraper", {scraper_id: id, priority: 0}); 73 | } 74 | export function createScraper(scraper) { 75 | return rpc("create_scraper", {scraper: scraper}); 76 | } 77 | 78 | 79 | export function updateScraper(scraper) { 80 | return rpc("update_scraper", {scraper: scraper}); 81 | } 82 | 83 | export function deleteScraper(id) { 84 | return rpc("delete_scraper", {id: id}); 85 | } 86 | 87 | export function isReadOnly() { 88 | const value = SessionStorage.getItem("is_storage_read_only"); 89 | if (value != null) return Promise.resolve(value); 90 | return rpc("is_read_only", {}) 91 | .then(result => { 92 | SessionStorage.set("is_storage_read_only", result); 93 | return result; 94 | }); 95 | } 96 | 97 | export function runEphemeralScraperTask(config, handler, state, priority) { 98 | return rpc( 99 | "run_ephemeral", 100 | { 101 | task: { 102 | scraper_config: config, 103 | scraper_handler: handler, 104 | scraper_state: state, 105 | }, 106 | priority: priority, 107 | } 108 | ); 109 | } 110 | -------------------------------------------------------------------------------- /sneakpeek/scraper/redis_storage.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | from uuid import uuid4 3 | 4 | from redis.asyncio import Redis 5 | from typing_extensions import override 6 | 7 | from sneakpeek.scraper.model import ( 8 | CreateScraperRequest, 9 | Scraper, 10 | ScraperId, 11 | ScraperNotFoundError, 12 | ScraperStorageABC, 13 | StorageIsReadOnlyError, 14 | ) 15 | 16 | _SCRAPER_KEY_PREFIX = "scraper:" 17 | 18 | 19 | class RedisScraperStorage(ScraperStorageABC): 20 | def __init__(self, redis: Redis, is_read_only: bool = False) -> None: 21 | self.redis = redis 22 | self.read_only = is_read_only 23 | 24 | def _get_scraper_key(self, id: ScraperId) -> str: 25 | return f"{_SCRAPER_KEY_PREFIX}{id}" 26 | 27 | @override 28 | def is_read_only(self) -> bool: 29 | return self.read_only 30 | 31 | @override 32 | async def create_scraper(self, request: CreateScraperRequest) -> Scraper: 33 | if self.read_only: 34 | raise StorageIsReadOnlyError() 35 | scraper = Scraper( 36 | id=str(uuid4()), 37 | name=request.name, 38 | handler=request.handler, 39 | schedule=request.schedule, 40 | schedule_crontab=request.schedule_crontab, 41 | config=request.config, 42 | priority=request.priority, 43 | timeout=( 44 | timedelta(seconds=request.timeout_seconds) 45 | if request.timeout_seconds 46 | else None 47 | ), 48 | ) 49 | await self.redis.set(self._get_scraper_key(scraper.id), scraper.json()) 50 | return scraper 51 | 52 | @override 53 | async def update_scraper(self, scraper: Scraper) -> Scraper: 54 | if self.read_only: 55 | raise StorageIsReadOnlyError() 56 | if not await self.redis.exists(self._get_scraper_key(scraper.id)): 57 | raise ScraperNotFoundError() 58 | await self.redis.set(self._get_scraper_key(scraper.id), scraper.json()) 59 | return scraper 60 | 61 | @override 62 | async def delete_scraper(self, id: ScraperId) -> Scraper: 63 | if self.read_only: 64 | raise StorageIsReadOnlyError() 65 | scraper = await self.redis.getdel(self._get_scraper_key(id)) 66 | if not scraper: 67 | raise ScraperNotFoundError() 68 | return Scraper.parse_raw(scraper) 69 | 70 | @override 71 | async def get_scraper(self, id: ScraperId) -> Scraper: 72 | scraper = await self.redis.get(self._get_scraper_key(id)) 73 | if scraper is None: 74 | raise ScraperNotFoundError() 75 | return Scraper.parse_raw(scraper) 76 | 77 | @override 78 | async def get_scrapers(self) -> list[Scraper]: 79 | keys = [ 80 | key.decode() 81 | async for key in self.redis.scan_iter(f"{_SCRAPER_KEY_PREFIX}*") 82 | ] 83 | return sorted( 84 | (Scraper.parse_raw(scraper) for scraper in await self.redis.mget(keys)), 85 | key=lambda x: x.id, 86 | ) 87 | -------------------------------------------------------------------------------- /sneakpeek/queue/tests/test_queue_storage.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | import pytest 4 | from fakeredis.aioredis import FakeRedis 5 | 6 | from sneakpeek.queue.in_memory_storage import InMemoryQueueStorage 7 | from sneakpeek.queue.model import QueueStorageABC, Task, TaskPriority, TaskStatus 8 | from sneakpeek.queue.redis_storage import RedisQueueStorage 9 | 10 | 11 | @pytest.fixture 12 | def in_memory_storage() -> QueueStorageABC: 13 | yield InMemoryQueueStorage() 14 | 15 | 16 | @pytest.fixture 17 | def redis_storage() -> QueueStorageABC: 18 | yield RedisQueueStorage(FakeRedis()) 19 | 20 | 21 | @pytest.fixture( 22 | params=[ 23 | pytest.lazy_fixture(in_memory_storage.__name__), 24 | pytest.lazy_fixture(redis_storage.__name__), 25 | ] 26 | ) 27 | def storage(request) -> QueueStorageABC: 28 | yield request.param 29 | 30 | 31 | @pytest.mark.asyncio 32 | async def test_storage_crud(storage: QueueStorageABC) -> None: 33 | task = Task( 34 | id=0, 35 | task_name=f"{test_storage_crud.__name__}:task_name", 36 | task_handler=f"{test_storage_crud.__name__}:task_handler", 37 | status=TaskStatus.PENDING, 38 | created_at=datetime.utcnow(), 39 | priority=TaskPriority.NORMAL, 40 | payload=f"{test_storage_crud.__name__}:payload", 41 | timeout=timedelta(seconds=1), 42 | ) 43 | # Create task 44 | enqueued = await storage.enqueue_task(task) 45 | assert enqueued.id > 0 46 | assert enqueued.task_name == task.task_name 47 | 48 | # Get task 49 | all_tasks = await storage.get_tasks() 50 | assert all_tasks == [enqueued] 51 | task_name_instances = await storage.get_task_instances(task.task_name) 52 | assert task_name_instances == [enqueued] 53 | actual_task = await storage.get_task_instance(enqueued.id) 54 | assert enqueued == actual_task 55 | 56 | # Update task 57 | enqueued.last_active_at = datetime(year=1, month=10, day=1) 58 | updated = await storage.update_task(enqueued) 59 | assert updated.id == enqueued.id 60 | assert enqueued.last_active_at == updated.last_active_at 61 | 62 | # Queue len 63 | assert await storage.get_queue_len() == 1 64 | 65 | # Dequeue 66 | dequeued = await storage.dequeue_task() 67 | assert dequeued.id == enqueued.id 68 | 69 | 70 | @pytest.mark.asyncio 71 | async def test_delete_old_items(storage: QueueStorageABC) -> None: 72 | keep_last = 2 73 | total_tasks = 4 74 | tasks = [ 75 | Task( 76 | id=0, 77 | task_name=f"{test_delete_old_items.__name__}:task_name", 78 | task_handler=f"{test_delete_old_items.__name__}:task_handler", 79 | status=TaskStatus.PENDING, 80 | created_at=datetime.utcnow(), 81 | priority=TaskPriority.NORMAL, 82 | payload=f"{test_delete_old_items.__name__}:payload:{i}", 83 | timeout=timedelta(seconds=1), 84 | ) 85 | for i in range(total_tasks) 86 | ] 87 | enqueued_tasks = [await storage.enqueue_task(task) for task in tasks] 88 | 89 | await storage.delete_old_tasks(keep_last) 90 | actual_left_tasks = await storage.get_tasks() 91 | assert sorted(actual_left_tasks, key=lambda x: x.id) == sorted( 92 | enqueued_tasks[keep_last:], key=lambda x: x.id 93 | ) 94 | -------------------------------------------------------------------------------- /front/src/pages/ScrapersPage.vue: -------------------------------------------------------------------------------- 1 | 53 | 54 | 96 | -------------------------------------------------------------------------------- /sneakpeek/scraper/tests/test_dynamic_scraper_handler.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from unittest.mock import AsyncMock, call 3 | 4 | import pytest 5 | 6 | from sneakpeek.scraper.dynamic_scraper_handler import ( 7 | DynamicScraperHandler, 8 | DynamicScraperParams, 9 | ) 10 | 11 | 12 | class FakeScraperContext: 13 | def __init__(self, params: DynamicScraperParams) -> None: 14 | self.params = params.dict() 15 | self.get_mock = AsyncMock() 16 | 17 | async def get(self, url: str) -> str: 18 | return await self.get_mock(url) 19 | 20 | 21 | @pytest.fixture 22 | def handler() -> DynamicScraperHandler: 23 | yield DynamicScraperHandler() 24 | 25 | 26 | SOURCE_CODE_NO_HANDLER_DEFINED = """ 27 | from sneakpeek.scraper.context import ScraperContext 28 | 29 | async def handler_not_defined(context: ScraperContext) -> str: 30 | return "1" 31 | """ 32 | 33 | 34 | def test_Given_SourceCodeHasNoHandlerDefined_When_Compiled_Then_SyntaxErrorIsThrown( 35 | handler: DynamicScraperHandler, 36 | ) -> None: 37 | with pytest.raises(SyntaxError): 38 | handler.compile(SOURCE_CODE_NO_HANDLER_DEFINED) 39 | 40 | 41 | SOURCE_CODE_HANDLER_NOT_ASYNC = """ 42 | from sneakpeek.scraper.context import ScraperContext 43 | 44 | def handler(context: ScraperContext) -> str: 45 | return "1" 46 | """ 47 | 48 | 49 | def test_Given_SourceCodeWithSyncHandler_When_Compiled_Then_SyntaxErrorIsThrown( 50 | handler: DynamicScraperHandler, 51 | ) -> None: 52 | with pytest.raises(SyntaxError): 53 | handler.compile(SOURCE_CODE_HANDLER_NOT_ASYNC) 54 | 55 | 56 | SOURCE_CODE_HANDLER_OBJECT = """ 57 | handler = 1 58 | """ 59 | SOURCE_CODE_HANDLER_CLASS = """ 60 | class handler: 61 | pass 62 | """ 63 | 64 | 65 | def test_Given_SourceCodeWithHandlerNotFunction_When_Compiled_Then_SyntaxErrorIsThrown( 66 | handler: DynamicScraperHandler, 67 | ) -> None: 68 | with pytest.raises(SyntaxError): 69 | handler.compile(SOURCE_CODE_HANDLER_OBJECT) 70 | with pytest.raises(SyntaxError): 71 | handler.compile(SOURCE_CODE_HANDLER_CLASS) 72 | 73 | 74 | SOURCE_CODE_HANDLER_NO_ARGS = """ 75 | async def handler(): 76 | return "1" 77 | """ 78 | 79 | 80 | def test_Given_SourceCodeWithHandleWithNoArgs_When_Compiled_Then_SyntaxErrorIsThrown( 81 | handler: DynamicScraperHandler, 82 | ) -> None: 83 | with pytest.raises(SyntaxError): 84 | handler.compile(SOURCE_CODE_HANDLER_NO_ARGS) 85 | 86 | 87 | SOURCE_CODE_COMPILES = """ 88 | from sneakpeek.scraper.context import ScraperContext 89 | 90 | async def handler(ctx: ScraperContext) -> str: 91 | return "1" 92 | """ 93 | 94 | 95 | def test_Given_SourceCode_When_Compiled_Then_AsyncFunctionIsReturned( 96 | handler: DynamicScraperHandler, 97 | ) -> None: 98 | func = handler.compile(SOURCE_CODE_COMPILES) 99 | assert inspect.iscoroutinefunction(func) 100 | assert func.__code__.co_argcount == 1 101 | 102 | 103 | CUSTOM_SOURCE_CODE = """ 104 | from sneakpeek.scraper.context import ScraperContext 105 | 106 | async def handler(ctx: ScraperContext, param1: str, param2: str = "test2", result="123"): 107 | for param in [param1, param2]: 108 | await ctx.get(param) 109 | return result 110 | """ 111 | 112 | 113 | @pytest.mark.asyncio 114 | async def test_Given_CustomCode_When_RanByHandler_Then_ContextIsCalled( 115 | handler: DynamicScraperHandler, 116 | ) -> None: 117 | context = FakeScraperContext( 118 | DynamicScraperParams( 119 | source_code=CUSTOM_SOURCE_CODE, 120 | args=["url1"], 121 | kwargs={"param2": "url2", "result": "some_result"}, 122 | ), 123 | ) 124 | result = await handler.run(context) 125 | assert result == "some_result" 126 | context.get_mock.assert_has_awaits( 127 | [ 128 | call("url1"), 129 | call("url2"), 130 | ] 131 | ) 132 | -------------------------------------------------------------------------------- /sneakpeek/queue/in_memory_storage.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from collections import defaultdict 3 | from itertools import count 4 | from typing import Iterator 5 | 6 | from typing_extensions import override 7 | 8 | from sneakpeek.metrics import count_invocations, measure_latency 9 | from sneakpeek.queue.model import QueueStorageABC, Task, TaskNotFoundError 10 | 11 | SCORE_PRIORITY_BIT_OFFSET = 32 12 | 13 | 14 | class InMemoryQueueStorage(QueueStorageABC): 15 | """In memory queue storage (should only be used for development purposes)""" 16 | 17 | def __init__(self) -> None: 18 | """ 19 | Args: 20 | redis (Redis): Async redis client 21 | """ 22 | self._id_generator: Iterator[int] = count(1) 23 | self._queue = asyncio.PriorityQueue() 24 | self._tasks: dict[str, set[int]] = defaultdict(set) 25 | self._task_instances: dict[int, Task] = {} 26 | self._lock = asyncio.Lock() 27 | 28 | async def _generate_id(self) -> int: 29 | return next(self._id_generator) 30 | 31 | def _get_task_score(self, task: Task) -> int: 32 | return (task.priority.value << SCORE_PRIORITY_BIT_OFFSET) + task.id 33 | 34 | @count_invocations(subsystem="storage") 35 | @measure_latency(subsystem="storage") 36 | @override 37 | async def get_tasks(self) -> list[Task]: 38 | return sorted(self._task_instances.values(), key=lambda x: x.id) 39 | 40 | @count_invocations(subsystem="storage") 41 | @measure_latency(subsystem="storage") 42 | @override 43 | async def get_task_instances(self, task_name: str) -> list[Task]: 44 | return sorted( 45 | [ 46 | self._task_instances[task_id] 47 | for task_id in self._tasks.get(task_name, []) 48 | ], 49 | key=lambda x: x.id, 50 | reverse=True, 51 | ) 52 | 53 | @count_invocations(subsystem="storage") 54 | @measure_latency(subsystem="storage") 55 | @override 56 | async def get_task_instance(self, id: int) -> Task: 57 | if id not in self._task_instances: 58 | raise TaskNotFoundError() 59 | return self._task_instances[id] 60 | 61 | @count_invocations(subsystem="storage") 62 | @measure_latency(subsystem="storage") 63 | @override 64 | async def enqueue_task(self, task: Task) -> Task: 65 | task.id = await self._generate_id() 66 | self._tasks[task.task_name].add(task.id) 67 | self._task_instances[task.id] = task 68 | await self._queue.put((self._get_task_score(task), task.id)) 69 | return task 70 | 71 | @count_invocations(subsystem="storage") 72 | @measure_latency(subsystem="storage") 73 | @override 74 | async def update_task(self, task: Task) -> Task: 75 | if task.id not in self._task_instances: 76 | raise TaskNotFoundError() 77 | self._task_instances[task.id] = task 78 | return task 79 | 80 | @count_invocations(subsystem="storage") 81 | @measure_latency(subsystem="storage") 82 | @override 83 | async def dequeue_task(self) -> Task | None: 84 | try: 85 | _, task_id = self._queue.get_nowait() 86 | return await self.get_task_instance(task_id) 87 | except asyncio.QueueEmpty: 88 | return None 89 | 90 | @count_invocations(subsystem="storage") 91 | @measure_latency(subsystem="storage") 92 | @override 93 | async def delete_old_tasks(self, keep_last: int = 50) -> None: 94 | for task_name, task_ids in self._tasks.items(): 95 | for task_id in sorted(task_ids, reverse=True)[keep_last:]: 96 | self._task_instances.pop(task_id) 97 | self._tasks[task_name].remove(task_id) 98 | 99 | @count_invocations(subsystem="storage") 100 | @measure_latency(subsystem="storage") 101 | @override 102 | async def get_queue_len(self) -> int: 103 | return self._queue.qsize() 104 | -------------------------------------------------------------------------------- /docs/design.rst: -------------------------------------------------------------------------------- 1 | ################# 2 | Design 3 | ################# 4 | 5 | .. contents:: Table of contents 6 | 7 | **Sneakpeek** has 6 core components: 8 | 9 | * Scrapers storage - stores list of scrapers and its metadata. 10 | * Tasks queue - populated by the scheduler or user and is consumed by the queue consumers 11 | * Lease storage - stores lease (global lock) for scheduler, to make sure there's only 1 active scheduler at all times. 12 | * Scheduler - schedules periodic tasks using scrapers in the storage 13 | * Consumer - consumes tasks queue and executes tasks logic (e.g. scraper logic) 14 | * API - provides JsonRPC API for interacting with the system 15 | 16 | All of the components are run by the :py:class:`SneakpeekServer `. 17 | 18 | ================ 19 | Scrapers Storage 20 | ================ 21 | 22 | Scraper storage interface is defined in :py:class:`sneakpeek.scraper.model.ScraperStorageABC`. 23 | 24 | * :py:class:`InMemoryScraperStorage ` - in-memory storage. Should either be used in **development** environment or if the list of scrapers is static and wouldn't be changed. 25 | * :py:class:`RedisScraperStorage ` - redis storage. 26 | 27 | ================ 28 | Tasks queue 29 | ================ 30 | 31 | Tasks queue consists of three components: 32 | * :py:class:`Storage ` - tasks storage 33 | * :py:class:`Storage ` - queue implementation 34 | * :py:class:`Storage ` - queue consumer implementation 35 | 36 | Currently there 2 storage implementations: 37 | 38 | * :py:class:`InMemoryQueueStorage ` - in-memory storage. Should only be used in **development** environment. 39 | * :py:class:`RedisQueueStorage ` - redis storage. 40 | 41 | ================ 42 | Lease storage 43 | ================ 44 | 45 | Lease storage is used by scheduler to ensure that at any point of time there's no more 46 | than 1 active scheduler instance which can enqueue scraper jobs. This disallows concurrent 47 | execution of the scraper. 48 | 49 | Lease storage interface is defined in :py:class:`LeaseStorageABC `. 50 | 51 | Currently there 2 storage implementations: 52 | 53 | * :py:class:`InMemoryLeaseStorage ` - in-memory storage. Should only be used in **development** environment. 54 | * :py:class:`RedisLeaseStorage ` - redis storage. 55 | 56 | ================ 57 | Scheduler 58 | ================ 59 | 60 | :py:class:`Scheduler ` is responsible for: 61 | 62 | * scheduling scrapers based on their configuration. 63 | * finding scraper jobs that haven't sent a heartbeat for a while and mark them as dead 64 | * cleaning up jobs queue from old historical scraper jobs 65 | * exporting metrics on number of pending jobs in the queue 66 | 67 | As for now there's only one implementation :py:class:`Scheduler ` 68 | that uses `APScheduler `_. 69 | 70 | ================ 71 | Queue consumer 72 | ================ 73 | 74 | Consumer constantly tries to dequeue a job and executes dequeued jobs. 75 | As for now there's only one implementation :py:class:`Consumer `. 76 | 77 | 78 | ================ 79 | API 80 | ================ 81 | 82 | Sneakpeek implements: 83 | 84 | * JsonRPC to programmatically interact with the system, it exposes following methods (available at ``/api/v1/jsonrpc``): 85 | * CRUD methods to add, modify and delete scrapers 86 | * Get list of scraper's jobs 87 | * Enqueue scraper jobs 88 | * UI that allows you to interact with the system 89 | * Swagger documentation (available at ``/api``) 90 | * Copy of this documentation (available at ``/docs``) 91 | -------------------------------------------------------------------------------- /sneakpeek/scraper/tests/test_scraper_storage.py: -------------------------------------------------------------------------------- 1 | from uuid import uuid4 2 | 3 | import pytest 4 | from fakeredis.aioredis import FakeRedis 5 | 6 | from sneakpeek.scheduler.model import TaskSchedule 7 | from sneakpeek.scraper.in_memory_storage import InMemoryScraperStorage 8 | from sneakpeek.scraper.model import ( 9 | CreateScraperRequest, 10 | Scraper, 11 | ScraperConfig, 12 | ScraperNotFoundError, 13 | ScraperStorageABC, 14 | ) 15 | from sneakpeek.scraper.redis_storage import RedisScraperStorage 16 | 17 | 18 | @pytest.fixture 19 | def in_memory_storage() -> ScraperStorageABC: 20 | yield InMemoryScraperStorage() 21 | 22 | 23 | @pytest.fixture 24 | def redis_storage() -> ScraperStorageABC: 25 | yield RedisScraperStorage(FakeRedis()) 26 | 27 | 28 | @pytest.fixture( 29 | params=[ 30 | pytest.lazy_fixture(in_memory_storage.__name__), 31 | pytest.lazy_fixture(redis_storage.__name__), 32 | ] 33 | ) 34 | def storage(request) -> ScraperStorageABC: 35 | yield request.param 36 | 37 | 38 | def _get_create_scraper_request(name: str) -> CreateScraperRequest: 39 | return CreateScraperRequest( 40 | name=name, 41 | schedule=TaskSchedule.CRONTAB, 42 | schedule_crontab=f"schedule_{name}", 43 | handler=f"handler_{name}", 44 | config=ScraperConfig(), 45 | ) 46 | 47 | 48 | @pytest.mark.asyncio 49 | async def test_read_after_write(storage: ScraperStorageABC): 50 | expected = _get_create_scraper_request("test_read_after_write") 51 | created = await storage.create_scraper(expected) 52 | assert created.id is not None, "Expected storage to create a scraper" 53 | assert created.name == expected.name 54 | assert created.schedule == expected.schedule 55 | assert created.schedule_crontab == expected.schedule_crontab 56 | assert created.handler == expected.handler 57 | assert created.config == expected.config 58 | actual = await storage.get_scraper(created.id) 59 | assert actual == created 60 | created.name = f"{created.name}_updated" 61 | actual = await storage.update_scraper(created) 62 | actual = await storage.get_scraper(created.id) 63 | assert actual == created 64 | 65 | 66 | @pytest.mark.asyncio 67 | async def test_get_scrapers(storage: ScraperStorageABC): 68 | expected = [ 69 | _get_create_scraper_request(f"test_get_scrapers_{i}") for i in range(1, 10) 70 | ] 71 | for item in expected: 72 | await storage.create_scraper(item) 73 | 74 | actual = await storage.get_scrapers() 75 | assert {item.name for item in actual} == {item.name for item in expected} 76 | 77 | 78 | @pytest.mark.asyncio 79 | async def test_read_non_existent_scraper_throws(storage: ScraperStorageABC): 80 | with pytest.raises(ScraperNotFoundError): 81 | await storage.get_scraper(uuid4()) 82 | 83 | 84 | @pytest.mark.asyncio 85 | async def test_update_non_existent_scraper_throws(storage: ScraperStorageABC): 86 | with pytest.raises(ScraperNotFoundError): 87 | await storage.update_scraper( 88 | Scraper( 89 | id=str(uuid4()), 90 | name="test_update_non_existent_scraper_throws", 91 | schedule=TaskSchedule.CRONTAB, 92 | schedule_crontab="schedule_test_update_non_existent_scraper_throws", 93 | handler="handler_test_update_non_existent_scraper_throws", 94 | config=ScraperConfig(), 95 | ) 96 | ) 97 | 98 | 99 | @pytest.mark.asyncio 100 | async def test_delete_non_existent_scraper_throws(storage: ScraperStorageABC): 101 | with pytest.raises(ScraperNotFoundError): 102 | await storage.delete_scraper(uuid4()) 103 | 104 | 105 | @pytest.mark.asyncio 106 | async def test_delete_scraper(storage: ScraperStorageABC): 107 | created = await storage.create_scraper( 108 | _get_create_scraper_request("test_delete_scraper") 109 | ) 110 | actual = await storage.get_scraper(created.id) 111 | assert created == actual 112 | deleted = await storage.delete_scraper(actual.id) 113 | assert deleted == actual 114 | with pytest.raises(ScraperNotFoundError): 115 | await storage.get_scraper(actual.id) 116 | -------------------------------------------------------------------------------- /sneakpeek/session_loggers/file_logger.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import itertools 3 | import json 4 | import logging 5 | import os 6 | import pathlib 7 | from asyncio import AbstractEventLoop 8 | from collections import defaultdict 9 | from concurrent.futures import ThreadPoolExecutor 10 | from datetime import datetime, timedelta 11 | from threading import Lock 12 | from traceback import format_exc 13 | from typing import Any, List 14 | 15 | from sneakpeek.session_loggers.base import LogLine, SessionLogger, get_fields_to_log 16 | 17 | MAX_BUFFER_AGE = timedelta(seconds=5) 18 | 19 | 20 | class Encoder(json.JSONEncoder): 21 | def default(self, obj): 22 | if isinstance(obj, set): 23 | return list(obj) 24 | return json.JSONEncoder.default(self, obj) 25 | 26 | 27 | class FileLoggerHandler(SessionLogger): 28 | def __init__( 29 | self, 30 | directory: str, 31 | loop: AbstractEventLoop | None = None, 32 | max_buffer_size: int = 10, 33 | max_buffer_age: timedelta = MAX_BUFFER_AGE, 34 | max_log_files_to_keep: int = 1000, 35 | ) -> None: 36 | super().__init__() 37 | self.dir = directory 38 | self.loop = loop or asyncio.get_event_loop() 39 | self.max_buffer_size = max_buffer_size 40 | self.max_buffer_age = max_buffer_age 41 | self.buffer: dict[str, Any] = defaultdict(list) 42 | self.last_flush = datetime.min 43 | self.executor = ThreadPoolExecutor(max_workers=10) 44 | self.max_log_files_to_keep = max_log_files_to_keep 45 | self._lock = Lock() 46 | 47 | def _cleanup(self): 48 | if not os.path.exists(self.dir): 49 | return 50 | with self._lock: 51 | with os.scandir(self.dir) as it: 52 | log_files = sorted( 53 | [entry for entry in it if entry.is_file()], 54 | key=lambda x: x.stat().st_mtime, 55 | ) 56 | for file in log_files[: -self.max_log_files_to_keep]: 57 | os.remove(file.path) 58 | 59 | def flush(self): 60 | """ 61 | Flushes the stream. 62 | """ 63 | with self.lock: 64 | self._cleanup() 65 | try: 66 | pathlib.Path(self.dir).mkdir(parents=True, exist_ok=True) 67 | for group, messages in self.buffer.items(): 68 | with open( 69 | os.path.join(self.dir, f"task_{group}.log"), mode="a" 70 | ) as f: 71 | f.writelines( 72 | [f"{json.dumps(m, cls=Encoder)}\n" for m in messages] 73 | ) 74 | except Exception: 75 | print(format_exc()) 76 | self.buffer.clear() 77 | 78 | def emit(self, record: logging.LogRecord) -> None: 79 | if not getattr(record, "task_id"): 80 | return 81 | 82 | self.buffer[record.task_id].append(get_fields_to_log(record)) 83 | with self._lock: 84 | if ( 85 | len(self.buffer) > self.max_buffer_size 86 | or datetime.utcnow() - self.last_flush > self.max_buffer_age 87 | ): 88 | self.loop.run_in_executor(self.executor, self.flush) 89 | 90 | async def read( 91 | self, 92 | task_id: str, 93 | last_log_line_id: str | None = None, 94 | max_lines: int = 100, 95 | ) -> List[dict[str, Any]]: 96 | path = os.path.join(self.dir, f"task_{task_id}.log") 97 | if not os.path.exists(path): 98 | return [] 99 | last_log_line_id = int(last_log_line_id) if last_log_line_id else 0 100 | 101 | with open(path, "r") as f: 102 | return [ 103 | LogLine( 104 | id=last_log_line_id + line_num + 1, 105 | data=json.loads(line), 106 | ) 107 | for line_num, line in enumerate( 108 | itertools.islice( 109 | f, 110 | last_log_line_id, 111 | last_log_line_id + max_lines, 112 | ) 113 | ) 114 | ] 115 | -------------------------------------------------------------------------------- /front/src/pages/ScraperIde.vue: -------------------------------------------------------------------------------- 1 | 23 | 115 | 120 | -------------------------------------------------------------------------------- /sneakpeek/scraper/runner.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from uuid import uuid4 4 | 5 | from sneakpeek.metrics import count_invocations 6 | from sneakpeek.scheduler.model import TaskSchedule 7 | from sneakpeek.scraper.context import ScraperContext 8 | from sneakpeek.scraper.in_memory_storage import InMemoryScraperStorage 9 | from sneakpeek.scraper.model import ( 10 | Middleware, 11 | Scraper, 12 | ScraperConfig, 13 | ScraperHandler, 14 | ScraperRunnerABC, 15 | ScraperStorageABC, 16 | ) 17 | 18 | 19 | class ScraperRunner(ScraperRunnerABC): 20 | """Default scraper runner implementation that is meant to be used in the Sneakpeek server""" 21 | 22 | def __init__( 23 | self, 24 | scraper_storage: ScraperStorageABC, 25 | middlewares: list[Middleware] | None = None, 26 | loop: asyncio.AbstractEventLoop | None = None, 27 | ) -> None: 28 | """ 29 | Args: 30 | handlers (list[ScraperHandler]): List of handlers that implement scraper logic 31 | scrapers_storage (ScrapersStorage): Sneakpeek scrapers storage implementation 32 | jobs_storage (ScraperJobsStorage): Sneakpeek jobs storage implementation 33 | middlewares (list[Middleware] | None, optional): List of middleware that will be used by scraper runner. Defaults to None. 34 | """ 35 | self.logger = logging.getLogger(__name__) 36 | self.scraper_storage = scraper_storage 37 | self.middlewares = middlewares 38 | 39 | @staticmethod 40 | async def debug_handler( 41 | handler: ScraperHandler, 42 | config: ScraperConfig | None = None, 43 | state: str | None = None, 44 | middlewares: list[Middleware] | None = None, 45 | ) -> str: 46 | scraper = Scraper( 47 | id=str(uuid4()), 48 | name="test_handler", 49 | handler=handler.name, 50 | schedule=TaskSchedule.INACTIVE, 51 | config=config, 52 | state=state, 53 | ) 54 | return await ScraperRunner( 55 | InMemoryScraperStorage([scraper]), 56 | middlewares=middlewares, 57 | ).run(handler, scraper) 58 | 59 | @count_invocations(subsystem="scraper_runner") 60 | async def run_ephemeral( 61 | self, 62 | handler: ScraperHandler, 63 | config: ScraperConfig | None = None, 64 | state: str | None = None, 65 | ) -> str | None: 66 | self.logger.info(f"Running ephemeral scraper with {handler.name}") 67 | 68 | context = ScraperContext( 69 | config, 70 | self.middlewares, 71 | scraper_state=state, 72 | ) 73 | try: 74 | await context.start_session() 75 | result = await handler.run(context) 76 | self.logger.info( 77 | f"Successfully executed ephemeral scraper with {handler.name}: {result}" 78 | ) 79 | return result 80 | except Exception: 81 | self.logger.exception( 82 | f"Failed to run ephemeral scraper with {handler.name}" 83 | ) 84 | raise 85 | finally: 86 | await context.close() 87 | 88 | @count_invocations(subsystem="scraper_runner") 89 | async def run(self, handler: ScraperHandler, scraper: Scraper) -> str: 90 | self.logger.info(f"Running scraper {scraper.handler}::{scraper.name}") 91 | 92 | if handler.name != scraper.handler: 93 | self.logger.warning( 94 | f"Provided handler's name ({handler.name}) doesn't match scraper handler name ({scraper.handler})" 95 | ) 96 | 97 | async def _update_scraper_state(state: str) -> Scraper: 98 | scraper.state = state 99 | return await self._scrapers_storage.update_scraper(scraper) 100 | 101 | context = ScraperContext( 102 | scraper.config, 103 | self.middlewares, 104 | scraper_state=scraper.state, 105 | update_scraper_state_func=_update_scraper_state, 106 | ) 107 | try: 108 | await context.start_session() 109 | result = await handler.run(context) 110 | self.logger.info( 111 | f"Successfully executed ephemeral scraper with {handler.name}: {result}" 112 | ) 113 | return result 114 | except Exception: 115 | self.logger.exception( 116 | f"Failed to run scraper {scraper.handler}::{scraper.name}" 117 | ) 118 | raise 119 | finally: 120 | await context.close() 121 | -------------------------------------------------------------------------------- /sneakpeek/tests/test_metrics.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from prometheus_client import REGISTRY 3 | 4 | from sneakpeek.metrics import count_invocations, measure_latency 5 | 6 | SUBSYSTEM = "test" 7 | 8 | exception_to_raise = ValueError() 9 | exception_to_raise_name = ValueError.__name__ 10 | 11 | 12 | @count_invocations(SUBSYSTEM) 13 | @measure_latency(SUBSYSTEM) 14 | async def async_test_fn(fail: bool = False): 15 | if fail: 16 | raise exception_to_raise 17 | return 1 18 | 19 | 20 | @count_invocations(SUBSYSTEM) 21 | @measure_latency(SUBSYSTEM) 22 | def sync_test_fn(fail: bool = False): 23 | if fail: 24 | raise exception_to_raise 25 | return 1 26 | 27 | 28 | latency_labels_sync = { 29 | "subsystem": SUBSYSTEM, 30 | "method": sync_test_fn.__name__, 31 | } 32 | latency_labels_async = { 33 | "subsystem": SUBSYSTEM, 34 | "method": async_test_fn.__name__, 35 | } 36 | 37 | 38 | def invocation_labels_sync(type: str, error: str = ""): 39 | return { 40 | "subsystem": SUBSYSTEM, 41 | "method": sync_test_fn.__name__, 42 | "type": type, 43 | "error": error, 44 | } 45 | 46 | 47 | def invocation_labels_async(type: str, error: str = ""): 48 | return { 49 | "subsystem": SUBSYSTEM, 50 | "method": async_test_fn.__name__, 51 | "type": type, 52 | "error": error, 53 | } 54 | 55 | 56 | @pytest.mark.asyncio 57 | async def test_measure_latency_async(): 58 | before = REGISTRY.get_sample_value("sneakpeek_latency_count", latency_labels_async) 59 | await async_test_fn() 60 | after = REGISTRY.get_sample_value("sneakpeek_latency_count", latency_labels_async) 61 | assert after - (before or 0) == 1 62 | 63 | 64 | @pytest.mark.asyncio 65 | async def test_measure_latency_sync(): 66 | before = REGISTRY.get_sample_value("sneakpeek_latency_count", latency_labels_sync) 67 | sync_test_fn() 68 | after = REGISTRY.get_sample_value("sneakpeek_latency_count", latency_labels_sync) 69 | assert after - (before or 0) == 1 70 | 71 | 72 | @pytest.mark.asyncio 73 | async def test_count_invocations_async(): 74 | before_total = REGISTRY.get_sample_value( 75 | "sneakpeek_invocations_total", 76 | invocation_labels_async("total"), 77 | ) 78 | before_success = REGISTRY.get_sample_value( 79 | "sneakpeek_invocations_total", 80 | invocation_labels_async("success"), 81 | ) 82 | before_error = REGISTRY.get_sample_value( 83 | "sneakpeek_invocations_total", 84 | invocation_labels_async("error", exception_to_raise_name), 85 | ) 86 | await async_test_fn(fail=False) 87 | with pytest.raises(type(exception_to_raise)): 88 | await async_test_fn(fail=True) 89 | 90 | after_total = REGISTRY.get_sample_value( 91 | "sneakpeek_invocations_total", 92 | invocation_labels_async("total"), 93 | ) 94 | after_success = REGISTRY.get_sample_value( 95 | "sneakpeek_invocations_total", 96 | invocation_labels_async("success"), 97 | ) 98 | after_error = REGISTRY.get_sample_value( 99 | "sneakpeek_invocations_total", 100 | invocation_labels_async("error", exception_to_raise_name), 101 | ) 102 | assert after_total - (before_total or 0) == 2 103 | assert after_success - (before_success or 0) == 1 104 | assert after_error - (before_error or 0) == 1 105 | 106 | 107 | def test_count_invocations_sync(): 108 | before_total = REGISTRY.get_sample_value( 109 | "sneakpeek_invocations_total", 110 | invocation_labels_sync("total"), 111 | ) 112 | before_success = REGISTRY.get_sample_value( 113 | "sneakpeek_invocations_total", 114 | invocation_labels_sync("success"), 115 | ) 116 | before_error = REGISTRY.get_sample_value( 117 | "sneakpeek_invocations_total", 118 | invocation_labels_sync("error", exception_to_raise_name), 119 | ) 120 | sync_test_fn(fail=False) 121 | with pytest.raises(type(exception_to_raise)): 122 | sync_test_fn(fail=True) 123 | 124 | after_total = REGISTRY.get_sample_value( 125 | "sneakpeek_invocations_total", 126 | invocation_labels_sync("total"), 127 | ) 128 | after_success = REGISTRY.get_sample_value( 129 | "sneakpeek_invocations_total", 130 | invocation_labels_sync("success"), 131 | ) 132 | after_error = REGISTRY.get_sample_value( 133 | "sneakpeek_invocations_total", 134 | invocation_labels_sync("error", exception_to_raise_name), 135 | ) 136 | assert after_total - (before_total or 0) == 2 137 | assert after_success - (before_success or 0) == 1 138 | assert after_error - (before_error or 0) == 1 139 | -------------------------------------------------------------------------------- /sneakpeek/middleware/robots_txt_middleware.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | from datetime import timedelta 4 | from enum import Enum, auto 5 | from traceback import format_exc 6 | from typing import Any 7 | from urllib.parse import urlparse 8 | from urllib.robotparser import RobotFileParser 9 | 10 | import aiohttp 11 | from cachetools import TTLCache 12 | from pydantic import BaseModel 13 | from typing_extensions import override 14 | 15 | from sneakpeek.middleware.base import BaseMiddleware, parse_config_from_obj 16 | from sneakpeek.scraper.model import Request 17 | 18 | 19 | class RobotsTxtViolationException(Exception): 20 | """Exception which is raised if request is disallowed by website robots.txt""" 21 | 22 | pass 23 | 24 | 25 | class RobotsTxtViolationStrategy(Enum): 26 | """What to do if the request is disallowed by website robots.txt""" 27 | 28 | LOG = auto() #: Only log violation 29 | THROW = auto() #: Raise an exception on vioalation 30 | 31 | 32 | class RobotsTxtMiddlewareConfig(BaseModel): 33 | """robots.txt middleware configuration""" 34 | 35 | violation_strategy: RobotsTxtViolationStrategy = RobotsTxtViolationStrategy.LOG 36 | 37 | 38 | class RobotsTxtMiddleware(BaseMiddleware): 39 | """Robots.txt middleware can log and optionally block requests if they are disallowed by website robots.txt.""" 40 | 41 | def __init__(self, default_config: RobotsTxtMiddlewareConfig | None = None) -> None: 42 | self._default_config = default_config or RobotsTxtMiddlewareConfig() 43 | self._logger = logging.getLogger(__name__) 44 | self._cache = TTLCache( 45 | maxsize=sys.maxsize, 46 | ttl=timedelta(hours=1).total_seconds(), 47 | ) 48 | 49 | @property 50 | def name(self) -> str: 51 | return "robots_txt" 52 | 53 | def _extract_host(self, url: str) -> str: 54 | return urlparse(url).hostname.replace("www.", "") 55 | 56 | async def _get_robots_txt_by_url(self, url: str) -> RobotFileParser | None: 57 | async with aiohttp.ClientSession() as session: 58 | response = await session.get(url) 59 | if response.status != 200: 60 | return None 61 | contents = await response.text() 62 | rfp = RobotFileParser() 63 | rfp.parse(contents.split("\n")) 64 | return rfp 65 | 66 | async def _load_robots_txt(self, host: str) -> RobotFileParser | None: 67 | if cached := self._cache.get(host): 68 | return cached 69 | for scheme in ["http", "https"]: 70 | for host_prefix in ["", "www."]: 71 | try: 72 | robots_txt = await self._get_robots_txt_by_url( 73 | f"{scheme}://{host_prefix}{host}/robots.txt" 74 | ) 75 | self._cache[host] = robots_txt 76 | if robots_txt: 77 | return robots_txt 78 | except Exception as e: 79 | self._logger.error(f"Failed to get robots.txt for {host}: {e}") 80 | self._logger.debug( 81 | f"Failed to get robots.txt for {host}. Traceback: {format_exc()}" 82 | ) 83 | return None 84 | 85 | @override 86 | async def on_request( 87 | self, 88 | request: Request, 89 | config: Any | None, 90 | ) -> Request: 91 | config = parse_config_from_obj( 92 | config, 93 | self.name, 94 | RobotsTxtMiddlewareConfig, 95 | self._default_config, 96 | ) 97 | host = self._extract_host(request.url) 98 | robots_txt = await self._load_robots_txt(host) 99 | if not robots_txt: 100 | self._logger.debug( 101 | f"No robots.txt was retrieved for {request.url}. Defaulting to allow" 102 | ) 103 | return request 104 | 105 | user_agent = (request.headers or {}).get("User-Agent") 106 | if not user_agent: 107 | self._logger.debug( 108 | f"User-Agent is empty while requesting {request.url}. Defaulting to '*'" 109 | ) 110 | user_agent = "*" 111 | 112 | if not robots_txt.can_fetch(user_agent, request.url): 113 | error_message = f"robots.txt prohibits requesting {request.url}" 114 | if config.violation_strategy == RobotsTxtViolationStrategy.THROW: 115 | raise RobotsTxtViolationException(error_message) 116 | self._logger.error( 117 | f"{error_message}. Proceeding because strategy is {config.violation_strategy}" 118 | ) 119 | 120 | return request 121 | -------------------------------------------------------------------------------- /sneakpeek/queue/queue.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime, timedelta 3 | 4 | from typing_extensions import override 5 | 6 | from sneakpeek.metrics import count_invocations, measure_latency 7 | from sneakpeek.queue.model import ( 8 | EnqueueTaskRequest, 9 | QueueABC, 10 | QueueStorageABC, 11 | Task, 12 | TaskHasActiveRunError, 13 | TaskPingFinishedError, 14 | TaskPingNotStartedError, 15 | TaskStatus, 16 | ) 17 | 18 | DEFAULT_DEAD_TIMEOUT = timedelta(minutes=5) 19 | 20 | 21 | class Queue(QueueABC): 22 | """Queue implementation""" 23 | 24 | def __init__( 25 | self, 26 | storage: QueueStorageABC, 27 | dead_task_timeout: timedelta = DEFAULT_DEAD_TIMEOUT, 28 | ) -> None: 29 | self.logger = logging.getLogger(__name__) 30 | self.storage = storage 31 | self.dead_task_timeout = dead_task_timeout 32 | 33 | @count_invocations(subsystem="queue") 34 | @measure_latency(subsystem="queue") 35 | @override 36 | async def enqueue(self, request: EnqueueTaskRequest) -> Task: 37 | existing_tasks = await self.storage.get_task_instances(request.task_name) 38 | if any( 39 | t 40 | for t in existing_tasks 41 | if t.status in (TaskStatus.STARTED, TaskStatus.PENDING) 42 | ): 43 | raise TaskHasActiveRunError() 44 | task = Task( 45 | id=0, 46 | task_name=request.task_name, 47 | task_handler=request.task_handler, 48 | status=TaskStatus.PENDING, 49 | created_at=datetime.utcnow(), 50 | payload=request.payload, 51 | priority=request.priority, 52 | timeout=request.timeout, 53 | ) 54 | return await self.storage.enqueue_task(task) 55 | 56 | @count_invocations(subsystem="queue") 57 | @measure_latency(subsystem="queue") 58 | @override 59 | async def dequeue(self) -> Task | None: 60 | return await self.storage.dequeue_task() 61 | 62 | @count_invocations(subsystem="queue") 63 | @measure_latency(subsystem="queue") 64 | @override 65 | async def get_queue_len(self) -> int: 66 | return await self.storage.get_queue_len() 67 | 68 | @count_invocations(subsystem="queue") 69 | @measure_latency(subsystem="queue") 70 | @override 71 | async def ping_task(self, id: int) -> Task: 72 | task = await self.storage.get_task_instance(id) 73 | if task.status == TaskStatus.PENDING: 74 | raise TaskPingNotStartedError() 75 | if task.status != TaskStatus.STARTED: 76 | raise TaskPingFinishedError() 77 | task.last_active_at = datetime.utcnow() 78 | return await self.storage.update_task(task) 79 | 80 | @count_invocations(subsystem="queue") 81 | @measure_latency(subsystem="queue") 82 | @override 83 | async def kill_dead_tasks(self) -> list[Task]: 84 | tasks = await self.storage.get_tasks() 85 | killed = [] 86 | for task in tasks: 87 | if self._is_task_dead(task): 88 | task.status = TaskStatus.DEAD 89 | task.finished_at = datetime.utcnow() 90 | killed.append(await self.storage.update_task(task)) 91 | return killed 92 | 93 | def _is_task_dead(self, task: Task) -> bool: 94 | if task.status != Task.STARTED: 95 | return False 96 | activity_timestamps = [ 97 | task.last_active_at, 98 | task.started_at, 99 | task.created_at, 100 | ] 101 | for ts in activity_timestamps: 102 | if ts and datetime.utcnow() - ts > self._dead_timeout: 103 | return True 104 | return False 105 | 106 | @count_invocations(subsystem="queue") 107 | @measure_latency(subsystem="queue") 108 | @override 109 | async def delete_old_tasks(self, keep_last: int = 50) -> None: 110 | await self.storage.delete_old_tasks(keep_last) 111 | 112 | @count_invocations(subsystem="queue") 113 | @measure_latency(subsystem="queue") 114 | @override 115 | async def update_task(self, task: Task) -> Task: 116 | return await self.storage.update_task(task) 117 | 118 | @count_invocations(subsystem="queue") 119 | @measure_latency(subsystem="queue") 120 | @override 121 | async def get_task_instances(self, task_name: str) -> list[Task]: 122 | return await self.storage.get_task_instances(task_name) 123 | 124 | @count_invocations(subsystem="queue") 125 | @measure_latency(subsystem="queue") 126 | @override 127 | async def get_task_instance(self, task_id: int) -> Task: 128 | return await self.storage.get_task_instance(task_id) 129 | -------------------------------------------------------------------------------- /sneakpeek/queue/tests/test_queue.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | import pytest 4 | from fakeredis.aioredis import FakeRedis 5 | 6 | from sneakpeek.queue.in_memory_storage import InMemoryQueueStorage 7 | from sneakpeek.queue.model import ( 8 | EnqueueTaskRequest, 9 | QueueABC, 10 | QueueStorageABC, 11 | TaskHasActiveRunError, 12 | TaskPriority, 13 | ) 14 | from sneakpeek.queue.queue import Queue 15 | from sneakpeek.queue.redis_storage import RedisQueueStorage 16 | 17 | 18 | @pytest.fixture 19 | def in_memory_storage() -> QueueStorageABC: 20 | yield InMemoryQueueStorage() 21 | 22 | 23 | @pytest.fixture 24 | def redis_storage() -> QueueStorageABC: 25 | yield RedisQueueStorage(FakeRedis()) 26 | 27 | 28 | @pytest.fixture( 29 | params=[ 30 | pytest.lazy_fixture(in_memory_storage.__name__), 31 | pytest.lazy_fixture(redis_storage.__name__), 32 | ] 33 | ) 34 | def queue_storage(request) -> QueueStorageABC: 35 | yield request.param 36 | 37 | 38 | @pytest.fixture 39 | def queue(queue_storage: QueueStorageABC) -> QueueABC: 40 | yield Queue(queue_storage) 41 | 42 | 43 | @pytest.mark.asyncio 44 | async def test_enqueue_dequeue(queue: Queue): 45 | request = EnqueueTaskRequest( 46 | task_name=test_enqueue_dequeue.__name__ + ":name", 47 | task_handler=test_enqueue_dequeue.__name__ + ":type", 48 | priority=TaskPriority.HIGH, 49 | payload=test_enqueue_dequeue.__name__ + ":payload", 50 | ) 51 | enqueued = await queue.enqueue(request) 52 | assert enqueued.id is not None 53 | assert enqueued.task_name == request.task_name 54 | assert enqueued.task_handler == request.task_handler 55 | assert enqueued.priority == request.priority 56 | assert enqueued.payload == request.payload 57 | dequeued = await queue.dequeue() 58 | assert dequeued is not None 59 | assert dequeued.id == enqueued.id 60 | assert dequeued.task_name == request.task_name 61 | assert dequeued.task_handler == request.task_handler 62 | assert dequeued.priority == request.priority 63 | assert dequeued.payload == request.payload 64 | 65 | 66 | @pytest.mark.asyncio 67 | async def test_double_enqueue_forbidden(queue: Queue): 68 | request = EnqueueTaskRequest( 69 | task_name=test_double_enqueue_forbidden.__name__ + ":name", 70 | task_handler=test_double_enqueue_forbidden.__name__ + ":type", 71 | priority=TaskPriority.HIGH, 72 | payload=test_double_enqueue_forbidden.__name__ + ":payload", 73 | ) 74 | enqueued = await queue.enqueue(request) 75 | assert enqueued.id is not None 76 | assert enqueued.task_name == request.task_name 77 | with pytest.raises(TaskHasActiveRunError): 78 | await queue.enqueue(request) 79 | 80 | 81 | @pytest.mark.asyncio 82 | async def test_enqueue_count_equals_dequeue_count(queue: Queue): 83 | requests = [ 84 | EnqueueTaskRequest( 85 | task_name=f"{test_enqueue_count_equals_dequeue_count.__name__}:name:{i}", 86 | task_handler=f"{test_enqueue_count_equals_dequeue_count.__name__}:type:{i}", 87 | priority=TaskPriority.HIGH, 88 | payload=f"{test_enqueue_count_equals_dequeue_count.__name__}:payload:{i}", 89 | ) 90 | for i in range(100) 91 | ] 92 | enqueued_tasks = await asyncio.gather( 93 | *{queue.enqueue(request) for request in requests} 94 | ) 95 | assert len(enqueued_tasks) == len(requests) 96 | assert {request.task_name for request in requests} == { 97 | task.task_name for task in enqueued_tasks 98 | } 99 | 100 | dequeued = [] 101 | while task := await queue.dequeue(): 102 | dequeued.append(task) 103 | assert len(dequeued) == len(requests) 104 | assert {request.task_name for request in requests} == { 105 | task.task_name for task in dequeued 106 | } 107 | 108 | 109 | @pytest.mark.asyncio 110 | async def test_scraper_priority_queue_dequeue_order(queue: Queue): 111 | def get_enqueue_request(priority: TaskPriority): 112 | return EnqueueTaskRequest( 113 | task_name=f"{test_scraper_priority_queue_dequeue_order.__name__}:name:{priority}", 114 | task_handler=f"{test_scraper_priority_queue_dequeue_order.__name__}:type:{priority}", 115 | payload=f"{test_scraper_priority_queue_dequeue_order.__name__}:payload:{priority}", 116 | priority=priority, 117 | ) 118 | 119 | requests = [ 120 | get_enqueue_request(TaskPriority.NORMAL), 121 | get_enqueue_request(TaskPriority.HIGH), 122 | get_enqueue_request(TaskPriority.UTMOST), 123 | ] 124 | for request in requests: 125 | await queue.enqueue(request) 126 | 127 | dequeued = [] 128 | while task := await queue.dequeue(): 129 | dequeued.append(task.priority) 130 | assert dequeued == [TaskPriority.UTMOST, TaskPriority.HIGH, TaskPriority.NORMAL] 131 | -------------------------------------------------------------------------------- /sneakpeek/scheduler/model.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from abc import ABC, abstractmethod 3 | from datetime import datetime, timedelta 4 | from enum import Enum 5 | from uuid import uuid4 6 | 7 | from pydantic import BaseModel 8 | 9 | from sneakpeek.queue.model import TaskPriority 10 | 11 | PeriodicTaskId = str 12 | 13 | 14 | def generate_id() -> PeriodicTaskId: 15 | return str(uuid4()) 16 | 17 | 18 | class TaskSchedule(str, Enum): 19 | """ 20 | Periodic task schedule options. Note that it's disallowed to have 2 concurrent 21 | task, so if there's an active task new one won't be scheduled 22 | """ 23 | 24 | INACTIVE = "inactive" #: Scraper won't be automatically scheduled 25 | EVERY_SECOND = "every_second" #: Scraper will be scheduled every second 26 | EVERY_MINUTE = "every_minute" #: Scraper will be scheduled every minute 27 | EVERY_HOUR = "every_hour" #: Scraper will be scheduled every hour 28 | EVERY_DAY = "every_day" #: Scraper will be scheduled every day 29 | EVERY_WEEK = "every_week" #: Scraper will be scheduled every week 30 | EVERY_MONTH = "every_month" #: Scraper will be scheduled every month 31 | CRONTAB = "crontab" #: Specify crontab when scraper should be scheduled 32 | 33 | 34 | class PeriodicTask(BaseModel): 35 | id: PeriodicTaskId #: Task unique ID 36 | name: str #: Task name - used to disallow concurrent execution of the task and to defined unique series of tasks 37 | handler: str #: Task handler name 38 | priority: TaskPriority #: Task priority 39 | payload: str #: Serialized task payload 40 | schedule: TaskSchedule #: Task Schedule 41 | schedule_crontab: str | None = None #: Task schedule crontab 42 | timeout: timedelta | None = None #: Task timeout 43 | 44 | 45 | class Lease(BaseModel): 46 | """Global lease metadata""" 47 | 48 | name: str #: Lease name (resource name to be locked) 49 | owner_id: str #: ID of the acquirer (should be the same if you already have the lease and want to prolong it) 50 | acquired: datetime #: Time when the lease was acquired 51 | acquired_until: datetime #: Time until the lease is acquired 52 | 53 | 54 | class LeaseStorageABC(ABC): 55 | """Global lease storage abstract class""" 56 | 57 | @abstractmethod 58 | async def maybe_acquire_lease( 59 | self, 60 | lease_name: str, 61 | owner_id: str, 62 | acquire_for: timedelta, 63 | ) -> Lease | None: 64 | """Try to acquire lease (global lock). 65 | 66 | Args: 67 | lease_name (str): Lease name (resource name to be locked) 68 | owner_id (str): ID of the acquirer (should be the same if you already have the lease and want to prolong it) 69 | acquire_for (timedelta): For how long lease will be acquired 70 | 71 | Returns: 72 | Lease | None: Lease metadata if it was acquired, None otherwise 73 | """ 74 | ... 75 | 76 | @abstractmethod 77 | async def release_lease(self, lease_name: str, owner_id: str) -> None: 78 | """Release lease (global lock) 79 | 80 | Args: 81 | lease_name (str): Lease name (resource name to be unlocked) 82 | owner_id (str): ID of the acquirer 83 | """ 84 | ... 85 | 86 | 87 | class PeriodicTasksStorageABC(ABC): 88 | @abstractmethod 89 | async def get_periodic_tasks(self) -> list[PeriodicTask]: 90 | ... 91 | 92 | 93 | class StaticPeriodicTasksStorage(PeriodicTasksStorageABC): 94 | def __init__(self, tasks: list[PeriodicTask]) -> None: 95 | self.tasks = tasks 96 | 97 | async def get_periodic_tasks(self) -> list[PeriodicTask]: 98 | return self.tasks 99 | 100 | 101 | class MultiPeriodicTasksStorage(PeriodicTasksStorageABC): 102 | def __init__(self, storages: list[PeriodicTasksStorageABC]) -> None: 103 | self.storages = storages 104 | 105 | async def get_periodic_tasks(self) -> list[PeriodicTask]: 106 | return sum( 107 | await asyncio.gather( 108 | *[storage.get_periodic_tasks() for storage in self.storages] 109 | ), 110 | [], 111 | ) 112 | 113 | 114 | class SchedulerABC(ABC): 115 | @abstractmethod 116 | async def enqueue_task( 117 | self, 118 | task_id: PeriodicTaskId, 119 | priority: TaskPriority, 120 | ) -> None: 121 | ... 122 | 123 | @abstractmethod 124 | async def start_scheduling_task(self, task: PeriodicTask) -> None: 125 | ... 126 | 127 | @abstractmethod 128 | async def stop_scheduling_task(self, task: PeriodicTask) -> None: 129 | ... 130 | 131 | @abstractmethod 132 | async def update_tasks(self) -> None: 133 | ... 134 | 135 | @abstractmethod 136 | async def start(self) -> None: 137 | ... 138 | 139 | @abstractmethod 140 | async def stop(self) -> None: 141 | ... 142 | -------------------------------------------------------------------------------- /sneakpeek/queue/tests/test_consumer.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from datetime import timedelta 3 | from unittest.mock import AsyncMock 4 | 5 | import pytest 6 | from fakeredis.aioredis import FakeRedis 7 | 8 | from sneakpeek.queue.consumer import Consumer 9 | from sneakpeek.queue.in_memory_storage import InMemoryQueueStorage 10 | from sneakpeek.queue.model import ( 11 | EnqueueTaskRequest, 12 | QueueABC, 13 | QueueStorageABC, 14 | Task, 15 | TaskHandlerABC, 16 | TaskPriority, 17 | TaskStatus, 18 | ) 19 | from sneakpeek.queue.queue import Queue 20 | from sneakpeek.queue.redis_storage import RedisQueueStorage 21 | 22 | TEST_HANDLER_NAME = "test_handler" 23 | PING_DELAY = timedelta(milliseconds=1) 24 | 25 | 26 | class TestTaskHandler(TaskHandlerABC): 27 | def __init__(self) -> None: 28 | self.process_mock = AsyncMock() 29 | 30 | def name(self): 31 | return TEST_HANDLER_NAME 32 | 33 | async def process(self, task: Task) -> str: 34 | await self.process_mock(task.id) 35 | return task.task_name 36 | 37 | 38 | @pytest.fixture 39 | def in_memory_storage() -> QueueStorageABC: 40 | yield InMemoryQueueStorage() 41 | 42 | 43 | @pytest.fixture 44 | def redis_storage() -> QueueStorageABC: 45 | yield RedisQueueStorage(FakeRedis()) 46 | 47 | 48 | @pytest.fixture( 49 | params=[ 50 | pytest.lazy_fixture(in_memory_storage.__name__), 51 | pytest.lazy_fixture(redis_storage.__name__), 52 | ] 53 | ) 54 | def queue_storage(request) -> QueueStorageABC: 55 | yield request.param 56 | 57 | 58 | @pytest.fixture 59 | def queue(queue_storage: QueueStorageABC) -> QueueABC: 60 | yield Queue(queue_storage) 61 | 62 | 63 | @pytest.fixture 64 | def handler() -> TaskHandlerABC: 65 | yield TestTaskHandler() 66 | 67 | 68 | @pytest.fixture 69 | def consumer(queue: QueueABC, handler: TaskHandlerABC) -> Consumer: 70 | yield Consumer(queue, [handler], ping_delay=PING_DELAY) 71 | 72 | 73 | async def _wait_task_in_finished_state(queue: QueueABC, task: Task, timeout: timedelta): 74 | async def wait(task: Task): 75 | while True: 76 | task = await queue.storage.get_task_instance(task.id) 77 | if task.status not in (TaskStatus.STARTED, TaskStatus.PENDING): 78 | return 79 | await asyncio.sleep(PING_DELAY.total_seconds()) 80 | 81 | await asyncio.wait_for(wait(task), timeout=timeout.total_seconds()) 82 | 83 | 84 | @pytest.mark.asyncio 85 | async def test_task_dequeues_and_succeeds( 86 | consumer: Consumer, 87 | queue: Queue, 88 | handler: TaskHandlerABC, 89 | ): 90 | request = EnqueueTaskRequest( 91 | task_name="test_task", 92 | task_handler=TEST_HANDLER_NAME, 93 | priority=TaskPriority.NORMAL, 94 | payload="payload", 95 | ) 96 | task = await queue.enqueue(request) 97 | assert await consumer.consume() 98 | await _wait_task_in_finished_state(queue, task, timedelta(seconds=2)) 99 | assert await queue.get_queue_len() == 0 100 | task = await queue.storage.get_task_instance(task.id) 101 | assert task.status == TaskStatus.SUCCEEDED 102 | assert task.result == task.task_name 103 | assert handler.process_mock.awaited_once_with(task.id) 104 | 105 | 106 | @pytest.mark.asyncio 107 | async def test_dequeues_and_fails( 108 | consumer: Consumer, 109 | queue: Queue, 110 | handler: TaskHandlerABC, 111 | ): 112 | handler.process_mock.side_effect = Exception() 113 | request = EnqueueTaskRequest( 114 | task_name="test_task", 115 | task_handler=TEST_HANDLER_NAME, 116 | priority=TaskPriority.NORMAL, 117 | payload="payload", 118 | ) 119 | task = await queue.enqueue(request) 120 | assert await consumer.consume() 121 | await _wait_task_in_finished_state(queue, task, timedelta(seconds=2)) 122 | assert await queue.get_queue_len() == 0 123 | task = await queue.storage.get_task_instance(task.id) 124 | assert task.status == TaskStatus.FAILED 125 | assert handler.process_mock.awaited_once_with(task.id) 126 | 127 | 128 | @pytest.mark.asyncio 129 | async def test_dequeues_and_times_out( 130 | consumer: Consumer, 131 | queue: Queue, 132 | handler: TaskHandlerABC, 133 | ): 134 | handler.process_mock.side_effect = asyncio.sleep(10) 135 | request = EnqueueTaskRequest( 136 | task_name="test_task", 137 | task_handler=TEST_HANDLER_NAME, 138 | priority=TaskPriority.NORMAL, 139 | payload="payload", 140 | timeout=timedelta(milliseconds=10), 141 | ) 142 | task = await queue.enqueue(request) 143 | assert await consumer.consume() 144 | await _wait_task_in_finished_state(queue, task, timedelta(seconds=2)) 145 | assert await queue.get_queue_len() == 0 146 | task = await queue.storage.get_task_instance(task.id) 147 | assert task.status == TaskStatus.FAILED 148 | assert handler.process_mock.awaited_once_with(task.id) 149 | -------------------------------------------------------------------------------- /sneakpeek/queue/redis_storage.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | from redis.asyncio import Redis 4 | from typing_extensions import override 5 | 6 | from sneakpeek.metrics import count_invocations, measure_latency 7 | from sneakpeek.queue.model import QueueStorageABC, Task, TaskNotFoundError 8 | 9 | DEFAULT_TASK_TTL = timedelta(days=7) 10 | SCORE_PRIORITY_BIT_OFFSET = 32 11 | 12 | 13 | class RedisQueueStorage(QueueStorageABC): 14 | """ 15 | Redis queue storage. Queue has two components: priority queue 16 | implemented by sorted set (ZADD and ZPOPMIN) and key (task name) 17 | values (set of task instances) set 18 | """ 19 | 20 | def __init__(self, redis: Redis, task_ttl: timedelta = DEFAULT_TASK_TTL) -> None: 21 | """ 22 | 23 | Args: 24 | redis (Redis): Async redis client 25 | task_ttl (timedelta): TTL of the task record in the redis. Defaults to 7 days. 26 | """ 27 | self._redis = redis 28 | self._queue_set_name = "internal::queue" 29 | self._task_ttl = task_ttl 30 | 31 | async def _generate_id(self) -> int: 32 | return await self._redis.incr("internal::id_counter") 33 | 34 | def _get_task_key(self, task_id: int) -> str: 35 | return f"task::{task_id}" 36 | 37 | def _get_task_name_key(self, task_name: str) -> str: 38 | return f"task_name::{task_name}" 39 | 40 | def _get_task_name_from_key(self, key: str) -> str: 41 | return key.replace("task_name::", "", 1) 42 | 43 | def _get_task_score(self, task: Task) -> int: 44 | # Values in redis sorted sets with the same score are stored lexicographically 45 | # So in order for a queue to be ordered by priority then by the ID 46 | # we can define score as (priority< list[Task]: 53 | tasks = [] 54 | async for key in self._redis.scan_iter("task_name::*"): 55 | tasks += await self.get_task_instances( 56 | self._get_task_name_from_key(key.decode()) 57 | ) 58 | return sorted(tasks, key=lambda x: x.id, reverse=True) 59 | 60 | @count_invocations(subsystem="storage") 61 | @measure_latency(subsystem="storage") 62 | @override 63 | async def get_task_instances(self, task_name: str) -> list[Task]: 64 | task_keys = await self._redis.smembers(self._get_task_name_key(task_name)) 65 | return sorted( 66 | [Task.parse_raw(task) for task in await self._redis.mget(task_keys)], 67 | key=lambda x: x.id, 68 | reverse=True, 69 | ) 70 | 71 | @count_invocations(subsystem="storage") 72 | @measure_latency(subsystem="storage") 73 | @override 74 | async def get_task_instance(self, id: int) -> Task: 75 | task = await self._redis.get(self._get_task_key(id)) 76 | if task is None: 77 | raise TaskNotFoundError() 78 | return Task.parse_raw(task) 79 | 80 | @count_invocations(subsystem="storage") 81 | @measure_latency(subsystem="storage") 82 | @override 83 | async def enqueue_task(self, task: Task) -> Task: 84 | task.id = await self._generate_id() 85 | task_key = self._get_task_key(task.id) 86 | pipe = self._redis.pipeline() 87 | pipe.set(task_key, task.json(), ex=self._task_ttl) 88 | pipe.sadd(self._get_task_name_key(task.task_name), task_key) 89 | pipe.zadd(self._queue_set_name, {task_key: self._get_task_score(task)}) 90 | await pipe.execute() 91 | return task 92 | 93 | @count_invocations(subsystem="storage") 94 | @measure_latency(subsystem="storage") 95 | @override 96 | async def update_task(self, task: Task) -> Task: 97 | task_key = self._get_task_key(task.id) 98 | await self._redis.set(task_key, task.json(), ex=self._task_ttl, xx=True) 99 | return task 100 | 101 | @count_invocations(subsystem="storage") 102 | @measure_latency(subsystem="storage") 103 | @override 104 | async def dequeue_task(self) -> Task | None: 105 | tasks = await self._redis.zpopmin(self._queue_set_name) 106 | if not tasks: 107 | return None 108 | task_key, _ = tasks[0] 109 | task = await self._redis.get(task_key) 110 | if task is None: 111 | raise TaskNotFoundError() 112 | return Task.parse_raw(task) 113 | 114 | @count_invocations(subsystem="storage") 115 | @measure_latency(subsystem="storage") 116 | @override 117 | async def delete_old_tasks(self, keep_last: int = 50) -> None: 118 | async for key in self._redis.scan_iter("task_name::*"): 119 | task_instances = sorted( 120 | await self.get_task_instances( 121 | self._get_task_name_from_key(key.decode()) 122 | ), 123 | key=lambda x: x.id, 124 | reverse=True, 125 | ) 126 | for task in task_instances[keep_last:]: 127 | task_key = self._get_task_key(task.id) 128 | pipe = self._redis.pipeline() 129 | pipe.delete(task_key) 130 | pipe.srem(key, task_key) 131 | await pipe.execute() 132 | 133 | @count_invocations(subsystem="storage") 134 | @measure_latency(subsystem="storage") 135 | @override 136 | async def get_queue_len(self) -> int: 137 | return await self._redis.zcount(self._queue_set_name, 0, "+inf") 138 | -------------------------------------------------------------------------------- /sneakpeek/middleware/rate_limiter_middleware.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from asyncio import Lock 4 | from datetime import datetime, timedelta 5 | from enum import Enum, auto 6 | from random import randint 7 | from typing import Any 8 | from urllib.parse import urlparse 9 | 10 | from cachetools.func import ttl_cache 11 | from pydantic import BaseModel, validator 12 | 13 | from sneakpeek.middleware.base import BaseMiddleware, parse_config_from_obj 14 | from sneakpeek.scraper.model import Request 15 | 16 | DEFAULT_BUCKET_TIME_WINDOW = timedelta(minutes=1) 17 | 18 | 19 | def rate_limited_delay_jitter() -> timedelta: 20 | return timedelta(milliseconds=randint(0, 500)) 21 | 22 | 23 | class _LeakyBucket: 24 | def __init__( 25 | self, size: int, time_window: timedelta = DEFAULT_BUCKET_TIME_WINDOW 26 | ) -> None: 27 | self.size = size 28 | self.time_window = time_window 29 | self.queue: list[datetime] = [] 30 | self.lock = Lock() 31 | 32 | def last_used(self) -> datetime | None: 33 | if not self.queue: 34 | return None 35 | return self.queue[0] 36 | 37 | async def add(self) -> datetime | None: 38 | async with self.lock: 39 | now = datetime.utcnow() 40 | while self.queue and self.queue[0] <= now - self.time_window: 41 | self.queue.pop(0) 42 | if not self.size: 43 | raise ValueError("Queue size is 0") 44 | if len(self.queue) >= self.size: 45 | return self.queue[0] + self.time_window 46 | 47 | self.queue.append(now) 48 | return None 49 | 50 | 51 | class RateLimitedException(Exception): 52 | """Request is rate limited because too many requests were made to the host""" 53 | 54 | pass 55 | 56 | 57 | class RateLimitedStrategy(Enum): 58 | """What to do if the request is rate limited""" 59 | 60 | THROW = auto() #: Throw an exception 61 | WAIT = auto() #: Wait until request is no longer rate limited 62 | 63 | 64 | class RateLimiterMiddlewareConfig(BaseModel): 65 | """Rate limiter middleware configuration""" 66 | 67 | #: Maximum number of allowed requests per host within time window 68 | max_requests: int = 60 69 | #: What to do if the request is rate limited 70 | rate_limited_strategy: RateLimitedStrategy = RateLimitedStrategy.WAIT 71 | #: Time window to aggregate requests 72 | time_window: timedelta = DEFAULT_BUCKET_TIME_WINDOW 73 | 74 | @validator("max_requests") 75 | def check_max_requests(cls, v: int) -> int: 76 | if v <= 0: 77 | raise ValueError( 78 | f"`max_requests` must be a positive integer. Received: {v}" 79 | ) 80 | return v 81 | 82 | def __hash__(self): 83 | return hash( 84 | ( 85 | self.max_requests, 86 | self.rate_limited_strategy, 87 | self.time_window, 88 | ) 89 | ) 90 | 91 | 92 | class RateLimiterMiddleware(BaseMiddleware): 93 | """ 94 | Rate limiter implements `leaky bucket algorithm `_ 95 | to limit number of requests made to the hosts. If the request is rate limited it can either 96 | raise an exception or wait until the request won't be limited anymore. 97 | """ 98 | 99 | def __init__( 100 | self, default_config: RateLimiterMiddlewareConfig | None = None 101 | ) -> None: 102 | self._default_config = default_config or RateLimiterMiddlewareConfig() 103 | self._logger = logging.getLogger(__name__) 104 | 105 | @property 106 | def name(self) -> str: 107 | return "rate_limiter" 108 | 109 | def _extract_key(self, url: str) -> str: 110 | return urlparse(url).hostname 111 | 112 | @ttl_cache(maxsize=None, ttl=timedelta(minutes=5).total_seconds()) 113 | def _get_bucket( 114 | self, key: str, config: RateLimiterMiddlewareConfig 115 | ) -> _LeakyBucket: 116 | return _LeakyBucket( 117 | size=config.max_requests, 118 | time_window=config.time_window, 119 | ) 120 | 121 | async def _wait_for_admission( 122 | self, 123 | url: str, 124 | config: RateLimiterMiddlewareConfig, 125 | ) -> None: 126 | key = self._extract_key(url) 127 | bucket = self._get_bucket(key, config) 128 | while True: 129 | next_attempt_dt = await bucket.add() 130 | if not next_attempt_dt: 131 | return 132 | error_message = ( 133 | f"Rate limited request to '{url}' because there were " 134 | f"more than {bucket.size} calls in the last {int(bucket.time_window.total_seconds())}s " 135 | f"to the domain {key}. " 136 | f"Next available call will be permitted at {next_attempt_dt}." 137 | ) 138 | 139 | if config.rate_limited_strategy == RateLimitedStrategy.THROW: 140 | raise RateLimitedException(error_message) 141 | self._logger.info(error_message) 142 | attempt_delay = next_attempt_dt - datetime.utcnow() 143 | attempt_delay += rate_limited_delay_jitter() 144 | await asyncio.sleep(attempt_delay.total_seconds()) 145 | 146 | async def on_request( 147 | self, 148 | request: Request, 149 | config: Any | None, 150 | ) -> Request: 151 | config = parse_config_from_obj( 152 | config, 153 | self.name, 154 | RateLimiterMiddlewareConfig, 155 | self._default_config, 156 | ) 157 | await self._wait_for_admission(request.url, config) 158 | return request 159 | -------------------------------------------------------------------------------- /sneakpeek/metrics.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from functools import wraps 3 | from typing import Any 4 | 5 | from prometheus_client import Counter, Gauge, Histogram 6 | 7 | invocations_counter = Counter( 8 | name="invocations", 9 | documentation="Methods invocations counter", 10 | namespace="sneakpeek", 11 | labelnames=["subsystem", "method", "type", "error"], 12 | ) 13 | latency_histogram = Histogram( 14 | name="latency", 15 | documentation="Time spent processing method", 16 | namespace="sneakpeek", 17 | labelnames=["subsystem", "method"], 18 | ) 19 | delay_histogram = Histogram( 20 | name="delay", 21 | documentation="Execution and scheduling delay", 22 | namespace="sneakpeek", 23 | labelnames=["type"], 24 | ) 25 | replicas_gauge = Gauge( 26 | name="replicas", 27 | documentation="Number of active subsytem replicas", 28 | namespace="sneakpeek", 29 | labelnames=["type"], 30 | ) 31 | 32 | 33 | def _get_full_class_name(obj: Any) -> str: 34 | module = obj.__class__.__module__ 35 | if module is None or module == str.__class__.__module__: 36 | return obj.__class__.__name__ 37 | return module + "." + obj.__class__.__name__ 38 | 39 | 40 | def measure_latency(subsystem: str): 41 | """ 42 | Decorator for measuring latency of the function (works for both sync and async functions). 43 | 44 | .. code-block:: python3 45 | 46 | @measure_latency(subsytem="my subsystem") 47 | def my_awesome_func(): 48 | ... 49 | 50 | 51 | This will export following Prometheus histogram metric: 52 | 53 | 54 | .. code-block:: 55 | 56 | sneakpeek_latency{subsystem="my subsystem", method="my_awesome_func"} 57 | 58 | Args: 59 | subsystem (str): Subsystem name to be used in the metric annotation 60 | """ 61 | 62 | def wrapper(func): 63 | @wraps(func) 64 | def sync_wrapper(*args, **kwargs): 65 | with latency_histogram.labels( 66 | subsystem=subsystem, method=func.__name__ 67 | ).time(): 68 | return func(*args, **kwargs) 69 | 70 | @wraps(func) 71 | async def async_wrapper(*args, **kwargs): 72 | with latency_histogram.labels( 73 | subsystem=subsystem, method=func.__name__ 74 | ).time(): 75 | return await func(*args, **kwargs) 76 | 77 | return async_wrapper if asyncio.iscoroutinefunction(func) else sync_wrapper 78 | 79 | return wrapper 80 | 81 | 82 | def count_invocations(subsystem: str): 83 | """ 84 | Decorator for measuring number of function invocations (works for both sync and async functions). 85 | 86 | .. code-block:: python3 87 | 88 | @count_invocations(subsytem="my subsystem") 89 | def my_awesome_func(): 90 | ... 91 | 92 | 93 | This will export following Prometheus counter metrics: 94 | 95 | 96 | .. code-block:: 97 | 98 | # Total number of invocations 99 | sneakpeek_invocations{subsystem="my subsystem", method="my_awesome_func", type="total", error=""} 100 | # Total number of successful invocations (ones that haven't thrown an exception) 101 | sneakpeek_invocations{subsystem="my subsystem", method="my_awesome_func", type="success", error=""} 102 | # Total number of failed invocations (ones that have thrown an exception) 103 | sneakpeek_invocations{subsystem="my subsystem", method="my_awesome_func", type="error", error=""} 104 | 105 | Args: 106 | subsystem (str): Subsystem name to be used in the metric annotation 107 | """ 108 | 109 | def wrapper(func): 110 | @wraps(func) 111 | def sync_wrapper(*args, **kwargs): 112 | invocations_counter.labels( 113 | subsystem=subsystem, 114 | method=func.__name__, 115 | type="total", 116 | error="", 117 | ).inc() 118 | try: 119 | result = func(*args, **kwargs) 120 | invocations_counter.labels( 121 | subsystem=subsystem, 122 | method=func.__name__, 123 | type="success", 124 | error="", 125 | ).inc() 126 | return result 127 | except Exception as e: 128 | invocations_counter.labels( 129 | subsystem=subsystem, 130 | method=func.__name__, 131 | type="error", 132 | error=_get_full_class_name(e), 133 | ).inc() 134 | raise 135 | 136 | @wraps(func) 137 | async def async_wrapper(*args, **kwargs): 138 | invocations_counter.labels( 139 | subsystem=subsystem, 140 | method=func.__name__, 141 | type="total", 142 | error="", 143 | ).inc() 144 | try: 145 | result = await func(*args, **kwargs) 146 | invocations_counter.labels( 147 | subsystem=subsystem, 148 | method=func.__name__, 149 | type="success", 150 | error="", 151 | ).inc() 152 | return result 153 | except Exception as e: 154 | invocations_counter.labels( 155 | subsystem=subsystem, 156 | method=func.__name__, 157 | type="error", 158 | error=_get_full_class_name(e), 159 | ).inc() 160 | raise 161 | 162 | return async_wrapper if asyncio.iscoroutinefunction(func) else sync_wrapper 163 | 164 | return wrapper 165 | -------------------------------------------------------------------------------- /docs/quick_start.rst: -------------------------------------------------------------------------------- 1 | ################# 2 | Quick start 3 | ################# 4 | 5 | So you want to create a new scraper, first you need to make sure you have installed **Sneakpeek**: 6 | 7 | .. code-block:: bash 8 | 9 | pip install sneakpeek-py 10 | 11 | The next step would be implementing scraper logic (or so called scraper handler): 12 | 13 | .. code-block:: python3 14 | 15 | # file: demo_scraper.py 16 | 17 | import json 18 | import logging 19 | 20 | from pydantic import BaseModel 21 | 22 | from sneakpeek.scraper.model import ScraperContextABC, ScraperHandler 23 | 24 | 25 | # This defines model of handler parameters that are defined 26 | # in the scraper config and then passed to the handler 27 | class DemoScraperParams(BaseModel): 28 | url: str 29 | 30 | # This is a class which actually implements logic 31 | # Note that you need to inherit the implementation from 32 | # the `sneakpeek.scraper_handler.ScraperHandler` 33 | class DemoScraper(ScraperHandler): 34 | # You can have any dependencies you want and pass them 35 | # in the server configuration 36 | def __init__(self) -> None: 37 | self._logger = logging.getLogger(__name__) 38 | 39 | # Each handler must define its name so it later 40 | # can be referenced in scrapers' configuration 41 | @property 42 | def name(self) -> str: 43 | return "demo_scraper" 44 | 45 | # Some example function that processes the response 46 | # and extracts valuable information 47 | async def process_page(self, response: str): 48 | ... 49 | 50 | # This function is called by the worker to execute the logic 51 | # The only argument that is passed is `sneakpeek.scraper_context.ScraperContext` 52 | # It implements basic async HTTP client and also provides parameters 53 | # that are defined in the scraper config 54 | async def run(self, context: ScraperContextABC) -> str: 55 | params = DemoScraperParams.parse_obj(context.params) 56 | # Perform GET request to the URL defined in the scraper config 57 | response = await context.get(params.url) 58 | response_body = await response.text() 59 | 60 | # Perform some business logic on a response 61 | result = await self.process_page(response_body) 62 | 63 | # Return meaningful job summary - must return a string 64 | return json.dumps({ 65 | "processed_urls": 1, 66 | "found_results": len(result), 67 | }) 68 | 69 | 70 | Now that we have some scraper logic, let's make it run periodically. 71 | To do so let's configure **SneakpeekServer**: 72 | 73 | .. code-block:: python3 74 | 75 | # file: main.py 76 | 77 | import random 78 | from uuid import uuid4 79 | 80 | from demo.demo_scraper import DemoScraper 81 | from sneakpeek.logging import configure_logging 82 | from sneakpeek.middleware.parser import ParserMiddleware 83 | from sneakpeek.middleware.rate_limiter_middleware import ( 84 | RateLimiterMiddleware, 85 | RateLimiterMiddlewareConfig, 86 | ) 87 | from sneakpeek.middleware.requests_logging_middleware import RequestsLoggingMiddleware 88 | from sneakpeek.middleware.robots_txt_middleware import RobotsTxtMiddleware 89 | from sneakpeek.middleware.user_agent_injecter_middleware import ( 90 | UserAgentInjecterMiddleware, 91 | UserAgentInjecterMiddlewareConfig, 92 | ) 93 | from sneakpeek.queue.in_memory_storage import InMemoryQueueStorage 94 | from sneakpeek.queue.model import TaskPriority 95 | from sneakpeek.scheduler.in_memory_lease_storage import InMemoryLeaseStorage 96 | from sneakpeek.scheduler.model import TaskSchedule 97 | from sneakpeek.scraper.in_memory_storage import InMemoryScraperStorage 98 | from sneakpeek.scraper.model import Scraper 99 | from sneakpeek.server import SneakpeekServer 100 | 101 | 102 | def get_server(urls: list[str], is_read_only: bool) -> SneakpeekServer: 103 | handler = DemoScraper() 104 | return SneakpeekServer.create( 105 | handlers=[handler], 106 | scraper_storage=InMemoryScraperStorage([ 107 | Scraper( 108 | id=str(uuid4()), 109 | name=f"Demo Scraper", 110 | schedule=TaskSchedule.EVERY_MINUTE, 111 | handler=handler.name, 112 | config=ScraperConfig(params={"start_url": "http://example.com"}), 113 | schedule_priority=TaskPriority.NORMAL, 114 | ) 115 | ]), 116 | queue_storage=InMemoryQueueStorage(), 117 | lease_storage=InMemoryLeaseStorage(), 118 | middlewares=[ 119 | RequestsLoggingMiddleware(), 120 | RobotsTxtMiddleware(), 121 | RateLimiterMiddleware(RateLimiterMiddlewareConfig(max_rpm=60)), 122 | UserAgentInjecterMiddleware( 123 | UserAgentInjecterMiddlewareConfig(use_external_data=False) 124 | ), 125 | ParserMiddleware(), 126 | ], 127 | ) 128 | 129 | 130 | def main(): 131 | args = parser.parse_args() 132 | server = get_server(args.urls, args.read_only) 133 | configure_logging() 134 | server.serve() 135 | 136 | 137 | if __name__ == "__main__": 138 | main() 139 | 140 | 141 | 142 | Now, the only thing is left is to actually run the server: 143 | 144 | .. code-block:: bash 145 | 146 | python3 run main.py 147 | 148 | That's it! Now you can open http://localhost:8080 and explore the UI to see 149 | how you scraper is being automatically scheduled and executed. 150 | -------------------------------------------------------------------------------- /front/src/components/ScraperIdeComponent.vue: -------------------------------------------------------------------------------- 1 | 34 | 181 | 192 | -------------------------------------------------------------------------------- /docs/middleware/new_middleware.rst: -------------------------------------------------------------------------------- 1 | ################################ 2 | Implementing your own middleware 3 | ################################ 4 | 5 | The interface for middleware is defined in :py:class:`Middleware `. 6 | There are 3 ways how middleware can be used: 7 | 1. Perform custom logic before request is processed (implement `on_request` method) 8 | 2. Perform custom logic before response is returned to the scraper logic (implement `on_response` method) 9 | 3. Provide some additional functionality a for the scraper implementation - scraper can call any middleware method using :py:class:`ScraperContext `. Each middleware is added as an attribute to the passed context, so you can call it like :code:`context..(...)` 10 | 11 | 12 | ===================================== 13 | Middleware implementation example 14 | ===================================== 15 | 16 | ----------------------- 17 | On request middleware 18 | ----------------------- 19 | Each request is wrapped in the :py:class:`Request ` class 20 | and you can modify its parameters before it's dispatched, here's the schema: 21 | 22 | .. code-block:: python3 23 | 24 | @dataclass 25 | class Request: 26 | method: HttpMethod 27 | url: str 28 | headers: HttpHeaders | None = None 29 | kwargs: dict[str, Any] | None = None 30 | 31 | Here's the example of the middleware which logs each request URL: 32 | 33 | .. code-block:: python3 34 | 35 | import logging 36 | from typing import Any 37 | 38 | import aiohttp 39 | from pydantic import BaseModel 40 | 41 | from sneakpeek.middlewares.utils import parse_config_from_obj 42 | from sneakpeek.scraper.model import Middleware, Request 43 | 44 | 45 | # Each middleware can be configured, its configuration can be 46 | # set globally for all requests or it can be overriden for 47 | # specific scrapers 48 | class MyLoggingMiddlewareConfig(BaseModel): 49 | some_param: str = "defaul value" 50 | 51 | class MyMiddleware(BeforeRequestMiddleware): 52 | """Middleware description""" 53 | 54 | def __init__(self, default_config: MyLoggingMiddlewareConfig | None = None) -> None: 55 | self._default_config = default_config or MyLoggingMiddlewareConfig() 56 | self._logger = logging.getLogger(__name__) 57 | 58 | # The name property is mandatory, it's used in scraper config to override 59 | # middleware configuration for the given scraper 60 | @property 61 | def name(self) -> str: 62 | return "my_middleware" 63 | 64 | async def on_request(self, request: Request, config: Any | None) -> Request: 65 | # This converts freeform dictionary into a typed config (it's optional) 66 | config = parse_config_from_obj( 67 | config, 68 | self.name, 69 | MyLoggingMiddlewareConfig, 70 | self._default_config, 71 | ) 72 | self._logger.info(f"Making {request.method.upper()} to {request.url}. {config.some_param}") 73 | return request 74 | 75 | 76 | 77 | ----------------------- 78 | On response middleware 79 | ----------------------- 80 | 81 | On response method recieves both request and response objects. Response is `aiohttp.ClientResponse `_ object. 82 | 83 | 84 | Here's the example of the middleware which logs each response body: 85 | 86 | .. code-block:: python3 87 | 88 | import logging 89 | from typing import Any 90 | 91 | import aiohttp 92 | from pydantic import BaseModel 93 | 94 | from sneakpeek.middleware.base import parse_config_from_obj 95 | from sneakpeek.scraper.model import Middleware, Request 96 | 97 | 98 | # Each middleware can be configured, its configuration can be 99 | # set globally for all requests or it can be overriden for 100 | # specific scrapers 101 | class MyLoggingMiddlewareConfig(BaseModel): 102 | some_param: str = "defaul value" 103 | 104 | 105 | class MyOnResponseMiddleware(Middleware): 106 | """Middleware description""" 107 | 108 | def __init__(self, default_config: MyLoggingMiddlewareConfig | None = None) -> None: 109 | self._default_config = default_config or MyLoggingMiddlewareConfig() 110 | self._logger = logging.getLogger(__name__) 111 | 112 | # The name property is mandatory, it's used in scraper config to override 113 | # middleware configuration for the given scraper 114 | @property 115 | def name(self) -> str: 116 | return "my_middleware" 117 | 118 | async def on_response( 119 | self, 120 | request: Request, 121 | response: aiohttp.ClientResponse, 122 | config: Any | None, 123 | ) -> aiohttp.ClientResponse: 124 | config = parse_config_from_obj( 125 | config, 126 | self.name, 127 | MyLoggingMiddlewareConfig, 128 | self._default_config, 129 | ) 130 | response_body = await response.text() 131 | self._logger.info(f"Made {request.method.upper()} request to {request.url} - received: status={response.status} body={response_body}") 132 | return response 133 | 134 | ------------------------ 135 | Functional middleware 136 | ------------------------ 137 | 138 | If the middleware doesn't need to interact with the request or response you can derive it 139 | from :py:class:`BaseMiddleware `, so that both 140 | `on_request` and `on_response` method are implemented as pass-through. 141 | 142 | Here's an example of such implementation 143 | 144 | .. code-block:: python3 145 | 146 | import logging 147 | from typing import Any 148 | 149 | from sneakpeek.middleware.base import parse_config_from_obj, BaseMiddleware 150 | 151 | 152 | class MyFunctionalMiddleware(BaseMiddleware): 153 | """Middleware description""" 154 | 155 | def __init__(self) -> None: 156 | self._logger = logging.getLogger(__name__) 157 | 158 | # The name property is mandatory, it's used in scraper config to override 159 | # middleware configuration for the given scraper 160 | @property 161 | def name(self) -> str: 162 | return "my_middleware" 163 | 164 | # This function will be available for scrapers by using 165 | # `context.my_middleware.custom_funct(some_arg)` 166 | def custom_func(self, arg1: Any) -> Any: 167 | return do_something(arg1) 168 | 169 | -------------------------------------------------------------------------------- /front/src/layouts/MainLayout.vue: -------------------------------------------------------------------------------- 1 | 102 | 103 | 120 | -------------------------------------------------------------------------------- /front/src/components/ScraperJobs.vue: -------------------------------------------------------------------------------- 1 | 70 | 71 | 174 | 184 | -------------------------------------------------------------------------------- /sneakpeek/queue/consumer.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from datetime import datetime, timedelta 4 | from traceback import format_exc 5 | 6 | from prometheus_client import Counter 7 | 8 | from sneakpeek.logging import task_context 9 | from sneakpeek.metrics import ( 10 | count_invocations, 11 | delay_histogram, 12 | measure_latency, 13 | replicas_gauge, 14 | ) 15 | from sneakpeek.queue.model import ( 16 | QueueABC, 17 | Task, 18 | TaskHandlerABC, 19 | TaskPingFinishedError, 20 | TaskStatus, 21 | TaskTimedOut, 22 | UnknownTaskHandlerError, 23 | ) 24 | 25 | POLL_DELAY = timedelta(milliseconds=100) 26 | TASK_PING_DELAY = timedelta(seconds=1) 27 | 28 | 29 | task_executed = Counter( 30 | name="task_executed", 31 | documentation="Tasks executed", 32 | namespace="sneakpeek", 33 | labelnames=["handler", "name", "status"], 34 | ) 35 | 36 | 37 | class Consumer: 38 | """ 39 | Generic queue consumer implementation 40 | """ 41 | 42 | def __init__( 43 | self, 44 | queue: QueueABC, 45 | handlers: list[TaskHandlerABC], 46 | loop: asyncio.AbstractEventLoop | None = None, 47 | max_concurrency: int = 50, 48 | poll_delay: timedelta = POLL_DELAY, 49 | ping_delay: timedelta = TASK_PING_DELAY, 50 | ) -> None: 51 | """ 52 | Args: 53 | queue (QueueABC): Queue implementation 54 | handlers (list[TaskHandlerABC]): List of the task handlers 55 | loop (asyncio.AbstractEventLoop | None, optional): asyncio loop. Defaults to asyncio.get_event_loop(). 56 | max_concurrency (int, optional): Maximum number of concurrent tasks that a consumer can handle. Defaults to 50. 57 | poll_delay (timedelta, optional): Delay between queue polling in case there are no items in the queue. Defaults to POLL_DELAY. 58 | ping_delay (timedelta, optional): Task heartbeat frequency. Defaults to TASK_PING_DELAY. 59 | """ 60 | self.logger = logging.getLogger(__name__) 61 | self.queue = queue 62 | self.handlers = {handler.name(): handler for handler in handlers} 63 | self.max_concurrency = max_concurrency 64 | self.active: set[asyncio.Task] = set() 65 | self.loop = loop or asyncio.get_event_loop() 66 | self.ping_delay = ping_delay.total_seconds() 67 | self.poll_delay = poll_delay.total_seconds() 68 | self.running = False 69 | self.cycle_task: asyncio.Task | None = None 70 | 71 | async def _handle_task(self, handler: TaskHandlerABC, task: Task) -> str: 72 | with task_context(task): 73 | return await handler.process(task) 74 | 75 | @count_invocations(subsystem="consumer") 76 | async def process_task(self, task: Task) -> None: 77 | """Process dequeued task 78 | 79 | Args: 80 | task (Task): Dequeued Task 81 | 82 | Raises: 83 | UnknownTaskHandlerError: Raised when there's no handler for given task type 84 | TaskTimedOut: Raised when a task has exceeded maximum process time 85 | """ 86 | delay_histogram.labels(type="time_spent_in_queue").observe( 87 | (datetime.utcnow() - task.created_at).total_seconds() 88 | ) 89 | handler_task: asyncio.Task | None = None 90 | self.logger.info(f"Executing task id={task.id}") 91 | try: 92 | task.started_at = datetime.utcnow() 93 | task.status = TaskStatus.STARTED 94 | task = await self.queue.update_task(task) 95 | 96 | if task.task_handler not in self.handlers: 97 | raise UnknownTaskHandlerError(task.task_handler) 98 | handler = self.handlers[task.task_handler] 99 | handler_task = self.loop.create_task(self._handle_task(handler, task)) 100 | 101 | while not handler_task.done(): 102 | if task.timeout and datetime.utcnow() - task.started_at > task.timeout: 103 | raise TaskTimedOut() 104 | task = await self.queue.ping_task(task.id) 105 | await asyncio.sleep(self.ping_delay) 106 | 107 | result = handler_task.result() 108 | task.finished_at = datetime.utcnow() 109 | task.status = TaskStatus.SUCCEEDED 110 | task.result = result 111 | self.logger.info(f"Successfully executed task id={task.id}") 112 | except TaskPingFinishedError: 113 | if handler_task and not handler_task.done(): 114 | handler_task.cancel() 115 | self.logger.exception(f"Seems like task {task.id} was killed") 116 | except Exception: 117 | if handler_task and not handler_task.done(): 118 | handler_task.cancel() 119 | self.logger.exception(f"Failed to execute {task.id}") 120 | task.finished_at = datetime.utcnow() 121 | task.status = TaskStatus.FAILED 122 | task.result = format_exc() 123 | finally: 124 | try: 125 | task = await self.queue.update_task(task) 126 | task_executed.labels( 127 | handler=task.task_handler, 128 | name=task.task_name, 129 | status=task.status.name.lower(), 130 | ) 131 | except Exception: 132 | self.logger.exception(f"Failed to update task {task.id}") 133 | 134 | @measure_latency(subsystem="consumer") 135 | @count_invocations(subsystem="consumer") 136 | async def consume(self) -> bool: 137 | """Consume from the queue 138 | 139 | Returns: 140 | bool: True if anything has been consumed, False otherwise 141 | """ 142 | replicas_gauge.labels(type="active_tasks").set(len(self.active)) 143 | if len(self.active) >= self.max_concurrency: 144 | self.logger.debug( 145 | f"Not dequeuing any tasks because worker has reached max concurrency," 146 | f" there are {len(self.active)} of active tasks" 147 | ) 148 | return False 149 | 150 | dequeued = await self.queue.dequeue() 151 | if not dequeued: 152 | self.logger.debug("No pending tasks in the queue") 153 | return False 154 | 155 | self.logger.info(f"Dequeued a task id={dequeued.id}") 156 | task_handle = self.loop.create_task(self.process_task(dequeued)) 157 | self.active.add(task_handle) 158 | task_handle.add_done_callback(self.active.discard) 159 | return True 160 | 161 | async def _cycle(self): 162 | while self.running: 163 | if not await self.consume(): 164 | await asyncio.sleep(self.poll_delay) 165 | 166 | def start(self): 167 | """Start consuming from the queue""" 168 | self.running = True 169 | self.cycle_task = self.loop.create_task(self._cycle()) 170 | 171 | def stop(self): 172 | """Stop consuming from the queue""" 173 | self.running = False 174 | if self.cycle_task: 175 | self.cycle_task.cancel() 176 | --------------------------------------------------------------------------------