├── front
    ├── src
    │   ├── boot
    │   │   └── .gitkeep
    │   ├── css
    │   │   ├── app.scss
    │   │   └── quasar.variables.scss
    │   ├── App.vue
    │   ├── components
    │   │   ├── ScheduleChip.vue
    │   │   ├── ScraperJobStatusChip.vue
    │   │   ├── PriorityChip.vue
    │   │   ├── ScraperCard.vue
    │   │   ├── TaskLogs.vue
    │   │   ├── ScraperIdeComponent.vue
    │   │   └── ScraperJobs.vue
    │   ├── pages
    │   │   ├── NewScraperPage.vue
    │   │   ├── ScraperPage.vue
    │   │   ├── ErrorNotFound.vue
    │   │   ├── ScrapersPage.vue
    │   │   └── ScraperIde.vue
    │   ├── router
    │   │   ├── routes.js
    │   │   └── index.js
    │   ├── assets
    │   │   └── logo.svg
    │   ├── api.js
    │   └── layouts
    │   │   └── MainLayout.vue
    ├── .npmrc
    ├── .eslintignore
    ├── public
    │   └── icons
    │   │   └── favicon.png
    ├── .editorconfig
    ├── .vscode
    │   ├── extensions.json
    │   └── settings.json
    ├── .gitignore
    ├── README.md
    ├── index.html
    ├── jsconfig.json
    ├── postcss.config.js
    ├── package.json
    ├── quasar.config.js
    └── .eslintrc.js
├── sneakpeek
    ├── static
    │   ├── .gitkeep
    │   └── docs
    │   │   └── .gitkeep
    ├── session_loggers
    │   ├── base.py
    │   ├── redis_logger.py
    │   └── file_logger.py
    ├── middleware
    │   ├── parser.py
    │   ├── base.py
    │   ├── proxy_middleware.py
    │   ├── user_agent_injecter_middleware.py
    │   ├── requests_logging_middleware.py
    │   ├── robots_txt_middleware.py
    │   └── rate_limiter_middleware.py
    ├── scraper
    │   ├── task_handler.py
    │   ├── ephemeral_scraper_task_handler.py
    │   ├── dynamic_scraper_handler.py
    │   ├── in_memory_storage.py
    │   ├── redis_storage.py
    │   ├── tests
    │   │   ├── test_dynamic_scraper_handler.py
    │   │   └── test_scraper_storage.py
    │   └── runner.py
    ├── scheduler
    │   ├── redis_lease_storage.py
    │   ├── in_memory_lease_storage.py
    │   ├── tests
    │   │   └── test_lease_storage.py
    │   └── model.py
    ├── queue
    │   ├── tasks.py
    │   ├── tests
    │   │   ├── test_queue_storage.py
    │   │   ├── test_queue.py
    │   │   └── test_consumer.py
    │   ├── in_memory_storage.py
    │   ├── queue.py
    │   ├── redis_storage.py
    │   └── consumer.py
    ├── logging.py
    ├── tests
    │   └── test_metrics.py
    └── metrics.py
├── .gitattributes
├── .flake8
├── .gitignore
├── .coveragerc
├── .vscode
    ├── extensions.json
    ├── settings.json
    └── launch.json
├── docs
    ├── middleware
    │   ├── index.rst
    │   ├── requests_logging_middleware.rst
    │   ├── robots_txt_middleware.rst
    │   ├── proxy_middleware.rst
    │   ├── user_agent_injecter_middleware.rst
    │   ├── rate_limiter_middleware.rst
    │   └── new_middleware.rst
    ├── Makefile
    ├── make.bat
    ├── index.rst
    ├── api.rst
    ├── deployment.rst
    ├── local_debugging.rst
    ├── conf.py
    ├── design.rst
    └── quick_start.rst
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
    └── workflows
    │   └── ci.yml
├── CONTRIBUTING.md
├── LICENCE
├── README.md
├── Makefile
└── pyproject.toml


/front/src/boot/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sneakpeek/static/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sneakpeek/static/docs/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | sneakpeek/static/** linguist-documentation


--------------------------------------------------------------------------------
/front/src/css/app.scss:
--------------------------------------------------------------------------------
1 | // app global css in SCSS form
2 | 


--------------------------------------------------------------------------------
/front/.npmrc:
--------------------------------------------------------------------------------
1 | # pnpm-related options
2 | shamefully-hoist=true
3 | strict-peer-dependencies=false
4 | 


--------------------------------------------------------------------------------
/front/.eslintignore:
--------------------------------------------------------------------------------
1 | /dist
2 | /src-capacitor
3 | /src-cordova
4 | /.quasar
5 | /node_modules
6 | .eslintrc.js
7 | 


--------------------------------------------------------------------------------
/front/public/icons/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flulemon/sneakpeek/HEAD/front/public/icons/favicon.png


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | ignore = E501, W503
 3 | exclude =
 4 |     __pycache__,
 5 |     .eggs,
 6 |     .git,
 7 |     .tox,
 8 |     .nox,
 9 |     build,
10 |     dist,
11 |     src/test/python_tests/test_data


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | **/__pycache__/*
 2 | .venv
 3 | *.install.stamp
 4 | dist
 5 | .dist
 6 | **/.pytest_cache/*
 7 | .pytest_cache/
 8 | .coverage
 9 | htmlcov
10 | demo
11 | coverage.xml
12 | !/**/.gitkeep
13 | logs


--------------------------------------------------------------------------------
/front/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | charset = utf-8
 5 | indent_style = space
 6 | indent_size = 2
 7 | end_of_line = lf
 8 | insert_final_newline = true
 9 | trim_trailing_whitespace = true
10 | 


--------------------------------------------------------------------------------
/front/src/App.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |   <router-view />
 3 | </template>
 4 | 
 5 | <script>
 6 | import { defineComponent } from 'vue'
 7 | 
 8 | export default defineComponent({
 9 |   name: 'App'
10 | })
11 | </script>
12 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [report]
 2 | exclude_lines =
 3 |     @abstractmethod
 4 |     @abc.abstractmethod
 5 |     raise AssertionError
 6 |     raise NotImplementedError
 7 |     if __name__ == .__main__.:
 8 |     @entrypoint.method()
 9 |     pragma: no cover
10 |     def __repr__
11 |     if self.debug:
12 |     if settings.DEBUG
13 |     if 0:
14 |     class .*\bProtocol\):
15 |     logger.


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "recommendations": [
 3 |     "dbaeumer.vscode-eslint",
 4 |     "esbenp.prettier-vscode",
 5 |     "editorconfig.editorconfig",
 6 |     "vue.volar",
 7 |     "wayou.vscode-todo-highlight"
 8 |   ],
 9 |   "unwantedRecommendations": [
10 |     "octref.vetur",
11 |     "hookyqr.beautify",
12 |     "dbaeumer.jshint",
13 |     "ms-vscode.vscode-typescript-tslint-plugin"
14 |   ]
15 | }


--------------------------------------------------------------------------------
/front/.vscode/extensions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "recommendations": [
 3 |     "dbaeumer.vscode-eslint",
 4 |     "esbenp.prettier-vscode",
 5 |     "editorconfig.editorconfig",
 6 |     "vue.volar",
 7 |     "wayou.vscode-todo-highlight"
 8 |   ],
 9 |   "unwantedRecommendations": [
10 |     "octref.vetur",
11 |     "hookyqr.beautify",
12 |     "dbaeumer.jshint",
13 |     "ms-vscode.vscode-typescript-tslint-plugin"
14 |   ]
15 | }


--------------------------------------------------------------------------------
/front/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "editor.bracketPairColorization.enabled": true,
 3 |   "editor.guides.bracketPairs": true,
 4 |   "editor.formatOnSave": true,
 5 |   "editor.defaultFormatter": "esbenp.prettier-vscode",
 6 |   "editor.codeActionsOnSave": [
 7 |     "source.fixAll.eslint"
 8 |   ],
 9 |   "eslint.validate": [
10 |     "javascript",
11 |     "javascriptreact",
12 |     "typescript",
13 |     "vue"
14 |   ]
15 | }


--------------------------------------------------------------------------------
/front/src/components/ScheduleChip.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |   <q-chip :label="label" v-bind="$attrs" />
 3 | </template>
 4 | <script>
 5 | import { format } from 'quasar';
 6 | const { capitalize } = format;
 7 | 
 8 | export default {
 9 |   name: "ScheduleChip",
10 |   props: ["value"],
11 |   data() {
12 |     return {
13 |       label: "Unknown",
14 |     };
15 |   },
16 |   created() {
17 |     this.label = this.value.split("_").map(capitalize).join(" ");
18 |   }
19 | }
20 | </script>
21 | 


--------------------------------------------------------------------------------
/front/src/pages/NewScraperPage.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |   <q-page class="flex flex-top column q-py-md q-px-xl">
 3 |     <div class="text-h6">
 4 |       Add new scraper
 5 |     </div>
 6 |     <q-separator spaced />
 7 |     <scraper-edit-form />
 8 |   </q-page>
 9 | </template>
10 | 
11 | <script>
12 | import ScraperEditForm from '../components/ScraperEditForm.vue';
13 | 
14 | export default {
15 |   components: { ScraperEditForm },
16 |   name: 'NewScraperPage',
17 | }
18 | </script>
19 | 


--------------------------------------------------------------------------------
/front/src/pages/ScraperPage.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |   <q-page class="flex flex-top column q-py-md q-px-xl">
 3 |     <scraper-card :id="id" />
 4 |     <q-separator spaced />
 5 |     <scraper-jobs :id="id" />
 6 |   </q-page>
 7 | </template>
 8 | 
 9 | <script>
10 | import ScraperCard from '../components/ScraperCard.vue';
11 | import ScraperJobs from '../components/ScraperJobs.vue';
12 | 
13 | export default {
14 |   components: { ScraperCard, ScraperJobs },
15 |   name: 'ScraperPage',
16 |   props: ['id'],
17 | }
18 | </script>
19 | 


--------------------------------------------------------------------------------
/front/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .thumbs.db
 3 | node_modules
 4 | .yarn
 5 | # Quasar core related directories
 6 | .quasar
 7 | /dist
 8 | 
 9 | # Cordova related directories and files
10 | /src-cordova/node_modules
11 | /src-cordova/platforms
12 | /src-cordova/plugins
13 | /src-cordova/www
14 | 
15 | # Capacitor related directories and files
16 | /src-capacitor/www
17 | /src-capacitor/node_modules
18 | 
19 | # Log files
20 | npm-debug.log*
21 | yarn-debug.log*
22 | yarn-error.log*
23 | 
24 | # Editor directories and files
25 | .idea
26 | *.suo
27 | *.ntvs*
28 | *.njsproj
29 | *.sln
30 | 


--------------------------------------------------------------------------------
/docs/middleware/index.rst:
--------------------------------------------------------------------------------
 1 | #################
 2 |   Middleware
 3 | #################
 4 | 
 5 | **Sneakpeek** allows you to run arbitrary code before the request and after the response has been recieved.
 6 | This can be helpful if you have some common logic you want to use in your scrapers. 
 7 | 
 8 | There are some plugins that are already implemented:
 9 | 
10 | .. toctree::
11 |    :maxdepth: 2
12 | 
13 | 
14 |    rate_limiter_middleware
15 |    robots_txt_middleware
16 |    user_agent_injecter_middleware
17 |    proxy_middleware
18 |    requests_logging_middleware
19 |    new_middleware
20 | 


--------------------------------------------------------------------------------
/front/src/pages/ErrorNotFound.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |   <div class="fullscreen bg-gray text-white text-center q-pa-md flex flex-center">
 3 |     <div>
 4 |       <div style="font-size: 30vh">
 5 |         404
 6 |       </div>
 7 | 
 8 |       <div class="text-h2" style="opacity:.4">
 9 |         Oops. Nothing is here...
10 |       </div>
11 | 
12 |       <q-btn class="q-mt-xl" flat text-color="" unelevated to="/" label="Go Home" no-caps/>
13 |     </div>
14 |   </div>
15 | </template>
16 | 
17 | <script>
18 | import { defineComponent } from 'vue';
19 | 
20 | export default defineComponent({
21 |   name: 'ErrorNotFound'
22 | })
23 | </script>
24 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: "[FEATURE]"
 5 | labels: enhancement
 6 | assignees: flulemon
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/front/README.md:
--------------------------------------------------------------------------------
 1 | # Sneakpeek (sneakpeek-front)
 2 | 
 3 | A toolbox to create scrapers
 4 | 
 5 | ## Install the dependencies
 6 | ```bash
 7 | yarn
 8 | # or
 9 | npm install
10 | ```
11 | 
12 | ### Start the app in development mode (hot-code reloading, error reporting, etc.)
13 | ```bash
14 | quasar dev
15 | ```
16 | 
17 | 
18 | ### Lint the files
19 | ```bash
20 | yarn lint
21 | # or
22 | npm run lint
23 | ```
24 | 
25 | 
26 | ### Format the files
27 | ```bash
28 | yarn format
29 | # or
30 | npm run format
31 | ```
32 | 
33 | 
34 | 
35 | ### Build the app for production
36 | ```bash
37 | quasar build
38 | ```
39 | 
40 | ### Customize the configuration
41 | See [Configuring quasar.config.js](https://v2.quasar.dev/quasar-cli-vite/quasar-config-js).
42 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: "[BUG]"
 5 | labels: bug
 6 | assignees: flulemon
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 | 
19 | **Screenshots**
20 | If applicable, add screenshots to help explain your problem.
21 | 
22 | **Environment (please complete the following information):**
23 |  - OS: [e.g. iOS]
24 |  - Python version [e.g. 3.10]
25 |  - Package Version [e.g. 0.1.4]
26 | 
27 | **Additional context**
28 | Add any other context about the problem here.
29 | 


--------------------------------------------------------------------------------
/front/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title><%= productName %></title>
 5 | 
 6 |     <meta charset="utf-8">
 7 |     <meta name="description" content="<%= productDescription %>">
 8 |     <meta name="format-detection" content="telephone=no">
 9 |     <meta name="msapplication-tap-highlight" content="no">
10 |     <meta name="viewport" content="user-scalable=no, initial-scale=1, maximum-scale=1, minimum-scale=1, width=device-width<% if (ctx.mode.cordova || ctx.mode.capacitor) { %>, viewport-fit=cover<% } %>">
11 | 
12 |     <link rel="icon" type="image/png" sizes="200x200" href="icons/favicon.png">
13 |     <link rel="icon" type="image/png" href="icons/favicon.png">
14 |   </head>
15 |   <body>
16 |     <!-- quasar:entry-point -->
17 |   </body>
18 | </html>
19 | 


--------------------------------------------------------------------------------
/front/jsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "baseUrl": ".",
 4 |     "paths": {
 5 |       "src/*": [
 6 |         "src/*"
 7 |       ],
 8 |       "app/*": [
 9 |         "*"
10 |       ],
11 |       "components/*": [
12 |         "src/components/*"
13 |       ],
14 |       "layouts/*": [
15 |         "src/layouts/*"
16 |       ],
17 |       "pages/*": [
18 |         "src/pages/*"
19 |       ],
20 |       "assets/*": [
21 |         "src/assets/*"
22 |       ],
23 |       "boot/*": [
24 |         "src/boot/*"
25 |       ],
26 |       "stores/*": [
27 |         "src/stores/*"
28 |       ],
29 |       "vue$": [
30 |         "node_modules/vue/dist/vue.runtime.esm-bundler.js"
31 |       ]
32 |     }
33 |   },
34 |   "exclude": [
35 |     "dist",
36 |     ".quasar",
37 |     "node_modules"
38 |   ]
39 | }


--------------------------------------------------------------------------------
/front/src/components/ScraperJobStatusChip.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |   <q-chip :color="color" text-color="white" :label="capitalize(this.value)" v-bind="$attrs"  />
 3 | </template>
 4 | <script>
 5 | import { format } from 'quasar';
 6 | const { capitalize } = format;
 7 | 
 8 | export default {
 9 |   name: "ScraperRunStatusChip",
10 |   props: ["value"],
11 |   data() {
12 |     return {
13 |       statusToColor: {
14 |         "pending": "secondary",
15 |         "started": "warning",
16 |         "succeeded": "positive",
17 |       }
18 |     };
19 |   },
20 |   computed: {
21 |     color() {
22 |       if (this.value in this.statusToColor) {
23 |         return this.statusToColor[this.value];
24 |       }
25 |       return "negative";
26 |     }
27 |   },
28 |   methods: {
29 |     capitalize,
30 |   }
31 | }
32 | </script>
33 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "python.formatting.provider": "black",
 3 |   "python.linting.flake8Enabled": true,
 4 |   "python.linting.enabled": true,
 5 |   "editor.formatOnSave": true,
 6 |   "editor.codeActionsOnSave": {
 7 |     "source.organizeImports": true,
 8 |     "source.fixAll.eslint": true
 9 |   },
10 |   "python.testing.pytestArgs": [],
11 |   "python.testing.unittestEnabled": false,
12 |   "python.testing.pytestEnabled": true,
13 |   "eslint.validate": ["javascript", "javascriptreact", "typescript", "vue"],
14 |   "editor.bracketPairColorization.enabled": true,
15 |   "editor.guides.bracketPairs": true,
16 |   "editor.defaultFormatter": "esbenp.prettier-vscode",
17 |   "typescript.tsdk": "node_modules/typescript/lib",
18 |   "[xml]": {
19 |     "editor.defaultFormatter": "redhat.vscode-xml"
20 |   },
21 |   "esbonio.sphinx.confDir": ""
22 | }
23 | 


--------------------------------------------------------------------------------
/front/src/router/routes.js:
--------------------------------------------------------------------------------
 1 | 
 2 | const routes = [
 3 |   {
 4 |     name: 'Homepage',
 5 |     path: '/',
 6 |     component: () => import('layouts/MainLayout.vue'),
 7 |     children: [
 8 |       { name: 'ScrapersPage', path: '', component: () => import('src/pages/ScrapersPage.vue') },
 9 |       { name: 'NewScraperPage', path: 'new', component: () => import('src/pages/NewScraperPage.vue') },
10 |       { name: 'ScraperIde', path: 'ide', component: () => import('src/pages/ScraperIde.vue') },
11 |       { name: 'ScraperPage', path: 'scraper/:id', component: () => import('src/pages/ScraperPage.vue'), props: true },
12 |     ]
13 |   },
14 | 
15 |   // Always leave this as last one,
16 |   // but you can also remove it
17 |   {
18 |     path: '/:catchAll(.*)*',
19 |     component: () => import('pages/ErrorNotFound.vue')
20 |   }
21 | ]
22 | 
23 | export default routes
24 | 


--------------------------------------------------------------------------------
/front/src/css/quasar.variables.scss:
--------------------------------------------------------------------------------
 1 | // Quasar SCSS (& Sass) Variables
 2 | // --------------------------------------------------
 3 | // To customize the look and feel of this app, you can override
 4 | // the Sass/SCSS variables found in Quasar's source Sass/SCSS files.
 5 | 
 6 | // Check documentation for full list of Quasar variables
 7 | 
 8 | // Your own variables (that are declared here) and Quasar's own
 9 | // ones will be available out of the box in your .vue/.scss/.sass files
10 | 
11 | // It's highly recommended to change the default colors
12 | // to match your app's branding.
13 | // Tip: Use the "Theme Builder" on Quasar's documentation website.
14 | 
15 | $primary   : #1976d2;
16 | $secondary : #c2c2c2;
17 | $accent    : #9C27B0;
18 | 
19 | $dark      : #3b3535;
20 | $dark-page : #121212;
21 | 
22 | $positive  : #37994e;
23 | $negative  : #c22b3c;
24 | $info      : #add5de;
25 | $warning   : #ff9b29;
26 | 


--------------------------------------------------------------------------------
/front/src/components/PriorityChip.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |   <q-chip :color="color" :text-color="textColor" :label="label" v-bind="$attrs"  />
 3 | </template>
 4 | <script>
 5 | export default {
 6 |   name: "PriorityChip",
 7 |   props: ["value"],
 8 |   data() {
 9 |     return {
10 |       label: "Unknown",
11 |       color: "secondary",
12 |       textColor: "black",
13 |     };
14 |   },
15 |   created() {
16 |     switch (this.value) {
17 |         case 0:
18 |           this.label = "Utmost";
19 |           this.color = "accent";
20 |           this.textColor = "white";
21 |           break;
22 |         case 1:
23 |           this.label = "High";
24 |           this.color = "warning";
25 |           this.textColor = "black";
26 |           break;
27 |         case 2:
28 |           this.label = "Normal";
29 |           this.color = "primary";
30 |           this.textColor = "white";
31 |           break;
32 |       }
33 |   }
34 | }
35 | </script>
36 | 


--------------------------------------------------------------------------------
/front/postcss.config.js:
--------------------------------------------------------------------------------
 1 | /* eslint-disable */
 2 | // https://github.com/michael-ciniawsky/postcss-load-config
 3 | 
 4 | module.exports = {
 5 |   plugins: [
 6 |     // https://github.com/postcss/autoprefixer
 7 |     require('autoprefixer')({
 8 |       overrideBrowserslist: [
 9 |         'last 4 Chrome versions',
10 |         'last 4 Firefox versions',
11 |         'last 4 Edge versions',
12 |         'last 4 Safari versions',
13 |         'last 4 Android versions',
14 |         'last 4 ChromeAndroid versions',
15 |         'last 4 FirefoxAndroid versions',
16 |         'last 4 iOS versions'
17 |       ]
18 |     })
19 | 
20 |     // https://github.com/elchininet/postcss-rtlcss
21 |     // If you want to support RTL css, then
22 |     // 1. yarn/npm install postcss-rtlcss
23 |     // 2. optionally set quasar.config.js > framework > lang to an RTL language
24 |     // 3. uncomment the following line:
25 |     // require('postcss-rtlcss')
26 |   ]
27 | }
28 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.https://www.sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/sneakpeek/session_loggers/base.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from abc import ABC, abstractmethod
 3 | from typing import Any, List
 4 | 
 5 | from pydantic import BaseModel
 6 | 
 7 | FIELDS_TO_LOG = [
 8 |     "levelname",
 9 |     "msg",
10 |     "filename",
11 |     "lineno",
12 |     "name",
13 |     "funcName",
14 |     "task_id",
15 |     "task_name",
16 |     "task_handler",
17 |     "asctime",
18 |     "headers",
19 |     "kwargs",
20 |     "request",
21 |     "response",
22 | ]
23 | 
24 | 
25 | def get_fields_to_log(record: logging.LogRecord) -> dict[str, Any]:
26 |     return {
27 |         field: value
28 |         for field in FIELDS_TO_LOG
29 |         if (value := getattr(record, field, None)) is not None
30 |     }
31 | 
32 | 
33 | class LogLine(BaseModel):
34 |     id: str
35 |     data: dict[str, Any]
36 | 
37 | 
38 | class SessionLogger(ABC, logging.Handler):
39 |     @abstractmethod
40 |     async def read(
41 |         self,
42 |         task_id: str,
43 |         last_log_line_id: str | None = None,
44 |         max_lines: int = 100,
45 |     ) -> List[LogLine]:
46 |         ...
47 | 


--------------------------------------------------------------------------------
/front/src/router/index.js:
--------------------------------------------------------------------------------
 1 | import { route } from 'quasar/wrappers'
 2 | import { createRouter, createMemoryHistory, createWebHistory, createWebHashHistory } from 'vue-router'
 3 | import routes from './routes'
 4 | 
 5 | /*
 6 |  * If not building with SSR mode, you can
 7 |  * directly export the Router instantiation;
 8 |  *
 9 |  * The function below can be async too; either use
10 |  * async/await or return a Promise which resolves
11 |  * with the Router instance.
12 |  */
13 | 
14 | export default route(function (/* { store, ssrContext } */) {
15 |   const createHistory = process.env.SERVER
16 |     ? createMemoryHistory
17 |     : (process.env.VUE_ROUTER_MODE === 'history' ? createWebHistory : createWebHashHistory)
18 | 
19 |   const Router = createRouter({
20 |     scrollBehavior: () => ({ left: 0, top: 0 }),
21 |     routes,
22 | 
23 |     // Leave this as is and make changes in quasar.conf.js instead!
24 |     // quasar.conf.js -> build -> vueRouterMode
25 |     // quasar.conf.js -> build -> publicPath
26 |     history: createHistory(process.env.VUE_ROUTER_BASE)
27 |   })
28 | 
29 |   return Router
30 | })
31 | 


--------------------------------------------------------------------------------
/sneakpeek/middleware/parser.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from dataclasses import dataclass
 3 | 
 4 | from sneakpeek.middleware.base import BaseMiddleware
 5 | 
 6 | 
 7 | @dataclass
 8 | class RegexMatch:
 9 |     """Regex match"""
10 | 
11 |     full_match: str  #: Full regular expression match
12 |     groups: dict[str, str]  #: Regular expression group matches
13 | 
14 | 
15 | class ParserMiddleware(BaseMiddleware):
16 |     """Parser middleware provides parsing utilities"""
17 | 
18 |     @property
19 |     def name(self) -> str:
20 |         return "parser"
21 | 
22 |     def regex(
23 |         self,
24 |         text: str,
25 |         pattern: str,
26 |         flags: re.RegexFlag = re.UNICODE | re.MULTILINE | re.IGNORECASE,
27 |     ) -> list[RegexMatch]:
28 |         """Find matches in the text using regular expression
29 | 
30 |         Args:
31 |             text (str): Text to search in
32 |             pattern (str): Regular expression
33 |             flags (re.RegexFlag, optional): Regular expression flags. Defaults to re.UNICODE | re.MULTILINE | re.IGNORECASE.
34 | 
35 |         Returns:
36 |             list[RegexMatch]: Matches found in the text
37 |         """
38 |         return [
39 |             RegexMatch(full_match=match.group(0), groups=match.groupdict())
40 |             for match in re.finditer(pattern, text, flags)
41 |         ]
42 | 


--------------------------------------------------------------------------------
/front/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "sneakpeek-front",
 3 |   "version": "0.2.2",
 4 |   "description": "A toolbox to create scrapers",
 5 |   "productName": "Sneakpeek",
 6 |   "author": "Dan Yazovsky <daniil.yazovsky@gmail.com>",
 7 |   "private": true,
 8 |   "scripts": {
 9 |     "lint": "eslint --ext .js,.vue ./",
10 |     "format": "prettier --write \"**/*.{js,vue,scss,html,md,json}\" --ignore-path .gitignore",
11 |     "test": "echo \"No test specified\" && exit 0",
12 |     "dev": "quasar dev",
13 |     "build": "quasar build"
14 |   },
15 |   "dependencies": {
16 |     "@quasar/extras": "^1.0.0",
17 |     "axios": "^1.3.5",
18 |     "json-editor-vue": "^0.10.5",
19 |     "monaco-editor-vue3": "^0.1.6",
20 |     "monaco-editor-webpack-plugin": "^7.0.1",
21 |     "quasar": "^2.6.0",
22 |     "vanilla-jsoneditor": "^0.16.1",
23 |     "vscode-ws-jsonrpc": "^3.0.0",
24 |     "vue": "^3.0.0",
25 |     "vue-router": "^4.0.0"
26 |   },
27 |   "devDependencies": {
28 |     "@quasar/app-vite": "^1.0.0",
29 |     "autoprefixer": "^10.4.2",
30 |     "eslint": "^8.10.0",
31 |     "eslint-config-prettier": "^8.1.0",
32 |     "eslint-plugin-vue": "^9.0.0",
33 |     "postcss": "^8.4.14",
34 |     "prettier": "^2.5.1"
35 |   },
36 |   "engines": {
37 |     "node": "^18 || ^16 || ^14.19",
38 |     "npm": ">= 6.13.4",
39 |     "yarn": ">= 1.21.1"
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/sneakpeek/scraper/task_handler.py:
--------------------------------------------------------------------------------
 1 | from sneakpeek.queue.model import Task, TaskHandlerABC
 2 | from sneakpeek.scraper.model import (
 3 |     SCRAPER_PERIODIC_TASK_HANDLER_NAME,
 4 |     Scraper,
 5 |     ScraperHandler,
 6 |     ScraperRunnerABC,
 7 |     ScraperStorageABC,
 8 |     UnknownScraperHandlerError,
 9 | )
10 | 
11 | 
12 | class ScraperTaskHandler(TaskHandlerABC):
13 |     def __init__(
14 |         self,
15 |         scraper_handlers: list[ScraperHandler],
16 |         runner: ScraperRunnerABC,
17 |         storage: ScraperStorageABC,
18 |     ) -> None:
19 |         self.scraper_handlers = {handler.name: handler for handler in scraper_handlers}
20 |         self.runner = runner
21 |         self.storage = storage
22 | 
23 |     def name(self) -> int:
24 |         return SCRAPER_PERIODIC_TASK_HANDLER_NAME
25 | 
26 |     async def process(self, task: Task) -> str:
27 |         scraper = await self.storage.get_scraper(task.task_name)
28 |         handler = self._get_handler(scraper)
29 |         return await self.runner.run(handler, scraper)
30 | 
31 |     def _get_handler(self, scraper: Scraper) -> ScraperHandler:
32 |         if scraper.handler not in self.scraper_handlers:
33 |             raise UnknownScraperHandlerError(
34 |                 f"Unknown scraper handler '{scraper.handler}'"
35 |             )
36 |         return self.scraper_handlers[scraper.handler]
37 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | ## Contributing
 2 | 
 3 | 1. File an issue to notify the maintainers about what you're working on.
 4 | 2. Fork the repo, develop and test your code changes, add docs.
 5 | 3. Make sure that your commit messages clearly describe the changes.
 6 | 4. Send a pull request.
 7 | 
 8 | ## File an Issue
 9 | 
10 | Use the issue tracker to start the discussion. It is possible that someone
11 | else is already working on your idea, your approach is not quite right, or that
12 | the functionality exists already. The ticket you file in the issue tracker will
13 | be used to hash that all out.
14 | 
15 | ## Running tests, building package and docs
16 | 
17 | Use the issue tracker to start the discussion. It is possible that someone
18 | else is already working on your idea, your approach is not quite right, or that
19 | the functionality exists already. The ticket you file in the issue tracker will
20 | be used to hash that all out.
21 | 
22 | ## Make the Pull Request
23 | 
24 | Once you have made all your changes, tests, and updated the documentation, run the tests and build the package:
25 | 
26 | ```
27 | make test
28 | make build
29 | ```
30 | 
31 | Once everything succeeds make a pull request to move everything back into the main branch of the
32 | `repository`.
33 | 
34 | Be sure to reference the original issue in the pull request.
35 | Expect some back-and-forth with regards to style and compliance of these
36 | rules.
37 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | #################
 2 | Overview
 3 | #################
 4 | 
 5 | **Sneakpeek** - is a platform to author, schedule and monitor scrapers in an easy, fast and extensible way.
 6 | It's the best choice for scrapers that have some specific complex scraping logic that needs
 7 | to be run on a constant basis.
 8 | 
 9 | Key features
10 | ############
11 | 
12 | - Horizontally scalable
13 | - Robust scraper scheduler and priority task queue
14 | - Multiple storage implementations to persist scrapers' configs, tasks, logs, etc.
15 | - JSON RPC API to manage the platform programmatically
16 | - Useful UI to manage all of your scrapers
17 | - Scraper IDE to enable you developing scrapers right in your browser
18 | - Easily extendable via middleware
19 | 
20 | Demo
21 | ####
22 | 
23 | [Here's a demo project](https://github.com/flulemon/sneakpeek-demo) which uses **Sneakpeek** framework.
24 | 
25 | You can also run the demo using Docker:
26 | 
27 | .. code-block:: bash
28 | 
29 |    docker run -it --rm -p 8080:8080 flulemon/sneakpeek-demo
30 | 
31 | 
32 | Once it has started head over to http://localhost:8080 to play around with it.
33 | 
34 | Table of contents
35 | ==================
36 | 
37 | .. toctree::
38 |    :maxdepth: 2
39 | 
40 |    self
41 |    quick_start
42 |    local_debugging
43 |    design
44 |    deployment
45 |    middleware/index
46 |    api
47 | 
48 | Indices
49 | ==================
50 | * :ref:`genindex`
51 | * :ref:`modindex`
52 | * :ref:`search`


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | #################
 2 | API
 3 | #################
 4 | 
 5 | .. automodule:: sneakpeek.server
 6 | .. automodule:: sneakpeek.queue.model
 7 | .. automodule:: sneakpeek.scheduler.model
 8 | .. automodule:: sneakpeek.scraper.model
 9 | .. automodule:: sneakpeek.queue.queue
10 | .. automodule:: sneakpeek.queue.consumer
11 | .. automodule:: sneakpeek.queue.in_memory_storage
12 | .. automodule:: sneakpeek.queue.redis_storage
13 | .. automodule:: sneakpeek.queue.tasks
14 | .. automodule:: sneakpeek.scheduler.scheduler
15 | .. automodule:: sneakpeek.scheduler.in_memory_lease_storage
16 | .. automodule:: sneakpeek.scheduler.redis_lease_storage
17 | .. automodule:: sneakpeek.scraper.context
18 | .. automodule:: sneakpeek.scraper.runner
19 | .. automodule:: sneakpeek.scraper.task_handler
20 | .. automodule:: sneakpeek.scraper.redis_storage
21 | .. automodule:: sneakpeek.scraper.in_memory_storage
22 | .. automodule:: sneakpeek.scraper.dynamic_scraper_handler
23 | .. automodule:: sneakpeek.middleware.base
24 | .. automodule:: sneakpeek.middleware.parser
25 | .. automodule:: sneakpeek.middleware.proxy_middleware
26 | .. automodule:: sneakpeek.middleware.rate_limiter_middleware
27 | .. automodule:: sneakpeek.middleware.requests_logging_middleware
28 | .. automodule:: sneakpeek.middleware.robots_txt_middleware
29 | .. automodule:: sneakpeek.middleware.user_agent_injecter_middleware
30 | .. automodule:: sneakpeek.api
31 | .. automodule:: sneakpeek.logging
32 | .. automodule:: sneakpeek.metrics


--------------------------------------------------------------------------------
/docs/deployment.rst:
--------------------------------------------------------------------------------
 1 | ##################
 2 | Deployment options
 3 | ##################
 4 | 
 5 | There are multiple options how you can deploy your scrapers depending on your requirements:
 6 | 
 7 | =============================
 8 | One replica that does it all
 9 | =============================
10 | 
11 | This is a good option if:
12 | 
13 | * you can tolerate some downtime
14 | * you don't need to host thousands of scrapers that can be dynamically changed by users
15 | * you don't care if you lose the information about the scraper jobs
16 | 
17 | In this case all you need to do is to:
18 | 
19 | * define a list of scrapers in the code (just like in the :doc:`tutorial </quick_start>`)
20 | * use in-memory storage
21 | 
22 | ======================
23 | Using external storage
24 | ======================
25 | 
26 | If you use some external storage (e.g. redis or RDBMS) for jobs queue and lease storage you'll be able:
27 | 
28 | * to scale workers horizontally until queue, storage or scheduler becomes a bottleneck
29 | * to have a secondary replicas for the scheduler, so when primary dies for some reason there are fallback options
30 | 
31 | If you also use the external storage as a scrapers storage you'll be able to dynamically 
32 | add, delete and update scrapers via UI or JsonRPC API.
33 | 
34 | Note that each **Sneakpeek** server by default runs worker, scheduler and API services, but
35 | it's possible to run only one role at the time, therefore you'll be able to scale
36 | services independently.
37 | 
38 | 


--------------------------------------------------------------------------------
/sneakpeek/middleware/base.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from traceback import format_exc
 3 | from typing import Any, Coroutine, Type, TypeVar
 4 | 
 5 | from aiohttp import ClientResponse
 6 | from pydantic import BaseModel
 7 | from typing_extensions import override
 8 | 
 9 | from sneakpeek.scraper.model import Middleware, MiddlewareConfig, Request
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | _TBaseModel = TypeVar("_TBaseModel", bound=BaseModel)
14 | 
15 | 
16 | def parse_config_from_obj(
17 |     config: Any | None,
18 |     plugin_name: str,
19 |     config_type: Type[_TBaseModel],
20 |     default_config: _TBaseModel,
21 | ) -> _TBaseModel:
22 |     if not config:
23 |         return default_config
24 |     try:
25 |         return config_type.parse_obj(config)
26 |     except Exception as e:
27 |         logger.warn(f"Failed to parse config for plugin '{plugin_name}': {e}")
28 |         logger.debug(f"Traceback: {format_exc()}")
29 |     return default_config
30 | 
31 | 
32 | class BaseMiddleware(Middleware):
33 |     @property
34 |     def name(self) -> str:
35 |         return "proxy"
36 | 
37 |     @override
38 |     async def on_request(
39 |         self,
40 |         request: Request,
41 |         config: Any | None,
42 |     ) -> Request:
43 |         return request
44 | 
45 |     async def on_response(
46 |         self,
47 |         request: Request,
48 |         response: ClientResponse,
49 |         config: MiddlewareConfig | None = None,
50 |     ) -> Coroutine[Any, Any, ClientResponse]:
51 |         return response
52 | 


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2023, Daniil Iazovskii
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of the copyright holder nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   // Use IntelliSense to learn about possible attributes.
 3 |   // Hover to view descriptions of existing attributes.
 4 |   // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |   "version": "0.2.1",
 6 |   "configurations": [
 7 |     {
 8 |       "name": "python",
 9 |       "type": "python",
10 |       "request": "launch",
11 |       "program": "${file}",
12 |       "console": "integratedTerminal",
13 |       "justMyCode": true,
14 |       "env": {
15 |         "PYTHONPATH": "${workspaceFolder}"
16 |       }
17 |     },
18 |     {
19 |       "name": "Run all",
20 |       "type": "python",
21 |       "request": "launch",
22 |       "program": "${workspaceFolder}/sneakpeek/app.py",
23 |       "args": ["--api", "--scheduler", "--worker"],
24 |       "console": "integratedTerminal",
25 |       "justMyCode": true,
26 |       "env": {
27 |         "PYTHONPATH": "${workspaceFolder}"
28 |       }
29 |     },
30 |     {
31 |       "name": "Run demo",
32 |       "type": "python",
33 |       "request": "launch",
34 |       "program": "${workspaceFolder}/demo/app.py",
35 |       "console": "integratedTerminal",
36 |       "justMyCode": true,
37 |       "env": {
38 |         "PYTHONPATH": "${workspaceFolder}"
39 |       }
40 |     },
41 |     {
42 |       "name": "Run demo (local handler)",
43 |       "type": "python",
44 |       "request": "launch",
45 |       "program": "${workspaceFolder}/demo/demo_scraper.py",
46 |       "console": "integratedTerminal",
47 |       "justMyCode": true,
48 |       "env": {
49 |         "PYTHONPATH": "${workspaceFolder}"
50 |       }
51 |     }
52 |   ]
53 | }
54 | 


--------------------------------------------------------------------------------
/sneakpeek/scraper/ephemeral_scraper_task_handler.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | from sneakpeek.queue.model import Task, TaskHandlerABC
 4 | from sneakpeek.scraper.model import (
 5 |     EPHEMERAL_SCRAPER_TASK_HANDLER_NAME,
 6 |     ScraperConfig,
 7 |     ScraperHandler,
 8 |     ScraperRunnerABC,
 9 |     UnknownScraperHandlerError,
10 | )
11 | 
12 | 
13 | class EphemeralScraperTask(BaseModel):
14 |     scraper_handler: str
15 |     scraper_config: ScraperConfig
16 |     scraper_state: str | None = None
17 | 
18 | 
19 | class EphemeralScraperTaskHandler(TaskHandlerABC):
20 |     def __init__(
21 |         self,
22 |         scraper_handlers: list[ScraperHandler],
23 |         runner: ScraperRunnerABC,
24 |     ) -> None:
25 |         self.scraper_handlers = {handler.name: handler for handler in scraper_handlers}
26 |         self.runner = runner
27 | 
28 |     def name(self) -> int:
29 |         return EPHEMERAL_SCRAPER_TASK_HANDLER_NAME
30 | 
31 |     async def process(self, task: Task) -> str:
32 |         config = EphemeralScraperTask.parse_raw(task.payload)
33 |         handler = self._get_handler(config.scraper_handler)
34 |         return await self.runner.run_ephemeral(
35 |             handler,
36 |             config.scraper_config,
37 |             config.scraper_state,
38 |         )
39 | 
40 |     def _get_handler(self, scraper_handler: str) -> ScraperHandler:
41 |         if scraper_handler not in self.scraper_handlers:
42 |             raise UnknownScraperHandlerError(
43 |                 f"Unknown scraper handler '{scraper_handler}'"
44 |             )
45 |         return self.scraper_handlers[scraper_handler]
46 | 


--------------------------------------------------------------------------------
/docs/local_debugging.rst:
--------------------------------------------------------------------------------
 1 | ################################
 2 | Local handler debugging
 3 | ################################
 4 | 
 5 | You can easily test handler without running full-featured server. Here's how you can do that for the `DemoScraper` that we have developed in the :doc:`tutorial </quick_start>`.
 6 | 
 7 | Add import in the beginning of the file:
 8 | 
 9 | .. code-block:: python3
10 | 
11 |     from sneakpeek.scraper.runner import ScraperRunner
12 | 
13 | 
14 | And add the following lines to the end of the file:
15 | 
16 | 
17 | .. code-block:: python3
18 | 
19 | 
20 |     async def main():
21 |         result = await ScraperRunner.debug_handler(
22 |             DemoScraper(),
23 |             config=ScraperConfig(
24 |                 params=DemoScraperParams(
25 |                     start_url="https://www.ycombinator.com/",
26 |                     max_pages=20,
27 |                 ).dict(),
28 |             ),
29 |             middlewares=[
30 |                 RequestsLoggingMiddleware(),
31 |             ],
32 |         )
33 |         logging.info(f"Finished scraper with result: {result}")
34 | 
35 |     if __name__ == "__main__":
36 |         asyncio.run(main())
37 | 
38 | 
39 | For the argument `ScraperRunner.debug_handler` takes:
40 | 
41 | 1. An instance of your scraper handler
42 | 2. Scraper config
43 | 3. **[Optional]** Middleware that will be used in the handler (:doc:`see full list of the middleware here </middleware/index>`)
44 | 
45 | Now you can run you handler as an ordinary Python script. Given it's in `demo_scraper.py` file you can use:
46 | 
47 | .. code-block:: bash
48 | 
49 |     python3 demo_scraper.py
50 | 


--------------------------------------------------------------------------------
/docs/middleware/requests_logging_middleware.rst:
--------------------------------------------------------------------------------
 1 | ##############################
 2 | Requests logging middleware
 3 | ##############################
 4 | 
 5 | Requests logging middleware logs all requests being made and received responses.
 6 | 
 7 | Configuration of the middleware is defined in :py:class:`RequestsLoggingMiddlewareConfig <sneakpeek.middleware.requests_logging_middleware.RequestsLoggingMiddlewareConfig>`.
 8 | 
 9 | How to configure middleware for the :py:class:`SneakpeekServer <sneakpeek.server.SneakpeekServer>` (will be used globally for all requests):
10 | 
11 | .. code-block:: python3
12 | 
13 |     from sneakpeek.middleware.requests_logging_middleware import RequestsLoggingMiddleware, RequestsLoggingMiddlewareConfig
14 | 
15 |     server = SneakpeekServer.create(
16 |         ...
17 |         middleware=[
18 |             RequestsLoggingMiddleware(
19 |                 RequestsLoggingMiddlewareConfig(
20 |                     log_request=True,
21 |                     log_response=True,
22 |                 )
23 |             )
24 |         ],
25 |     )
26 | 
27 | 
28 | How to override middleware settings for a given scraper:
29 | 
30 | .. code-block:: python3
31 | 
32 |     from sneakpeek.middleware.requests_logging_middleware import RequestsLoggingMiddlewareConfig
33 | 
34 |     scraper = Scraper(
35 |         ...
36 |         config=ScraperConfig(
37 |             ...
38 |             middleware={
39 |                 "requests_logging": RequestsLoggingMiddlewareConfig(
40 |                     log_request=True,
41 |                     log_response=False,
42 |                 )
43 |             }
44 |         ),
45 |     )
46 | 


--------------------------------------------------------------------------------
/docs/middleware/robots_txt_middleware.rst:
--------------------------------------------------------------------------------
 1 | #########################
 2 | Robots.txt
 3 | #########################
 4 | 
 5 | Robots.txt middleware can log and optionally block requests if they are disallowed by website robots.txt. 
 6 | If robots.txt is unavailable (e.g. request returns 5xx code) all requests will be allowed.
 7 | 
 8 | Configuration of the middleware is defined in :py:class:`RobotsTxtMiddlewareConfig <sneakpeek.middleware.robots_txt_middleware.RobotsTxtMiddlewareConfig>`.
 9 | 
10 | How to configure middleware for the :py:class:`SneakpeekServer <sneakpeek.server.SneakpeekServer>` (will be used globally for all requests):
11 | 
12 | .. code-block:: python3
13 | 
14 |     from sneakpeek.middleware.robots_txt_middleware import RobotsTxtMiddleware, RobotsTxtMiddlewareConfig
15 | 
16 |     server = SneakpeekServer.create(
17 |         ...
18 |         middleware=[
19 |             ProxyMiddleware(
20 |                 ProxyMiddlewareConfig(
21 |                     violation_strategy = RobotsTxtViolationStrategy.THROW,
22 |                 )
23 |             )
24 |         ],
25 |     )
26 | 
27 | 
28 | How to override middleware settings for a given scraper:
29 | 
30 | .. code-block:: python3
31 | 
32 |     from aiohttp import BasicAuth
33 |     from sneakpeek.middleware.robots_txt_middleware import RobotsTxtMiddlewareConfig
34 | 
35 |     scraper = Scraper(
36 |         ...
37 |         config=ScraperConfig(
38 |             ...
39 |             middleware={
40 |                 "robots_txt": ProxyMiddlewareConfig(
41 |                     violation_strategy = RobotsTxtViolationStrategy.LOG,
42 |                 )
43 |             }
44 |         ),
45 |     )
46 | 


--------------------------------------------------------------------------------
/docs/middleware/proxy_middleware.rst:
--------------------------------------------------------------------------------
 1 | #########################
 2 | Proxy middleware
 3 | #########################
 4 | 
 5 | Proxy middleware automatically sets proxy arguments for all HTTP requests.
 6 | Configuration of the middleware is defined in :py:class:`ProxyMiddlewareConfig <sneakpeek.middleware.proxy_middleware.ProxyMiddlewareConfig>`.
 7 | 
 8 | How to configure middleware for the :py:class:`SneakpeekServer <sneakpeek.server.SneakpeekServer>` (will be used globally for all requests):
 9 | 
10 | .. code-block:: python3
11 | 
12 |     from aiohttp import BasicAuth
13 |     from sneakpeek.middleware.proxy_middleware import ProxyMiddleware, ProxyMiddlewareConfig
14 | 
15 |     server = SneakpeekServer.create(
16 |         ...
17 |         middleware=[
18 |             ProxyMiddleware(
19 |                 ProxyMiddlewareConfig(
20 |                     proxy = "http://example.proxy.com:3128",
21 |                     proxy_auth = BasicAuth(login="mylogin", password="securepassword"),
22 |                 )
23 |             )
24 |         ],
25 |     )
26 | 
27 | 
28 | How to override middleware settings for a given scraper:
29 | 
30 | .. code-block:: python3
31 | 
32 |     from aiohttp import BasicAuth
33 |     from sneakpeek.middleware.proxy_middleware import ProxyMiddlewareConfig
34 | 
35 |     scraper = Scraper(
36 |         ...
37 |         config=ScraperConfig(
38 |             ...
39 |             middleware={
40 |                 "proxy": ProxyMiddlewareConfig(
41 |                     proxy = "http://example.proxy.com:3128",
42 |                     proxy_auth = BasicAuth(login="mylogin", password="securepassword"),
43 |                 )
44 |             }
45 |         ),
46 |     )
47 | 


--------------------------------------------------------------------------------
/docs/middleware/user_agent_injecter_middleware.rst:
--------------------------------------------------------------------------------
 1 | #########################
 2 | User Agent injector
 3 | #########################
 4 | 
 5 | This middleware automatically adds ``User-Agent`` header if it's not present. 
 6 | It uses `fake-useragent <https://pypi.org/project/fake-useragent/>`_ in order to generate fake real world user agents.
 7 | 
 8 | Configuration of the middleware is defined in :py:class:`UserAgentInjecterMiddlewareConfig <sneakpeek.middleware.user_agent_injecter_middleware.UserAgentInjecterMiddlewareConfig>`.
 9 | 
10 | How to configure middleware for the :py:class:`SneakpeekServer <sneakpeek.server.SneakpeekServer>` (will be used globally for all requests):
11 | 
12 | .. code-block:: python3
13 | 
14 |     from sneakpeek.middleware.user_agent_injecter_middleware import UserAgentInjecterMiddleware, UserAgentInjecterMiddlewareConfig
15 | 
16 |     server = SneakpeekServer.create(
17 |         ...
18 |         middleware=[
19 |             UserAgentInjecterMiddleware(
20 |                 UserAgentInjecterMiddlewareConfig(
21 |                     use_external_data = True,
22 |                     browsers = ["chrome", "firefox"],
23 |                 )
24 |             )
25 |         ],
26 |     )
27 | 
28 | 
29 | How to override middleware settings for a given scraper:
30 | 
31 | .. code-block:: python3
32 | 
33 |     from sneakpeek.middleware.user_agent_injecter_middleware import UserAgentInjecterMiddlewareConfig
34 | 
35 |     scraper = Scraper(
36 |         ...
37 |         config=ScraperConfig(
38 |             ...
39 |             middleware={
40 |                 "user_agent_injecter": UserAgentInjecterMiddlewareConfig(
41 |                     use_external_data = False,
42 |                     browsers = ["chrome", "firefox"],
43 |                 )
44 |             }
45 |         ),
46 |     )
47 | 


--------------------------------------------------------------------------------
/front/quasar.config.js:
--------------------------------------------------------------------------------
 1 | const { configure } = require('quasar/wrappers');
 2 | const MonacoWebpackPlugin = require('monaco-editor-webpack-plugin');
 3 | 
 4 | module.exports = configure(function (ctx) {
 5 |   return {
 6 |     eslint: {
 7 |       warnings: true,
 8 |       errors: true
 9 |     },
10 |     boot: [
11 |     ],
12 |     css: [
13 |       'app.scss'
14 |     ],
15 |     extras: [
16 |       'fontawesome-v6',
17 |       'roboto-font',
18 |       'material-icons',
19 |     ],
20 |     build: {
21 |       target: {
22 |         browser: [ 'es2019', 'edge88', 'firefox78', 'chrome87', 'safari13.1' ],
23 |         node: 'node16'
24 |       },
25 |       distDir: '../sneakpeek/static/ui/',
26 |       vueRouterMode: 'hash',
27 |       env: {
28 |         JSONRPC_ENDPOINT: ctx.dev ? 'http://localhost:8080/api/v1/jsonrpc' : '/api/v1/jsonrpc',
29 |       },
30 |       chainWebpack: config => {
31 |         config.plugin('monaco-editor').use(MonacoWebpackPlugin, [
32 |           {
33 |             languages: ['python', 'javascript', 'html', 'xml']
34 |           }
35 |         ])
36 |       }
37 |     },
38 |     devServer: {
39 |       open: true
40 |     },
41 |     framework: {
42 |       config: {
43 |         dark: "auto",
44 |         notify: {
45 |           position: "bottom"
46 |         }
47 |       },
48 |       plugins: [
49 |         "Notify",
50 |         "SessionStorage",
51 |       ]
52 |     },
53 |     ssr: {
54 |       pwa: false,
55 |       prodPort: 3000,
56 |       middlewares: [
57 |         'render'
58 |       ]
59 |     },
60 |     pwa: {
61 |       workboxMode: 'generateSW',
62 |       injectPwaMetaTags: true,
63 |       swFilename: 'sw.js',
64 |       manifestFilename: 'manifest.json',
65 |       useCredentialsForManifestTag: false,
66 |     },
67 |     capacitor: {
68 |       hideSplashscreen: true
69 |     },
70 |   }
71 | });
72 | 


--------------------------------------------------------------------------------
/sneakpeek/middleware/proxy_middleware.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from aiohttp import BasicAuth
 4 | from fake_useragent import UserAgent
 5 | from pydantic import BaseModel
 6 | from typing_extensions import override
 7 | from yarl import URL
 8 | 
 9 | from sneakpeek.middleware.base import BaseMiddleware, parse_config_from_obj
10 | from sneakpeek.scraper.model import Request
11 | 
12 | 
13 | class ProxyMiddlewareConfig(BaseModel):
14 |     """Proxy middleware config"""
15 | 
16 |     proxy: str | URL | None = None  #: Proxy URL
17 |     proxy_auth: BasicAuth | None = None  #: Proxy authentication info to use
18 | 
19 |     class Config:
20 |         arbitrary_types_allowed = True
21 | 
22 | 
23 | class ProxyMiddleware(BaseMiddleware):
24 |     """Proxy middleware automatically sets proxy arguments for all HTTP requests."""
25 | 
26 |     def __init__(self, default_config: ProxyMiddlewareConfig | None = None) -> None:
27 |         self._default_config = default_config or ProxyMiddlewareConfig()
28 |         self._user_agents = UserAgent(
29 |             use_external_data=self._default_config.use_external_data,
30 |             browsers=self._default_config.browsers,
31 |         )
32 | 
33 |     @property
34 |     def name(self) -> str:
35 |         return "proxy"
36 | 
37 |     @override
38 |     async def on_request(
39 |         self,
40 |         request: Request,
41 |         config: Any | None,
42 |     ) -> Request:
43 |         config = parse_config_from_obj(
44 |             config,
45 |             self.name,
46 |             ProxyMiddlewareConfig,
47 |             self._default_config,
48 |         )
49 |         if not request.kwargs:
50 |             request.kwargs = {}
51 |         if config.proxy:
52 |             request.kwargs["proxy"] = config.proxy
53 |         if config.proxy_auth:
54 |             request.kwargs["proxy_auth"] = config.proxy_auth
55 |         return request
56 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on: push
 4 | 
 5 | jobs:
 6 |   ci:
 7 |     name: Build and publish Python pacakage to PyPI
 8 |     runs-on: "ubuntu-latest"
 9 |     permissions:
10 |       id-token: write
11 |     strategy:
12 |       fail-fast: false
13 |       matrix:
14 |         python-version: ["3.10"]
15 |         poetry-version: ["1.4.2"]
16 |         node-version: ["18.16.0"]
17 |     steps:
18 |       - uses: actions/checkout@v3
19 | 
20 |       - name: Set up Python
21 |         uses: actions/setup-python@v4
22 |         with:
23 |           python-version: ${{ matrix.python-version }}
24 | 
25 |       - name: Set up Poetry
26 |         uses: abatilo/actions-poetry@v2
27 |         with:
28 |           poetry-version: ${{ matrix.poetry-version }}
29 | 
30 |       - name: Set Node.js
31 |         uses: actions/setup-node@v3
32 |         with:
33 |           node-version: ${{ matrix.node-version }}
34 | 
35 |       - name: Run install
36 |         run: make install
37 | 
38 |       - name: Run tests
39 |         run: make test
40 | 
41 |       - name: Tests coverage
42 |         run: make coverage
43 | 
44 |       - name: Upload coverage reports to Codecov
45 |         uses: codecov/codecov-action@v3
46 |         with:
47 |           token: ${{ secrets.CODECOV_TOKEN }}
48 |           files: ./coverage.xml
49 |           verbose: true
50 | 
51 |       - name: Build package
52 |         run: make build
53 | 
54 |       - name: Publish package to Test PyPI
55 |         if: startsWith(github.ref, 'refs/tags')
56 |         uses: pypa/gh-action-pypi-publish@release/v1
57 |         with:
58 |           repository-url: https://test.pypi.org/legacy/
59 |           skip-existing: true
60 | 
61 |       - name: Publish package to PyPI
62 |         if: startsWith(github.ref, 'refs/tags')
63 |         uses: pypa/gh-action-pypi-publish@release/v1
64 |         with:
65 |           password: ${{ secrets.PYPI_API_TOKEN }}
66 | 


--------------------------------------------------------------------------------
/sneakpeek/scheduler/redis_lease_storage.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from redis.asyncio import Redis
 4 | 
 5 | from sneakpeek.metrics import count_invocations, measure_latency
 6 | from sneakpeek.scheduler.model import Lease, LeaseStorageABC
 7 | 
 8 | 
 9 | class RedisLeaseStorage(LeaseStorageABC):
10 |     """Redis storage for leases. Should only be used for development purposes"""
11 | 
12 |     def __init__(self, redis: Redis) -> None:
13 |         """
14 |         Args:
15 |             redis (Redis): Async redis client
16 |         """
17 |         self._redis = redis
18 | 
19 |     @count_invocations(subsystem="storage")
20 |     @measure_latency(subsystem="storage")
21 |     async def maybe_acquire_lease(
22 |         self,
23 |         lease_name: str,
24 |         owner_id: str,
25 |         acquire_for: timedelta,
26 |     ) -> Lease | None:
27 |         lease_key = f"lease:{lease_name}"
28 |         existing_lease = await self._redis.get(lease_key)
29 |         result = None
30 |         if not existing_lease or existing_lease.decode() == owner_id:
31 |             result = await self._redis.set(
32 |                 f"lease:{lease_name}",
33 |                 owner_id,
34 |                 ex=acquire_for,
35 |             )
36 |         return (
37 |             Lease(
38 |                 name=lease_name,
39 |                 owner_id=owner_id,
40 |                 acquired=datetime.utcnow(),
41 |                 acquired_until=datetime.utcnow() + acquire_for,
42 |             )
43 |             if result
44 |             else None
45 |         )
46 | 
47 |     @count_invocations(subsystem="storage")
48 |     @measure_latency(subsystem="storage")
49 |     async def release_lease(self, lease_name: str, owner_id: str) -> None:
50 |         lease_owner = await self._redis.get(f"lease:{lease_name}")
51 |         if lease_owner == owner_id:
52 |             await self._redis.delete(f"lease:{lease_name}")
53 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | 
16 | print(os.path.abspath(".."))
17 | sys.path.insert(0, os.path.abspath(".."))
18 | 
19 | 
20 | # -- Project information -----------------------------------------------------
21 | 
22 | project = "Sneakpeek"
23 | copyright = "2023, Dan Yazovsky"
24 | author = "Dan Yazovsky"
25 | version = "0.2"
26 | release = "0.2.2"
27 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.coverage", "sphinx.ext.napoleon"]
28 | templates_path = ["_templates"]
29 | language = "en"
30 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
31 | html_static_path = ["_static"]
32 | autoclass_content = "both"
33 | html_theme = "sphinx_rtd_theme"
34 | html_theme_options = {
35 |     "analytics_id": "G-3EW8JNTBHC",
36 |     "logo_only": False,
37 |     "display_version": True,
38 |     "prev_next_buttons_location": "bottom",
39 |     "style_external_links": False,
40 |     "vcs_pageview_mode": "display_github",
41 |     "collapse_navigation": False,
42 |     "sticky_navigation": True,
43 |     "navigation_depth": 4,
44 |     "includehidden": True,
45 |     "titles_only": True,
46 | }
47 | github_url = "https://github.com/flulemon/sneakpeek"
48 | highlight_language = "python3"
49 | pygments_style = "sphinx"
50 | 
51 | autodoc_default_options = {
52 |     "members": True,
53 |     "show-inheritance": True,
54 | }
55 | autodoc_typehints = "both"
56 | 


--------------------------------------------------------------------------------
/sneakpeek/queue/tasks.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from sneakpeek.queue.model import QueueABC, Task, TaskHandlerABC, TaskPriority
 4 | from sneakpeek.scheduler.model import (
 5 |     PeriodicTask,
 6 |     StaticPeriodicTasksStorage,
 7 |     TaskSchedule,
 8 |     generate_id,
 9 | )
10 | 
11 | KILL_DEAD_TASKS_TASK_NAME = "internal::queue::kill_dead_tasks"
12 | DELETE_OLD_TASKS_TASK_NAME = "internal::queue::delete_old_tasks"
13 | 
14 | 
15 | class KillDeadTasksHandler(TaskHandlerABC):
16 |     def __init__(self, queue: QueueABC) -> None:
17 |         self.queue = queue
18 | 
19 |     def name(self) -> int:
20 |         return KILL_DEAD_TASKS_TASK_NAME
21 | 
22 |     async def process(self, task: Task) -> str:
23 |         killed = await self.queue.kill_dead_tasks()
24 |         return json.dumps(
25 |             {
26 |                 "success": True,
27 |                 "killed": [item.id for item in killed],
28 |             },
29 |             indent=4,
30 |         )
31 | 
32 | 
33 | class DeleteOldTasksHandler(TaskHandlerABC):
34 |     def __init__(self, queue: QueueABC) -> None:
35 |         self.queue = queue
36 | 
37 |     def name(self) -> int:
38 |         return DELETE_OLD_TASKS_TASK_NAME
39 | 
40 |     async def process(self, task: Task) -> str:
41 |         await self.queue.delete_old_tasks()
42 |         return json.dumps({"success": True}, indent=4)
43 | 
44 | 
45 | queue_periodic_tasks = StaticPeriodicTasksStorage(
46 |     tasks=[
47 |         PeriodicTask(
48 |             id=generate_id(),
49 |             name=KILL_DEAD_TASKS_TASK_NAME,
50 |             handler=KILL_DEAD_TASKS_TASK_NAME,
51 |             priority=TaskPriority.NORMAL,
52 |             payload="",
53 |             schedule=TaskSchedule.EVERY_HOUR,
54 |         ),
55 |         PeriodicTask(
56 |             id=generate_id(),
57 |             name=DELETE_OLD_TASKS_TASK_NAME,
58 |             handler=DELETE_OLD_TASKS_TASK_NAME,
59 |             priority=TaskPriority.NORMAL,
60 |             payload="",
61 |             schedule=TaskSchedule.EVERY_HOUR,
62 |         ),
63 |     ]
64 | )
65 | 


--------------------------------------------------------------------------------
/sneakpeek/scheduler/in_memory_lease_storage.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from asyncio import Lock
 3 | from datetime import datetime, timedelta
 4 | 
 5 | from sneakpeek.metrics import count_invocations, measure_latency
 6 | from sneakpeek.scheduler.model import Lease, LeaseStorageABC
 7 | 
 8 | 
 9 | class InMemoryLeaseStorage(LeaseStorageABC):
10 |     """In memory storage for leases. Should only be used for development purposes"""
11 | 
12 |     def __init__(self) -> None:
13 |         self._logger = logging.getLogger(__name__)
14 |         self._lock = Lock()
15 |         self._leases: dict[str, Lease] = {}
16 | 
17 |     def _can_acquire_lease(self, lease_name: str, owner_id: str) -> bool:
18 |         existing_lease = self._leases.get(lease_name)
19 |         return (
20 |             not existing_lease
21 |             or existing_lease.acquired_until < datetime.utcnow()
22 |             or existing_lease.owner_id == owner_id
23 |         )
24 | 
25 |     @count_invocations(subsystem="storage")
26 |     @measure_latency(subsystem="storage")
27 |     async def maybe_acquire_lease(
28 |         self,
29 |         lease_name: str,
30 |         owner_id: str,
31 |         acquire_for: timedelta,
32 |     ) -> Lease | None:
33 |         async with self._lock:
34 |             if self._can_acquire_lease(lease_name, owner_id):
35 |                 self._leases[lease_name] = Lease(
36 |                     name=lease_name,
37 |                     owner_id=owner_id,
38 |                     acquired=datetime.utcnow(),
39 |                     acquired_until=datetime.utcnow() + acquire_for,
40 |                 )
41 |                 return self._leases[lease_name]
42 |         return None
43 | 
44 |     @count_invocations(subsystem="storage")
45 |     @measure_latency(subsystem="storage")
46 |     async def release_lease(self, lease_name: str, owner_id: str) -> None:
47 |         async with self._lock:
48 |             if lease_name not in self._leases:
49 |                 return
50 |             if self._can_acquire_lease(lease_name, owner_id):
51 |                 del self._leases[lease_name]
52 | 


--------------------------------------------------------------------------------
/sneakpeek/middleware/user_agent_injecter_middleware.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from fake_useragent import UserAgent
 4 | from pydantic import BaseModel
 5 | from typing_extensions import override
 6 | 
 7 | from sneakpeek.middleware.base import BaseMiddleware, parse_config_from_obj
 8 | from sneakpeek.scraper.model import Request
 9 | 
10 | 
11 | class UserAgentInjecterMiddlewareConfig(BaseModel):
12 |     """Middleware configuration"""
13 | 
14 |     #: Whether to use external data as a fallback
15 |     use_external_data: bool = True
16 | 
17 |     #: List of browsers which are used to generate user agents
18 |     browsers: list[str] = ["chrome", "edge", "firefox", "safari", "opera"]
19 | 
20 | 
21 | class UserAgentInjecterMiddleware(BaseMiddleware):
22 |     """
23 |     This middleware automatically adds ``User-Agent`` header if it's not present.
24 |     It uses `fake-useragent <https://pypi.org/project/fake-useragent/>`_ in order to generate fake real world user agents.
25 |     """
26 | 
27 |     def __init__(
28 |         self, default_config: UserAgentInjecterMiddlewareConfig | None = None
29 |     ) -> None:
30 |         self._default_config = default_config or UserAgentInjecterMiddlewareConfig()
31 |         self._user_agents = UserAgent(
32 |             use_external_data=self._default_config.use_external_data,
33 |             browsers=self._default_config.browsers,
34 |         )
35 | 
36 |     @property
37 |     def name(self) -> str:
38 |         return "user_agent_injecter"
39 | 
40 |     @override
41 |     async def on_request(
42 |         self,
43 |         request: Request,
44 |         config: Any | None,
45 |     ) -> Request:
46 |         config = parse_config_from_obj(
47 |             config,
48 |             self.name,
49 |             UserAgentInjecterMiddlewareConfig,
50 |             self._default_config,
51 |         )
52 |         if (request.headers or {}).get("User-Agent"):
53 |             return request
54 |         if not request.headers:
55 |             request.headers = {}
56 |         request.headers["User-Agent"] = self._user_agents.random
57 |         return request
58 | 


--------------------------------------------------------------------------------
/sneakpeek/scraper/dynamic_scraper_handler.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import json
 3 | from typing import Any, Awaitable, Callable, Mapping
 4 | 
 5 | from pydantic import BaseModel
 6 | from typing_extensions import override
 7 | 
 8 | from sneakpeek.scraper.model import ScraperContextABC, ScraperHandler
 9 | 
10 | 
11 | class DynamicScraperParams(BaseModel):
12 |     source_code: str
13 |     args: list[Any] | None = None
14 |     kwargs: Mapping[str, Any] | None = None
15 | 
16 | 
17 | class DynamicScraperHandler(ScraperHandler):
18 |     @property
19 |     def name(self) -> str:
20 |         return "dynamic_scraper"
21 | 
22 |     def compile(self, source_code: str) -> Callable[..., Awaitable[None]]:
23 |         bytecode = compile(source=source_code, filename="<string>", mode="exec")
24 |         session_globals = {}
25 |         exec(bytecode, session_globals)
26 |         if "context" in session_globals:
27 |             raise SyntaxError("`context` is a reserved keyword")
28 |         if "handler" not in session_globals:
29 |             raise SyntaxError("Expected source code to define a `handler` function")
30 |         handler = session_globals["handler"]
31 |         if not inspect.iscoroutinefunction(handler):
32 |             raise SyntaxError("Expected `handler` to be a function")
33 |         if handler.__code__.co_argcount == 0:
34 |             raise SyntaxError(
35 |                 "Expected `handler` to have at least one argument: `context: ScraperContext`"
36 |             )
37 |         return handler
38 | 
39 |     @override
40 |     async def run(self, context: ScraperContextABC) -> str:
41 |         params = DynamicScraperParams.parse_obj(context.params)
42 |         handler = self.compile(params.source_code)
43 |         result = await handler(context, *(params.args or []), **(params.kwargs or {}))
44 |         if result is None:
45 |             return "No result was returned"
46 |         if isinstance(result, str):
47 |             return result
48 |         try:
49 |             return json.dumps(result, indent=4)
50 |         except TypeError as ex:
51 |             return f"Failed to serialize result with error: {ex}"
52 | 


--------------------------------------------------------------------------------
/front/src/components/ScraperCard.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |   <div>
 3 |     <div v-if="!error && !loading">
 4 |       <div class="flex row q-mb-md">
 5 |         <div class="text-h6">
 6 |           {{ scraper.name }}
 7 |         </div>
 8 |         <q-space />
 9 |         <div class="flex row">
10 |           <q-btn class="q-mr-sm" icon="fa-solid fa-play" label="Run" size="sm" color="positive"
11 |                   :loading="enqueueLoading" @click="enqueueRun" />
12 |         </div>
13 |       </div>
14 |       <scraper-edit-form v-model="scraper" />
15 |     </div>
16 |     <div v-if="error && !loading">
17 |       <div class="text-h6 text-center">
18 |         Failed to load scraper. Try to refresh. <br />
19 |         {{ error }}
20 |       </div>
21 |     </div>
22 |     <q-inner-loading :showing="loading">
23 |       <q-spinner-grid size="50px" color="primary" />
24 |     </q-inner-loading>
25 |   </div>
26 | </template>
27 | 
28 | <script>
29 | import useQuasar from 'quasar/src/composables/use-quasar.js';
30 | import { enqueueScraper, getScraper } from "../api.js";
31 | import ScraperEditForm from './ScraperEditForm.vue';
32 | 
33 | export default {
34 |   name: 'ScraperCard',
35 |   props: ['id'],
36 |   components: { ScraperEditForm },
37 |   data() {
38 |     return {
39 |       loading: false,
40 |       error: false,
41 |       scraper: {},
42 |       $q: useQuasar(),
43 |       enqueueLoading: false,
44 |     }
45 |   },
46 |   created() {
47 |     this.loading = true;
48 |     getScraper(this.id)
49 |       .then(item => this.scraper = item)
50 |       .catch(error => this.error = error)
51 |       .finally(() => this.loading = false);
52 |   },
53 |   methods: {
54 |     enqueueRun() {
55 |       this.enqueueLoading = true;
56 |       enqueueScraper(this.scraper.id)
57 |         .then(() => this.$q.notify({
58 |           message: "Successfully enqueued scraper run",
59 |           color: "positive"
60 |         }))
61 |         .catch(error => this.$q.notify({
62 |           message: `Failed to enqueue scraper run: ${error}`,
63 |           color: "negative"
64 |         }))
65 |         .finally(() => this.enqueueLoading = false);
66 |     }
67 |   }
68 | }
69 | </script>
70 | 


--------------------------------------------------------------------------------
/sneakpeek/session_loggers/redis_logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from asyncio import AbstractEventLoop
 3 | from copy import copy
 4 | from dataclasses import dataclass
 5 | from datetime import datetime, timedelta
 6 | from typing import Any
 7 | 
 8 | from redis.asyncio import Redis
 9 | 
10 | from sneakpeek.session_loggers.base import SessionLogger, get_fields_to_log
11 | 
12 | MAX_BUFFER_AGE = timedelta(seconds=5)
13 | 
14 | 
15 | @dataclass
16 | class _LogRecord:
17 |     task_id: str
18 |     data: Any
19 | 
20 | 
21 | class RedisLoggerHandler(SessionLogger):
22 |     def __init__(
23 |         self,
24 |         redis: Redis,
25 |         loop: AbstractEventLoop | None = None,
26 |         max_buffer_size: int = 10,
27 |         max_buffer_age: timedelta = MAX_BUFFER_AGE,
28 |     ) -> None:
29 |         super().__init__()
30 |         self.redis = redis
31 |         self.loop = loop
32 |         self.max_buffer_size = max_buffer_size
33 |         self.max_buffer_age = max_buffer_age
34 |         self.buffer: list[_LogRecord] = []
35 |         self.last_flush = datetime.min
36 | 
37 |     async def _write_to_log(self, messages: list[_LogRecord]) -> None:
38 |         for message in messages:
39 |             await self.redis.xadd(name=message.task_id, fields=message.data)
40 | 
41 |     def flush(self):
42 |         """
43 |         Flushes the stream.
44 |         """
45 |         if not self.buffer:
46 |             return
47 |         if (
48 |             len(self.buffer) < self.max_buffer_size
49 |             and datetime.utcnow() - self.last_flush < self.max_buffer_age
50 |         ):
51 |             return
52 |         self.acquire()
53 |         try:
54 |             self.loop.create_task(self._write_to_log, copy(self.buffer))
55 |         finally:
56 |             self.buffer.clear()
57 |             self.release()
58 | 
59 |     def emit(self, record: logging.LogRecord) -> None:
60 |         if not getattr(record, "task_id"):
61 |             return
62 | 
63 |         self.buffer.append(
64 |             _LogRecord(
65 |                 task_id=record.task_id,
66 |                 data=get_fields_to_log(record),
67 |             )
68 |         )
69 |         self.flush()
70 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Sneakpeek
 2 | 
 3 | ![CI](https://github.com/flulemon/sneakpeek/actions/workflows/ci.yml/badge.svg)
 4 | [![PyPI version](https://badge.fury.io/py/sneakpeek-py.svg)](https://badge.fury.io/py/sneakpeek-py)
 5 | [![Downloads](https://static.pepy.tech/badge/sneakpeek-py)](https://pepy.tech/project/sneakpeek-py)
 6 | [![Documentation Status](https://readthedocs.org/projects/sneakpeek-py/badge/?version=latest)](https://sneakpeek-py.readthedocs.io/en/latest/?badge=latest)
 7 | [![codecov](https://codecov.io/gh/flulemon/sneakpeek/branch/main/graph/badge.svg?token=7h45P8qHRG)](https://codecov.io/gh/flulemon/sneakpeek)
 8 | 
 9 | **Sneakpeek** - is a platform to author, schedule and monitor scrapers in an easy, fast and extensible way.
10 | It's the best choice for scrapers that have some specific complex scraping logic that needs
11 | to be run on a constant basis.
12 | 
13 | ## Key features
14 | 
15 | - Horizontally scalable
16 | - Robust scraper scheduler and priority task queue
17 | - Multiple storage implementations to persist scrapers' configs, tasks, logs, etc.
18 | - JSON RPC API to manage the platform programmatically
19 | - Useful UI to manage all of your scrapers
20 | - Scraper IDE to enable you developing scrapers right in your browser
21 | - Easily extendable via middleware
22 | 
23 | ## Demo
24 | 
25 | [Here's a demo project](https://github.com/flulemon/sneakpeek-demo) which uses **Sneakpeek** framework.
26 | 
27 | You can also run the demo using Docker:
28 | 
29 | ```bash
30 | docker run -it --rm -p 8080:8080 flulemon/sneakpeek-demo
31 | ```
32 | 
33 | Once it has started head over to http://localhost:8080 to play around with it.
34 | 
35 | ## Documentation
36 | 
37 | For the full documentation please visit [sneakpeek-py.readthedocs.io](https://sneakpeek-py.readthedocs.io/en/latest/)
38 | 
39 | ## Contributing
40 | 
41 | Please take a look at our [contributing](https://github.com/flulemon/sneakpeek/blob/main/CONTRIBUTING.md) guidelines if you're interested in helping!
42 | 
43 | ## Future plans
44 | 
45 | - Headful and headless browser engines middleware (Selenium and Playwright)
46 | - SQL and AmazonDB storage implementation
47 | - Advanced monitoring for the scrapers' health
48 | 


--------------------------------------------------------------------------------
/sneakpeek/scheduler/tests/test_lease_storage.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from datetime import timedelta
 3 | 
 4 | import pytest
 5 | from fakeredis.aioredis import FakeRedis
 6 | 
 7 | from sneakpeek.scheduler.in_memory_lease_storage import InMemoryLeaseStorage
 8 | from sneakpeek.scheduler.model import LeaseStorageABC
 9 | from sneakpeek.scheduler.redis_lease_storage import RedisLeaseStorage
10 | 
11 | NON_EXISTENT_SCRAPER_ID = 10001
12 | 
13 | 
14 | @pytest.fixture
15 | def in_memory_storage() -> LeaseStorageABC:
16 |     return InMemoryLeaseStorage()
17 | 
18 | 
19 | @pytest.fixture
20 | def redis_storage() -> LeaseStorageABC:
21 |     return RedisLeaseStorage(FakeRedis())
22 | 
23 | 
24 | @pytest.fixture(
25 |     params=[
26 |         pytest.lazy_fixture(in_memory_storage.__name__),
27 |         pytest.lazy_fixture(redis_storage.__name__),
28 |     ]
29 | )
30 | def storage(request) -> LeaseStorageABC:
31 |     yield request.param
32 | 
33 | 
34 | @pytest.mark.asyncio
35 | async def test_lease(storage: LeaseStorageABC):
36 |     lease_name_1 = "test_lease_1"
37 |     lease_name_2 = "test_lease_2"
38 |     owner_1 = "owner_id_1"
39 |     owner_2 = "owner_id_2"
40 |     owner_1_acquire_until = timedelta(seconds=1)
41 |     owner_2_acquire_until = timedelta(seconds=5)
42 | 
43 |     # initial acquire
44 |     assert (
45 |         await storage.maybe_acquire_lease(lease_name_1, owner_1, owner_1_acquire_until)
46 |         is not None
47 |     )
48 |     # another lease can be acquired
49 |     assert (
50 |         await storage.maybe_acquire_lease(lease_name_2, owner_2, owner_2_acquire_until)
51 |         is not None
52 |     )
53 |     # lock is acquired so no one can acquire
54 |     assert (
55 |         await storage.maybe_acquire_lease(lease_name_1, owner_2, owner_2_acquire_until)
56 |         is None
57 |     )
58 |     # owner can re-acquire
59 |     assert (
60 |         await storage.maybe_acquire_lease(lease_name_1, owner_1, owner_1_acquire_until)
61 |         is not None
62 |     )
63 | 
64 |     # lock expires and can be acuired
65 |     await asyncio.sleep(1)
66 |     assert (
67 |         await storage.maybe_acquire_lease(lease_name_1, owner_2, owner_2_acquire_until)
68 |         is not None
69 |     )
70 | 


--------------------------------------------------------------------------------
/docs/middleware/rate_limiter_middleware.rst:
--------------------------------------------------------------------------------
 1 | #########################
 2 | Rate limiter
 3 | #########################
 4 | 
 5 | Rate limiter implements `leaky bucket algorithm <https://en.wikipedia.org/wiki/Leaky_bucket>`_ 
 6 | to limit number of requests made to the hosts. If the request is rate limited it can either 
 7 | raise an exception or wait until the request won't be limited anymore.
 8 | 
 9 | Configuration of the middleware is defined in :py:class:`RateLimiterMiddlewareConfig <sneakpeek.middleware.rate_limiter_middleware.RateLimiterMiddlewareConfig>`.
10 | 
11 | How to configure middleware for the :py:class:`SneakpeekServer <sneakpeek.server.SneakpeekServer>` (will be used globally for all requests):
12 | 
13 | .. code-block:: python3
14 | 
15 |     from sneakpeek.middleware.rate_limiter_middleware import RateLimiterMiddleware, RateLimiterMiddlewareConfig
16 | 
17 |     server = SneakpeekServer.create(
18 |         ...
19 |         middleware=[
20 |             RateLimiterMiddleware(
21 |                 RateLimiterMiddlewareConfig(
22 |                     # maximum number of requests in a given time window
23 |                     max_requests = 60,
24 |                     # wait until request won't be rate limited
25 |                     rate_limited_strategy = RateLimitedStrategy.WAIT
26 |                     # only 60 requests per host are allowed within 1 minute
27 |                     time_window = timedelta(minute=1),
28 |                 )
29 |             )
30 |         ],
31 |     )
32 | 
33 | 
34 | How to override middleware settings for a given scraper:
35 | 
36 | .. code-block:: python3
37 | 
38 |     from sneakpeek.middleware.rate_limiter_middleware import RateLimiterMiddlewareConfig
39 | 
40 |     scraper = Scraper(
41 |         ...
42 |         config=ScraperConfig(
43 |             ...
44 |             middleware={
45 |                 "rate_limiter": RateLimiterMiddlewareConfig(
46 |                     # maximum number of requests in a given time window
47 |                     max_requests = 120,
48 |                     # throw RateLimiterException if request is rate limited
49 |                     rate_limited_strategy = RateLimitedStrategy.THROW
50 |                     # only 120 requests per host are allowed within 1 minute
51 |                     time_window = timedelta(minute=1),
52 |                 )
53 |             }
54 |         ),
55 |     )
56 | 


--------------------------------------------------------------------------------
/front/src/components/TaskLogs.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |   <q-infinite-scroll class="q-pa-sm bg-blue-grey-10">
 3 |     <pre v-for="(item, index) in logs" :key="index" :class="`log-line q-pa-none q-ma-none text-${getLogLevelTextColor(item.data.levelname)}`"
 4 |     >{{ item.data.asctime }}][{{ item.data.levelname.padEnd(8) }}] {{ item.data.msg }}</pre>
 5 |   </q-infinite-scroll>
 6 | </template>
 7 | <script>
 8 | import { getTaskLogs } from '../api';
 9 | 
10 | export default {
11 |   name: "TaskLogs",
12 |   props: ["taskId"],
13 |   data() {
14 |     return {
15 |       options: {
16 |         readOnly: true,
17 |         domReadOnly: true,
18 |       },
19 |       lastLogLine: "",
20 |       logs: [],
21 |       maxLinesToFetch: 100,
22 |       logUpdateTask: null,
23 |     };
24 |   },
25 |   computed: {
26 |     theme() {
27 |       return this.$q.dark.isActive ? "vs-dark": "vs";
28 |     }
29 |   },
30 |   watch: {
31 |     taskId() {
32 |       this.init();
33 |     }
34 |   },
35 |   created() {
36 |     this.init();
37 |   },
38 |   methods: {
39 |     init() {
40 |       if (this.taskId) {
41 |         this.clean();
42 |         this.getLogs();
43 |       }
44 |     },
45 |     clean() {
46 |       if (this.logUpdateTask) {
47 |         clearTimeout(this.logUpdateTask);
48 |         this.logUpdateTask = null;
49 |       }
50 |       this.lastLogLine = "";
51 |       this.logs = [];
52 |     },
53 |     getLogs() {
54 |       if (this.taskId) {
55 |         getTaskLogs(
56 |           this.taskId,
57 |           this.lastLogLine,
58 |           this.maxLinesToFetch
59 |         ).then(resp => {
60 |           if (resp.length > 0) {
61 |             this.logs = this.logs.concat(resp);
62 |             this.lastLogLine = resp[resp.length-1].id;
63 |           }
64 |           setTimeout(this.getLogs, 1000);
65 |         });
66 |       }
67 |     },
68 |     getLogLevelTextColor(level) {
69 |       switch(level) {
70 |         case "CRITICAL": return "deep-purple-5";
71 |         case "ERROR": return "red-14";
72 |         case "WARNING": return "amber-10";
73 |         default:
74 |           return "grey-5";
75 |       }
76 |     },
77 |   }
78 | }
79 | </script>
80 | <style>
81 | .logs {
82 |   width: 100%;
83 |   max-height: 600px;
84 |   overflow: auto;
85 | }
86 | .log-line {
87 |   font-family: Consolas,Monaco,Andale Mono,Ubuntu Mono,monospace;
88 |   font-size: 14px;
89 | }
90 | </style>
91 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | NAME := sneakpeek
 2 | PY_INSTALL_STAMP := .py.install.stamp
 3 | JS_INSTALL_STAMP := .js.install.stamp
 4 | POETRY := $(shell command -v poetry 2> /dev/null)
 5 | YARN := $(shell command -v yarn 2> /dev/null)
 6 | ROOT_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 7 | 
 8 | .DEFAULT_GOAL := help
 9 | 
10 | 
11 | .PHONY: help
12 | help: ##Show this help.
13 | 	@fgrep -h "##" $(MAKEFILE_LIST) | sed -e 's/\(\:.*\#\#\)/\:\ /' | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##//'
14 | 
15 | install-py: $(PY_INSTALL_STAMP) ##Install python dependencies (Poetry is required)
16 | $(PY_INSTALL_STAMP): pyproject.toml poetry.lock
17 | 	@if [ -z $(POETRY) ]; then echo "Poetry could not be found. See https://python-poetry.org/docs/"; exit 2; fi
18 | 	$(POETRY) --version
19 | 	$(POETRY) install --all-extras --with remotesettings,taskcluster --no-ansi --no-interaction --verbose
20 | 	touch $(PY_INSTALL_STAMP)
21 | 
22 | install-js: $(JS_INSTALL_STAMP) ##Install JS dependencies (Yarn is required)
23 | $(JS_INSTALL_STAMP): front/package.json front/yarn.lock
24 | 	@if [ -z $(YARN) ]; then echo "YARN could not be found. See https://yarnpkg.com/"; exit 2; fi
25 | 	$(YARN) --version
26 | 	cd $(ROOT_DIR)/front; $(YARN) install
27 | 	touch $(JS_INSTALL_STAMP)
28 | 
29 | install: install-py install-js ##Install all dependencies
30 | 
31 | gen-requirements: $(PY_INSTALL_STAMP)
32 | 	$(POETRY) export --without-hashes --format=requirements.txt > requirements.txt
33 | 
34 | .PHONY: test
35 | test: $(PY_INSTALL_STAMP) ##Run tests
36 | 	$(POETRY) run pytest -n 20
37 | 
38 | .PHONE: coverage
39 | coverage: $(PY_INSTALL_STAMP) ##Run tests
40 | 	$(POETRY) run pytest --cov=sneakpeek sneakpeek --cov-fail-under=85 --cov-report term-missing --cov-report html --cov-report xml
41 | 
42 | build-ui: ##Build frontend
43 | 	$(YARN) --cwd $(ROOT_DIR)/front/ quasar build
44 | 
45 | build-docs: $(PY_INSTALL_STAMP) ##Build documentation
46 | 	rm -rf $(ROOT_DIR)/sneakpeek/static/docs/
47 | 	mkdir -p $(ROOT_DIR)/sneakpeek/static/docs/
48 | 	$(POETRY) run sphinx-build $(ROOT_DIR)/docs $(ROOT_DIR)/sneakpeek/static/docs/
49 | 
50 | build-py: ##Build Python package
51 | 	$(POETRY) build
52 | 
53 | build: build-ui build-docs build-py ##Build everything
54 | 
55 | .PHONY: clean
56 | clean: ##Cleanup
57 | 	find . -type d -name "__pycache__" | xargs rm -rf {};
58 | 	find . -type d -name ".pytest_cache" | xargs rm -rf {};
59 | 	rm -rf $(PY_INSTALL_STAMP) $(JS_INSTALL_STAMP) .coverage .mypy_cache


--------------------------------------------------------------------------------
/front/src/assets/logo.svg:
--------------------------------------------------------------------------------
 1 | <svg class="img-fluid" id="outputsvg" xmlns="http://www.w3.org/2000/svg"
 2 |   style="transform: matrix(1.44804, 0, 0, 1.44804, 0, 0); transform-origin: 50% 50%; cursor: move; transition: transform 200ms ease-in-out 0s;"
 3 |   viewBox="0 0 2000 2000">
 4 |   <g id="l4go2u76yFZO1yZMGfeo1oM" fill="rgb(0,0,0)" style="transform: none;">
 5 |     <g style="transform: none;">
 6 |       <path id="pUHK4cYg3"
 7 |         d="M117 1903 c-4 -3 -7 -159 -7 -345 l0 -338 46 0 c29 0 190 43 441 116 l395 117 413 -122 c227 -66 424 -121 439 -121 l26 0 0 350 0 350 -873 0 c-481 0 -877 -3 -880 -7z"></path>
 8 |       <path id="pMO2rLAZ8"
 9 |         d="M662 1210 c-18 -11 -38 -31 -44 -45 -10 -23 -16 -25 -83 -25 -95 0 -107 -10 -103 -80 l3 -55 245 0 245 0 3 73 3 73 -39 40 c-39 39 -40 39 -118 39 -61 0 -86 -5 -112 -20z"></path>
10 |       <path id="pm3wH3Llu"
11 |         d="M1289 1213 c-15 -9 -34 -29 -43 -45 -16 -26 -21 -28 -81 -28 -35 0 -75 -4 -89 -10 -23 -8 -26 -15 -26 -58 0 -26 5 -53 12 -60 9 -9 77 -12 243 -12 166 0 234 3 243 12 17 17 15 104 -5 145 -25 54 -65 73 -153 73 -52 0 -83 -6 -101 -17z"></path>
12 |       <path id="pmed2ttMB"
13 |         d="M184 896 c-3 -7 -4 -35 -2 -62 l3 -49 102 -3 103 -3 15 -52 c8 -29 52 -182 99 -342 l83 -290 31 -3 c25 -3 63 19 206 114 l176 117 168 -66 c92 -37 181 -67 198 -67 l31 0 109 293 109 292 85 3 c46 2 90 8 97 14 16 13 17 90 1 106 -19 19 -1607 17 -1614 -2z"></path>
14 |     </g>
15 |   </g>
16 |   <g id="l2iB71gug0KQl2CIfZpzhLk" fill="rgb(255,255,255)" style="transform: none;">
17 |     <g style="transform: none;">
18 |       <path id="pX8JVs27A"
19 |         d="M0 1000 l0 -1000 1000 0 1000 0 0 1000 0 1000 -1000 0 -1000 0 0 -1000z m1840 561 l0 -319 -77 23 c-43 12 -234 68 -425 124 l-346 102 -410 -120 c-226 -67 -418 -121 -426 -121 -14 0 -16 35 -16 308 0 170 3 312 7 315 3 4 386 7 850 7 l843 0 0 -319z m-978 -400 c35 -36 39 -44 36 -83 l-3 -43 -215 0 -215 0 -3 24 c-6 38 22 51 103 51 67 0 73 2 83 25 18 38 69 65 126 65 43 0 53 -4 88 -39z m604 20 c31 -19 64 -74 64 -108 0 -40 -16 -43 -225 -43 -208 0 -225 3 -225 42 0 25 39 38 115 38 60 0 65 2 81 28 9 16 28 36 43 45 36 22 109 21 147 -2z m305 -317 c9 -11 10 -20 2 -32 -8 -13 -31 -18 -99 -22 l-88 -5 -108 -285 c-59 -157 -108 -288 -110 -293 -2 -4 -82 24 -178 63 -96 39 -182 72 -190 72 -8 1 -97 -53 -198 -120 -117 -79 -184 -118 -187 -110 -2 7 -38 132 -80 278 -42 146 -85 295 -96 332 l-20 67 -102 3 c-89 3 -102 5 -105 21 -10 49 -47 47 776 47 690 0 771 -2 783 -16z"></path>
20 |     </g>
21 |   </g>
22 | </svg>


--------------------------------------------------------------------------------
/front/.eslintrc.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   // https://eslint.org/docs/user-guide/configuring#configuration-cascading-and-hierarchy
 3 |   // This option interrupts the configuration hierarchy at this file
 4 |   // Remove this if you have an higher level ESLint config file (it usually happens into a monorepos)
 5 |   root: true,
 6 | 
 7 |   parserOptions: {
 8 |     ecmaVersion: '2021', // Allows for the parsing of modern ECMAScript features
 9 |   },
10 | 
11 |   env: {
12 |     node: true,
13 |     browser: true,
14 |     'vue/setup-compiler-macros': true
15 |   },
16 | 
17 |   // Rules order is important, please avoid shuffling them
18 |   extends: [
19 |     // Base ESLint recommended rules
20 |     // 'eslint:recommended',
21 | 
22 |     // Uncomment any of the lines below to choose desired strictness,
23 |     // but leave only one uncommented!
24 |     // See https://eslint.vuejs.org/rules/#available-rules
25 |     'plugin:vue/vue3-essential', // Priority A: Essential (Error Prevention)
26 |     // 'plugin:vue/vue3-strongly-recommended', // Priority B: Strongly Recommended (Improving Readability)
27 |     // 'plugin:vue/vue3-recommended', // Priority C: Recommended (Minimizing Arbitrary Choices and Cognitive Overhead)
28 | 
29 |     // https://github.com/prettier/eslint-config-prettier#installation
30 |     // usage with Prettier, provided by 'eslint-config-prettier'.
31 |     'prettier'
32 |   ],
33 | 
34 |   plugins: [
35 |     // https://eslint.vuejs.org/user-guide/#why-doesn-t-it-work-on-vue-files
36 |     // required to lint *.vue files
37 |     'vue',
38 |     
39 |     // https://github.com/typescript-eslint/typescript-eslint/issues/389#issuecomment-509292674
40 |     // Prettier has not been included as plugin to avoid performance impact
41 |     // add it as an extension for your IDE
42 |     
43 |   ],
44 | 
45 |   globals: {
46 |     ga: 'readonly', // Google Analytics
47 |     cordova: 'readonly',
48 |     __statics: 'readonly',
49 |     __QUASAR_SSR__: 'readonly',
50 |     __QUASAR_SSR_SERVER__: 'readonly',
51 |     __QUASAR_SSR_CLIENT__: 'readonly',
52 |     __QUASAR_SSR_PWA__: 'readonly',
53 |     process: 'readonly',
54 |     Capacitor: 'readonly',
55 |     chrome: 'readonly'
56 |   },
57 | 
58 |   // add your custom rules here
59 |   rules: {
60 |     
61 |     'prefer-promise-reject-errors': 'off',
62 | 
63 |     // allow debugger during development only
64 |     'no-debugger': process.env.NODE_ENV === 'production' ? 'error' : 'off'
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "sneakpeek-py"
 3 | packages = [{ include = "sneakpeek" }]
 4 | version = "0.2.2"
 5 | description = "Sneakpeek is a framework that helps to quickly and conviniently develop scrapers. It's the best choice for scrapers that have some specific complex scraping logic that needs to be run on a constant basis."
 6 | authors = ["Dan Yazovsky <daniil.yazovsky@gmail.com>"]
 7 | maintainers = ["Dan Yazovsky <daniil.yazovsky@gmail.com>"]
 8 | repository = "https://github.com/flulemon/sneakpeek"
 9 | documentation = "https://sneakpeek-py.readthedocs.io/en/latest/"
10 | homepage = "https://github.com/flulemon/sneakpeek"
11 | license = "BSD-3-Clause"
12 | readme = "README.md"
13 | classifiers = [
14 |     "Operating System :: OS Independent",
15 |     "Development Status :: 2 - Pre-Alpha",
16 |     "License :: OSI Approved :: BSD License",
17 |     "Programming Language :: Python",
18 |     "Programming Language :: Python :: 3",
19 |     "Programming Language :: Python :: 3.10",
20 |     "Programming Language :: Python :: 3.11",
21 |     "Programming Language :: Python :: 3.12",
22 |     "Intended Audience :: Developers",
23 |     "Framework :: FastAPI",
24 |     "Framework :: Pydantic",
25 |     "Topic :: Software Development :: Libraries :: Application Frameworks",
26 |     "Topic :: Internet :: WWW/HTTP :: Indexing/Search"
27 | ]
28 | 
29 | [tool.poetry.dependencies]
30 | python = "^3.10"
31 | pydantic = "^1.10.7"
32 | fastapi = "^0.95.0"
33 | fastapi-jsonrpc = "^2.4.1"
34 | redis = "^4.5.4"
35 | apscheduler = "^3.10.1"
36 | aiohttp = "^3.8.4"
37 | uvicorn = "^0.21.1"
38 | cachetools = "^5.3.0"
39 | prometheus-client = "^0.16.0"
40 | fake-useragent = "^1.1.3"
41 | Sphinx = { version = "4.2.0", optional = true }
42 | sphinx-rtd-theme = { version = "1.0.0", optional = true }
43 | sphinxcontrib-napoleon = { version = "0.7", optional = true }
44 | yarl = "^1.9.1"
45 | 
46 | [tool.poetry.group.dev.dependencies]
47 | pytest = "^7.2.2"
48 | fakeredis = "2.11.0"
49 | black = "^23.3.0"
50 | pytest-lazy-fixture = "^0.6.3"
51 | pytest-asyncio = "^0.21.0"
52 | pytest-cov = "^4.0.0"
53 | aioresponses = "^0.7.4"
54 | pytest-xdist = "^3.3.0"
55 | 
56 | [build-system]
57 | requires = ["poetry-core"]
58 | build-backend = "poetry.core.masonry.api"
59 | 
60 | [tool.pytest.ini_options]
61 | log_cli = true
62 | log_cli_level = "INFO"
63 | log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
64 | log_cli_date_format = "%Y-%m-%d %H:%M:%S"
65 | 
66 | [tool.poetry.extras]
67 | docs = ["Sphinx", "sphinx-rtd-theme", "sphinxcontrib-napoleon"]


--------------------------------------------------------------------------------
/sneakpeek/middleware/requests_logging_middleware.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Any
 3 | 
 4 | import aiohttp
 5 | from pydantic import BaseModel
 6 | from typing_extensions import override
 7 | 
 8 | from sneakpeek.middleware.base import parse_config_from_obj
 9 | from sneakpeek.scraper.model import Middleware, Request
10 | 
11 | 
12 | class RequestsLoggingMiddlewareConfig(BaseModel):
13 |     """Requests logging middleware config"""
14 | 
15 |     log_request: bool = True  #: Whether to log request being made
16 |     log_response: bool = True  #: Whether to log response being made
17 | 
18 | 
19 | class RequestsLoggingMiddleware(Middleware):
20 |     """Requests logging middleware logs all requests being made and received responses."""
21 | 
22 |     def __init__(
23 |         self, default_config: RequestsLoggingMiddlewareConfig | None = None
24 |     ) -> None:
25 |         self._default_config = default_config or RequestsLoggingMiddlewareConfig()
26 |         self._logger = logging.getLogger(__name__)
27 | 
28 |     @property
29 |     def name(self) -> str:
30 |         return "requests_logging"
31 | 
32 |     @override
33 |     async def on_request(
34 |         self,
35 |         request: Request,
36 |         config: Any | None,
37 |     ) -> Request:
38 |         config = parse_config_from_obj(
39 |             config,
40 |             self.name,
41 |             RequestsLoggingMiddlewareConfig,
42 |             self._default_config,
43 |         )
44 |         if config.log_request:
45 |             self._logger.info(
46 |                 f"{request.method.upper()} {request.url}",
47 |                 extra={
48 |                     "headers": request.headers,
49 |                     "kwargs": request.kwargs,
50 |                 },
51 |             )
52 |         return request
53 | 
54 |     @override
55 |     async def on_response(
56 |         self,
57 |         request: Request,
58 |         response: aiohttp.ClientResponse,
59 |         config: Any | None,
60 |     ) -> aiohttp.ClientResponse:
61 |         config = parse_config_from_obj(
62 |             config,
63 |             self.name,
64 |             RequestsLoggingMiddlewareConfig,
65 |             self._default_config,
66 |         )
67 |         if config.log_response:
68 |             response_body = await response.text()
69 |             self._logger.info(
70 |                 f"{request.method.upper()} {request.url} - {response.status} ",
71 |                 extra={
72 |                     "headers": request.headers,
73 |                     "kwargs": request.kwargs,
74 |                     "response": {response_body},
75 |                 },
76 |             )
77 |         return response
78 | 


--------------------------------------------------------------------------------
/sneakpeek/scraper/in_memory_storage.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from datetime import timedelta
 3 | from uuid import uuid4
 4 | 
 5 | from typing_extensions import override
 6 | 
 7 | from sneakpeek.scraper.model import (
 8 |     CreateScraperRequest,
 9 |     Scraper,
10 |     ScraperId,
11 |     ScraperNotFoundError,
12 |     ScraperStorageABC,
13 |     StorageIsReadOnlyError,
14 | )
15 | 
16 | 
17 | class InMemoryScraperStorage(ScraperStorageABC):
18 |     def __init__(
19 |         self,
20 |         initial_scrapers: list[Scraper] | None = None,
21 |         is_read_only: bool = False,
22 |     ) -> None:
23 |         self.read_only = is_read_only
24 |         self.scrapers: dict[ScraperId, Scraper] = {
25 |             scraper.id: scraper for scraper in initial_scrapers or []
26 |         }
27 |         self.lock = asyncio.Lock()
28 | 
29 |     @override
30 |     def is_read_only(self) -> bool:
31 |         return self.read_only
32 | 
33 |     @override
34 |     async def create_scraper(self, request: CreateScraperRequest) -> Scraper:
35 |         if self.read_only:
36 |             raise StorageIsReadOnlyError()
37 |         async with self.lock:
38 |             id = str(uuid4())
39 |             self.scrapers[id] = Scraper(
40 |                 id=id,
41 |                 name=request.name,
42 |                 handler=request.handler,
43 |                 schedule=request.schedule,
44 |                 schedule_crontab=request.schedule_crontab,
45 |                 config=request.config,
46 |                 priority=request.priority,
47 |                 timeout=(
48 |                     timedelta(seconds=request.timeout_seconds)
49 |                     if request.timeout_seconds
50 |                     else None
51 |                 ),
52 |             )
53 |             return self.scrapers[id]
54 | 
55 |     @override
56 |     async def update_scraper(self, scraper: Scraper) -> Scraper:
57 |         if self.read_only:
58 |             raise StorageIsReadOnlyError()
59 |         async with self.lock:
60 |             if scraper.id not in self.scrapers:
61 |                 raise ScraperNotFoundError()
62 |             self.scrapers[scraper.id] = scraper
63 |             return scraper
64 | 
65 |     @override
66 |     async def delete_scraper(self, id: ScraperId) -> Scraper:
67 |         if self.read_only:
68 |             raise StorageIsReadOnlyError()
69 |         async with self.lock:
70 |             if id not in self.scrapers:
71 |                 raise ScraperNotFoundError()
72 |             return self.scrapers.pop(id)
73 | 
74 |     @override
75 |     async def get_scraper(self, id: ScraperId) -> Scraper:
76 |         if id not in self.scrapers:
77 |             raise ScraperNotFoundError()
78 |         return self.scrapers[id]
79 | 
80 |     @override
81 |     async def get_scrapers(self) -> list[Scraper]:
82 |         return list(self.scrapers.values())
83 | 


--------------------------------------------------------------------------------
/sneakpeek/logging.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from contextlib import contextmanager
 3 | from contextvars import ContextVar
 4 | 
 5 | from sneakpeek.queue.model import Task
 6 | 
 7 | ctx_task = ContextVar("scraper_job")
 8 | 
 9 | 
10 | @contextmanager
11 | def task_context(task: Task) -> None:
12 |     """
13 |     Initialize scraper job logging context which automatically adds
14 |     scraper and scraper job IDs to the logging metadata
15 | 
16 |     Args:
17 |         scraper_job (ScraperJob): Scraper job definition
18 |     """
19 |     try:
20 |         token = ctx_task.set(task)
21 |         yield
22 |     finally:
23 |         ctx_task.reset(token)
24 | 
25 | 
26 | class TaskContextInjectingFilter(logging.Filter):
27 |     """
28 |     Scraper context filter which automatically injects
29 |     scraper and scraper job IDs to the logging metadata.
30 | 
31 |     Example of usage:
32 | 
33 |     .. code-block:: python3
34 | 
35 |         logger = logging.getLogger()
36 |         handler = logging.StreamHandler()
37 |         handler.addFilter(ScraperContextInjectingFilter())
38 |         logger.addHandler(handler)
39 |     """
40 | 
41 |     def filter(self, record: logging.LogRecord) -> bool:
42 |         """Injects task metadata into log record:
43 | 
44 |         * ``task_id`` - Task ID
45 |         * ``task_name`` - Task name
46 |         * ``task_handler`` - Task handler
47 | 
48 |         Args:
49 |             record (logging.LogRecord): Log record to inject metadata into
50 | 
51 |         Returns:
52 |             bool: Always True
53 |         """
54 |         task: Task | None = ctx_task.get(None)
55 |         record.task_id = task.id if task else ""
56 |         record.task_name = task.task_name if task else ""
57 |         record.task_handler = task.task_handler if task else ""
58 |         return True
59 | 
60 | 
61 | def configure_logging(
62 |     level: int = logging.INFO,
63 |     session_logger_handler: logging.Handler | None = None,
64 | ):
65 |     """
66 |     Helper function to configure logging:
67 | 
68 |     * Adds console logger to the root logger
69 |     * Adds scraper context injector filter to the console logger
70 |     * Configures console formatting to use scraper metadata
71 | 
72 |     Args:
73 |         level (int, optional): Minimum logging level. Defaults to logging.INFO.
74 |     """
75 |     logger = logging.getLogger()
76 |     handler = logging.StreamHandler()
77 |     handler.setFormatter(
78 |         logging.Formatter(
79 |             "%(asctime)s][%(levelname)s][%(name)s:%(lineno)d]%(task_handler)s:%(task_name)s:%(task_id)s - %(message)s"
80 |         )
81 |     )
82 |     handler.addFilter(TaskContextInjectingFilter())
83 |     logger.addHandler(handler)
84 |     if session_logger_handler:
85 |         logger.addHandler(session_logger_handler)
86 |     logger.setLevel(level)
87 |     logging.getLogger("apscheduler.executors.default").setLevel(logging.WARNING)
88 | 


--------------------------------------------------------------------------------
/front/src/api.js:
--------------------------------------------------------------------------------
  1 | import { SessionStorage } from 'quasar';
  2 | 
  3 | function rpc(method, params) {
  4 |   return fetch(
  5 |     process.env.JSONRPC_ENDPOINT || "/api/v1/jsonrpc",
  6 |     {
  7 |       method: "POST",
  8 |       headers: {
  9 |         "Content-Type": "application/json",
 10 |       },
 11 |       body: JSON.stringify({
 12 |         jsonrpc: "2.0",
 13 |         id: 0,
 14 |         method: method,
 15 |         params: params,
 16 |       })
 17 |     }
 18 |   ).then(response => {
 19 |     if (response.ok) {
 20 |       return response.json();
 21 |     } else {
 22 |       throw Error(response.statusText);
 23 |     }
 24 |   }).then(data => {
 25 |     if (data.error) {
 26 |       throw Error(data.error.message);
 27 |     }
 28 |     return data.result;
 29 |   });
 30 | }
 31 | 
 32 | export function getScrapers() {
 33 |   return rpc("get_scrapers", {});
 34 | }
 35 | 
 36 | export function getScraper(id) {
 37 |   return rpc("get_scraper", {id: id});
 38 | }
 39 | 
 40 | export function getScraperJobs(id) {
 41 |   return rpc("get_task_instances", {task_name: id});
 42 | }
 43 | 
 44 | export function getTask(id) {
 45 |   return rpc("get_task_instance", {task_id: id});
 46 | }
 47 | 
 48 | export function getTaskLogs(id, last_log_line_id, max_lines) {
 49 |   return rpc(
 50 |     "get_task_logs",
 51 |     {
 52 |       task_id: id,
 53 |       last_log_line_id: last_log_line_id,
 54 |       max_lines: max_lines
 55 |     }
 56 |   );
 57 | }
 58 | 
 59 | export function getScraperHandlers() {
 60 |   return rpc("get_scraper_handlers", {});
 61 | }
 62 | 
 63 | export function getSchedules() {
 64 |   return rpc("get_schedules", {});
 65 | }
 66 | 
 67 | export function getPriorities() {
 68 |   return rpc("get_priorities", {});
 69 | }
 70 | 
 71 | export function enqueueScraper(id) {
 72 |   return rpc("enqueue_scraper", {scraper_id: id, priority: 0});
 73 | }
 74 | export function createScraper(scraper) {
 75 |   return rpc("create_scraper", {scraper: scraper});
 76 | }
 77 | 
 78 | 
 79 | export function updateScraper(scraper) {
 80 |   return rpc("update_scraper", {scraper: scraper});
 81 | }
 82 | 
 83 | export function deleteScraper(id) {
 84 |   return rpc("delete_scraper", {id: id});
 85 | }
 86 | 
 87 | export function isReadOnly() {
 88 |   const value = SessionStorage.getItem("is_storage_read_only");
 89 |   if (value != null) return Promise.resolve(value);
 90 |   return rpc("is_read_only", {})
 91 |     .then(result => {
 92 |       SessionStorage.set("is_storage_read_only", result);
 93 |       return result;
 94 |     });
 95 | }
 96 | 
 97 | export function runEphemeralScraperTask(config, handler, state, priority) {
 98 |   return rpc(
 99 |     "run_ephemeral",
100 |     {
101 |       task: {
102 |         scraper_config: config,
103 |         scraper_handler: handler,
104 |         scraper_state: state,
105 |       },
106 |       priority: priority,
107 |     }
108 |   );
109 | }
110 | 


--------------------------------------------------------------------------------
/sneakpeek/scraper/redis_storage.py:
--------------------------------------------------------------------------------
 1 | from datetime import timedelta
 2 | from uuid import uuid4
 3 | 
 4 | from redis.asyncio import Redis
 5 | from typing_extensions import override
 6 | 
 7 | from sneakpeek.scraper.model import (
 8 |     CreateScraperRequest,
 9 |     Scraper,
10 |     ScraperId,
11 |     ScraperNotFoundError,
12 |     ScraperStorageABC,
13 |     StorageIsReadOnlyError,
14 | )
15 | 
16 | _SCRAPER_KEY_PREFIX = "scraper:"
17 | 
18 | 
19 | class RedisScraperStorage(ScraperStorageABC):
20 |     def __init__(self, redis: Redis, is_read_only: bool = False) -> None:
21 |         self.redis = redis
22 |         self.read_only = is_read_only
23 | 
24 |     def _get_scraper_key(self, id: ScraperId) -> str:
25 |         return f"{_SCRAPER_KEY_PREFIX}{id}"
26 | 
27 |     @override
28 |     def is_read_only(self) -> bool:
29 |         return self.read_only
30 | 
31 |     @override
32 |     async def create_scraper(self, request: CreateScraperRequest) -> Scraper:
33 |         if self.read_only:
34 |             raise StorageIsReadOnlyError()
35 |         scraper = Scraper(
36 |             id=str(uuid4()),
37 |             name=request.name,
38 |             handler=request.handler,
39 |             schedule=request.schedule,
40 |             schedule_crontab=request.schedule_crontab,
41 |             config=request.config,
42 |             priority=request.priority,
43 |             timeout=(
44 |                 timedelta(seconds=request.timeout_seconds)
45 |                 if request.timeout_seconds
46 |                 else None
47 |             ),
48 |         )
49 |         await self.redis.set(self._get_scraper_key(scraper.id), scraper.json())
50 |         return scraper
51 | 
52 |     @override
53 |     async def update_scraper(self, scraper: Scraper) -> Scraper:
54 |         if self.read_only:
55 |             raise StorageIsReadOnlyError()
56 |         if not await self.redis.exists(self._get_scraper_key(scraper.id)):
57 |             raise ScraperNotFoundError()
58 |         await self.redis.set(self._get_scraper_key(scraper.id), scraper.json())
59 |         return scraper
60 | 
61 |     @override
62 |     async def delete_scraper(self, id: ScraperId) -> Scraper:
63 |         if self.read_only:
64 |             raise StorageIsReadOnlyError()
65 |         scraper = await self.redis.getdel(self._get_scraper_key(id))
66 |         if not scraper:
67 |             raise ScraperNotFoundError()
68 |         return Scraper.parse_raw(scraper)
69 | 
70 |     @override
71 |     async def get_scraper(self, id: ScraperId) -> Scraper:
72 |         scraper = await self.redis.get(self._get_scraper_key(id))
73 |         if scraper is None:
74 |             raise ScraperNotFoundError()
75 |         return Scraper.parse_raw(scraper)
76 | 
77 |     @override
78 |     async def get_scrapers(self) -> list[Scraper]:
79 |         keys = [
80 |             key.decode()
81 |             async for key in self.redis.scan_iter(f"{_SCRAPER_KEY_PREFIX}*")
82 |         ]
83 |         return sorted(
84 |             (Scraper.parse_raw(scraper) for scraper in await self.redis.mget(keys)),
85 |             key=lambda x: x.id,
86 |         )
87 | 


--------------------------------------------------------------------------------
/sneakpeek/queue/tests/test_queue_storage.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | import pytest
 4 | from fakeredis.aioredis import FakeRedis
 5 | 
 6 | from sneakpeek.queue.in_memory_storage import InMemoryQueueStorage
 7 | from sneakpeek.queue.model import QueueStorageABC, Task, TaskPriority, TaskStatus
 8 | from sneakpeek.queue.redis_storage import RedisQueueStorage
 9 | 
10 | 
11 | @pytest.fixture
12 | def in_memory_storage() -> QueueStorageABC:
13 |     yield InMemoryQueueStorage()
14 | 
15 | 
16 | @pytest.fixture
17 | def redis_storage() -> QueueStorageABC:
18 |     yield RedisQueueStorage(FakeRedis())
19 | 
20 | 
21 | @pytest.fixture(
22 |     params=[
23 |         pytest.lazy_fixture(in_memory_storage.__name__),
24 |         pytest.lazy_fixture(redis_storage.__name__),
25 |     ]
26 | )
27 | def storage(request) -> QueueStorageABC:
28 |     yield request.param
29 | 
30 | 
31 | @pytest.mark.asyncio
32 | async def test_storage_crud(storage: QueueStorageABC) -> None:
33 |     task = Task(
34 |         id=0,
35 |         task_name=f"{test_storage_crud.__name__}:task_name",
36 |         task_handler=f"{test_storage_crud.__name__}:task_handler",
37 |         status=TaskStatus.PENDING,
38 |         created_at=datetime.utcnow(),
39 |         priority=TaskPriority.NORMAL,
40 |         payload=f"{test_storage_crud.__name__}:payload",
41 |         timeout=timedelta(seconds=1),
42 |     )
43 |     # Create task
44 |     enqueued = await storage.enqueue_task(task)
45 |     assert enqueued.id > 0
46 |     assert enqueued.task_name == task.task_name
47 | 
48 |     # Get task
49 |     all_tasks = await storage.get_tasks()
50 |     assert all_tasks == [enqueued]
51 |     task_name_instances = await storage.get_task_instances(task.task_name)
52 |     assert task_name_instances == [enqueued]
53 |     actual_task = await storage.get_task_instance(enqueued.id)
54 |     assert enqueued == actual_task
55 | 
56 |     # Update task
57 |     enqueued.last_active_at = datetime(year=1, month=10, day=1)
58 |     updated = await storage.update_task(enqueued)
59 |     assert updated.id == enqueued.id
60 |     assert enqueued.last_active_at == updated.last_active_at
61 | 
62 |     # Queue len
63 |     assert await storage.get_queue_len() == 1
64 | 
65 |     # Dequeue
66 |     dequeued = await storage.dequeue_task()
67 |     assert dequeued.id == enqueued.id
68 | 
69 | 
70 | @pytest.mark.asyncio
71 | async def test_delete_old_items(storage: QueueStorageABC) -> None:
72 |     keep_last = 2
73 |     total_tasks = 4
74 |     tasks = [
75 |         Task(
76 |             id=0,
77 |             task_name=f"{test_delete_old_items.__name__}:task_name",
78 |             task_handler=f"{test_delete_old_items.__name__}:task_handler",
79 |             status=TaskStatus.PENDING,
80 |             created_at=datetime.utcnow(),
81 |             priority=TaskPriority.NORMAL,
82 |             payload=f"{test_delete_old_items.__name__}:payload:{i}",
83 |             timeout=timedelta(seconds=1),
84 |         )
85 |         for i in range(total_tasks)
86 |     ]
87 |     enqueued_tasks = [await storage.enqueue_task(task) for task in tasks]
88 | 
89 |     await storage.delete_old_tasks(keep_last)
90 |     actual_left_tasks = await storage.get_tasks()
91 |     assert sorted(actual_left_tasks, key=lambda x: x.id) == sorted(
92 |         enqueued_tasks[keep_last:], key=lambda x: x.id
93 |     )
94 | 


--------------------------------------------------------------------------------
/front/src/pages/ScrapersPage.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |   <q-page class="flex flex-center" style="width: 100%;">
 3 |     <q-table :rows="rows" :columns="columns" row-key="id"
 4 |              style="width: 100%; height: 100vh;" virtual-scroll hide-bottom
 5 |              :loading="loading" :rows-per-page-options="[0]" :filter="filter">
 6 |       <template v-slot:top>
 7 |         <div class="text-h6">Scrapers</div>
 8 |         <q-space />
 9 |         <q-input dense color="primary" style="width: 300px;"
10 |                  v-model="filter" placeholder="Search..." clearable>
11 |           <template v-slot:prepend>
12 |             <q-icon name="search" />
13 |           </template>
14 |         </q-input>
15 |       </template>
16 | 
17 |       <template v-slot:body-cell-name="props">
18 |         <q-td :props="props">
19 |           <q-item clickable v-ripple :href="$router.resolve({ name: 'ScraperPage', params: { id: props.value.id } }).href">
20 |             <q-item-section>
21 |               {{ props.value.name }}
22 |             </q-item-section>
23 |           </q-item>
24 |         </q-td>
25 |       </template>
26 | 
27 | 
28 |       <template v-slot:body-cell-priority="props">
29 |         <q-td :props="props">
30 |           <priority-chip :value="props.value" size="sm" />
31 |         </q-td>
32 |       </template>
33 | 
34 |       <template v-slot:body-cell-schedule="props">
35 |         <q-td :props="props">
36 |           <schedule-chip size="sm" color="primary" text-color="white" :value="props.value" />
37 |         </q-td>
38 |       </template>
39 | 
40 |       <template v-slot:body-cell-actions="props">
41 |         <q-td :props="props">
42 |             <q-btn flat size="sm"
43 |                 :to="$router.resolve({ name: 'ScraperPage', params: { id: props.value.id } })">
44 |               <q-icon name="fa-solid fa-edit" class="q-mr-sm" />
45 |               Edit
46 |             </q-btn>
47 |         </q-td>
48 |       </template>
49 | 
50 |     </q-table>
51 |   </q-page>
52 | </template>
53 | 
54 | <script>
55 | import { getScrapers } from "../api.js";
56 | import PriorityChip from '../components/PriorityChip.vue';
57 | import ScheduleChip from '../components/ScheduleChip.vue';
58 | 
59 | export default {
60 |   components: { PriorityChip, ScheduleChip },
61 |   name: 'ScrapersPage',
62 |   data() {
63 |     return {
64 |       loading: false,
65 |       error: false,
66 |       columns: [
67 |         { name: 'id', label: 'ID', field: 'id', align: 'left' },
68 |         { name: 'name', label: 'Name', field: row => row, align: 'left'},
69 |         { name: 'schedule', label: 'Schedule', field: 'schedule', align: 'center' },
70 |         { name: 'priority', label: 'Priority', field: 'priority', align: 'center' },
71 |         { name: 'actions', label: 'Actions', field: row => row, align: 'right' },
72 |       ],
73 |       rows: [],
74 |       filter: '',
75 |     }
76 |   },
77 |   created() {
78 |     this.filter = this.$route.query.f || '';
79 |     this.loading = true;
80 |     getScrapers()
81 |       .then((data) => this.rows = data)
82 |       .catch((error => this.error = error))
83 |       .finally(() => this.loading = false);
84 |   },
85 |   watch: {
86 |     filter(value) {
87 |       if (value) {
88 |         this.$router.replace({query: { f: value }});
89 |       } else {
90 |         this.$router.replace({query: { }});
91 |       }
92 |     }
93 |   }
94 | }
95 | </script>
96 | 


--------------------------------------------------------------------------------
/sneakpeek/scraper/tests/test_dynamic_scraper_handler.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | from unittest.mock import AsyncMock, call
  3 | 
  4 | import pytest
  5 | 
  6 | from sneakpeek.scraper.dynamic_scraper_handler import (
  7 |     DynamicScraperHandler,
  8 |     DynamicScraperParams,
  9 | )
 10 | 
 11 | 
 12 | class FakeScraperContext:
 13 |     def __init__(self, params: DynamicScraperParams) -> None:
 14 |         self.params = params.dict()
 15 |         self.get_mock = AsyncMock()
 16 | 
 17 |     async def get(self, url: str) -> str:
 18 |         return await self.get_mock(url)
 19 | 
 20 | 
 21 | @pytest.fixture
 22 | def handler() -> DynamicScraperHandler:
 23 |     yield DynamicScraperHandler()
 24 | 
 25 | 
 26 | SOURCE_CODE_NO_HANDLER_DEFINED = """
 27 | from sneakpeek.scraper.context import ScraperContext
 28 | 
 29 | async def handler_not_defined(context: ScraperContext) -> str:
 30 |     return "1"
 31 | """
 32 | 
 33 | 
 34 | def test_Given_SourceCodeHasNoHandlerDefined_When_Compiled_Then_SyntaxErrorIsThrown(
 35 |     handler: DynamicScraperHandler,
 36 | ) -> None:
 37 |     with pytest.raises(SyntaxError):
 38 |         handler.compile(SOURCE_CODE_NO_HANDLER_DEFINED)
 39 | 
 40 | 
 41 | SOURCE_CODE_HANDLER_NOT_ASYNC = """
 42 | from sneakpeek.scraper.context import ScraperContext
 43 | 
 44 | def handler(context: ScraperContext) -> str:
 45 |     return "1"
 46 | """
 47 | 
 48 | 
 49 | def test_Given_SourceCodeWithSyncHandler_When_Compiled_Then_SyntaxErrorIsThrown(
 50 |     handler: DynamicScraperHandler,
 51 | ) -> None:
 52 |     with pytest.raises(SyntaxError):
 53 |         handler.compile(SOURCE_CODE_HANDLER_NOT_ASYNC)
 54 | 
 55 | 
 56 | SOURCE_CODE_HANDLER_OBJECT = """
 57 | handler = 1
 58 | """
 59 | SOURCE_CODE_HANDLER_CLASS = """
 60 | class handler:
 61 |     pass
 62 | """
 63 | 
 64 | 
 65 | def test_Given_SourceCodeWithHandlerNotFunction_When_Compiled_Then_SyntaxErrorIsThrown(
 66 |     handler: DynamicScraperHandler,
 67 | ) -> None:
 68 |     with pytest.raises(SyntaxError):
 69 |         handler.compile(SOURCE_CODE_HANDLER_OBJECT)
 70 |     with pytest.raises(SyntaxError):
 71 |         handler.compile(SOURCE_CODE_HANDLER_CLASS)
 72 | 
 73 | 
 74 | SOURCE_CODE_HANDLER_NO_ARGS = """
 75 | async def handler():
 76 |     return "1"
 77 | """
 78 | 
 79 | 
 80 | def test_Given_SourceCodeWithHandleWithNoArgs_When_Compiled_Then_SyntaxErrorIsThrown(
 81 |     handler: DynamicScraperHandler,
 82 | ) -> None:
 83 |     with pytest.raises(SyntaxError):
 84 |         handler.compile(SOURCE_CODE_HANDLER_NO_ARGS)
 85 | 
 86 | 
 87 | SOURCE_CODE_COMPILES = """
 88 | from sneakpeek.scraper.context import ScraperContext
 89 | 
 90 | async def handler(ctx: ScraperContext) -> str:
 91 |     return "1"
 92 | """
 93 | 
 94 | 
 95 | def test_Given_SourceCode_When_Compiled_Then_AsyncFunctionIsReturned(
 96 |     handler: DynamicScraperHandler,
 97 | ) -> None:
 98 |     func = handler.compile(SOURCE_CODE_COMPILES)
 99 |     assert inspect.iscoroutinefunction(func)
100 |     assert func.__code__.co_argcount == 1
101 | 
102 | 
103 | CUSTOM_SOURCE_CODE = """
104 | from sneakpeek.scraper.context import ScraperContext
105 | 
106 | async def handler(ctx: ScraperContext, param1: str, param2: str = "test2", result="123"):
107 |     for param in [param1, param2]:
108 |         await ctx.get(param)
109 |     return result
110 | """
111 | 
112 | 
113 | @pytest.mark.asyncio
114 | async def test_Given_CustomCode_When_RanByHandler_Then_ContextIsCalled(
115 |     handler: DynamicScraperHandler,
116 | ) -> None:
117 |     context = FakeScraperContext(
118 |         DynamicScraperParams(
119 |             source_code=CUSTOM_SOURCE_CODE,
120 |             args=["url1"],
121 |             kwargs={"param2": "url2", "result": "some_result"},
122 |         ),
123 |     )
124 |     result = await handler.run(context)
125 |     assert result == "some_result"
126 |     context.get_mock.assert_has_awaits(
127 |         [
128 |             call("url1"),
129 |             call("url2"),
130 |         ]
131 |     )
132 | 


--------------------------------------------------------------------------------
/sneakpeek/queue/in_memory_storage.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from collections import defaultdict
  3 | from itertools import count
  4 | from typing import Iterator
  5 | 
  6 | from typing_extensions import override
  7 | 
  8 | from sneakpeek.metrics import count_invocations, measure_latency
  9 | from sneakpeek.queue.model import QueueStorageABC, Task, TaskNotFoundError
 10 | 
 11 | SCORE_PRIORITY_BIT_OFFSET = 32
 12 | 
 13 | 
 14 | class InMemoryQueueStorage(QueueStorageABC):
 15 |     """In memory queue storage (should only be used for development purposes)"""
 16 | 
 17 |     def __init__(self) -> None:
 18 |         """
 19 |         Args:
 20 |             redis (Redis): Async redis client
 21 |         """
 22 |         self._id_generator: Iterator[int] = count(1)
 23 |         self._queue = asyncio.PriorityQueue()
 24 |         self._tasks: dict[str, set[int]] = defaultdict(set)
 25 |         self._task_instances: dict[int, Task] = {}
 26 |         self._lock = asyncio.Lock()
 27 | 
 28 |     async def _generate_id(self) -> int:
 29 |         return next(self._id_generator)
 30 | 
 31 |     def _get_task_score(self, task: Task) -> int:
 32 |         return (task.priority.value << SCORE_PRIORITY_BIT_OFFSET) + task.id
 33 | 
 34 |     @count_invocations(subsystem="storage")
 35 |     @measure_latency(subsystem="storage")
 36 |     @override
 37 |     async def get_tasks(self) -> list[Task]:
 38 |         return sorted(self._task_instances.values(), key=lambda x: x.id)
 39 | 
 40 |     @count_invocations(subsystem="storage")
 41 |     @measure_latency(subsystem="storage")
 42 |     @override
 43 |     async def get_task_instances(self, task_name: str) -> list[Task]:
 44 |         return sorted(
 45 |             [
 46 |                 self._task_instances[task_id]
 47 |                 for task_id in self._tasks.get(task_name, [])
 48 |             ],
 49 |             key=lambda x: x.id,
 50 |             reverse=True,
 51 |         )
 52 | 
 53 |     @count_invocations(subsystem="storage")
 54 |     @measure_latency(subsystem="storage")
 55 |     @override
 56 |     async def get_task_instance(self, id: int) -> Task:
 57 |         if id not in self._task_instances:
 58 |             raise TaskNotFoundError()
 59 |         return self._task_instances[id]
 60 | 
 61 |     @count_invocations(subsystem="storage")
 62 |     @measure_latency(subsystem="storage")
 63 |     @override
 64 |     async def enqueue_task(self, task: Task) -> Task:
 65 |         task.id = await self._generate_id()
 66 |         self._tasks[task.task_name].add(task.id)
 67 |         self._task_instances[task.id] = task
 68 |         await self._queue.put((self._get_task_score(task), task.id))
 69 |         return task
 70 | 
 71 |     @count_invocations(subsystem="storage")
 72 |     @measure_latency(subsystem="storage")
 73 |     @override
 74 |     async def update_task(self, task: Task) -> Task:
 75 |         if task.id not in self._task_instances:
 76 |             raise TaskNotFoundError()
 77 |         self._task_instances[task.id] = task
 78 |         return task
 79 | 
 80 |     @count_invocations(subsystem="storage")
 81 |     @measure_latency(subsystem="storage")
 82 |     @override
 83 |     async def dequeue_task(self) -> Task | None:
 84 |         try:
 85 |             _, task_id = self._queue.get_nowait()
 86 |             return await self.get_task_instance(task_id)
 87 |         except asyncio.QueueEmpty:
 88 |             return None
 89 | 
 90 |     @count_invocations(subsystem="storage")
 91 |     @measure_latency(subsystem="storage")
 92 |     @override
 93 |     async def delete_old_tasks(self, keep_last: int = 50) -> None:
 94 |         for task_name, task_ids in self._tasks.items():
 95 |             for task_id in sorted(task_ids, reverse=True)[keep_last:]:
 96 |                 self._task_instances.pop(task_id)
 97 |                 self._tasks[task_name].remove(task_id)
 98 | 
 99 |     @count_invocations(subsystem="storage")
100 |     @measure_latency(subsystem="storage")
101 |     @override
102 |     async def get_queue_len(self) -> int:
103 |         return self._queue.qsize()
104 | 


--------------------------------------------------------------------------------
/docs/design.rst:
--------------------------------------------------------------------------------
 1 | #################
 2 | Design
 3 | #################
 4 | 
 5 | .. contents:: Table of contents
 6 | 
 7 | **Sneakpeek** has 6 core components:
 8 | 
 9 | * Scrapers storage - stores list of scrapers and its metadata.
10 | * Tasks queue - populated by the scheduler or user and is consumed by the queue consumers
11 | * Lease storage - stores lease (global lock) for scheduler, to make sure there's only 1 active scheduler at all times.
12 | * Scheduler - schedules periodic tasks using scrapers in the storage
13 | * Consumer - consumes tasks queue and executes tasks logic (e.g. scraper logic)
14 | * API - provides JsonRPC API for interacting with the system
15 | 
16 | All of the components are run by the :py:class:`SneakpeekServer <sneakpeek.server.SneakpeekServer>`.
17 | 
18 | ================
19 | Scrapers Storage
20 | ================
21 | 
22 | Scraper storage interface is defined in :py:class:`sneakpeek.scraper.model.ScraperStorageABC`.
23 | 
24 | * :py:class:`InMemoryScraperStorage <sneakpeek.scraper.in_memory_storage.InMemoryScraperStorage>` - in-memory storage. Should either be used in **development** environment or if the list of scrapers is static and wouldn't be changed.
25 | * :py:class:`RedisScraperStorage <sneakpeek.storage.redis_storage.RedisScraperStorage>` - redis storage.
26 | 
27 | ================
28 | Tasks queue
29 | ================
30 | 
31 | Tasks queue consists of three components:
32 | * :py:class:`Storage <sneakpeek.queue.model.QueueStorageABC>` - tasks storage
33 | * :py:class:`Storage <sneakpeek.queue.model.QueueABC>` - queue implementation
34 | * :py:class:`Storage <sneakpeek.queue.model.Consumer>` - queue consumer implementation
35 | 
36 | Currently there 2 storage implementations:
37 | 
38 | * :py:class:`InMemoryQueueStorage <sneakpeek.queue.in_memory_storage.InMemoryQueueStorage>` - in-memory storage. Should only be used in **development** environment.
39 | * :py:class:`RedisQueueStorage <sneakpeek.queue.redis_storage.RedisQueueStorage>` - redis storage.
40 | 
41 | ================
42 | Lease storage
43 | ================
44 | 
45 | Lease storage is used by scheduler to ensure that at any point of time there's no more 
46 | than 1 active scheduler instance which can enqueue scraper jobs. This disallows concurrent
47 | execution of the scraper.
48 | 
49 | Lease storage interface is defined in :py:class:`LeaseStorageABC <sneakpeek.scheduler.model.LeaseStorageABC>`.
50 | 
51 | Currently there 2 storage implementations:
52 | 
53 | * :py:class:`InMemoryLeaseStorage <sneakpeek.scheduler.in_memory_lease_storage.InMemoryLeaseStorage>` - in-memory storage. Should only be used in **development** environment.
54 | * :py:class:`RedisLeaseStorage <sneakpeek.scheduler.redis_lease_storage.RedisLeaseStorage>` - redis storage.
55 | 
56 | ================
57 | Scheduler
58 | ================
59 | 
60 | :py:class:`Scheduler <sneakpeek.scheduler.model.SchedulerABC>` is responsible for:
61 | 
62 | * scheduling scrapers based on their configuration. 
63 | * finding scraper jobs that haven't sent a heartbeat for a while and mark them as dead
64 | * cleaning up jobs queue from old historical scraper jobs
65 | * exporting metrics on number of pending jobs in the queue
66 | 
67 | As for now there's only one implementation :py:class:`Scheduler <sneakpeek.scheduler.scheduler.Scheduler>` 
68 | that uses `APScheduler <https://apscheduler.readthedocs.io/en/3.x/>`_.
69 | 
70 | ================
71 | Queue consumer
72 | ================
73 | 
74 | Consumer constantly tries to dequeue a job and executes dequeued jobs.
75 | As for now there's only one implementation :py:class:`Consumer <sneakpeek.queue.consumer.Consumer>`.
76 | 
77 | 
78 | ================
79 | API
80 | ================
81 | 
82 | Sneakpeek implements:
83 | 
84 | * JsonRPC to programmatically interact with the system, it exposes following methods (available at ``/api/v1/jsonrpc``):
85 |   * CRUD methods to add, modify and delete scrapers
86 |   * Get list of scraper's jobs
87 |   * Enqueue scraper jobs
88 | * UI that allows you to interact with the system
89 | * Swagger documentation (available at ``/api``)
90 | * Copy of this documentation (available at ``/docs``)
91 | 


--------------------------------------------------------------------------------
/sneakpeek/scraper/tests/test_scraper_storage.py:
--------------------------------------------------------------------------------
  1 | from uuid import uuid4
  2 | 
  3 | import pytest
  4 | from fakeredis.aioredis import FakeRedis
  5 | 
  6 | from sneakpeek.scheduler.model import TaskSchedule
  7 | from sneakpeek.scraper.in_memory_storage import InMemoryScraperStorage
  8 | from sneakpeek.scraper.model import (
  9 |     CreateScraperRequest,
 10 |     Scraper,
 11 |     ScraperConfig,
 12 |     ScraperNotFoundError,
 13 |     ScraperStorageABC,
 14 | )
 15 | from sneakpeek.scraper.redis_storage import RedisScraperStorage
 16 | 
 17 | 
 18 | @pytest.fixture
 19 | def in_memory_storage() -> ScraperStorageABC:
 20 |     yield InMemoryScraperStorage()
 21 | 
 22 | 
 23 | @pytest.fixture
 24 | def redis_storage() -> ScraperStorageABC:
 25 |     yield RedisScraperStorage(FakeRedis())
 26 | 
 27 | 
 28 | @pytest.fixture(
 29 |     params=[
 30 |         pytest.lazy_fixture(in_memory_storage.__name__),
 31 |         pytest.lazy_fixture(redis_storage.__name__),
 32 |     ]
 33 | )
 34 | def storage(request) -> ScraperStorageABC:
 35 |     yield request.param
 36 | 
 37 | 
 38 | def _get_create_scraper_request(name: str) -> CreateScraperRequest:
 39 |     return CreateScraperRequest(
 40 |         name=name,
 41 |         schedule=TaskSchedule.CRONTAB,
 42 |         schedule_crontab=f"schedule_{name}",
 43 |         handler=f"handler_{name}",
 44 |         config=ScraperConfig(),
 45 |     )
 46 | 
 47 | 
 48 | @pytest.mark.asyncio
 49 | async def test_read_after_write(storage: ScraperStorageABC):
 50 |     expected = _get_create_scraper_request("test_read_after_write")
 51 |     created = await storage.create_scraper(expected)
 52 |     assert created.id is not None, "Expected storage to create a scraper"
 53 |     assert created.name == expected.name
 54 |     assert created.schedule == expected.schedule
 55 |     assert created.schedule_crontab == expected.schedule_crontab
 56 |     assert created.handler == expected.handler
 57 |     assert created.config == expected.config
 58 |     actual = await storage.get_scraper(created.id)
 59 |     assert actual == created
 60 |     created.name = f"{created.name}_updated"
 61 |     actual = await storage.update_scraper(created)
 62 |     actual = await storage.get_scraper(created.id)
 63 |     assert actual == created
 64 | 
 65 | 
 66 | @pytest.mark.asyncio
 67 | async def test_get_scrapers(storage: ScraperStorageABC):
 68 |     expected = [
 69 |         _get_create_scraper_request(f"test_get_scrapers_{i}") for i in range(1, 10)
 70 |     ]
 71 |     for item in expected:
 72 |         await storage.create_scraper(item)
 73 | 
 74 |     actual = await storage.get_scrapers()
 75 |     assert {item.name for item in actual} == {item.name for item in expected}
 76 | 
 77 | 
 78 | @pytest.mark.asyncio
 79 | async def test_read_non_existent_scraper_throws(storage: ScraperStorageABC):
 80 |     with pytest.raises(ScraperNotFoundError):
 81 |         await storage.get_scraper(uuid4())
 82 | 
 83 | 
 84 | @pytest.mark.asyncio
 85 | async def test_update_non_existent_scraper_throws(storage: ScraperStorageABC):
 86 |     with pytest.raises(ScraperNotFoundError):
 87 |         await storage.update_scraper(
 88 |             Scraper(
 89 |                 id=str(uuid4()),
 90 |                 name="test_update_non_existent_scraper_throws",
 91 |                 schedule=TaskSchedule.CRONTAB,
 92 |                 schedule_crontab="schedule_test_update_non_existent_scraper_throws",
 93 |                 handler="handler_test_update_non_existent_scraper_throws",
 94 |                 config=ScraperConfig(),
 95 |             )
 96 |         )
 97 | 
 98 | 
 99 | @pytest.mark.asyncio
100 | async def test_delete_non_existent_scraper_throws(storage: ScraperStorageABC):
101 |     with pytest.raises(ScraperNotFoundError):
102 |         await storage.delete_scraper(uuid4())
103 | 
104 | 
105 | @pytest.mark.asyncio
106 | async def test_delete_scraper(storage: ScraperStorageABC):
107 |     created = await storage.create_scraper(
108 |         _get_create_scraper_request("test_delete_scraper")
109 |     )
110 |     actual = await storage.get_scraper(created.id)
111 |     assert created == actual
112 |     deleted = await storage.delete_scraper(actual.id)
113 |     assert deleted == actual
114 |     with pytest.raises(ScraperNotFoundError):
115 |         await storage.get_scraper(actual.id)
116 | 


--------------------------------------------------------------------------------
/sneakpeek/session_loggers/file_logger.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import itertools
  3 | import json
  4 | import logging
  5 | import os
  6 | import pathlib
  7 | from asyncio import AbstractEventLoop
  8 | from collections import defaultdict
  9 | from concurrent.futures import ThreadPoolExecutor
 10 | from datetime import datetime, timedelta
 11 | from threading import Lock
 12 | from traceback import format_exc
 13 | from typing import Any, List
 14 | 
 15 | from sneakpeek.session_loggers.base import LogLine, SessionLogger, get_fields_to_log
 16 | 
 17 | MAX_BUFFER_AGE = timedelta(seconds=5)
 18 | 
 19 | 
 20 | class Encoder(json.JSONEncoder):
 21 |     def default(self, obj):
 22 |         if isinstance(obj, set):
 23 |             return list(obj)
 24 |         return json.JSONEncoder.default(self, obj)
 25 | 
 26 | 
 27 | class FileLoggerHandler(SessionLogger):
 28 |     def __init__(
 29 |         self,
 30 |         directory: str,
 31 |         loop: AbstractEventLoop | None = None,
 32 |         max_buffer_size: int = 10,
 33 |         max_buffer_age: timedelta = MAX_BUFFER_AGE,
 34 |         max_log_files_to_keep: int = 1000,
 35 |     ) -> None:
 36 |         super().__init__()
 37 |         self.dir = directory
 38 |         self.loop = loop or asyncio.get_event_loop()
 39 |         self.max_buffer_size = max_buffer_size
 40 |         self.max_buffer_age = max_buffer_age
 41 |         self.buffer: dict[str, Any] = defaultdict(list)
 42 |         self.last_flush = datetime.min
 43 |         self.executor = ThreadPoolExecutor(max_workers=10)
 44 |         self.max_log_files_to_keep = max_log_files_to_keep
 45 |         self._lock = Lock()
 46 | 
 47 |     def _cleanup(self):
 48 |         if not os.path.exists(self.dir):
 49 |             return
 50 |         with self._lock:
 51 |             with os.scandir(self.dir) as it:
 52 |                 log_files = sorted(
 53 |                     [entry for entry in it if entry.is_file()],
 54 |                     key=lambda x: x.stat().st_mtime,
 55 |                 )
 56 |             for file in log_files[: -self.max_log_files_to_keep]:
 57 |                 os.remove(file.path)
 58 | 
 59 |     def flush(self):
 60 |         """
 61 |         Flushes the stream.
 62 |         """
 63 |         with self.lock:
 64 |             self._cleanup()
 65 |             try:
 66 |                 pathlib.Path(self.dir).mkdir(parents=True, exist_ok=True)
 67 |                 for group, messages in self.buffer.items():
 68 |                     with open(
 69 |                         os.path.join(self.dir, f"task_{group}.log"), mode="a"
 70 |                     ) as f:
 71 |                         f.writelines(
 72 |                             [f"{json.dumps(m, cls=Encoder)}\n" for m in messages]
 73 |                         )
 74 |             except Exception:
 75 |                 print(format_exc())
 76 |             self.buffer.clear()
 77 | 
 78 |     def emit(self, record: logging.LogRecord) -> None:
 79 |         if not getattr(record, "task_id"):
 80 |             return
 81 | 
 82 |         self.buffer[record.task_id].append(get_fields_to_log(record))
 83 |         with self._lock:
 84 |             if (
 85 |                 len(self.buffer) > self.max_buffer_size
 86 |                 or datetime.utcnow() - self.last_flush > self.max_buffer_age
 87 |             ):
 88 |                 self.loop.run_in_executor(self.executor, self.flush)
 89 | 
 90 |     async def read(
 91 |         self,
 92 |         task_id: str,
 93 |         last_log_line_id: str | None = None,
 94 |         max_lines: int = 100,
 95 |     ) -> List[dict[str, Any]]:
 96 |         path = os.path.join(self.dir, f"task_{task_id}.log")
 97 |         if not os.path.exists(path):
 98 |             return []
 99 |         last_log_line_id = int(last_log_line_id) if last_log_line_id else 0
100 | 
101 |         with open(path, "r") as f:
102 |             return [
103 |                 LogLine(
104 |                     id=last_log_line_id + line_num + 1,
105 |                     data=json.loads(line),
106 |                 )
107 |                 for line_num, line in enumerate(
108 |                     itertools.islice(
109 |                         f,
110 |                         last_log_line_id,
111 |                         last_log_line_id + max_lines,
112 |                     )
113 |                 )
114 |             ]
115 | 


--------------------------------------------------------------------------------
/front/src/pages/ScraperIde.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |   <q-page>
  3 |     <scraper-ide-component v-model="draftScraper.config" :enable-save-btn="true" @save="onSave" />
  4 |     <q-dialog v-model="showSaveDialog">
  5 |       <q-card class="save-dialog">
  6 |         <q-card-section>
  7 |           <div class="text-h6">Save Scraper</div>
  8 |         </q-card-section>
  9 |         <q-card-section class="q-pt-none">
 10 |           <q-input v-model="draftScraper.name" label="Name" dense />
 11 |           <q-select v-model="draftScraper.schedule" label="Schedule" :options="schedules" dense class="q-pt-md" />
 12 |           <q-input v-model="draftScraper.schedule_crontab" label="Crontab" v-if="draftScraper.schedule === 'crontab'" dense class="q-pt-md" />
 13 |           <q-select v-model="draftScraper.priority" label="Priority" :options="priorities" dense class="q-pt-md" />
 14 |         </q-card-section>
 15 |         <q-card-actions align="right">
 16 |           <q-btn flat label="Cancel" v-close-popup />
 17 |           <q-btn label="Save" color="positive" :loading="saveLoading" @click="saveScraper" />
 18 |         </q-card-actions>
 19 |       </q-card>
 20 |     </q-dialog>
 21 |   </q-page>
 22 | </template>
 23 | <script>
 24 | import { format } from 'quasar';
 25 | import { createScraper, getPriorities, getSchedules } from "../api.js";
 26 | import ScraperIdeComponent from '../components/ScraperIdeComponent.vue';
 27 | 
 28 | const { capitalize } = format;
 29 | 
 30 | export default {
 31 |   name: "ScraperIde",
 32 |   components: { ScraperIdeComponent },
 33 |   data() {
 34 |     return {
 35 |       loading: false,
 36 |       error: false,
 37 |       showSaveDialog: false,
 38 |       draftScraper: {
 39 |         name: "",
 40 |         schedule: "inactive",
 41 |         priority: 0,
 42 |         handler: "dynamic_scraper",
 43 |         config: {}
 44 |       },
 45 |       schedules: [],
 46 |       priorities: [],
 47 |       saveLoading: false,
 48 |     }
 49 |   },
 50 |   created() {
 51 |     this.loading = true;
 52 |     Promise.all([
 53 |       getSchedules().then(data => this.schedules = data.map(this.makeScheduleOption)),
 54 |       getPriorities().then(data => this.priorities = data.map(this.makePriorityOption)),
 55 |     ])
 56 |     .then(this.prettifyScraperParamsLabels)
 57 |     .catch((error) => this.$q.notify({
 58 |       message: `Error occured, plese refresh the page: ${error}`,
 59 |       color: "negative",
 60 |     }))
 61 |     .finally(() => this.loading = false);
 62 |   },
 63 |   methods: {
 64 |     makeScheduleOption(item) {
 65 |       return {
 66 |         label: item.split("_").map(capitalize).join(" "),
 67 |         value: item,
 68 |       };
 69 |     },
 70 |     makePriorityOption(item) {
 71 |       return {
 72 |         label: capitalize(item.name),
 73 |         value: item.value,
 74 |       };
 75 |     },
 76 |     onSave(config) {
 77 |       this.draftScraper.config = config;
 78 |       this.prettifyScraperParamsLabels();
 79 |       this.showSaveDialog = true;
 80 |     },
 81 |     prettifyScraperParamsLabels() {
 82 |       const schedule = this.schedules.filter(x => x.value === this.draftScraper.schedule)[0];
 83 |       this.draftScraper.schedule = schedule || this.draftScraper.schedule;
 84 | 
 85 |       const priority = this.priorities.filter(x => x.value === this.draftScraper.priority)[0];
 86 |       this.draftScraper.priority = priority || this.draftScraper.priority;
 87 |     },
 88 |     saveScraper() {
 89 |       const payload = {
 90 |         name: this.draftScraper.name,
 91 |         handler: this.draftScraper.handler,
 92 |         schedule: this.draftScraper.schedule.value || this.draftScraper.schedule,
 93 |         priority: this.draftScraper.priority.value == 0 ? 0 : (this.draftScraper.priority.value || this.draftScraper.priority),
 94 |         schedule_crontab: this.draftScraper.schedule_crontab,
 95 |         config: this.draftScraper.config,
 96 |       }
 97 |       this.saveLoading = true;
 98 |       createScraper(payload)
 99 |         .then((result) => {
100 |           this.$q.notify({
101 |             message: `Successfully created new scraper`,
102 |             color: "positive",
103 |           });
104 |           this.$router.push({ name: 'ScraperPage', params: {id: result.id }});
105 |         })
106 |         .catch(error => this.$q.notify({
107 |             message: `Failed to create new scraper: ${error}`,
108 |             color: "negative",
109 |           }))
110 |         .finally(() => this.saveLoading = false);
111 |     }
112 |   }
113 | };
114 | </script>
115 | <style>
116 | .save-dialog {
117 |   min-width: 500px;
118 | }
119 | </style>
120 | 


--------------------------------------------------------------------------------
/sneakpeek/scraper/runner.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | from uuid import uuid4
  4 | 
  5 | from sneakpeek.metrics import count_invocations
  6 | from sneakpeek.scheduler.model import TaskSchedule
  7 | from sneakpeek.scraper.context import ScraperContext
  8 | from sneakpeek.scraper.in_memory_storage import InMemoryScraperStorage
  9 | from sneakpeek.scraper.model import (
 10 |     Middleware,
 11 |     Scraper,
 12 |     ScraperConfig,
 13 |     ScraperHandler,
 14 |     ScraperRunnerABC,
 15 |     ScraperStorageABC,
 16 | )
 17 | 
 18 | 
 19 | class ScraperRunner(ScraperRunnerABC):
 20 |     """Default scraper runner implementation that is meant to be used in the Sneakpeek server"""
 21 | 
 22 |     def __init__(
 23 |         self,
 24 |         scraper_storage: ScraperStorageABC,
 25 |         middlewares: list[Middleware] | None = None,
 26 |         loop: asyncio.AbstractEventLoop | None = None,
 27 |     ) -> None:
 28 |         """
 29 |         Args:
 30 |             handlers (list[ScraperHandler]): List of handlers that implement scraper logic
 31 |             scrapers_storage (ScrapersStorage): Sneakpeek scrapers storage implementation
 32 |             jobs_storage (ScraperJobsStorage): Sneakpeek jobs storage implementation
 33 |             middlewares (list[Middleware] | None, optional): List of middleware that will be used by scraper runner. Defaults to None.
 34 |         """
 35 |         self.logger = logging.getLogger(__name__)
 36 |         self.scraper_storage = scraper_storage
 37 |         self.middlewares = middlewares
 38 | 
 39 |     @staticmethod
 40 |     async def debug_handler(
 41 |         handler: ScraperHandler,
 42 |         config: ScraperConfig | None = None,
 43 |         state: str | None = None,
 44 |         middlewares: list[Middleware] | None = None,
 45 |     ) -> str:
 46 |         scraper = Scraper(
 47 |             id=str(uuid4()),
 48 |             name="test_handler",
 49 |             handler=handler.name,
 50 |             schedule=TaskSchedule.INACTIVE,
 51 |             config=config,
 52 |             state=state,
 53 |         )
 54 |         return await ScraperRunner(
 55 |             InMemoryScraperStorage([scraper]),
 56 |             middlewares=middlewares,
 57 |         ).run(handler, scraper)
 58 | 
 59 |     @count_invocations(subsystem="scraper_runner")
 60 |     async def run_ephemeral(
 61 |         self,
 62 |         handler: ScraperHandler,
 63 |         config: ScraperConfig | None = None,
 64 |         state: str | None = None,
 65 |     ) -> str | None:
 66 |         self.logger.info(f"Running ephemeral scraper with {handler.name}")
 67 | 
 68 |         context = ScraperContext(
 69 |             config,
 70 |             self.middlewares,
 71 |             scraper_state=state,
 72 |         )
 73 |         try:
 74 |             await context.start_session()
 75 |             result = await handler.run(context)
 76 |             self.logger.info(
 77 |                 f"Successfully executed ephemeral scraper with {handler.name}: {result}"
 78 |             )
 79 |             return result
 80 |         except Exception:
 81 |             self.logger.exception(
 82 |                 f"Failed to run ephemeral scraper with {handler.name}"
 83 |             )
 84 |             raise
 85 |         finally:
 86 |             await context.close()
 87 | 
 88 |     @count_invocations(subsystem="scraper_runner")
 89 |     async def run(self, handler: ScraperHandler, scraper: Scraper) -> str:
 90 |         self.logger.info(f"Running scraper {scraper.handler}::{scraper.name}")
 91 | 
 92 |         if handler.name != scraper.handler:
 93 |             self.logger.warning(
 94 |                 f"Provided handler's name ({handler.name}) doesn't match scraper handler name ({scraper.handler})"
 95 |             )
 96 | 
 97 |         async def _update_scraper_state(state: str) -> Scraper:
 98 |             scraper.state = state
 99 |             return await self._scrapers_storage.update_scraper(scraper)
100 | 
101 |         context = ScraperContext(
102 |             scraper.config,
103 |             self.middlewares,
104 |             scraper_state=scraper.state,
105 |             update_scraper_state_func=_update_scraper_state,
106 |         )
107 |         try:
108 |             await context.start_session()
109 |             result = await handler.run(context)
110 |             self.logger.info(
111 |                 f"Successfully executed ephemeral scraper with {handler.name}: {result}"
112 |             )
113 |             return result
114 |         except Exception:
115 |             self.logger.exception(
116 |                 f"Failed to run scraper {scraper.handler}::{scraper.name}"
117 |             )
118 |             raise
119 |         finally:
120 |             await context.close()
121 | 


--------------------------------------------------------------------------------
/sneakpeek/tests/test_metrics.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from prometheus_client import REGISTRY
  3 | 
  4 | from sneakpeek.metrics import count_invocations, measure_latency
  5 | 
  6 | SUBSYSTEM = "test"
  7 | 
  8 | exception_to_raise = ValueError()
  9 | exception_to_raise_name = ValueError.__name__
 10 | 
 11 | 
 12 | @count_invocations(SUBSYSTEM)
 13 | @measure_latency(SUBSYSTEM)
 14 | async def async_test_fn(fail: bool = False):
 15 |     if fail:
 16 |         raise exception_to_raise
 17 |     return 1
 18 | 
 19 | 
 20 | @count_invocations(SUBSYSTEM)
 21 | @measure_latency(SUBSYSTEM)
 22 | def sync_test_fn(fail: bool = False):
 23 |     if fail:
 24 |         raise exception_to_raise
 25 |     return 1
 26 | 
 27 | 
 28 | latency_labels_sync = {
 29 |     "subsystem": SUBSYSTEM,
 30 |     "method": sync_test_fn.__name__,
 31 | }
 32 | latency_labels_async = {
 33 |     "subsystem": SUBSYSTEM,
 34 |     "method": async_test_fn.__name__,
 35 | }
 36 | 
 37 | 
 38 | def invocation_labels_sync(type: str, error: str = ""):
 39 |     return {
 40 |         "subsystem": SUBSYSTEM,
 41 |         "method": sync_test_fn.__name__,
 42 |         "type": type,
 43 |         "error": error,
 44 |     }
 45 | 
 46 | 
 47 | def invocation_labels_async(type: str, error: str = ""):
 48 |     return {
 49 |         "subsystem": SUBSYSTEM,
 50 |         "method": async_test_fn.__name__,
 51 |         "type": type,
 52 |         "error": error,
 53 |     }
 54 | 
 55 | 
 56 | @pytest.mark.asyncio
 57 | async def test_measure_latency_async():
 58 |     before = REGISTRY.get_sample_value("sneakpeek_latency_count", latency_labels_async)
 59 |     await async_test_fn()
 60 |     after = REGISTRY.get_sample_value("sneakpeek_latency_count", latency_labels_async)
 61 |     assert after - (before or 0) == 1
 62 | 
 63 | 
 64 | @pytest.mark.asyncio
 65 | async def test_measure_latency_sync():
 66 |     before = REGISTRY.get_sample_value("sneakpeek_latency_count", latency_labels_sync)
 67 |     sync_test_fn()
 68 |     after = REGISTRY.get_sample_value("sneakpeek_latency_count", latency_labels_sync)
 69 |     assert after - (before or 0) == 1
 70 | 
 71 | 
 72 | @pytest.mark.asyncio
 73 | async def test_count_invocations_async():
 74 |     before_total = REGISTRY.get_sample_value(
 75 |         "sneakpeek_invocations_total",
 76 |         invocation_labels_async("total"),
 77 |     )
 78 |     before_success = REGISTRY.get_sample_value(
 79 |         "sneakpeek_invocations_total",
 80 |         invocation_labels_async("success"),
 81 |     )
 82 |     before_error = REGISTRY.get_sample_value(
 83 |         "sneakpeek_invocations_total",
 84 |         invocation_labels_async("error", exception_to_raise_name),
 85 |     )
 86 |     await async_test_fn(fail=False)
 87 |     with pytest.raises(type(exception_to_raise)):
 88 |         await async_test_fn(fail=True)
 89 | 
 90 |     after_total = REGISTRY.get_sample_value(
 91 |         "sneakpeek_invocations_total",
 92 |         invocation_labels_async("total"),
 93 |     )
 94 |     after_success = REGISTRY.get_sample_value(
 95 |         "sneakpeek_invocations_total",
 96 |         invocation_labels_async("success"),
 97 |     )
 98 |     after_error = REGISTRY.get_sample_value(
 99 |         "sneakpeek_invocations_total",
100 |         invocation_labels_async("error", exception_to_raise_name),
101 |     )
102 |     assert after_total - (before_total or 0) == 2
103 |     assert after_success - (before_success or 0) == 1
104 |     assert after_error - (before_error or 0) == 1
105 | 
106 | 
107 | def test_count_invocations_sync():
108 |     before_total = REGISTRY.get_sample_value(
109 |         "sneakpeek_invocations_total",
110 |         invocation_labels_sync("total"),
111 |     )
112 |     before_success = REGISTRY.get_sample_value(
113 |         "sneakpeek_invocations_total",
114 |         invocation_labels_sync("success"),
115 |     )
116 |     before_error = REGISTRY.get_sample_value(
117 |         "sneakpeek_invocations_total",
118 |         invocation_labels_sync("error", exception_to_raise_name),
119 |     )
120 |     sync_test_fn(fail=False)
121 |     with pytest.raises(type(exception_to_raise)):
122 |         sync_test_fn(fail=True)
123 | 
124 |     after_total = REGISTRY.get_sample_value(
125 |         "sneakpeek_invocations_total",
126 |         invocation_labels_sync("total"),
127 |     )
128 |     after_success = REGISTRY.get_sample_value(
129 |         "sneakpeek_invocations_total",
130 |         invocation_labels_sync("success"),
131 |     )
132 |     after_error = REGISTRY.get_sample_value(
133 |         "sneakpeek_invocations_total",
134 |         invocation_labels_sync("error", exception_to_raise_name),
135 |     )
136 |     assert after_total - (before_total or 0) == 2
137 |     assert after_success - (before_success or 0) == 1
138 |     assert after_error - (before_error or 0) == 1
139 | 


--------------------------------------------------------------------------------
/sneakpeek/middleware/robots_txt_middleware.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import sys
  3 | from datetime import timedelta
  4 | from enum import Enum, auto
  5 | from traceback import format_exc
  6 | from typing import Any
  7 | from urllib.parse import urlparse
  8 | from urllib.robotparser import RobotFileParser
  9 | 
 10 | import aiohttp
 11 | from cachetools import TTLCache
 12 | from pydantic import BaseModel
 13 | from typing_extensions import override
 14 | 
 15 | from sneakpeek.middleware.base import BaseMiddleware, parse_config_from_obj
 16 | from sneakpeek.scraper.model import Request
 17 | 
 18 | 
 19 | class RobotsTxtViolationException(Exception):
 20 |     """Exception which is raised if request is disallowed by website robots.txt"""
 21 | 
 22 |     pass
 23 | 
 24 | 
 25 | class RobotsTxtViolationStrategy(Enum):
 26 |     """What to do if the request is disallowed by website robots.txt"""
 27 | 
 28 |     LOG = auto()  #: Only log violation
 29 |     THROW = auto()  #: Raise an exception on vioalation
 30 | 
 31 | 
 32 | class RobotsTxtMiddlewareConfig(BaseModel):
 33 |     """robots.txt middleware configuration"""
 34 | 
 35 |     violation_strategy: RobotsTxtViolationStrategy = RobotsTxtViolationStrategy.LOG
 36 | 
 37 | 
 38 | class RobotsTxtMiddleware(BaseMiddleware):
 39 |     """Robots.txt middleware can log and optionally block requests if they are disallowed by website robots.txt."""
 40 | 
 41 |     def __init__(self, default_config: RobotsTxtMiddlewareConfig | None = None) -> None:
 42 |         self._default_config = default_config or RobotsTxtMiddlewareConfig()
 43 |         self._logger = logging.getLogger(__name__)
 44 |         self._cache = TTLCache(
 45 |             maxsize=sys.maxsize,
 46 |             ttl=timedelta(hours=1).total_seconds(),
 47 |         )
 48 | 
 49 |     @property
 50 |     def name(self) -> str:
 51 |         return "robots_txt"
 52 | 
 53 |     def _extract_host(self, url: str) -> str:
 54 |         return urlparse(url).hostname.replace("www.", "")
 55 | 
 56 |     async def _get_robots_txt_by_url(self, url: str) -> RobotFileParser | None:
 57 |         async with aiohttp.ClientSession() as session:
 58 |             response = await session.get(url)
 59 |             if response.status != 200:
 60 |                 return None
 61 |             contents = await response.text()
 62 |             rfp = RobotFileParser()
 63 |             rfp.parse(contents.split("\n"))
 64 |             return rfp
 65 | 
 66 |     async def _load_robots_txt(self, host: str) -> RobotFileParser | None:
 67 |         if cached := self._cache.get(host):
 68 |             return cached
 69 |         for scheme in ["http", "https"]:
 70 |             for host_prefix in ["", "www."]:
 71 |                 try:
 72 |                     robots_txt = await self._get_robots_txt_by_url(
 73 |                         f"{scheme}://{host_prefix}{host}/robots.txt"
 74 |                     )
 75 |                     self._cache[host] = robots_txt
 76 |                     if robots_txt:
 77 |                         return robots_txt
 78 |                 except Exception as e:
 79 |                     self._logger.error(f"Failed to get robots.txt for {host}: {e}")
 80 |                     self._logger.debug(
 81 |                         f"Failed to get robots.txt for {host}. Traceback: {format_exc()}"
 82 |                     )
 83 |         return None
 84 | 
 85 |     @override
 86 |     async def on_request(
 87 |         self,
 88 |         request: Request,
 89 |         config: Any | None,
 90 |     ) -> Request:
 91 |         config = parse_config_from_obj(
 92 |             config,
 93 |             self.name,
 94 |             RobotsTxtMiddlewareConfig,
 95 |             self._default_config,
 96 |         )
 97 |         host = self._extract_host(request.url)
 98 |         robots_txt = await self._load_robots_txt(host)
 99 |         if not robots_txt:
100 |             self._logger.debug(
101 |                 f"No robots.txt was retrieved for {request.url}. Defaulting to allow"
102 |             )
103 |             return request
104 | 
105 |         user_agent = (request.headers or {}).get("User-Agent")
106 |         if not user_agent:
107 |             self._logger.debug(
108 |                 f"User-Agent is empty while requesting {request.url}. Defaulting to '*'"
109 |             )
110 |             user_agent = "*"
111 | 
112 |         if not robots_txt.can_fetch(user_agent, request.url):
113 |             error_message = f"robots.txt prohibits requesting {request.url}"
114 |             if config.violation_strategy == RobotsTxtViolationStrategy.THROW:
115 |                 raise RobotsTxtViolationException(error_message)
116 |             self._logger.error(
117 |                 f"{error_message}. Proceeding because strategy is {config.violation_strategy}"
118 |             )
119 | 
120 |         return request
121 | 


--------------------------------------------------------------------------------
/sneakpeek/queue/queue.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from datetime import datetime, timedelta
  3 | 
  4 | from typing_extensions import override
  5 | 
  6 | from sneakpeek.metrics import count_invocations, measure_latency
  7 | from sneakpeek.queue.model import (
  8 |     EnqueueTaskRequest,
  9 |     QueueABC,
 10 |     QueueStorageABC,
 11 |     Task,
 12 |     TaskHasActiveRunError,
 13 |     TaskPingFinishedError,
 14 |     TaskPingNotStartedError,
 15 |     TaskStatus,
 16 | )
 17 | 
 18 | DEFAULT_DEAD_TIMEOUT = timedelta(minutes=5)
 19 | 
 20 | 
 21 | class Queue(QueueABC):
 22 |     """Queue implementation"""
 23 | 
 24 |     def __init__(
 25 |         self,
 26 |         storage: QueueStorageABC,
 27 |         dead_task_timeout: timedelta = DEFAULT_DEAD_TIMEOUT,
 28 |     ) -> None:
 29 |         self.logger = logging.getLogger(__name__)
 30 |         self.storage = storage
 31 |         self.dead_task_timeout = dead_task_timeout
 32 | 
 33 |     @count_invocations(subsystem="queue")
 34 |     @measure_latency(subsystem="queue")
 35 |     @override
 36 |     async def enqueue(self, request: EnqueueTaskRequest) -> Task:
 37 |         existing_tasks = await self.storage.get_task_instances(request.task_name)
 38 |         if any(
 39 |             t
 40 |             for t in existing_tasks
 41 |             if t.status in (TaskStatus.STARTED, TaskStatus.PENDING)
 42 |         ):
 43 |             raise TaskHasActiveRunError()
 44 |         task = Task(
 45 |             id=0,
 46 |             task_name=request.task_name,
 47 |             task_handler=request.task_handler,
 48 |             status=TaskStatus.PENDING,
 49 |             created_at=datetime.utcnow(),
 50 |             payload=request.payload,
 51 |             priority=request.priority,
 52 |             timeout=request.timeout,
 53 |         )
 54 |         return await self.storage.enqueue_task(task)
 55 | 
 56 |     @count_invocations(subsystem="queue")
 57 |     @measure_latency(subsystem="queue")
 58 |     @override
 59 |     async def dequeue(self) -> Task | None:
 60 |         return await self.storage.dequeue_task()
 61 | 
 62 |     @count_invocations(subsystem="queue")
 63 |     @measure_latency(subsystem="queue")
 64 |     @override
 65 |     async def get_queue_len(self) -> int:
 66 |         return await self.storage.get_queue_len()
 67 | 
 68 |     @count_invocations(subsystem="queue")
 69 |     @measure_latency(subsystem="queue")
 70 |     @override
 71 |     async def ping_task(self, id: int) -> Task:
 72 |         task = await self.storage.get_task_instance(id)
 73 |         if task.status == TaskStatus.PENDING:
 74 |             raise TaskPingNotStartedError()
 75 |         if task.status != TaskStatus.STARTED:
 76 |             raise TaskPingFinishedError()
 77 |         task.last_active_at = datetime.utcnow()
 78 |         return await self.storage.update_task(task)
 79 | 
 80 |     @count_invocations(subsystem="queue")
 81 |     @measure_latency(subsystem="queue")
 82 |     @override
 83 |     async def kill_dead_tasks(self) -> list[Task]:
 84 |         tasks = await self.storage.get_tasks()
 85 |         killed = []
 86 |         for task in tasks:
 87 |             if self._is_task_dead(task):
 88 |                 task.status = TaskStatus.DEAD
 89 |                 task.finished_at = datetime.utcnow()
 90 |                 killed.append(await self.storage.update_task(task))
 91 |         return killed
 92 | 
 93 |     def _is_task_dead(self, task: Task) -> bool:
 94 |         if task.status != Task.STARTED:
 95 |             return False
 96 |         activity_timestamps = [
 97 |             task.last_active_at,
 98 |             task.started_at,
 99 |             task.created_at,
100 |         ]
101 |         for ts in activity_timestamps:
102 |             if ts and datetime.utcnow() - ts > self._dead_timeout:
103 |                 return True
104 |         return False
105 | 
106 |     @count_invocations(subsystem="queue")
107 |     @measure_latency(subsystem="queue")
108 |     @override
109 |     async def delete_old_tasks(self, keep_last: int = 50) -> None:
110 |         await self.storage.delete_old_tasks(keep_last)
111 | 
112 |     @count_invocations(subsystem="queue")
113 |     @measure_latency(subsystem="queue")
114 |     @override
115 |     async def update_task(self, task: Task) -> Task:
116 |         return await self.storage.update_task(task)
117 | 
118 |     @count_invocations(subsystem="queue")
119 |     @measure_latency(subsystem="queue")
120 |     @override
121 |     async def get_task_instances(self, task_name: str) -> list[Task]:
122 |         return await self.storage.get_task_instances(task_name)
123 | 
124 |     @count_invocations(subsystem="queue")
125 |     @measure_latency(subsystem="queue")
126 |     @override
127 |     async def get_task_instance(self, task_id: int) -> Task:
128 |         return await self.storage.get_task_instance(task_id)
129 | 


--------------------------------------------------------------------------------
/sneakpeek/queue/tests/test_queue.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | 
  3 | import pytest
  4 | from fakeredis.aioredis import FakeRedis
  5 | 
  6 | from sneakpeek.queue.in_memory_storage import InMemoryQueueStorage
  7 | from sneakpeek.queue.model import (
  8 |     EnqueueTaskRequest,
  9 |     QueueABC,
 10 |     QueueStorageABC,
 11 |     TaskHasActiveRunError,
 12 |     TaskPriority,
 13 | )
 14 | from sneakpeek.queue.queue import Queue
 15 | from sneakpeek.queue.redis_storage import RedisQueueStorage
 16 | 
 17 | 
 18 | @pytest.fixture
 19 | def in_memory_storage() -> QueueStorageABC:
 20 |     yield InMemoryQueueStorage()
 21 | 
 22 | 
 23 | @pytest.fixture
 24 | def redis_storage() -> QueueStorageABC:
 25 |     yield RedisQueueStorage(FakeRedis())
 26 | 
 27 | 
 28 | @pytest.fixture(
 29 |     params=[
 30 |         pytest.lazy_fixture(in_memory_storage.__name__),
 31 |         pytest.lazy_fixture(redis_storage.__name__),
 32 |     ]
 33 | )
 34 | def queue_storage(request) -> QueueStorageABC:
 35 |     yield request.param
 36 | 
 37 | 
 38 | @pytest.fixture
 39 | def queue(queue_storage: QueueStorageABC) -> QueueABC:
 40 |     yield Queue(queue_storage)
 41 | 
 42 | 
 43 | @pytest.mark.asyncio
 44 | async def test_enqueue_dequeue(queue: Queue):
 45 |     request = EnqueueTaskRequest(
 46 |         task_name=test_enqueue_dequeue.__name__ + ":name",
 47 |         task_handler=test_enqueue_dequeue.__name__ + ":type",
 48 |         priority=TaskPriority.HIGH,
 49 |         payload=test_enqueue_dequeue.__name__ + ":payload",
 50 |     )
 51 |     enqueued = await queue.enqueue(request)
 52 |     assert enqueued.id is not None
 53 |     assert enqueued.task_name == request.task_name
 54 |     assert enqueued.task_handler == request.task_handler
 55 |     assert enqueued.priority == request.priority
 56 |     assert enqueued.payload == request.payload
 57 |     dequeued = await queue.dequeue()
 58 |     assert dequeued is not None
 59 |     assert dequeued.id == enqueued.id
 60 |     assert dequeued.task_name == request.task_name
 61 |     assert dequeued.task_handler == request.task_handler
 62 |     assert dequeued.priority == request.priority
 63 |     assert dequeued.payload == request.payload
 64 | 
 65 | 
 66 | @pytest.mark.asyncio
 67 | async def test_double_enqueue_forbidden(queue: Queue):
 68 |     request = EnqueueTaskRequest(
 69 |         task_name=test_double_enqueue_forbidden.__name__ + ":name",
 70 |         task_handler=test_double_enqueue_forbidden.__name__ + ":type",
 71 |         priority=TaskPriority.HIGH,
 72 |         payload=test_double_enqueue_forbidden.__name__ + ":payload",
 73 |     )
 74 |     enqueued = await queue.enqueue(request)
 75 |     assert enqueued.id is not None
 76 |     assert enqueued.task_name == request.task_name
 77 |     with pytest.raises(TaskHasActiveRunError):
 78 |         await queue.enqueue(request)
 79 | 
 80 | 
 81 | @pytest.mark.asyncio
 82 | async def test_enqueue_count_equals_dequeue_count(queue: Queue):
 83 |     requests = [
 84 |         EnqueueTaskRequest(
 85 |             task_name=f"{test_enqueue_count_equals_dequeue_count.__name__}:name:{i}",
 86 |             task_handler=f"{test_enqueue_count_equals_dequeue_count.__name__}:type:{i}",
 87 |             priority=TaskPriority.HIGH,
 88 |             payload=f"{test_enqueue_count_equals_dequeue_count.__name__}:payload:{i}",
 89 |         )
 90 |         for i in range(100)
 91 |     ]
 92 |     enqueued_tasks = await asyncio.gather(
 93 |         *{queue.enqueue(request) for request in requests}
 94 |     )
 95 |     assert len(enqueued_tasks) == len(requests)
 96 |     assert {request.task_name for request in requests} == {
 97 |         task.task_name for task in enqueued_tasks
 98 |     }
 99 | 
100 |     dequeued = []
101 |     while task := await queue.dequeue():
102 |         dequeued.append(task)
103 |     assert len(dequeued) == len(requests)
104 |     assert {request.task_name for request in requests} == {
105 |         task.task_name for task in dequeued
106 |     }
107 | 
108 | 
109 | @pytest.mark.asyncio
110 | async def test_scraper_priority_queue_dequeue_order(queue: Queue):
111 |     def get_enqueue_request(priority: TaskPriority):
112 |         return EnqueueTaskRequest(
113 |             task_name=f"{test_scraper_priority_queue_dequeue_order.__name__}:name:{priority}",
114 |             task_handler=f"{test_scraper_priority_queue_dequeue_order.__name__}:type:{priority}",
115 |             payload=f"{test_scraper_priority_queue_dequeue_order.__name__}:payload:{priority}",
116 |             priority=priority,
117 |         )
118 | 
119 |     requests = [
120 |         get_enqueue_request(TaskPriority.NORMAL),
121 |         get_enqueue_request(TaskPriority.HIGH),
122 |         get_enqueue_request(TaskPriority.UTMOST),
123 |     ]
124 |     for request in requests:
125 |         await queue.enqueue(request)
126 | 
127 |     dequeued = []
128 |     while task := await queue.dequeue():
129 |         dequeued.append(task.priority)
130 |     assert dequeued == [TaskPriority.UTMOST, TaskPriority.HIGH, TaskPriority.NORMAL]
131 | 


--------------------------------------------------------------------------------
/sneakpeek/scheduler/model.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from abc import ABC, abstractmethod
  3 | from datetime import datetime, timedelta
  4 | from enum import Enum
  5 | from uuid import uuid4
  6 | 
  7 | from pydantic import BaseModel
  8 | 
  9 | from sneakpeek.queue.model import TaskPriority
 10 | 
 11 | PeriodicTaskId = str
 12 | 
 13 | 
 14 | def generate_id() -> PeriodicTaskId:
 15 |     return str(uuid4())
 16 | 
 17 | 
 18 | class TaskSchedule(str, Enum):
 19 |     """
 20 |     Periodic task schedule options. Note that it's disallowed to have 2 concurrent
 21 |     task, so if there's an active task new one won't be scheduled
 22 |     """
 23 | 
 24 |     INACTIVE = "inactive"  #: Scraper won't be automatically scheduled
 25 |     EVERY_SECOND = "every_second"  #: Scraper will be scheduled every second
 26 |     EVERY_MINUTE = "every_minute"  #: Scraper will be scheduled every minute
 27 |     EVERY_HOUR = "every_hour"  #: Scraper will be scheduled every hour
 28 |     EVERY_DAY = "every_day"  #: Scraper will be scheduled every day
 29 |     EVERY_WEEK = "every_week"  #: Scraper will be scheduled every week
 30 |     EVERY_MONTH = "every_month"  #: Scraper will be scheduled every month
 31 |     CRONTAB = "crontab"  #: Specify crontab when scraper should be scheduled
 32 | 
 33 | 
 34 | class PeriodicTask(BaseModel):
 35 |     id: PeriodicTaskId  #: Task unique ID
 36 |     name: str  #: Task name - used to disallow concurrent execution of the task and to defined unique series of tasks
 37 |     handler: str  #: Task handler name
 38 |     priority: TaskPriority  #: Task priority
 39 |     payload: str  #: Serialized task payload
 40 |     schedule: TaskSchedule  #: Task Schedule
 41 |     schedule_crontab: str | None = None  #: Task schedule crontab
 42 |     timeout: timedelta | None = None  #: Task timeout
 43 | 
 44 | 
 45 | class Lease(BaseModel):
 46 |     """Global lease metadata"""
 47 | 
 48 |     name: str  #: Lease name (resource name to be locked)
 49 |     owner_id: str  #: ID of the acquirer (should be the same if you already have the lease and want to prolong it)
 50 |     acquired: datetime  #: Time when the lease was acquired
 51 |     acquired_until: datetime  #: Time until the lease is acquired
 52 | 
 53 | 
 54 | class LeaseStorageABC(ABC):
 55 |     """Global lease storage abstract class"""
 56 | 
 57 |     @abstractmethod
 58 |     async def maybe_acquire_lease(
 59 |         self,
 60 |         lease_name: str,
 61 |         owner_id: str,
 62 |         acquire_for: timedelta,
 63 |     ) -> Lease | None:
 64 |         """Try to acquire lease (global lock).
 65 | 
 66 |         Args:
 67 |             lease_name (str): Lease name (resource name to be locked)
 68 |             owner_id (str): ID of the acquirer (should be the same if you already have the lease and want to prolong it)
 69 |             acquire_for (timedelta): For how long lease will be acquired
 70 | 
 71 |         Returns:
 72 |             Lease | None: Lease metadata if it was acquired, None otherwise
 73 |         """
 74 |         ...
 75 | 
 76 |     @abstractmethod
 77 |     async def release_lease(self, lease_name: str, owner_id: str) -> None:
 78 |         """Release lease (global lock)
 79 | 
 80 |         Args:
 81 |             lease_name (str): Lease name (resource name to be unlocked)
 82 |             owner_id (str): ID of the acquirer
 83 |         """
 84 |         ...
 85 | 
 86 | 
 87 | class PeriodicTasksStorageABC(ABC):
 88 |     @abstractmethod
 89 |     async def get_periodic_tasks(self) -> list[PeriodicTask]:
 90 |         ...
 91 | 
 92 | 
 93 | class StaticPeriodicTasksStorage(PeriodicTasksStorageABC):
 94 |     def __init__(self, tasks: list[PeriodicTask]) -> None:
 95 |         self.tasks = tasks
 96 | 
 97 |     async def get_periodic_tasks(self) -> list[PeriodicTask]:
 98 |         return self.tasks
 99 | 
100 | 
101 | class MultiPeriodicTasksStorage(PeriodicTasksStorageABC):
102 |     def __init__(self, storages: list[PeriodicTasksStorageABC]) -> None:
103 |         self.storages = storages
104 | 
105 |     async def get_periodic_tasks(self) -> list[PeriodicTask]:
106 |         return sum(
107 |             await asyncio.gather(
108 |                 *[storage.get_periodic_tasks() for storage in self.storages]
109 |             ),
110 |             [],
111 |         )
112 | 
113 | 
114 | class SchedulerABC(ABC):
115 |     @abstractmethod
116 |     async def enqueue_task(
117 |         self,
118 |         task_id: PeriodicTaskId,
119 |         priority: TaskPriority,
120 |     ) -> None:
121 |         ...
122 | 
123 |     @abstractmethod
124 |     async def start_scheduling_task(self, task: PeriodicTask) -> None:
125 |         ...
126 | 
127 |     @abstractmethod
128 |     async def stop_scheduling_task(self, task: PeriodicTask) -> None:
129 |         ...
130 | 
131 |     @abstractmethod
132 |     async def update_tasks(self) -> None:
133 |         ...
134 | 
135 |     @abstractmethod
136 |     async def start(self) -> None:
137 |         ...
138 | 
139 |     @abstractmethod
140 |     async def stop(self) -> None:
141 |         ...
142 | 


--------------------------------------------------------------------------------
/sneakpeek/queue/tests/test_consumer.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from datetime import timedelta
  3 | from unittest.mock import AsyncMock
  4 | 
  5 | import pytest
  6 | from fakeredis.aioredis import FakeRedis
  7 | 
  8 | from sneakpeek.queue.consumer import Consumer
  9 | from sneakpeek.queue.in_memory_storage import InMemoryQueueStorage
 10 | from sneakpeek.queue.model import (
 11 |     EnqueueTaskRequest,
 12 |     QueueABC,
 13 |     QueueStorageABC,
 14 |     Task,
 15 |     TaskHandlerABC,
 16 |     TaskPriority,
 17 |     TaskStatus,
 18 | )
 19 | from sneakpeek.queue.queue import Queue
 20 | from sneakpeek.queue.redis_storage import RedisQueueStorage
 21 | 
 22 | TEST_HANDLER_NAME = "test_handler"
 23 | PING_DELAY = timedelta(milliseconds=1)
 24 | 
 25 | 
 26 | class TestTaskHandler(TaskHandlerABC):
 27 |     def __init__(self) -> None:
 28 |         self.process_mock = AsyncMock()
 29 | 
 30 |     def name(self):
 31 |         return TEST_HANDLER_NAME
 32 | 
 33 |     async def process(self, task: Task) -> str:
 34 |         await self.process_mock(task.id)
 35 |         return task.task_name
 36 | 
 37 | 
 38 | @pytest.fixture
 39 | def in_memory_storage() -> QueueStorageABC:
 40 |     yield InMemoryQueueStorage()
 41 | 
 42 | 
 43 | @pytest.fixture
 44 | def redis_storage() -> QueueStorageABC:
 45 |     yield RedisQueueStorage(FakeRedis())
 46 | 
 47 | 
 48 | @pytest.fixture(
 49 |     params=[
 50 |         pytest.lazy_fixture(in_memory_storage.__name__),
 51 |         pytest.lazy_fixture(redis_storage.__name__),
 52 |     ]
 53 | )
 54 | def queue_storage(request) -> QueueStorageABC:
 55 |     yield request.param
 56 | 
 57 | 
 58 | @pytest.fixture
 59 | def queue(queue_storage: QueueStorageABC) -> QueueABC:
 60 |     yield Queue(queue_storage)
 61 | 
 62 | 
 63 | @pytest.fixture
 64 | def handler() -> TaskHandlerABC:
 65 |     yield TestTaskHandler()
 66 | 
 67 | 
 68 | @pytest.fixture
 69 | def consumer(queue: QueueABC, handler: TaskHandlerABC) -> Consumer:
 70 |     yield Consumer(queue, [handler], ping_delay=PING_DELAY)
 71 | 
 72 | 
 73 | async def _wait_task_in_finished_state(queue: QueueABC, task: Task, timeout: timedelta):
 74 |     async def wait(task: Task):
 75 |         while True:
 76 |             task = await queue.storage.get_task_instance(task.id)
 77 |             if task.status not in (TaskStatus.STARTED, TaskStatus.PENDING):
 78 |                 return
 79 |             await asyncio.sleep(PING_DELAY.total_seconds())
 80 | 
 81 |     await asyncio.wait_for(wait(task), timeout=timeout.total_seconds())
 82 | 
 83 | 
 84 | @pytest.mark.asyncio
 85 | async def test_task_dequeues_and_succeeds(
 86 |     consumer: Consumer,
 87 |     queue: Queue,
 88 |     handler: TaskHandlerABC,
 89 | ):
 90 |     request = EnqueueTaskRequest(
 91 |         task_name="test_task",
 92 |         task_handler=TEST_HANDLER_NAME,
 93 |         priority=TaskPriority.NORMAL,
 94 |         payload="payload",
 95 |     )
 96 |     task = await queue.enqueue(request)
 97 |     assert await consumer.consume()
 98 |     await _wait_task_in_finished_state(queue, task, timedelta(seconds=2))
 99 |     assert await queue.get_queue_len() == 0
100 |     task = await queue.storage.get_task_instance(task.id)
101 |     assert task.status == TaskStatus.SUCCEEDED
102 |     assert task.result == task.task_name
103 |     assert handler.process_mock.awaited_once_with(task.id)
104 | 
105 | 
106 | @pytest.mark.asyncio
107 | async def test_dequeues_and_fails(
108 |     consumer: Consumer,
109 |     queue: Queue,
110 |     handler: TaskHandlerABC,
111 | ):
112 |     handler.process_mock.side_effect = Exception()
113 |     request = EnqueueTaskRequest(
114 |         task_name="test_task",
115 |         task_handler=TEST_HANDLER_NAME,
116 |         priority=TaskPriority.NORMAL,
117 |         payload="payload",
118 |     )
119 |     task = await queue.enqueue(request)
120 |     assert await consumer.consume()
121 |     await _wait_task_in_finished_state(queue, task, timedelta(seconds=2))
122 |     assert await queue.get_queue_len() == 0
123 |     task = await queue.storage.get_task_instance(task.id)
124 |     assert task.status == TaskStatus.FAILED
125 |     assert handler.process_mock.awaited_once_with(task.id)
126 | 
127 | 
128 | @pytest.mark.asyncio
129 | async def test_dequeues_and_times_out(
130 |     consumer: Consumer,
131 |     queue: Queue,
132 |     handler: TaskHandlerABC,
133 | ):
134 |     handler.process_mock.side_effect = asyncio.sleep(10)
135 |     request = EnqueueTaskRequest(
136 |         task_name="test_task",
137 |         task_handler=TEST_HANDLER_NAME,
138 |         priority=TaskPriority.NORMAL,
139 |         payload="payload",
140 |         timeout=timedelta(milliseconds=10),
141 |     )
142 |     task = await queue.enqueue(request)
143 |     assert await consumer.consume()
144 |     await _wait_task_in_finished_state(queue, task, timedelta(seconds=2))
145 |     assert await queue.get_queue_len() == 0
146 |     task = await queue.storage.get_task_instance(task.id)
147 |     assert task.status == TaskStatus.FAILED
148 |     assert handler.process_mock.awaited_once_with(task.id)
149 | 


--------------------------------------------------------------------------------
/sneakpeek/queue/redis_storage.py:
--------------------------------------------------------------------------------
  1 | from datetime import timedelta
  2 | 
  3 | from redis.asyncio import Redis
  4 | from typing_extensions import override
  5 | 
  6 | from sneakpeek.metrics import count_invocations, measure_latency
  7 | from sneakpeek.queue.model import QueueStorageABC, Task, TaskNotFoundError
  8 | 
  9 | DEFAULT_TASK_TTL = timedelta(days=7)
 10 | SCORE_PRIORITY_BIT_OFFSET = 32
 11 | 
 12 | 
 13 | class RedisQueueStorage(QueueStorageABC):
 14 |     """
 15 |     Redis queue storage. Queue has two components: priority queue
 16 |     implemented by sorted set (ZADD and ZPOPMIN) and key (task name)
 17 |     values (set of task instances) set
 18 |     """
 19 | 
 20 |     def __init__(self, redis: Redis, task_ttl: timedelta = DEFAULT_TASK_TTL) -> None:
 21 |         """
 22 | 
 23 |         Args:
 24 |             redis (Redis): Async redis client
 25 |             task_ttl (timedelta): TTL of the task record in the redis. Defaults to 7 days.
 26 |         """
 27 |         self._redis = redis
 28 |         self._queue_set_name = "internal::queue"
 29 |         self._task_ttl = task_ttl
 30 | 
 31 |     async def _generate_id(self) -> int:
 32 |         return await self._redis.incr("internal::id_counter")
 33 | 
 34 |     def _get_task_key(self, task_id: int) -> str:
 35 |         return f"task::{task_id}"
 36 | 
 37 |     def _get_task_name_key(self, task_name: str) -> str:
 38 |         return f"task_name::{task_name}"
 39 | 
 40 |     def _get_task_name_from_key(self, key: str) -> str:
 41 |         return key.replace("task_name::", "", 1)
 42 | 
 43 |     def _get_task_score(self, task: Task) -> int:
 44 |         # Values in redis sorted sets with the same score are stored lexicographically
 45 |         # So in order for a queue to be ordered by priority then by the ID
 46 |         # we can define score as (priority<<N + task_id)
 47 |         return (task.priority.value << SCORE_PRIORITY_BIT_OFFSET) + task.id
 48 | 
 49 |     @count_invocations(subsystem="storage")
 50 |     @measure_latency(subsystem="storage")
 51 |     @override
 52 |     async def get_tasks(self) -> list[Task]:
 53 |         tasks = []
 54 |         async for key in self._redis.scan_iter("task_name::*"):
 55 |             tasks += await self.get_task_instances(
 56 |                 self._get_task_name_from_key(key.decode())
 57 |             )
 58 |         return sorted(tasks, key=lambda x: x.id, reverse=True)
 59 | 
 60 |     @count_invocations(subsystem="storage")
 61 |     @measure_latency(subsystem="storage")
 62 |     @override
 63 |     async def get_task_instances(self, task_name: str) -> list[Task]:
 64 |         task_keys = await self._redis.smembers(self._get_task_name_key(task_name))
 65 |         return sorted(
 66 |             [Task.parse_raw(task) for task in await self._redis.mget(task_keys)],
 67 |             key=lambda x: x.id,
 68 |             reverse=True,
 69 |         )
 70 | 
 71 |     @count_invocations(subsystem="storage")
 72 |     @measure_latency(subsystem="storage")
 73 |     @override
 74 |     async def get_task_instance(self, id: int) -> Task:
 75 |         task = await self._redis.get(self._get_task_key(id))
 76 |         if task is None:
 77 |             raise TaskNotFoundError()
 78 |         return Task.parse_raw(task)
 79 | 
 80 |     @count_invocations(subsystem="storage")
 81 |     @measure_latency(subsystem="storage")
 82 |     @override
 83 |     async def enqueue_task(self, task: Task) -> Task:
 84 |         task.id = await self._generate_id()
 85 |         task_key = self._get_task_key(task.id)
 86 |         pipe = self._redis.pipeline()
 87 |         pipe.set(task_key, task.json(), ex=self._task_ttl)
 88 |         pipe.sadd(self._get_task_name_key(task.task_name), task_key)
 89 |         pipe.zadd(self._queue_set_name, {task_key: self._get_task_score(task)})
 90 |         await pipe.execute()
 91 |         return task
 92 | 
 93 |     @count_invocations(subsystem="storage")
 94 |     @measure_latency(subsystem="storage")
 95 |     @override
 96 |     async def update_task(self, task: Task) -> Task:
 97 |         task_key = self._get_task_key(task.id)
 98 |         await self._redis.set(task_key, task.json(), ex=self._task_ttl, xx=True)
 99 |         return task
100 | 
101 |     @count_invocations(subsystem="storage")
102 |     @measure_latency(subsystem="storage")
103 |     @override
104 |     async def dequeue_task(self) -> Task | None:
105 |         tasks = await self._redis.zpopmin(self._queue_set_name)
106 |         if not tasks:
107 |             return None
108 |         task_key, _ = tasks[0]
109 |         task = await self._redis.get(task_key)
110 |         if task is None:
111 |             raise TaskNotFoundError()
112 |         return Task.parse_raw(task)
113 | 
114 |     @count_invocations(subsystem="storage")
115 |     @measure_latency(subsystem="storage")
116 |     @override
117 |     async def delete_old_tasks(self, keep_last: int = 50) -> None:
118 |         async for key in self._redis.scan_iter("task_name::*"):
119 |             task_instances = sorted(
120 |                 await self.get_task_instances(
121 |                     self._get_task_name_from_key(key.decode())
122 |                 ),
123 |                 key=lambda x: x.id,
124 |                 reverse=True,
125 |             )
126 |             for task in task_instances[keep_last:]:
127 |                 task_key = self._get_task_key(task.id)
128 |                 pipe = self._redis.pipeline()
129 |                 pipe.delete(task_key)
130 |                 pipe.srem(key, task_key)
131 |                 await pipe.execute()
132 | 
133 |     @count_invocations(subsystem="storage")
134 |     @measure_latency(subsystem="storage")
135 |     @override
136 |     async def get_queue_len(self) -> int:
137 |         return await self._redis.zcount(self._queue_set_name, 0, "+inf")
138 | 


--------------------------------------------------------------------------------
/sneakpeek/middleware/rate_limiter_middleware.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | from asyncio import Lock
  4 | from datetime import datetime, timedelta
  5 | from enum import Enum, auto
  6 | from random import randint
  7 | from typing import Any
  8 | from urllib.parse import urlparse
  9 | 
 10 | from cachetools.func import ttl_cache
 11 | from pydantic import BaseModel, validator
 12 | 
 13 | from sneakpeek.middleware.base import BaseMiddleware, parse_config_from_obj
 14 | from sneakpeek.scraper.model import Request
 15 | 
 16 | DEFAULT_BUCKET_TIME_WINDOW = timedelta(minutes=1)
 17 | 
 18 | 
 19 | def rate_limited_delay_jitter() -> timedelta:
 20 |     return timedelta(milliseconds=randint(0, 500))
 21 | 
 22 | 
 23 | class _LeakyBucket:
 24 |     def __init__(
 25 |         self, size: int, time_window: timedelta = DEFAULT_BUCKET_TIME_WINDOW
 26 |     ) -> None:
 27 |         self.size = size
 28 |         self.time_window = time_window
 29 |         self.queue: list[datetime] = []
 30 |         self.lock = Lock()
 31 | 
 32 |     def last_used(self) -> datetime | None:
 33 |         if not self.queue:
 34 |             return None
 35 |         return self.queue[0]
 36 | 
 37 |     async def add(self) -> datetime | None:
 38 |         async with self.lock:
 39 |             now = datetime.utcnow()
 40 |             while self.queue and self.queue[0] <= now - self.time_window:
 41 |                 self.queue.pop(0)
 42 |             if not self.size:
 43 |                 raise ValueError("Queue size is 0")
 44 |             if len(self.queue) >= self.size:
 45 |                 return self.queue[0] + self.time_window
 46 | 
 47 |             self.queue.append(now)
 48 |             return None
 49 | 
 50 | 
 51 | class RateLimitedException(Exception):
 52 |     """Request is rate limited because too many requests were made to the host"""
 53 | 
 54 |     pass
 55 | 
 56 | 
 57 | class RateLimitedStrategy(Enum):
 58 |     """What to do if the request is rate limited"""
 59 | 
 60 |     THROW = auto()  #: Throw an exception
 61 |     WAIT = auto()  #: Wait until request is no longer rate limited
 62 | 
 63 | 
 64 | class RateLimiterMiddlewareConfig(BaseModel):
 65 |     """Rate limiter middleware configuration"""
 66 | 
 67 |     #: Maximum number of allowed requests per host within time window
 68 |     max_requests: int = 60
 69 |     #: What to do if the request is rate limited
 70 |     rate_limited_strategy: RateLimitedStrategy = RateLimitedStrategy.WAIT
 71 |     #: Time window to aggregate requests
 72 |     time_window: timedelta = DEFAULT_BUCKET_TIME_WINDOW
 73 | 
 74 |     @validator("max_requests")
 75 |     def check_max_requests(cls, v: int) -> int:
 76 |         if v <= 0:
 77 |             raise ValueError(
 78 |                 f"`max_requests` must be a positive integer. Received: {v}"
 79 |             )
 80 |         return v
 81 | 
 82 |     def __hash__(self):
 83 |         return hash(
 84 |             (
 85 |                 self.max_requests,
 86 |                 self.rate_limited_strategy,
 87 |                 self.time_window,
 88 |             )
 89 |         )
 90 | 
 91 | 
 92 | class RateLimiterMiddleware(BaseMiddleware):
 93 |     """
 94 |     Rate limiter implements `leaky bucket algorithm <https://en.wikipedia.org/wiki/Leaky_bucket>`_
 95 |     to limit number of requests made to the hosts. If the request is rate limited it can either
 96 |     raise an exception or wait until the request won't be limited anymore.
 97 |     """
 98 | 
 99 |     def __init__(
100 |         self, default_config: RateLimiterMiddlewareConfig | None = None
101 |     ) -> None:
102 |         self._default_config = default_config or RateLimiterMiddlewareConfig()
103 |         self._logger = logging.getLogger(__name__)
104 | 
105 |     @property
106 |     def name(self) -> str:
107 |         return "rate_limiter"
108 | 
109 |     def _extract_key(self, url: str) -> str:
110 |         return urlparse(url).hostname
111 | 
112 |     @ttl_cache(maxsize=None, ttl=timedelta(minutes=5).total_seconds())
113 |     def _get_bucket(
114 |         self, key: str, config: RateLimiterMiddlewareConfig
115 |     ) -> _LeakyBucket:
116 |         return _LeakyBucket(
117 |             size=config.max_requests,
118 |             time_window=config.time_window,
119 |         )
120 | 
121 |     async def _wait_for_admission(
122 |         self,
123 |         url: str,
124 |         config: RateLimiterMiddlewareConfig,
125 |     ) -> None:
126 |         key = self._extract_key(url)
127 |         bucket = self._get_bucket(key, config)
128 |         while True:
129 |             next_attempt_dt = await bucket.add()
130 |             if not next_attempt_dt:
131 |                 return
132 |             error_message = (
133 |                 f"Rate limited request to '{url}' because there were "
134 |                 f"more than {bucket.size} calls in the last {int(bucket.time_window.total_seconds())}s "
135 |                 f"to the domain {key}. "
136 |                 f"Next available call will be permitted at {next_attempt_dt}."
137 |             )
138 | 
139 |             if config.rate_limited_strategy == RateLimitedStrategy.THROW:
140 |                 raise RateLimitedException(error_message)
141 |             self._logger.info(error_message)
142 |             attempt_delay = next_attempt_dt - datetime.utcnow()
143 |             attempt_delay += rate_limited_delay_jitter()
144 |             await asyncio.sleep(attempt_delay.total_seconds())
145 | 
146 |     async def on_request(
147 |         self,
148 |         request: Request,
149 |         config: Any | None,
150 |     ) -> Request:
151 |         config = parse_config_from_obj(
152 |             config,
153 |             self.name,
154 |             RateLimiterMiddlewareConfig,
155 |             self._default_config,
156 |         )
157 |         await self._wait_for_admission(request.url, config)
158 |         return request
159 | 


--------------------------------------------------------------------------------
/sneakpeek/metrics.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from functools import wraps
  3 | from typing import Any
  4 | 
  5 | from prometheus_client import Counter, Gauge, Histogram
  6 | 
  7 | invocations_counter = Counter(
  8 |     name="invocations",
  9 |     documentation="Methods invocations counter",
 10 |     namespace="sneakpeek",
 11 |     labelnames=["subsystem", "method", "type", "error"],
 12 | )
 13 | latency_histogram = Histogram(
 14 |     name="latency",
 15 |     documentation="Time spent processing method",
 16 |     namespace="sneakpeek",
 17 |     labelnames=["subsystem", "method"],
 18 | )
 19 | delay_histogram = Histogram(
 20 |     name="delay",
 21 |     documentation="Execution and scheduling delay",
 22 |     namespace="sneakpeek",
 23 |     labelnames=["type"],
 24 | )
 25 | replicas_gauge = Gauge(
 26 |     name="replicas",
 27 |     documentation="Number of active subsytem replicas",
 28 |     namespace="sneakpeek",
 29 |     labelnames=["type"],
 30 | )
 31 | 
 32 | 
 33 | def _get_full_class_name(obj: Any) -> str:
 34 |     module = obj.__class__.__module__
 35 |     if module is None or module == str.__class__.__module__:
 36 |         return obj.__class__.__name__
 37 |     return module + "." + obj.__class__.__name__
 38 | 
 39 | 
 40 | def measure_latency(subsystem: str):
 41 |     """
 42 |     Decorator for measuring latency of the function (works for both sync and async functions).
 43 | 
 44 |     .. code-block:: python3
 45 | 
 46 |         @measure_latency(subsytem="my subsystem")
 47 |         def my_awesome_func():
 48 |             ...
 49 | 
 50 | 
 51 |     This will export following Prometheus histogram metric:
 52 | 
 53 | 
 54 |     .. code-block::
 55 | 
 56 |         sneakpeek_latency{subsystem="my subsystem", method="my_awesome_func"}
 57 | 
 58 |     Args:
 59 |         subsystem (str): Subsystem name to be used in the metric annotation
 60 |     """
 61 | 
 62 |     def wrapper(func):
 63 |         @wraps(func)
 64 |         def sync_wrapper(*args, **kwargs):
 65 |             with latency_histogram.labels(
 66 |                 subsystem=subsystem, method=func.__name__
 67 |             ).time():
 68 |                 return func(*args, **kwargs)
 69 | 
 70 |         @wraps(func)
 71 |         async def async_wrapper(*args, **kwargs):
 72 |             with latency_histogram.labels(
 73 |                 subsystem=subsystem, method=func.__name__
 74 |             ).time():
 75 |                 return await func(*args, **kwargs)
 76 | 
 77 |         return async_wrapper if asyncio.iscoroutinefunction(func) else sync_wrapper
 78 | 
 79 |     return wrapper
 80 | 
 81 | 
 82 | def count_invocations(subsystem: str):
 83 |     """
 84 |     Decorator for measuring number of function invocations (works for both sync and async functions).
 85 | 
 86 |     .. code-block:: python3
 87 | 
 88 |         @count_invocations(subsytem="my subsystem")
 89 |         def my_awesome_func():
 90 |             ...
 91 | 
 92 | 
 93 |     This will export following Prometheus counter metrics:
 94 | 
 95 | 
 96 |     .. code-block::
 97 | 
 98 |         # Total number of invocations
 99 |         sneakpeek_invocations{subsystem="my subsystem", method="my_awesome_func", type="total", error=""}
100 |         # Total number of successful invocations (ones that haven't thrown an exception)
101 |         sneakpeek_invocations{subsystem="my subsystem", method="my_awesome_func", type="success", error=""}
102 |         # Total number of failed invocations (ones that have thrown an exception)
103 |         sneakpeek_invocations{subsystem="my subsystem", method="my_awesome_func", type="error", error="<Exception class name>"}
104 | 
105 |     Args:
106 |         subsystem (str): Subsystem name to be used in the metric annotation
107 |     """
108 | 
109 |     def wrapper(func):
110 |         @wraps(func)
111 |         def sync_wrapper(*args, **kwargs):
112 |             invocations_counter.labels(
113 |                 subsystem=subsystem,
114 |                 method=func.__name__,
115 |                 type="total",
116 |                 error="",
117 |             ).inc()
118 |             try:
119 |                 result = func(*args, **kwargs)
120 |                 invocations_counter.labels(
121 |                     subsystem=subsystem,
122 |                     method=func.__name__,
123 |                     type="success",
124 |                     error="",
125 |                 ).inc()
126 |                 return result
127 |             except Exception as e:
128 |                 invocations_counter.labels(
129 |                     subsystem=subsystem,
130 |                     method=func.__name__,
131 |                     type="error",
132 |                     error=_get_full_class_name(e),
133 |                 ).inc()
134 |                 raise
135 | 
136 |         @wraps(func)
137 |         async def async_wrapper(*args, **kwargs):
138 |             invocations_counter.labels(
139 |                 subsystem=subsystem,
140 |                 method=func.__name__,
141 |                 type="total",
142 |                 error="",
143 |             ).inc()
144 |             try:
145 |                 result = await func(*args, **kwargs)
146 |                 invocations_counter.labels(
147 |                     subsystem=subsystem,
148 |                     method=func.__name__,
149 |                     type="success",
150 |                     error="",
151 |                 ).inc()
152 |                 return result
153 |             except Exception as e:
154 |                 invocations_counter.labels(
155 |                     subsystem=subsystem,
156 |                     method=func.__name__,
157 |                     type="error",
158 |                     error=_get_full_class_name(e),
159 |                 ).inc()
160 |                 raise
161 | 
162 |         return async_wrapper if asyncio.iscoroutinefunction(func) else sync_wrapper
163 | 
164 |     return wrapper
165 | 


--------------------------------------------------------------------------------
/docs/quick_start.rst:
--------------------------------------------------------------------------------
  1 | #################
  2 | Quick start
  3 | #################
  4 | 
  5 | So you want to create a new scraper, first you need to make sure you have installed **Sneakpeek**:
  6 | 
  7 | .. code-block:: bash
  8 | 
  9 |     pip install sneakpeek-py
 10 | 
 11 | The next step would be implementing scraper logic (or so called scraper handler):
 12 | 
 13 | .. code-block:: python3
 14 | 
 15 |     # file: demo_scraper.py
 16 | 
 17 |     import json
 18 |     import logging
 19 | 
 20 |     from pydantic import BaseModel
 21 | 
 22 |     from sneakpeek.scraper.model import ScraperContextABC, ScraperHandler
 23 | 
 24 |     
 25 |     # This defines model of handler parameters that are defined 
 26 |     # in the scraper config and then passed to the handler
 27 |     class DemoScraperParams(BaseModel):
 28 |         url: str
 29 | 
 30 |     # This is a class which actually implements logic
 31 |     # Note that you need to inherit the implementation from 
 32 |     # the `sneakpeek.scraper_handler.ScraperHandler`
 33 |     class DemoScraper(ScraperHandler):
 34 |         # You can have any dependencies you want and pass them
 35 |         # in the server configuration
 36 |         def __init__(self) -> None:
 37 |             self._logger = logging.getLogger(__name__)
 38 | 
 39 |         # Each handler must define its name so it later
 40 |         # can be referenced in scrapers' configuration
 41 |         @property
 42 |         def name(self) -> str:
 43 |             return "demo_scraper"
 44 | 
 45 |         # Some example function that processes the response
 46 |         # and extracts valuable information
 47 |         async def process_page(self, response: str):
 48 |             ...
 49 | 
 50 |         # This function is called by the worker to execute the logic
 51 |         # The only argument that is passed is `sneakpeek.scraper_context.ScraperContext`
 52 |         # It implements basic async HTTP client and also provides parameters
 53 |         # that are defined in the scraper config
 54 |         async def run(self, context: ScraperContextABC) -> str:
 55 |             params = DemoScraperParams.parse_obj(context.params)
 56 |             # Perform GET request to the URL defined in the scraper config 
 57 |             response = await context.get(params.url)
 58 |             response_body = await response.text()
 59 | 
 60 |             # Perform some business logic on a response
 61 |             result = await self.process_page(response_body)
 62 |             
 63 |             # Return meaningful job summary - must return a string
 64 |             return json.dumps({
 65 |                 "processed_urls": 1,
 66 |                 "found_results": len(result),
 67 |             })
 68 | 
 69 | 
 70 | Now that we have some scraper logic, let's make it run periodically. 
 71 | To do so let's configure **SneakpeekServer**:
 72 | 
 73 | .. code-block:: python3
 74 | 
 75 |     # file: main.py
 76 | 
 77 |     import random
 78 |     from uuid import uuid4
 79 | 
 80 |     from demo.demo_scraper import DemoScraper
 81 |     from sneakpeek.logging import configure_logging
 82 |     from sneakpeek.middleware.parser import ParserMiddleware
 83 |     from sneakpeek.middleware.rate_limiter_middleware import (
 84 |         RateLimiterMiddleware,
 85 |         RateLimiterMiddlewareConfig,
 86 |     )
 87 |     from sneakpeek.middleware.requests_logging_middleware import RequestsLoggingMiddleware
 88 |     from sneakpeek.middleware.robots_txt_middleware import RobotsTxtMiddleware
 89 |     from sneakpeek.middleware.user_agent_injecter_middleware import (
 90 |         UserAgentInjecterMiddleware,
 91 |         UserAgentInjecterMiddlewareConfig,
 92 |     )
 93 |     from sneakpeek.queue.in_memory_storage import InMemoryQueueStorage
 94 |     from sneakpeek.queue.model import TaskPriority
 95 |     from sneakpeek.scheduler.in_memory_lease_storage import InMemoryLeaseStorage
 96 |     from sneakpeek.scheduler.model import TaskSchedule
 97 |     from sneakpeek.scraper.in_memory_storage import InMemoryScraperStorage
 98 |     from sneakpeek.scraper.model import Scraper
 99 |     from sneakpeek.server import SneakpeekServer
100 | 
101 | 
102 |     def get_server(urls: list[str], is_read_only: bool) -> SneakpeekServer:
103 |         handler = DemoScraper()
104 |         return SneakpeekServer.create(
105 |             handlers=[handler],
106 |             scraper_storage=InMemoryScraperStorage([
107 |                 Scraper(
108 |                     id=str(uuid4()),
109 |                     name=f"Demo Scraper",
110 |                     schedule=TaskSchedule.EVERY_MINUTE,
111 |                     handler=handler.name,
112 |                     config=ScraperConfig(params={"start_url": "http://example.com"}),
113 |                     schedule_priority=TaskPriority.NORMAL,
114 |                 )
115 |             ]),
116 |             queue_storage=InMemoryQueueStorage(),
117 |             lease_storage=InMemoryLeaseStorage(),
118 |             middlewares=[
119 |                 RequestsLoggingMiddleware(),
120 |                 RobotsTxtMiddleware(),
121 |                 RateLimiterMiddleware(RateLimiterMiddlewareConfig(max_rpm=60)),
122 |                 UserAgentInjecterMiddleware(
123 |                     UserAgentInjecterMiddlewareConfig(use_external_data=False)
124 |                 ),
125 |                 ParserMiddleware(),
126 |             ],
127 |         )
128 | 
129 | 
130 |     def main():
131 |         args = parser.parse_args()
132 |         server = get_server(args.urls, args.read_only)
133 |         configure_logging()
134 |         server.serve()
135 | 
136 | 
137 |     if __name__ == "__main__":
138 |         main()
139 | 
140 | 
141 | 
142 | Now, the only thing is left is to actually run the server:
143 | 
144 | .. code-block:: bash
145 | 
146 |     python3 run main.py
147 | 
148 | That's it! Now you can open http://localhost:8080 and explore the UI to see
149 | how you scraper is being automatically scheduled and executed.
150 | 


--------------------------------------------------------------------------------
/front/src/components/ScraperIdeComponent.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |   <div class="flex flex-top column" style="width: 100%;">
  3 |     <MonacoEditor :value="code" @change="updateCode" class="editor"
  4 |     language="python"
  5 |     :theme="theme" :options="options" />
  6 |     <div class="q-mt-md">
  7 |       <div class="flex column q-px-xl" v-if="args && Object.keys(args).length > 0">
  8 |         <div class="text-h6">
  9 |           Session arguments
 10 |         </div>
 11 |         <div v-for="arg in Object.keys(args)" :key="arg" class="flex row items-baseline">
 12 |           <q-input v-model="args[arg]" :label="arg" dense class="q-mr-sm arg-input" />
 13 |         </div>
 14 |         <div class="q-mt-md flex row justify-start">
 15 |           <q-btn @click="run" size="sm" class="q-mr-sm" >
 16 |             <q-icon name="fa-solid fa-bug" class="q-mr-sm" :loading="runLoading" />
 17 |             Debug
 18 |           </q-btn>
 19 |           <q-btn size="sm" class="q-mr-sm" v-if="enableSaveBtn" @click="() => $emit('save', this.scraperConfig)">
 20 |             <q-icon name="fa-solid fa-save" class="q-mr-sm" />
 21 |             Save scraper
 22 |           </q-btn>
 23 |         </div>
 24 |       </div>
 25 |     </div>
 26 |     <div class="q-mt-lg" v-if="lastTaskId">
 27 |       <div class="text-h6 q-px-xl">
 28 |         Logs
 29 |       </div>
 30 |       <task-logs :task-id="lastTaskId" />
 31 |     </div>
 32 |   </div>
 33 | </template>
 34 | <script>
 35 | import MonacoEditor from 'monaco-editor-vue3';
 36 | import { h } from 'vue';
 37 | import { runEphemeralScraperTask } from '../api';
 38 | import TaskLogs from '../components/TaskLogs.vue';
 39 | MonacoEditor.render = () => h('div');
 40 | 
 41 | export default {
 42 |   name: "ScraperIdeComponent",
 43 |   components: { MonacoEditor, TaskLogs },
 44 |   props: ["modelValue", "enableSaveBtn"],
 45 |   emits: ['update:modelValue', "save"],
 46 |   data() {
 47 |     return {
 48 |       code: `
 49 | # Define the code for the scraper logic here
 50 | import logging
 51 | 
 52 | from pydantic import BaseModel
 53 | from sneakpeek.scraper.model import ScraperContextABC
 54 | 
 55 | 
 56 | # Scraper must define 'handler' function. Consider it to be the 'main' function of the scraper.
 57 | # All of the arguments (except the first 'ctx') will be passed using scraper config's 'args' or 'kwargs'
 58 | async def handler(ctx: ScraperContextABC, start_url: str) -> str:
 59 |     logging.info(f"Downloading {start_url}")
 60 |     response = await ctx.get(start_url)
 61 |     content = await response.text()
 62 |     logging.info(f"Received {content[:50]}")
 63 |     return {
 64 |         "success": True,
 65 |         "content": content[:50]
 66 |     }`,
 67 |       options: {
 68 |         automaticLayout: true,
 69 |       },
 70 |       lastTaskId: null,
 71 |       args: {},
 72 |       runLoading: false,
 73 |     }
 74 |   },
 75 |   created() {
 76 |     if (this.modelValue && this.modelValue.params) {
 77 |       this.code = this.modelValue.params.source_code;
 78 |       this.args = this.modelValue.params.kwargs;
 79 |     } else {
 80 |       this.parseArgs();
 81 |     }
 82 |   },
 83 |   computed: {
 84 |     theme() {
 85 |       return this.$q.dark.isActive ? "vs-dark": "vs";
 86 |     },
 87 |     scraperConfig() {
 88 |       return {
 89 |         params: {
 90 |           source_code: this.code,
 91 |           kwargs: this.args,
 92 |         },
 93 |       }
 94 |     }
 95 |   },
 96 |   watch: {
 97 |     args: {
 98 |       deep: true,
 99 |       handler(val) {
100 |         this.$emit('update:modelValue', this.scraperConfig);
101 |       }
102 |     }
103 |   },
104 |   methods: {
105 |     updateCode(event) {
106 |       if (typeof event === 'string' || event instanceof String) {
107 |         this.code = event;
108 |         this.parseArgs();
109 |         this.$emit('update:modelValue', this.scraperConfig);
110 |       }
111 |     },
112 |     run() {
113 |       this.runLoading = true;
114 |       runEphemeralScraperTask(
115 |         {
116 |           params: {
117 |             source_code: this.code,
118 |             args: Object.values(this.args),
119 |           },
120 |         },
121 |         "dynamic_scraper",
122 |         1,
123 |       )
124 |       .then((resp) => {
125 |         this.lastTaskId = resp.id;
126 |         this.$q.notify({
127 |           message: `Started debug session`,
128 |           color: "positive",
129 |         });
130 |       })
131 |       .catch(error => this.$q.notify({
132 |         message: `Failed to start debug session: ${error}`,
133 |         color: "negative",
134 |       }))
135 |       .finally(() => this.runLoading = false);
136 |     },
137 |     parseArgs() {
138 |       const args = /async def handler\(ctx[^,]+,(?<args>[^\)]+)\)/gm.exec(this.code);
139 |         if (args && args.length > 0) {
140 |           const parsedArgs = args.groups.args.split(",").map(a => a.split(":")[0].trim()).filter(a => a.length > 0);
141 |           Object.keys(this.args).forEach(a => {
142 |             if (!parsedArgs.includes(a)) {
143 |               delete this.args[a];
144 |             }
145 |           });
146 |           parsedArgs.forEach(a => {
147 |             if (!(a in this.args)) {
148 |               this.args[a] = "";
149 |             }
150 |           });
151 |         }
152 |     },
153 |     save() {
154 |       this.saveLoading = true;
155 |       runEphemeralScraperTask(
156 |         {
157 |           params: {
158 |             source_code: this.code,
159 |             args: Object.values(this.args),
160 |           },
161 |         },
162 |         "dynamic_scraper",
163 |         1,
164 |       )
165 |       .then((resp) => {
166 |         this.lastTaskId = resp.id;
167 |         this.$q.notify({
168 |           message: `Successfully created scraper`,
169 |           color: "positive",
170 |         });
171 |       })
172 |       .catch(error => this.$q.notify({
173 |         message: `Failed to save scraper: ${error}`,
174 |         color: "negative",
175 |       }))
176 |       .finally(() => this.saveLoading = false);
177 |     }
178 |   }
179 | };
180 | </script>
181 | <style>
182 | .editor {
183 |   width: 100%;
184 |   min-height: 600px;
185 |   /* height: 600px; */
186 |   height: 100%;
187 | }
188 | .arg-input {
189 |   width: 100%;
190 | }
191 | </style>
192 | 


--------------------------------------------------------------------------------
/docs/middleware/new_middleware.rst:
--------------------------------------------------------------------------------
  1 | ################################
  2 | Implementing your own middleware
  3 | ################################
  4 | 
  5 | The interface for middleware is defined in :py:class:`Middleware <sneakpeek.scraper.model.Middleware>`. 
  6 | There are 3 ways how middleware can be used:
  7 | 1. Perform custom logic before request is processed (implement `on_request` method)
  8 | 2. Perform custom logic before response is returned to the scraper logic (implement `on_response` method)
  9 | 3. Provide some additional functionality a for the scraper implementation - scraper can call any middleware method using :py:class:`ScraperContext <sneakpeek.scraper.model.ScraperContextABC>`. Each middleware is added as an attribute to the passed context, so you can call it like :code:`context.<middleware_name>.<middleware_method>(...)`
 10 | 
 11 | 
 12 | =====================================
 13 | Middleware implementation example
 14 | =====================================
 15 | 
 16 | -----------------------
 17 | On request middleware
 18 | -----------------------
 19 | Each request is wrapped in the :py:class:`Request <sneakpeek.scraper.model.Request>` class 
 20 | and you can modify its parameters before it's dispatched, here's the schema:
 21 | 
 22 | .. code-block:: python3
 23 | 
 24 |   @dataclass
 25 |   class Request:
 26 |       method: HttpMethod
 27 |       url: str
 28 |       headers: HttpHeaders | None = None
 29 |       kwargs: dict[str, Any] | None = None
 30 | 
 31 | Here's the example of the middleware which logs each request URL:
 32 | 
 33 | .. code-block:: python3
 34 | 
 35 |   import logging
 36 |   from typing import Any
 37 | 
 38 |   import aiohttp
 39 |   from pydantic import BaseModel
 40 | 
 41 |   from sneakpeek.middlewares.utils import parse_config_from_obj
 42 |   from sneakpeek.scraper.model import Middleware, Request
 43 | 
 44 | 
 45 |   # Each middleware can be configured, its configuration can be
 46 |   # set globally for all requests or it can be overriden for
 47 |   # specific scrapers
 48 |   class MyLoggingMiddlewareConfig(BaseModel):
 49 |       some_param: str = "defaul value"
 50 | 
 51 |   class MyMiddleware(BeforeRequestMiddleware):
 52 |     """Middleware description"""
 53 | 
 54 |     def __init__(self, default_config: MyLoggingMiddlewareConfig | None = None) -> None:
 55 |         self._default_config = default_config or MyLoggingMiddlewareConfig()
 56 |         self._logger = logging.getLogger(__name__)
 57 | 
 58 |     # The name property is mandatory, it's used in scraper config to override 
 59 |     # middleware configuration for the given scraper
 60 |     @property
 61 |     def name(self) -> str:
 62 |         return "my_middleware"
 63 | 
 64 |     async def on_request(self, request: Request, config: Any | None) -> Request:
 65 |         # This converts freeform dictionary into a typed config (it's optional)
 66 |         config = parse_config_from_obj(
 67 |             config,
 68 |             self.name,
 69 |             MyLoggingMiddlewareConfig,
 70 |             self._default_config,
 71 |         )
 72 |         self._logger.info(f"Making {request.method.upper()} to {request.url}. {config.some_param}")
 73 |         return request
 74 | 
 75 | 
 76 | 
 77 | -----------------------
 78 | On response middleware
 79 | -----------------------
 80 | 
 81 | On response method recieves both request and response objects. Response is `aiohttp.ClientResponse <https://docs.aiohttp.org/en/stable/client_reference.html#aiohttp.ClientResponse>`_ object.
 82 | 
 83 | 
 84 | Here's the example of the middleware which logs each response body:
 85 | 
 86 | .. code-block:: python3
 87 | 
 88 |   import logging
 89 |   from typing import Any
 90 | 
 91 |   import aiohttp
 92 |   from pydantic import BaseModel
 93 | 
 94 |   from sneakpeek.middleware.base import parse_config_from_obj
 95 |   from sneakpeek.scraper.model import Middleware, Request
 96 | 
 97 | 
 98 |   # Each middleware can be configured, its configuration can be
 99 |   # set globally for all requests or it can be overriden for
100 |   # specific scrapers
101 |   class MyLoggingMiddlewareConfig(BaseModel):
102 |       some_param: str = "defaul value"
103 | 
104 | 
105 |   class MyOnResponseMiddleware(Middleware):
106 |     """Middleware description"""
107 | 
108 |     def __init__(self, default_config: MyLoggingMiddlewareConfig | None = None) -> None:
109 |         self._default_config = default_config or MyLoggingMiddlewareConfig()
110 |         self._logger = logging.getLogger(__name__)
111 | 
112 |     # The name property is mandatory, it's used in scraper config to override 
113 |     # middleware configuration for the given scraper
114 |     @property
115 |     def name(self) -> str:
116 |         return "my_middleware"
117 | 
118 |     async def on_response(
119 |         self,
120 |         request: Request,
121 |         response: aiohttp.ClientResponse,
122 |         config: Any | None,
123 |     ) -> aiohttp.ClientResponse:
124 |         config = parse_config_from_obj(
125 |             config,
126 |             self.name,
127 |             MyLoggingMiddlewareConfig,
128 |             self._default_config,
129 |         )
130 |         response_body = await response.text()
131 |         self._logger.info(f"Made {request.method.upper()} request to {request.url} - received: status={response.status} body={response_body}")
132 |         return response
133 | 
134 | ------------------------
135 | Functional middleware
136 | ------------------------
137 | 
138 | If the middleware doesn't need to interact with the request or response you can derive it 
139 | from :py:class:`BaseMiddleware <sneakpeek.middleware.base.BaseMiddleware>`, so that both
140 | `on_request` and `on_response` method are implemented as pass-through.
141 | 
142 | Here's an example of such implementation
143 | 
144 | .. code-block:: python3
145 | 
146 |   import logging
147 |   from typing import Any
148 | 
149 |   from sneakpeek.middleware.base import parse_config_from_obj, BaseMiddleware
150 | 
151 | 
152 |   class MyFunctionalMiddleware(BaseMiddleware):
153 |     """Middleware description"""
154 | 
155 |     def __init__(self) -> None:
156 |         self._logger = logging.getLogger(__name__)
157 | 
158 |     # The name property is mandatory, it's used in scraper config to override 
159 |     # middleware configuration for the given scraper
160 |     @property
161 |     def name(self) -> str:
162 |         return "my_middleware"
163 | 
164 |     # This function will be available for scrapers by using
165 |     # `context.my_middleware.custom_funct(some_arg)`
166 |     def custom_func(self, arg1: Any) -> Any:
167 |         return do_something(arg1)
168 | 
169 |         


--------------------------------------------------------------------------------
/front/src/layouts/MainLayout.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |   <q-layout view="lHh Lpr lff" container style="height: 100vh" class="shadow-2">
  3 |     <q-drawer show-if-above :width="250" :breakpoint="500" :mini="mini" v-model="drawer"
  4 |               :class="$q.dark.isActive ? 'bg-black' : 'bg-grey-5'">
  5 |       <q-list padding style="height: calc(100vh - 10px)" class="column"
  6 |               :class="$q.dark.isActive ? 'text-white': 'text-black'">
  7 |         <q-item :to="$router.resolve({name: 'ScrapersPage'})" :focues="false" manual-focus
  8 |                 :class="$q.dark.isActive ? 'text-white': 'text-black'">
  9 |           <q-item-section avatar>
 10 |             <q-icon>
 11 |               <svg class="img-fluid" id="outputsvg" xmlns="http://www.w3.org/2000/svg"
 12 | 
 13 |                 viewBox="0 0 2000 2000">
 14 |                 <g id="l4go2u76yFZO1yZMGfeo1oM"  style="transform: none;">
 15 |                   <g style="transform: none;">
 16 |                     <path id="pUHK4cYg3"
 17 |                       d="M117 1903 c-4 -3 -7 -159 -7 -345 l0 -338 46 0 c29 0 190 43 441 116 l395 117 413 -122 c227 -66 424 -121 439 -121 l26 0 0 350 0 350 -873 0 c-481 0 -877 -3 -880 -7z"></path>
 18 |                     <path id="pMO2rLAZ8"
 19 |                       d="M662 1210 c-18 -11 -38 -31 -44 -45 -10 -23 -16 -25 -83 -25 -95 0 -107 -10 -103 -80 l3 -55 245 0 245 0 3 73 3 73 -39 40 c-39 39 -40 39 -118 39 -61 0 -86 -5 -112 -20z"></path>
 20 |                     <path id="pm3wH3Llu"
 21 |                       d="M1289 1213 c-15 -9 -34 -29 -43 -45 -16 -26 -21 -28 -81 -28 -35 0 -75 -4 -89 -10 -23 -8 -26 -15 -26 -58 0 -26 5 -53 12 -60 9 -9 77 -12 243 -12 166 0 234 3 243 12 17 17 15 104 -5 145 -25 54 -65 73 -153 73 -52 0 -83 -6 -101 -17z"></path>
 22 |                     <path id="pmed2ttMB"
 23 |                       d="M184 896 c-3 -7 -4 -35 -2 -62 l3 -49 102 -3 103 -3 15 -52 c8 -29 52 -182 99 -342 l83 -290 31 -3 c25 -3 63 19 206 114 l176 117 168 -66 c92 -37 181 -67 198 -67 l31 0 109 293 109 292 85 3 c46 2 90 8 97 14 16 13 17 90 1 106 -19 19 -1607 17 -1614 -2z"></path>
 24 |                   </g>
 25 |                 </g>
 26 |                 <g id="l2iB71gug0KQl2CIfZpzhLk" fill="rgba(0,0,0,0)" style="transform: none;">
 27 |                   <g style="transform: none;">
 28 |                     <path id="pX8JVs27A"
 29 |                       d="M0 1000 l0 -1000 1000 0 1000 0 0 1000 0 1000 -1000 0 -1000 0 0 -1000z m1840 561 l0 -319 -77 23 c-43 12 -234 68 -425 124 l-346 102 -410 -120 c-226 -67 -418 -121 -426 -121 -14 0 -16 35 -16 308 0 170 3 312 7 315 3 4 386 7 850 7 l843 0 0 -319z m-978 -400 c35 -36 39 -44 36 -83 l-3 -43 -215 0 -215 0 -3 24 c-6 38 22 51 103 51 67 0 73 2 83 25 18 38 69 65 126 65 43 0 53 -4 88 -39z m604 20 c31 -19 64 -74 64 -108 0 -40 -16 -43 -225 -43 -208 0 -225 3 -225 42 0 25 39 38 115 38 60 0 65 2 81 28 9 16 28 36 43 45 36 22 109 21 147 -2z m305 -317 c9 -11 10 -20 2 -32 -8 -13 -31 -18 -99 -22 l-88 -5 -108 -285 c-59 -157 -108 -288 -110 -293 -2 -4 -82 24 -178 63 -96 39 -182 72 -190 72 -8 1 -97 -53 -198 -120 -117 -79 -184 -118 -187 -110 -2 7 -38 132 -80 278 -42 146 -85 295 -96 332 l-20 67 -102 3 c-89 3 -102 5 -105 21 -10 49 -47 47 776 47 690 0 771 -2 783 -16z"></path>
 30 |                   </g>
 31 |                 </g>
 32 |               </svg>
 33 |             </q-icon>
 34 |           </q-item-section>
 35 |           <q-item-section class="text-h5">Sneakpeeker</q-item-section>
 36 |         </q-item>
 37 |         <q-separator spaced :color="$q.dark.isActive ? 'white': 'white'" />
 38 |         <q-item clickable v-ripple :to="$router.resolve({name: 'ScrapersPage'})"
 39 |                 :class="$q.dark.isActive ? 'text-white': 'text-black'">
 40 |           <q-item-section avatar>
 41 |             <q-icon name="dashboard" />
 42 |           </q-item-section>
 43 |           <q-item-section>
 44 |             Scrapers
 45 |           </q-item-section>
 46 |         </q-item>
 47 |         <q-item clickable v-ripple :to="$router.resolve({name: 'NewScraperPage'})"
 48 |                 :class="$q.dark.isActive ? 'text-white': 'text-black'" v-if="!isReadOnly">
 49 |           <q-item-section avatar>
 50 |             <q-icon name="fa-solid fa-plus" />
 51 |           </q-item-section>
 52 |           <q-item-section>
 53 |             New scraper
 54 |           </q-item-section>
 55 |         </q-item>
 56 |         <q-item clickable v-ripple :to="$router.resolve({name: 'ScraperIde'})"
 57 |                 :class="$q.dark.isActive ? 'text-white': 'text-black'" v-if="!isReadOnly">
 58 |           <q-item-section avatar>
 59 |             <q-icon name="fa-solid fa-code" />
 60 |           </q-item-section>
 61 |           <q-item-section>
 62 |             Scraper IDE
 63 |           </q-item-section>
 64 |         </q-item>
 65 |         <q-space vertical />
 66 |         <q-item clickable v-ripple href="https://github.com/flulemon/sneakpeek" target="_blank">
 67 |           <q-item-section avatar>
 68 |             <q-icon name="fa-brands fa-github" />
 69 |           </q-item-section>
 70 |           <q-item-section>
 71 |             Github
 72 |           </q-item-section>
 73 |         </q-item>
 74 |         <q-item clickable v-ripple href="https://sneakpeek-py.readthedocs.io/en/latest/" target="_blank">
 75 |           <q-item-section avatar>
 76 |             <q-icon name="fa-solid fa-code" />
 77 |           </q-item-section>
 78 |           <q-item-section>
 79 |             Documentation
 80 |           </q-item-section>
 81 |         </q-item>
 82 |         <q-item clickable v-ripple @click="$q.dark.toggle()">
 83 |           <q-item-section avatar>
 84 |             <q-icon :name="'fa-solid ' + ($q.dark.isActive ? 'fa-moon' : 'fa-sun')" />
 85 |           </q-item-section>
 86 |           <q-item-section>
 87 |             {{ $q.dark.isActive ? 'Dark' : 'Light' }} mode
 88 |           </q-item-section>
 89 |         </q-item>
 90 |       </q-list>
 91 |       <div class="absolute" style="top: 100px; right: -12px">
 92 |           <q-btn dense round unelevated  @click="mini = !mini" size="sm"
 93 |             :color="$q.dark.isActive ? 'grey-8': 'grey-8'"
 94 |             :icon="mini ? 'fa-solid fa-chevron-right' : 'fa-solid fa-chevron-left'"/>
 95 |         </div>
 96 |     </q-drawer>
 97 |     <q-page-container :class="$q.dark.isActive ? 'bg-grey-9' : 'bg-grey-2'">
 98 |       <router-view />
 99 |     </q-page-container>
100 |   </q-layout>
101 | </template>
102 | 
103 | <script>
104 | import { isReadOnly } from '../api';
105 | 
106 | export default {
107 |   name: 'MainLayout',
108 |   data() {
109 |     return {
110 |       drawer: false,
111 |       mini: false,
112 |       isReadOnly: false,
113 |     }
114 |   },
115 |   created() {
116 |     isReadOnly().then(result => {this.isReadOnly = result;});
117 |   }
118 | }
119 | </script>
120 | 


--------------------------------------------------------------------------------
/front/src/components/ScraperJobs.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |   <div>
  3 |     <q-table :rows="rows" :columns="columns"  class="full-height" title="Scraper jobs"
  4 |             :rows-per-page-options="[0]" :loading="loading" virtual-scroll hide-bottom>
  5 |       <template v-slot:body-cell-status="props">
  6 |         <q-td :props="props">
  7 |           <scraper-job-status-chip :value="props.value" size="sm" />
  8 |         </q-td>
  9 |       </template>
 10 |       <template v-slot:body-cell-priority="props">
 11 |         <q-td :props="props">
 12 |           <priority-chip :value="props.value" size="sm" />
 13 |         </q-td>
 14 |       </template>
 15 |       <template v-slot:body-cell-timeline="props">
 16 |         <q-td :props="props">
 17 |           <div class="column">
 18 |             <div class="row" v-if="props.value.status == 'pending'">
 19 |               <div class="text-weight-bold">Created:&nbsp;</div>
 20 |               <div>{{ formatDate(props.value.created_at) }} ({{getRelativeDate(props.value.created_at)}})</div>
 21 |             </div>
 22 |             <div v-else-if="props.value.status == 'started'" class="column">
 23 |               <div class="row">
 24 |                 <div class="text-weight-bold">Started:&nbsp;</div>
 25 |                 <div>{{ formatDate(props.value.started_at) }} ({{getRelativeDate(props.value.started_at)}})</div>
 26 |               </div>
 27 |               <div class="row">
 28 |                 <div class="text-weight-bold">Last active:&nbsp;</div>
 29 |                 <div>{{ formatDate(props.value.last_active_at) }} ({{getRelativeDate(props.value.last_active_at)}})</div>
 30 |               </div>
 31 |             </div>
 32 |             <div class="row" v-else>
 33 |               <div class="text-weight-bold">Finished:&nbsp;</div>
 34 |               <div>{{ formatDate(props.value.finished_at) }} ({{getRelativeDate(props.value.finished_at)}})</div>
 35 |             </div>
 36 |           </div>
 37 |         </q-td>
 38 |       </template>
 39 |       <template v-slot:body-cell-result="props">
 40 |         <q-td :props="props">
 41 |           <pre class="job-result">{{ formatResult(props.value) }}</pre>
 42 |         </q-td>
 43 |       </template>
 44 |       <template v-slot:body-cell-actions="props">
 45 |         <q-td :props="props">
 46 |           <q-btn size="sm" class="q-mr-sm" @click="showLogs(props.value.id)">
 47 |             <q-icon name="fa-solid fa-file-lines" class="q-mr-sm" />
 48 |             Logs
 49 |           </q-btn>
 50 |         </q-td>
 51 |       </template>
 52 |     </q-table>
 53 |     <q-dialog v-model="showLogsDialog">
 54 |       <q-card class="task-logs-card q-mb-none q-mx-none">
 55 |         <div class="flex row justify-between align-center q-px-md q-pt-md q-pb-none">
 56 |           <div class="text-h6">
 57 |             Task logs
 58 |           </div>
 59 |           <q-btn flat size="sm" v-close-popup>
 60 |             <q-icon name="fa-solid fa-close" />
 61 |           </q-btn>
 62 |         </div>
 63 |         <q-card-section class="q-pa-none q-mb-none q-mx-none">
 64 |           <task-logs :taskId="selectedScraperTask" />
 65 |         </q-card-section>
 66 |       </q-card>
 67 |     </q-dialog>
 68 |   </div>
 69 | </template>
 70 | 
 71 | <script>
 72 | import { date } from 'quasar';
 73 | import { getScraperJobs } from "../api.js";
 74 | import PriorityChip from './PriorityChip.vue';
 75 | import ScraperJobStatusChip from './ScraperJobStatusChip.vue';
 76 | import TaskLogs from './TaskLogs.vue';
 77 | 
 78 | export default {
 79 |   components: { ScraperJobStatusChip, PriorityChip, TaskLogs },
 80 |   name: 'ScraperRuns',
 81 |   props: ['id'],
 82 |   data() {
 83 |     return {
 84 |       loading: false,
 85 |       error: false,
 86 |       rows: [],
 87 |       columns: [
 88 |         { name: "id", label: "ID", field: "id", align: "left" },
 89 |         { name: "status", label: "Status", field: "status", align: "center" },
 90 |         { name: "priority", label: "Priority", field: "priority", align: "center" },
 91 |         { name: "timeline", label: "Timeline", field: row => row, align: "center" },
 92 |         { name: "result", label: "Result", field: "result", align: "center" },
 93 |         { name: "actions", label: "Actions", field: row => row, align: "center" },
 94 |       ],
 95 |       loader: null,
 96 |       loadingInBackground: false,
 97 |       showLogsDialog: false,
 98 |       selectedScraperTask: null,
 99 |     }
100 |   },
101 |   created() {
102 |     this.loading = true;
103 |     this.loadJobs()
104 |       .catch((error => this.error = error))
105 |       .finally(() => this.loading = false);
106 |     this.loadingInBackground = true;
107 |     this.loadJobsInBackground(1000);
108 |   },
109 |   unmounted() {
110 |     this.loadingInBackground = false;
111 |   },
112 |   methods: {
113 |     convertToUserTz(value) {
114 |       const parsed = date.extractDate(value, "YYYY-MM-DDTHH:mm:ss.SSSSSS")
115 |       const formatted = date.formatDate(parsed, "YYYY-MM-DD HH:mm:ss.SSS")
116 |       return new Date(`${formatted} UTC`);
117 |     },
118 |     formatDate(value) {
119 |       const converted = this.convertToUserTz(value);
120 |       return date.formatDate(converted, "YYYY-MM-DD HH:mm");
121 |     },
122 |     formatResult(value) {
123 |       try {
124 |         const parsed = JSON.parse(value);
125 |         return JSON.stringify(parsed, null, 2);
126 |       } catch (e) {
127 |           return value;
128 |       }
129 |     },
130 |     getRelativeDate(value) {
131 |       const now = new Date();
132 |       const parsed = this.convertToUserTz(value);
133 | 
134 |       const yearsDiff = date.getDateDiff(now, parsed, 'years');
135 |       if (yearsDiff > 1) {
136 |         return `${yearsDiff} years ago`;
137 |       }
138 |       const monthsDiff = date.getDateDiff(now, parsed, 'months');
139 |       if (monthsDiff > 1) {
140 |         return `${monthsDiff} months ago`;
141 |       }
142 |       const daysDiff = date.getDateDiff(now, parsed, 'days');
143 |       if (daysDiff > 1) {
144 |         return `${daysDiff} days ago`;
145 |       }
146 |       const hoursDiff = date.getDateDiff(now, parsed, 'hours');
147 |       if (hoursDiff > 1) {
148 |         return `${hoursDiff} hours ago`;
149 |       }
150 |       const minutesDiff = date.getDateDiff(now, parsed, 'minutes');
151 |       if (minutesDiff >= 1) {
152 |         return `${minutesDiff} minute${minutesDiff > 1 ? 's' : ''} ago`;
153 |       }
154 |       const secondsDiff = date.getDateDiff(now, parsed, 'seconds');
155 |       return `${secondsDiff} seconds ago`;
156 |     },
157 |     loadJobs() {
158 |       return getScraperJobs(this.id).then((data) => this.rows = data);
159 |     },
160 |     loadJobsInBackground() {
161 |       if (!this.loadingInBackground) {
162 |         return;
163 |       }
164 |       this.loadJobs();
165 |       setTimeout(this.loadJobsInBackground, 1000);
166 |     },
167 |     showLogs(id) {
168 |       this.selectedScraperTask = id;
169 |       this.showLogsDialog = true;
170 |     }
171 |   }
172 | }
173 | </script>
174 | <style scoped>
175 | .job-result {
176 |   max-width: 400px;
177 |   white-space: break-spaces;
178 |   text-align: start;
179 | }
180 | .task-logs-card {
181 |   min-width: 900px;
182 | }
183 | </style>
184 | 


--------------------------------------------------------------------------------
/sneakpeek/queue/consumer.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | from datetime import datetime, timedelta
  4 | from traceback import format_exc
  5 | 
  6 | from prometheus_client import Counter
  7 | 
  8 | from sneakpeek.logging import task_context
  9 | from sneakpeek.metrics import (
 10 |     count_invocations,
 11 |     delay_histogram,
 12 |     measure_latency,
 13 |     replicas_gauge,
 14 | )
 15 | from sneakpeek.queue.model import (
 16 |     QueueABC,
 17 |     Task,
 18 |     TaskHandlerABC,
 19 |     TaskPingFinishedError,
 20 |     TaskStatus,
 21 |     TaskTimedOut,
 22 |     UnknownTaskHandlerError,
 23 | )
 24 | 
 25 | POLL_DELAY = timedelta(milliseconds=100)
 26 | TASK_PING_DELAY = timedelta(seconds=1)
 27 | 
 28 | 
 29 | task_executed = Counter(
 30 |     name="task_executed",
 31 |     documentation="Tasks executed",
 32 |     namespace="sneakpeek",
 33 |     labelnames=["handler", "name", "status"],
 34 | )
 35 | 
 36 | 
 37 | class Consumer:
 38 |     """
 39 |     Generic queue consumer implementation
 40 |     """
 41 | 
 42 |     def __init__(
 43 |         self,
 44 |         queue: QueueABC,
 45 |         handlers: list[TaskHandlerABC],
 46 |         loop: asyncio.AbstractEventLoop | None = None,
 47 |         max_concurrency: int = 50,
 48 |         poll_delay: timedelta = POLL_DELAY,
 49 |         ping_delay: timedelta = TASK_PING_DELAY,
 50 |     ) -> None:
 51 |         """
 52 |         Args:
 53 |             queue (QueueABC): Queue implementation
 54 |             handlers (list[TaskHandlerABC]): List of the task handlers
 55 |             loop (asyncio.AbstractEventLoop | None, optional): asyncio loop. Defaults to asyncio.get_event_loop().
 56 |             max_concurrency (int, optional): Maximum number of concurrent tasks that a consumer can handle. Defaults to 50.
 57 |             poll_delay (timedelta, optional): Delay between queue polling in case there are no items in the queue. Defaults to POLL_DELAY.
 58 |             ping_delay (timedelta, optional): Task heartbeat frequency. Defaults to TASK_PING_DELAY.
 59 |         """
 60 |         self.logger = logging.getLogger(__name__)
 61 |         self.queue = queue
 62 |         self.handlers = {handler.name(): handler for handler in handlers}
 63 |         self.max_concurrency = max_concurrency
 64 |         self.active: set[asyncio.Task] = set()
 65 |         self.loop = loop or asyncio.get_event_loop()
 66 |         self.ping_delay = ping_delay.total_seconds()
 67 |         self.poll_delay = poll_delay.total_seconds()
 68 |         self.running = False
 69 |         self.cycle_task: asyncio.Task | None = None
 70 | 
 71 |     async def _handle_task(self, handler: TaskHandlerABC, task: Task) -> str:
 72 |         with task_context(task):
 73 |             return await handler.process(task)
 74 | 
 75 |     @count_invocations(subsystem="consumer")
 76 |     async def process_task(self, task: Task) -> None:
 77 |         """Process dequeued task
 78 | 
 79 |         Args:
 80 |             task (Task): Dequeued Task
 81 | 
 82 |         Raises:
 83 |             UnknownTaskHandlerError: Raised when there's no handler for given task type
 84 |             TaskTimedOut: Raised when a task has exceeded maximum process time
 85 |         """
 86 |         delay_histogram.labels(type="time_spent_in_queue").observe(
 87 |             (datetime.utcnow() - task.created_at).total_seconds()
 88 |         )
 89 |         handler_task: asyncio.Task | None = None
 90 |         self.logger.info(f"Executing task id={task.id}")
 91 |         try:
 92 |             task.started_at = datetime.utcnow()
 93 |             task.status = TaskStatus.STARTED
 94 |             task = await self.queue.update_task(task)
 95 | 
 96 |             if task.task_handler not in self.handlers:
 97 |                 raise UnknownTaskHandlerError(task.task_handler)
 98 |             handler = self.handlers[task.task_handler]
 99 |             handler_task = self.loop.create_task(self._handle_task(handler, task))
100 | 
101 |             while not handler_task.done():
102 |                 if task.timeout and datetime.utcnow() - task.started_at > task.timeout:
103 |                     raise TaskTimedOut()
104 |                 task = await self.queue.ping_task(task.id)
105 |                 await asyncio.sleep(self.ping_delay)
106 | 
107 |             result = handler_task.result()
108 |             task.finished_at = datetime.utcnow()
109 |             task.status = TaskStatus.SUCCEEDED
110 |             task.result = result
111 |             self.logger.info(f"Successfully executed task id={task.id}")
112 |         except TaskPingFinishedError:
113 |             if handler_task and not handler_task.done():
114 |                 handler_task.cancel()
115 |             self.logger.exception(f"Seems like task {task.id} was killed")
116 |         except Exception:
117 |             if handler_task and not handler_task.done():
118 |                 handler_task.cancel()
119 |             self.logger.exception(f"Failed to execute {task.id}")
120 |             task.finished_at = datetime.utcnow()
121 |             task.status = TaskStatus.FAILED
122 |             task.result = format_exc()
123 |         finally:
124 |             try:
125 |                 task = await self.queue.update_task(task)
126 |                 task_executed.labels(
127 |                     handler=task.task_handler,
128 |                     name=task.task_name,
129 |                     status=task.status.name.lower(),
130 |                 )
131 |             except Exception:
132 |                 self.logger.exception(f"Failed to update task {task.id}")
133 | 
134 |     @measure_latency(subsystem="consumer")
135 |     @count_invocations(subsystem="consumer")
136 |     async def consume(self) -> bool:
137 |         """Consume from the queue
138 | 
139 |         Returns:
140 |             bool: True if anything has been consumed, False otherwise
141 |         """
142 |         replicas_gauge.labels(type="active_tasks").set(len(self.active))
143 |         if len(self.active) >= self.max_concurrency:
144 |             self.logger.debug(
145 |                 f"Not dequeuing any tasks because worker has reached max concurrency,"
146 |                 f" there are {len(self.active)} of active tasks"
147 |             )
148 |             return False
149 | 
150 |         dequeued = await self.queue.dequeue()
151 |         if not dequeued:
152 |             self.logger.debug("No pending tasks in the queue")
153 |             return False
154 | 
155 |         self.logger.info(f"Dequeued a task id={dequeued.id}")
156 |         task_handle = self.loop.create_task(self.process_task(dequeued))
157 |         self.active.add(task_handle)
158 |         task_handle.add_done_callback(self.active.discard)
159 |         return True
160 | 
161 |     async def _cycle(self):
162 |         while self.running:
163 |             if not await self.consume():
164 |                 await asyncio.sleep(self.poll_delay)
165 | 
166 |     def start(self):
167 |         """Start consuming from the queue"""
168 |         self.running = True
169 |         self.cycle_task = self.loop.create_task(self._cycle())
170 | 
171 |     def stop(self):
172 |         """Stop consuming from the queue"""
173 |         self.running = False
174 |         if self.cycle_task:
175 |             self.cycle_task.cancel()
176 | 


--------------------------------------------------------------------------------