├── .dockerignore
├── .github
├── ISSUE_TEMPLATE
│ └── bug_report.md
├── actions
│ ├── push-to-helm
│ │ └── action.yaml
│ └── run-cypress-tests
│ │ └── action.yaml
└── workflows
│ ├── docker-image.yml
│ └── unit-tests.yml
├── .gitignore
├── .prettierignore
├── .python-version
├── FUNDING.yml
├── LICENSE
├── Makefile
├── README.md
├── api
└── backend
│ ├── __init__.py
│ ├── ai
│ ├── agent
│ │ ├── actions.py
│ │ ├── agent.py
│ │ ├── prompts.py
│ │ └── utils.py
│ ├── ai_router.py
│ └── clients.py
│ ├── app.py
│ ├── auth
│ ├── __init__.py
│ ├── auth_router.py
│ └── auth_utils.py
│ ├── constants.py
│ ├── database
│ ├── __init__.py
│ ├── common.py
│ ├── queries
│ │ ├── __init__.py
│ │ └── queries.py
│ ├── schema
│ │ ├── __init__.py
│ │ └── schema.py
│ └── startup.py
│ ├── job
│ ├── __init__.py
│ ├── cron_scheduling
│ │ └── cron_scheduling.py
│ ├── job.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── job_options.py
│ │ └── site_map.py
│ ├── scraping
│ │ ├── add_custom.py
│ │ ├── collect_media.py
│ │ └── scraping_utils.py
│ ├── site_mapping
│ │ ├── __init__.py
│ │ └── site_mapping.py
│ └── utils
│ │ ├── clean_job_format.py
│ │ └── stream_md_from_job_results.py
│ ├── models.py
│ ├── routers
│ ├── job_router.py
│ └── stats_router.py
│ ├── scheduler.py
│ ├── schemas.py
│ ├── scraping.py
│ ├── tests
│ ├── factories
│ │ └── job_factory.py
│ ├── job
│ │ ├── __init__.py
│ │ └── test_download_job.py
│ └── scraping
│ │ ├── __init__.py
│ │ └── test_scraping.py
│ ├── utils.py
│ └── worker
│ ├── job_worker.py
│ ├── logger.py
│ └── post_job_complete
│ ├── discord_notification.py
│ ├── email_notifcation.py
│ ├── models.py
│ └── post_job_complete.py
├── cypress.config.ts
├── cypress
├── e2e
│ ├── authentication.cy.ts
│ ├── navigation.cy.ts
│ └── submit-job.cy.ts
├── fixtures
│ └── example.json
└── support
│ ├── commands.ts
│ └── e2e.ts
├── docker-compose.dev.yml
├── docker-compose.yml
├── docker
├── api
│ └── Dockerfile
└── frontend
│ └── Dockerfile
├── docs
├── chat_page.png
├── docs_page.png
├── job_page.png
├── log_page.png
├── login.png
├── logo_picture.png
├── main_page.png
└── stats_page.png
├── helm
├── .helmignore
├── Chart.yaml
├── templates
│ ├── deployment.yaml
│ └── service.yaml
└── values.yaml
├── next-env.d.ts
├── next.config.mjs
├── package.json
├── pdm.lock
├── postcss.config.js
├── public
├── favicon.ico
├── images
│ └── scraperr_logo.png
├── manifest.json
└── robots.txt
├── pyproject.toml
├── src
├── components
│ ├── ai
│ │ ├── Chat.tsx
│ │ ├── JobSelector.tsx
│ │ └── index.ts
│ ├── common
│ │ ├── advanced-job-options
│ │ │ ├── advanced-job-options.tsx
│ │ │ ├── dialog
│ │ │ │ ├── advanced-job-options-dialog.tsx
│ │ │ │ └── index.ts
│ │ │ └── index.ts
│ │ ├── csv-table
│ │ │ ├── csv-table.tsx
│ │ │ └── index.ts
│ │ ├── disabled
│ │ │ ├── disabled.tsx
│ │ │ └── index.ts
│ │ ├── expanded-table-input
│ │ │ ├── expanded-table-input.tsx
│ │ │ └── index.ts
│ │ ├── index.ts
│ │ ├── job-download-dialog
│ │ │ ├── index.ts
│ │ │ └── job-download-dialog.tsx
│ │ ├── media-viewer
│ │ │ ├── audio
│ │ │ │ ├── audio-viewer.tsx
│ │ │ │ └── index.ts
│ │ │ ├── image
│ │ │ │ ├── image-viewer.tsx
│ │ │ │ └── index.ts
│ │ │ ├── index.ts
│ │ │ ├── media-viewer.tsx
│ │ │ ├── pdf-viewer
│ │ │ │ ├── index.ts
│ │ │ │ └── pdf-viewer.tsx
│ │ │ ├── tile-grid-view
│ │ │ │ ├── index.ts
│ │ │ │ └── tile-grid-view.tsx
│ │ │ └── video
│ │ │ │ ├── index.ts
│ │ │ │ └── video-viewer.tsx
│ │ └── nav-drawer
│ │ │ ├── index.ts
│ │ │ ├── nav-drawer.module.css
│ │ │ ├── nav-drawer.tsx
│ │ │ ├── nav-item
│ │ │ ├── index.ts
│ │ │ └── nav-item.tsx
│ │ │ ├── nav-items
│ │ │ ├── index.ts
│ │ │ └── nav-items.tsx
│ │ │ └── user-control
│ │ │ ├── index.ts
│ │ │ ├── logged-in-control
│ │ │ ├── index.ts
│ │ │ ├── logged-in-control.module.css
│ │ │ └── logged-in-control.tsx
│ │ │ ├── logged-out-control
│ │ │ ├── index.ts
│ │ │ ├── logged-out-control.module.css
│ │ │ └── logged-out-control.tsx
│ │ │ ├── user-control.module.css
│ │ │ └── user-control.tsx
│ ├── jobs
│ │ ├── Favorites.tsx
│ │ ├── JobQueue.tsx
│ │ ├── JobTable.tsx
│ │ └── index.tsx
│ ├── nav
│ │ └── quick-settings
│ │ │ ├── index.ts
│ │ │ ├── quick-settings.module.css
│ │ │ └── quick-settings.tsx
│ ├── pages
│ │ ├── agent
│ │ │ ├── agent.tsx
│ │ │ └── index.ts
│ │ ├── chat
│ │ │ └── chat.tsx
│ │ ├── cron-jobs
│ │ │ ├── create-cron-jobs
│ │ │ │ ├── create-cron-jobs.tsx
│ │ │ │ └── index.ts
│ │ │ ├── cron-jobs.module.css
│ │ │ ├── cron-jobs.tsx
│ │ │ ├── get-server-side-props.ts
│ │ │ └── index.ts
│ │ ├── home
│ │ │ ├── home.tsx
│ │ │ └── index.ts
│ │ ├── job
│ │ │ └── csv
│ │ │ │ └── id
│ │ │ │ ├── get-server-side-props.ts
│ │ │ │ ├── id.tsx
│ │ │ │ └── index.ts
│ │ ├── media
│ │ │ └── id
│ │ │ │ ├── id.tsx
│ │ │ │ └── index.ts
│ │ └── recordings
│ │ │ └── id
│ │ │ ├── id.tsx
│ │ │ └── index.ts
│ └── submit
│ │ ├── index.ts
│ │ └── job-submitter
│ │ ├── element-table
│ │ ├── element-table.tsx
│ │ └── index.ts
│ │ ├── index.ts
│ │ ├── job-submitter-header
│ │ ├── index.ts
│ │ ├── job-submitter-header.module.css
│ │ └── job-submitter-header.tsx
│ │ ├── job-submitter-input
│ │ ├── index.ts
│ │ ├── job-submitter-input.module.css
│ │ └── job-submitter-input.tsx
│ │ ├── job-submitter-options
│ │ ├── index.ts
│ │ └── job-submitter-options.tsx
│ │ ├── job-submitter.tsx
│ │ ├── provider.tsx
│ │ └── site-map
│ │ ├── index.ts
│ │ ├── site-map-input
│ │ ├── index.ts
│ │ ├── site-map-input.module.css
│ │ └── site-map-input.tsx
│ │ └── site-map.tsx
├── contexts
│ └── AuthContext.tsx
├── declaration.d.ts
├── lib
│ ├── constants.ts
│ ├── helpers
│ │ ├── index.ts
│ │ ├── parse-job-options.ts
│ │ ├── parse-json-to-entries.ts
│ │ └── validate-url.ts
│ ├── hooks
│ │ └── use-advanced-job-options
│ │ │ ├── index.ts
│ │ │ └── use-advanced-job-options.ts
│ ├── index.ts
│ └── utils.ts
├── pages
│ ├── _app.tsx
│ ├── _document.tsx
│ ├── agent.tsx
│ ├── api
│ │ ├── ai
│ │ │ └── index.ts
│ │ ├── check.ts
│ │ ├── delete-cron-job.ts
│ │ ├── delete.ts
│ │ ├── download.ts
│ │ ├── get-average-element-per-link.ts
│ │ ├── get-average-jobs-per-day.ts
│ │ ├── job
│ │ │ └── [id].ts
│ │ ├── logs.ts
│ │ ├── me.ts
│ │ ├── media
│ │ │ ├── get-media.ts
│ │ │ └── index.ts
│ │ ├── recordings
│ │ │ └── [id].ts
│ │ ├── retrieve.ts
│ │ ├── schedule-cron-job.ts
│ │ ├── signup.ts
│ │ ├── submit-scrape-job.ts
│ │ ├── token.ts
│ │ └── update.ts
│ ├── chat.tsx
│ ├── cron-jobs.tsx
│ ├── index.tsx
│ ├── job
│ │ └── csv
│ │ │ └── [id].tsx
│ ├── jobs.tsx
│ ├── login.tsx
│ ├── media
│ │ └── index.tsx
│ ├── recordings
│ │ └── index.tsx
│ └── statistics.tsx
├── services
│ ├── api-service
│ │ ├── api-service.ts
│ │ ├── functions
│ │ │ ├── index.ts
│ │ │ └── submit-job.ts
│ │ └── index.ts
│ └── index.ts
├── store
│ ├── hooks.ts
│ ├── slices
│ │ └── settingsSlice.ts
│ └── store.ts
├── styles
│ ├── globals.css
│ └── themes.ts
└── types
│ ├── element.ts
│ ├── index.ts
│ ├── job.ts
│ ├── message.ts
│ └── result.ts
├── start.sh
├── supervisord.conf
├── tailwind.config.js
├── tsconfig.json
└── yarn.lock
/.dockerignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | npm-debug.log
3 | Dockerfile
4 | .dockerignore
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: 'Bug reporting '
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS: [e.g. iOS]
28 | - Browser [e.g. chrome, safari]
29 | - Version [e.g. 22]
30 |
31 | **Additional context**
32 | Add any other context about the problem here.
33 |
--------------------------------------------------------------------------------
/.github/actions/push-to-helm/action.yaml:
--------------------------------------------------------------------------------
1 | name: Publish Helm Chart
2 | description: Publish a Helm chart to a target repository
3 |
4 | inputs:
5 | app-repo-token:
6 | required: true
7 | description: "The token for the target repository"
8 |
9 | runs:
10 | using: 'composite'
11 | steps:
12 | - name: Checkout app repo
13 | uses: actions/checkout@v4
14 |
15 | - name: Set up Helm
16 | uses: azure/setup-helm@v3
17 |
18 | - name: Package Helm chart
19 | run: |
20 | mkdir -p packaged
21 | helm package helm -d packaged
22 | shell: bash
23 |
24 | - name: Clone target Helm repo
25 | run: |
26 | git clone https://github.com/jaypyles/helm.git target-repo
27 | cd target-repo
28 | git config user.name "github-actions"
29 | git config user.email "github-actions@github.com"
30 | git fetch origin gh-pages # Fetch gh-pages explicitly
31 | git checkout gh-pages # Checkout gh-pages branch
32 | git pull origin gh-pages # Pull latest changes from gh-pages
33 | shell: bash
34 |
35 | - name: Copy package and update index
36 | run: |
37 | APP_NAME="scraperr"
38 | mkdir -p target-repo/charts/$APP_NAME
39 | cp packaged/*.tgz target-repo/charts/$APP_NAME/
40 | cd target-repo/charts/$APP_NAME
41 | helm repo index . --url https://jaypyles.github.io/helm/charts/$APP_NAME
42 | shell: bash
43 |
44 | - name: Commit and push to target repo
45 | run: |
46 | cd target-repo
47 | git add charts/
48 | git commit -m "Update $APP_NAME chart $(date +'%Y-%m-%d %H:%M:%S')" || echo "No changes"
49 | git push https://x-access-token:${{ inputs.app-repo-token }}@github.com/jaypyles/helm.git gh-pages
50 | shell: bash
--------------------------------------------------------------------------------
/.github/actions/run-cypress-tests/action.yaml:
--------------------------------------------------------------------------------
1 | name: Run Cypress Tests
2 |
3 | description: Run Cypress tests
4 |
5 | runs:
6 | using: "composite"
7 | steps:
8 | - name: Checkout code
9 | uses: actions/checkout@v4
10 |
11 | - name: Setup Node
12 | uses: actions/setup-node@v4
13 | with:
14 | node-version: 22
15 |
16 | - name: Setup Docker project
17 | shell: bash
18 | run: make build-ci up-ci
19 |
20 | - name: Install dependencies
21 | shell: bash
22 | run: yarn install
23 |
24 | - name: Wait for frontend to be ready
25 | shell: bash
26 | run: |
27 | for i in {1..10}; do
28 | curl -s http://127.0.0.1:80 && echo "Frontend is ready" && exit 0
29 | echo "Waiting for frontend to be ready... attempt $i"
30 | sleep 1
31 | done
32 | echo "Frontend failed to be ready after 10 retries"
33 | exit 1
34 |
35 | - name: Wait for backend to be ready
36 | shell: bash
37 | run: |
38 | for i in {1..10}; do
39 | curl -s http://127.0.0.1:8000 && echo "Backend is ready" && exit 0
40 | echo "Waiting for backend to be ready... attempt $i"
41 | sleep 1
42 | done
43 | echo "Backend failed to be ready after 10 retries"
44 | exit 1
45 |
46 | - name: Show backend logs on failure
47 | if: failure()
48 | shell: bash
49 | run: |
50 | echo "== Docker Containers =="
51 | docker ps -a
52 | echo "== Backend Logs =="
53 | docker logs $(docker ps -a --filter "name=scraperr_api" --format "{{.Names}}") || echo "Could not get backend logs"
54 |
55 | - name: Run Cypress tests
56 | shell: bash
57 | run: npm run cy:run
58 |
59 |
--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
1 | name: Docker Image
2 | on:
3 | workflow_dispatch:
4 |
5 | jobs:
6 | build:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - name: Checkout
10 | uses: actions/checkout@v4
11 |
12 | - name: Get version from helm chart
13 | run: |
14 | VERSION=$(grep "version:" ./helm/Chart.yaml | cut -d: -f2 | tr -d ' ')
15 | echo "VERSION=$VERSION" >> $GITHUB_ENV
16 | echo "Version is $VERSION"
17 |
18 | - name: Login to Docker Hub
19 | uses: docker/login-action@v3
20 | with:
21 | username: ${{ secrets.DOCKERHUB_USERNAME }}
22 | password: ${{ secrets.DOCKERHUB_TOKEN }}
23 |
24 | - name: Set up Docker Buildx
25 | uses: docker/setup-buildx-action@v3
26 |
27 | - name: Build and push frontend
28 | uses: docker/build-push-action@v5
29 | with:
30 | context: .
31 | file: ./docker/frontend/Dockerfile
32 | push: true
33 | tags: |
34 | ${{ secrets.DOCKERHUB_USERNAME }}/scraperr:latest
35 | ${{ secrets.DOCKERHUB_USERNAME }}/scraperr:${{ env.VERSION }}
36 |
37 | - name: Build and push api
38 | uses: docker/build-push-action@v5
39 | with:
40 | context: .
41 | file: ./docker/api/Dockerfile
42 | push: true
43 | tags: |
44 | ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
45 | ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:${{ env.VERSION }}
46 |
47 | push-helm-chart:
48 | runs-on: ubuntu-latest
49 | needs:
50 | - build
51 | steps:
52 | - uses: actions/checkout@v4
53 |
54 | - name: Push Helm Chart
55 | uses: ./.github/actions/push-to-helm
56 | with:
57 | app-repo-token: ${{ secrets.GPAT_TOKEN }}
58 |
59 | success-message:
60 | runs-on: ubuntu-latest
61 | needs:
62 | - build
63 | - push-helm-chart
64 | steps:
65 | - name: Send Discord Message
66 | uses: jaypyles/discord-webhook-action@v1.0.0
67 | with:
68 | webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
69 | content: "Scraperr Successfully Built Docker Images"
70 | username: "Scraperr CI"
71 | embed-title: "✅ Deployment Status"
72 | embed-description: "Scraperr successfully built docker images."
73 | embed-color: 3066993 # Green
74 | embed-footer-text: "Scraperr CI"
75 | embed-timestamp: ${{ github.event.head_commit.timestamp }}
76 |
--------------------------------------------------------------------------------
/.github/workflows/unit-tests.yml:
--------------------------------------------------------------------------------
1 | name: Unit Tests
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 |
8 | pull_request:
9 | types: [opened, synchronize, reopened]
10 |
11 | workflow_dispatch:
12 |
13 | jobs:
14 | unit-tests:
15 | runs-on: ubuntu-latest
16 | steps:
17 | - name: Checkout
18 | uses: actions/checkout@v4
19 |
20 | - name: Set env
21 | run: echo "ENV=test" >> $GITHUB_ENV
22 |
23 | - name: Install pdm
24 | run: pip install pdm
25 |
26 | - name: Install project dependencies
27 | run: pdm install
28 |
29 | - name: Install playwright
30 | run: pdm run playwright install
31 |
32 | - name: Run tests
33 | run: PYTHONPATH=. pdm run pytest -v -ra api/backend/tests
34 |
35 | cypress-tests:
36 | runs-on: ubuntu-latest
37 | steps:
38 | - uses: actions/checkout@v4
39 | - uses: ./.github/actions/run-cypress-tests
40 |
41 | success-message:
42 | runs-on: ubuntu-latest
43 | needs:
44 | - unit-tests
45 | - cypress-tests
46 | steps:
47 | - name: Send Discord Message
48 | uses: jaypyles/discord-webhook-action@v1.0.0
49 | with:
50 | webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
51 | content: "Scraperr Successfully Passed Tests"
52 | username: "Scraperr CI"
53 | embed-title: "✅ Deployment Status"
54 | embed-description: "Scraperr successfully passed all tests."
55 | embed-color: 3066993 # Green
56 | embed-footer-text: "Scraperr CI"
57 | embed-timestamp: ${{ github.event.head_commit.timestamp }}
58 |
--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------
1 | *.yaml
2 | *.yml
3 |
--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10.12
2 |
--------------------------------------------------------------------------------
/FUNDING.yml:
--------------------------------------------------------------------------------
1 | custom: ["https://www.buymeacoffee.com/jaypyles"]
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Jayden Pyles
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .DEFAULT_GOAL := help
2 |
3 | COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.local.yml
4 | COMPOSE_PROD = docker compose -f docker-compose.yml
5 |
6 | .PHONY: help deps build pull up up-dev down setup deploy
7 |
8 | help:
9 | @echo "Usage:"
10 | @echo " make logs - Check Docker container logs"
11 | @echo " make deps - Build frontend assets"
12 | @echo " make build - Build Docker images"
13 | @echo " make build-force - Build Docker images"
14 | @echo " make pull - Pull Docker images"
15 | @echo " make up - Start production environment"
16 | @echo " make up-dev - Start development environment"
17 | @echo " make down - Stop and remove containers, networks, images, and volumes"
18 | @echo " make setup - Setup server with dependencies and clone repo"
19 | @echo " make deploy - Deploy site onto server"
20 | @echo " make cypress-start - Start Cypress"
21 | @echo ""
22 |
23 | logs:
24 | docker compose logs -f
25 |
26 | deps:
27 | pdm install
28 | npm install
29 | npm run build
30 |
31 | build:
32 | $(COMPOSE_DEV) build
33 |
34 | build-force:
35 | $(COMPOSE_DEV) build --no-cache
36 |
37 | pull:
38 | docker compose pull
39 |
40 | up:
41 | $(COMPOSE_PROD) up -d --force-recreate
42 |
43 | up-dev:
44 | $(COMPOSE_DEV) up -d --force-recreate
45 |
46 | down:
47 | $(COMPOSE_DEV) down
48 | $(COMPOSE_PROD) down
49 |
50 | setup:
51 | ansible-playbook -i ./ansible/inventory.yaml ./ansible/setup.yaml
52 |
53 | deploy:
54 | ansible-playbook -i ./ansible/inventory.yaml ./ansible/deploy_site.yaml -v
55 |
56 | build-ci:
57 | docker compose -f docker-compose.yml -f docker-compose.dev.yml build
58 |
59 | up-ci:
60 | docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d --force-recreate
61 |
62 | cypress-start:
63 | DISPLAY=:0 npx cypress open
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | **A powerful self-hosted web scraping solution**
5 |
6 |
12 |
13 |
14 | ## 📋 Overview
15 |
16 | Scrape websites without writing a single line of code.
17 |
18 | > 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information.
19 |
20 |
21 |
22 |
23 |
24 | ## ✨ Key Features
25 |
26 | - **XPath-Based Extraction**: Precisely target page elements
27 | - **Queue Management**: Submit and manage multiple scraping jobs
28 | - **Domain Spidering**: Option to scrape all pages within the same domain
29 | - **Custom Headers**: Add JSON headers to your scraping requests
30 | - **Media Downloads**: Automatically download images, videos, and other media
31 | - **Results Visualization**: View scraped data in a structured table format
32 | - **Data Export**: Export your results in markdown and csv formats
33 | - **Notifcation Channels**: Send completion notifcations, through various channels
34 |
35 | ## 🚀 Getting Started
36 |
37 | ### Docker
38 |
39 | ```bash
40 | make up
41 | ```
42 |
43 | ### Helm
44 |
45 | > Refer to the docs for helm deployment: https://scraperr-docs.pages.dev/guides/helm-deployment
46 |
47 | ## ⚖️ Legal and Ethical Guidelines
48 |
49 | When using Scraperr, please remember to:
50 |
51 | 1. **Respect `robots.txt`**: Always check a website's `robots.txt` file to verify which pages permit scraping
52 | 2. **Terms of Service**: Adhere to each website's Terms of Service regarding data extraction
53 | 3. **Rate Limiting**: Implement reasonable delays between requests to avoid overloading servers
54 |
55 | > **Disclaimer**: Scraperr is intended for use only on websites that explicitly permit scraping. The creator accepts no responsibility for misuse of this tool.
56 |
57 | ## 💬 Join the Community
58 |
59 | Get support, report bugs, and chat with other users and contributors.
60 |
61 | 👉 [Join the Scraperr Discord](https://discord.gg/89q7scsGEK)
62 |
63 | ## 📄 License
64 |
65 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
66 |
67 | ## 👏 Contributions
68 |
69 | Development made easier with the [webapp template](https://github.com/jaypyles/webapp-template).
70 |
71 | To get started, simply run `make build up-dev`.
--------------------------------------------------------------------------------
/api/backend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/api/backend/__init__.py
--------------------------------------------------------------------------------
/api/backend/ai/agent/actions.py:
--------------------------------------------------------------------------------
1 | from typing_extensions import TypedDict
2 |
3 |
4 | class Action(TypedDict):
5 | type: str
6 | url: str
7 |
--------------------------------------------------------------------------------
/api/backend/ai/agent/agent.py:
--------------------------------------------------------------------------------
1 | import random
2 | from typing import Any
3 |
4 | from camoufox import AsyncCamoufox
5 | from playwright.async_api import Page
6 |
7 | from api.backend.ai.agent.utils import (
8 | capture_elements,
9 | convert_to_markdown,
10 | parse_response,
11 | )
12 |
13 | from api.backend.ai.clients import ask_open_ai, ask_ollama, open_ai_key
14 |
15 | from api.backend.ai.agent.prompts import (
16 | ELEMENT_EXTRACTION_PROMPT,
17 | EXTRACT_ELEMENTS_PROMPT,
18 | )
19 |
20 | from api.backend.job.scraping.collect_media import collect_media
21 | from api.backend.worker.logger import LOG
22 |
23 | from api.backend.job.scraping.add_custom import add_custom_items
24 |
25 | from api.backend.models import CapturedElement
26 |
27 |
28 | ask_ai = ask_open_ai if open_ai_key else ask_ollama
29 |
30 |
31 | async def scrape_with_agent(agent_job: dict[str, Any]):
32 | LOG.info(f"Starting work for agent job: {agent_job}")
33 | pages = set()
34 |
35 | if agent_job["job_options"]["proxies"]:
36 | proxy = random.choice(agent_job["job_options"]["proxies"])
37 | LOG.info(f"Using proxy: {proxy}")
38 |
39 | async with AsyncCamoufox(headless=True) as browser:
40 | page: Page = await browser.new_page()
41 |
42 | await add_custom_items(
43 | agent_job["url"],
44 | page,
45 | agent_job["job_options"]["custom_cookies"],
46 | agent_job["job_options"]["custom_headers"],
47 | )
48 |
49 | try:
50 | await page.set_viewport_size({"width": 1920, "height": 1080})
51 | await page.goto(agent_job["url"], timeout=60000)
52 |
53 | if agent_job["job_options"]["collect_media"]:
54 | await collect_media(agent_job["id"], page)
55 |
56 | html_content = await page.content()
57 | markdown_content = convert_to_markdown(html_content)
58 |
59 | response = await ask_ai(
60 | ELEMENT_EXTRACTION_PROMPT.format(
61 | extraction_prompt=EXTRACT_ELEMENTS_PROMPT,
62 | webpage=markdown_content,
63 | prompt=agent_job["prompt"],
64 | )
65 | )
66 |
67 | xpaths = parse_response(response)
68 |
69 | captured_elements = await capture_elements(page, xpaths)
70 |
71 | final_url = page.url
72 |
73 | pages.add((html_content, final_url))
74 | finally:
75 | await page.close()
76 | await browser.close()
77 |
78 | name_to_elements = {}
79 |
80 | for page in pages:
81 | for element in captured_elements:
82 | if element.name not in name_to_elements:
83 | name_to_elements[element.name] = []
84 |
85 | name_to_elements[element.name].append(element)
86 |
87 | scraped_elements: list[dict[str, dict[str, list[CapturedElement]]]] = [
88 | {
89 | page[1]: name_to_elements,
90 | }
91 | for page in pages
92 | ]
93 |
94 | return scraped_elements
95 |
--------------------------------------------------------------------------------
/api/backend/ai/agent/prompts.py:
--------------------------------------------------------------------------------
1 | EXTRACT_ELEMENTS_PROMPT = """
2 | You are an assistant that extracts XPath expressions from webpages.
3 |
4 | You will receive HTML content in markdown format.
5 |
6 | Each element in the markdown has their xpath shown above them in a path like:
7 |
8 |
9 | Respond only with a list of general XPath expressions inside `... ` tags.
10 |
11 | You will also decide the decision of what to do next. If there is no decision available, return nothing for that section.
12 | """
13 |
14 | ELEMENT_EXTRACTION_PROMPT = """
15 | {extraction_prompt}
16 |
17 | **Guidelines:**
18 | - Prefer shorter, more general XPaths like `//div[...]` or `//span[...]`.
19 | - Avoid overly specific or deep paths like `//div[3]/ul/li[2]/a`.
20 | - Do **not** chain multiple elements deeply (e.g., `//div/span/a`).
21 | - Use XPaths further down the tree when possible.
22 | - Do not include any extra explanation or text.
23 | - One XPath is acceptable if that's all that's needed.
24 | - Try and limit it down to 1 - 3 xpaths.
25 | - Include a name for each xpath.
26 |
27 |
28 | - USE THE MOST SIMPLE XPATHS POSSIBLE.
29 | - USE THE MOST GENERAL XPATHS POSSIBLE.
30 | - USE THE MOST SPECIFIC XPATHS POSSIBLE.
31 | - USE THE MOST GENERAL XPATHS POSSIBLE.
32 |
33 |
34 | **Example Format:**
35 | ```xml
36 |
37 | - :
38 | - :
39 | - :
40 | - :
41 | - :
42 | - etc
43 |
44 |
45 |
46 |
47 | - //a[@href='next_page_url']
48 |
49 |
50 | ```
51 |
52 | **Input webpage:**
53 | {webpage}
54 |
55 | **Target content:**
56 | {prompt}
57 |
58 | """
59 |
--------------------------------------------------------------------------------
/api/backend/ai/ai_router.py:
--------------------------------------------------------------------------------
1 | # STL
2 | import logging
3 | from collections.abc import Iterable, AsyncGenerator
4 |
5 | # PDM
6 | from fastapi import APIRouter
7 | from fastapi.responses import JSONResponse, StreamingResponse
8 | from openai.types.chat import ChatCompletionMessageParam
9 |
10 | # LOCAL
11 | from ollama import Message
12 | from api.backend.models import AI
13 |
14 | from api.backend.ai.clients import (
15 | llama_client,
16 | llama_model,
17 | openai_client,
18 | open_ai_model,
19 | open_ai_key,
20 | )
21 |
22 |
23 | LOG = logging.getLogger(__name__)
24 |
25 | ai_router = APIRouter()
26 |
27 |
28 | async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]:
29 | if llama_client and llama_model:
30 | try:
31 | async for part in await llama_client.chat(
32 | model=llama_model, messages=chat_messages, stream=True
33 | ):
34 | yield part["message"]["content"]
35 | except Exception as e:
36 | LOG.error(f"Error during chat: {e}")
37 | yield "An error occurred while processing your request."
38 |
39 |
40 | async def openai_chat(
41 | chat_messages: Iterable[ChatCompletionMessageParam],
42 | ) -> AsyncGenerator[str, None]:
43 | if openai_client and not open_ai_model:
44 | LOG.error("OpenAI model is not set")
45 | yield "An error occurred while processing your request."
46 |
47 | if not openai_client:
48 | LOG.error("OpenAI client is not set")
49 | yield "An error occurred while processing your request."
50 |
51 | if openai_client and open_ai_model:
52 | try:
53 | response = openai_client.chat.completions.create(
54 | model=open_ai_model, messages=chat_messages, stream=True
55 | )
56 | for part in response:
57 | yield part.choices[0].delta.content or ""
58 | except Exception as e:
59 | LOG.error(f"Error during OpenAI chat: {e}")
60 | yield "An error occurred while processing your request."
61 |
62 |
63 | chat_function = llama_chat if llama_client else openai_chat
64 |
65 |
66 | @ai_router.post("/ai")
67 | async def ai(c: AI):
68 | return StreamingResponse(
69 | chat_function(chat_messages=c.messages), media_type="text/plain"
70 | )
71 |
72 |
73 | @ai_router.get("/ai/check")
74 | async def check():
75 | return JSONResponse(content={"ai_enabled": bool(open_ai_key or llama_model)})
76 |
--------------------------------------------------------------------------------
/api/backend/ai/clients.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from openai import OpenAI
4 | from ollama import AsyncClient
5 |
6 |
7 | # Load environment variables
8 | open_ai_key = os.getenv("OPENAI_KEY")
9 | open_ai_model = os.getenv("OPENAI_MODEL")
10 | llama_url = os.getenv("OLLAMA_URL")
11 | llama_model = os.getenv("OLLAMA_MODEL")
12 |
13 | # Initialize clients
14 | openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None
15 | llama_client = AsyncClient(host=llama_url) if llama_url else None
16 |
17 |
18 | async def ask_open_ai(prompt: str) -> str:
19 | if not openai_client:
20 | raise ValueError("OpenAI client not initialized")
21 |
22 | response = openai_client.chat.completions.create(
23 | model=open_ai_model or "gpt-4.1-mini",
24 | messages=[{"role": "user", "content": prompt}],
25 | )
26 |
27 | return response.choices[0].message.content or ""
28 |
29 |
30 | async def ask_ollama(prompt: str) -> str:
31 | if not llama_client:
32 | raise ValueError("Ollama client not initialized")
33 |
34 | response = await llama_client.chat(
35 | model=llama_model or "", messages=[{"role": "user", "content": prompt}]
36 | )
37 |
38 | return response.message.content or ""
39 |
--------------------------------------------------------------------------------
/api/backend/app.py:
--------------------------------------------------------------------------------
1 | # STL
2 | import os
3 | import logging
4 | import apscheduler # type: ignore
5 | from contextlib import asynccontextmanager
6 |
7 | # PDM
8 | import apscheduler.schedulers
9 | import apscheduler.schedulers.background
10 | from fastapi import FastAPI, Request, status
11 | from fastapi.exceptions import RequestValidationError
12 | from fastapi.middleware.cors import CORSMiddleware
13 |
14 | # LOCAL
15 | from api.backend.ai.ai_router import ai_router
16 | from api.backend.auth.auth_router import auth_router
17 | from api.backend.utils import get_log_level
18 | from api.backend.routers.job_router import job_router
19 | from api.backend.routers.stats_router import stats_router
20 | from api.backend.database.startup import init_database
21 | from fastapi.responses import JSONResponse
22 |
23 | from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
24 | from api.backend.scheduler import scheduler
25 |
26 | log_level = os.getenv("LOG_LEVEL")
27 | LOG_LEVEL = get_log_level(log_level)
28 |
29 | logging.basicConfig(
30 | level=LOG_LEVEL,
31 | format="%(levelname)s: %(asctime)s - %(name)s - %(message)s",
32 | handlers=[logging.StreamHandler()],
33 | )
34 |
35 | LOG = logging.getLogger(__name__)
36 |
37 |
38 | @asynccontextmanager
39 | async def lifespan(app: FastAPI):
40 | # Startup
41 | LOG.info("Starting application...")
42 |
43 | init_database()
44 |
45 | LOG.info("Starting cron scheduler...")
46 | start_cron_scheduler(scheduler)
47 | scheduler.start()
48 | LOG.info("Cron scheduler started successfully")
49 |
50 | yield
51 |
52 | # Shutdown
53 | LOG.info("Shutting down application...")
54 | LOG.info("Stopping cron scheduler...")
55 | scheduler.shutdown(wait=False) # Set wait=False to not block shutdown
56 | LOG.info("Cron scheduler stopped")
57 | LOG.info("Application shutdown complete")
58 |
59 |
60 | app = FastAPI(title="api", root_path="/api", lifespan=lifespan)
61 |
62 | app.add_middleware(
63 | CORSMiddleware,
64 | allow_origins=["*"],
65 | allow_credentials=True,
66 | allow_methods=["*"],
67 | allow_headers=["*"],
68 | )
69 |
70 | app.include_router(auth_router)
71 | app.include_router(ai_router)
72 | app.include_router(job_router)
73 | app.include_router(stats_router)
74 |
75 |
76 | @app.exception_handler(RequestValidationError)
77 | async def validation_exception_handler(request: Request, exc: RequestValidationError):
78 | exc_str = f"{exc}".replace("\n", " ").replace(" ", " ")
79 | logging.error(f"{request}: {exc_str}")
80 | content = {"status_code": 10422, "message": exc_str, "data": None}
81 | return JSONResponse(
82 | content=content, status_code=status.HTTP_422_UNPROCESSABLE_ENTITY
83 | )
84 |
--------------------------------------------------------------------------------
/api/backend/auth/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/api/backend/auth/__init__.py
--------------------------------------------------------------------------------
/api/backend/auth/auth_router.py:
--------------------------------------------------------------------------------
1 | # STL
2 | from datetime import timedelta
3 | import os
4 |
5 | # PDM
6 | from fastapi import Depends, APIRouter, HTTPException, status
7 | from fastapi.security import OAuth2PasswordRequestForm
8 |
9 | # LOCAL
10 | from api.backend.schemas import User, Token, UserCreate
11 | from api.backend.auth.auth_utils import (
12 | ACCESS_TOKEN_EXPIRE_MINUTES,
13 | get_current_user,
14 | authenticate_user,
15 | get_password_hash,
16 | create_access_token,
17 | )
18 | import logging
19 |
20 | from api.backend.database.common import update
21 |
22 | auth_router = APIRouter()
23 |
24 | LOG = logging.getLogger("auth_router")
25 |
26 |
27 | @auth_router.post("/auth/token", response_model=Token)
28 | async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
29 | user = await authenticate_user(form_data.username, form_data.password)
30 | if not user:
31 | raise HTTPException(
32 | status_code=status.HTTP_401_UNAUTHORIZED,
33 | detail="Incorrect username or password",
34 | headers={"WWW-Authenticate": "Bearer"},
35 | )
36 |
37 | expire_minutes = (
38 | int(ACCESS_TOKEN_EXPIRE_MINUTES) if ACCESS_TOKEN_EXPIRE_MINUTES else 60
39 | )
40 |
41 | access_token_expires = timedelta(minutes=expire_minutes)
42 | access_token = create_access_token(
43 | data={"sub": user.email}, expires_delta=access_token_expires
44 | )
45 |
46 | return {"access_token": access_token, "token_type": "bearer"}
47 |
48 |
49 | @auth_router.post("/auth/signup", response_model=User)
50 | async def create_user(user: UserCreate):
51 | hashed_password = get_password_hash(user.password)
52 | user_dict = user.model_dump()
53 | user_dict["hashed_password"] = hashed_password
54 | del user_dict["password"]
55 |
56 | query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)"
57 | _ = update(query, (user_dict["email"], hashed_password, user_dict["full_name"]))
58 |
59 | return user_dict
60 |
61 |
62 | @auth_router.get("/auth/users/me", response_model=User)
63 | async def read_users_me(current_user: User = Depends(get_current_user)):
64 | return current_user
65 |
66 |
67 | @auth_router.get("/auth/check")
68 | async def check_auth():
69 | return {
70 | "registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True",
71 | "recordings_enabled": os.environ.get("RECORDINGS_ENABLED", "true").lower()
72 | == "true",
73 | }
74 |
--------------------------------------------------------------------------------
/api/backend/constants.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import os
3 |
4 | DATABASE_PATH = "data/database.db"
5 | RECORDINGS_DIR = Path("media/recordings")
6 | RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
7 | MEDIA_DIR = Path("media")
8 | MEDIA_TYPES = [
9 | "audio",
10 | "documents",
11 | "images",
12 | "pdfs",
13 | "presentations",
14 | "spreadsheets",
15 | "videos",
16 | ]
17 |
--------------------------------------------------------------------------------
/api/backend/database/__init__.py:
--------------------------------------------------------------------------------
1 | from .common import insert, QUERIES, update
2 |
3 | __all__ = ["insert", "QUERIES", "update"]
4 |
--------------------------------------------------------------------------------
/api/backend/database/common.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | from typing import Any, Optional
3 | from api.backend.constants import DATABASE_PATH
4 | from api.backend.utils import format_json, format_sql_row_to_python
5 | from api.backend.database.schema import INIT_QUERY
6 | from api.backend.database.queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
7 | import logging
8 |
9 | LOG = logging.getLogger(__name__)
10 |
11 |
12 | def connect():
13 | connection = sqlite3.connect(DATABASE_PATH)
14 | connection.set_trace_callback(print)
15 | cursor = connection.cursor()
16 | return cursor
17 |
18 |
19 | def insert(query: str, values: tuple[Any, ...]):
20 | connection = sqlite3.connect(DATABASE_PATH)
21 | cursor = connection.cursor()
22 | copy = list(values)
23 | format_json(copy)
24 |
25 | try:
26 | _ = cursor.execute(query, copy)
27 | connection.commit()
28 | except sqlite3.Error as e:
29 | LOG.error(f"An error occurred: {e}")
30 | finally:
31 | cursor.close()
32 | connection.close()
33 |
34 |
35 | def query(query: str, values: Optional[tuple[Any, ...]] = None):
36 | connection = sqlite3.connect(DATABASE_PATH)
37 | connection.row_factory = sqlite3.Row
38 | cursor = connection.cursor()
39 | rows = []
40 | try:
41 | if values:
42 | _ = cursor.execute(query, values)
43 | else:
44 | _ = cursor.execute(query)
45 |
46 | rows = cursor.fetchall()
47 |
48 | finally:
49 | cursor.close()
50 | connection.close()
51 |
52 | formatted_rows: list[dict[str, Any]] = []
53 |
54 | for row in rows:
55 | row = dict(row)
56 | formatted_row = format_sql_row_to_python(row)
57 | formatted_rows.append(formatted_row)
58 |
59 | return formatted_rows
60 |
61 |
62 | def update(query: str, values: Optional[tuple[Any, ...]] = None):
63 | connection = sqlite3.connect(DATABASE_PATH)
64 | cursor = connection.cursor()
65 |
66 | copy = None
67 |
68 | if values:
69 | copy = list(values)
70 | format_json(copy)
71 |
72 | try:
73 | if copy:
74 | res = cursor.execute(query, copy)
75 | else:
76 | res = cursor.execute(query)
77 | connection.commit()
78 | return res.rowcount
79 | except sqlite3.Error as e:
80 | LOG.error(f"An error occurred: {e}")
81 | finally:
82 | cursor.close()
83 | connection.close()
84 |
85 | return 0
86 |
87 |
88 | QUERIES = {
89 | "init": INIT_QUERY,
90 | "insert_job": JOB_INSERT_QUERY,
91 | "delete_job": DELETE_JOB_QUERY,
92 | }
93 |
--------------------------------------------------------------------------------
/api/backend/database/queries/__init__.py:
--------------------------------------------------------------------------------
1 | from .queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
2 |
3 | __all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"]
4 |
--------------------------------------------------------------------------------
/api/backend/database/queries/queries.py:
--------------------------------------------------------------------------------
1 | JOB_INSERT_QUERY = """
2 | INSERT INTO jobs
3 | (id, url, elements, user, time_created, result, status, chat, job_options, agent_mode, prompt)
4 | VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
5 | """
6 |
7 | DELETE_JOB_QUERY = """
8 | DELETE FROM jobs WHERE id IN ()
9 | """
10 |
--------------------------------------------------------------------------------
/api/backend/database/schema/__init__.py:
--------------------------------------------------------------------------------
1 | from .schema import INIT_QUERY
2 |
3 | __all__ = ["INIT_QUERY"]
4 |
--------------------------------------------------------------------------------
/api/backend/database/schema/schema.py:
--------------------------------------------------------------------------------
1 | INIT_QUERY = """
2 | CREATE TABLE IF NOT EXISTS jobs (
3 | id STRING PRIMARY KEY NOT NULL,
4 | url STRING NOT NULL,
5 | elements JSON NOT NULL,
6 | user STRING,
7 | time_created DATETIME NOT NULL,
8 | result JSON NOT NULL,
9 | status STRING NOT NULL,
10 | chat JSON,
11 | job_options JSON
12 | );
13 |
14 | CREATE TABLE IF NOT EXISTS users (
15 | email STRING PRIMARY KEY NOT NULL,
16 | hashed_password STRING NOT NULL,
17 | full_name STRING,
18 | disabled BOOLEAN
19 | );
20 |
21 | CREATE TABLE IF NOT EXISTS cron_jobs (
22 | id STRING PRIMARY KEY NOT NULL,
23 | user_email STRING NOT NULL,
24 | job_id STRING NOT NULL,
25 | cron_expression STRING NOT NULL,
26 | time_created DATETIME NOT NULL,
27 | time_updated DATETIME NOT NULL,
28 | FOREIGN KEY (job_id) REFERENCES jobs(id)
29 | );
30 |
31 | ALTER TABLE jobs ADD COLUMN agent_mode BOOLEAN NOT NULL DEFAULT FALSE;
32 | ALTER TABLE jobs ADD COLUMN prompt STRING;
33 | """
34 |
--------------------------------------------------------------------------------
/api/backend/database/startup.py:
--------------------------------------------------------------------------------
1 | import os
2 | from api.backend.database.common import connect, QUERIES, insert
3 | import logging
4 | import sqlite3
5 |
6 | from api.backend.auth.auth_utils import get_password_hash
7 |
8 | LOG = logging.getLogger(__name__)
9 |
10 |
11 | def init_database():
12 | cursor = connect()
13 |
14 | for query in QUERIES["init"].strip().split(";"):
15 | query = query.strip()
16 | if not query:
17 | continue
18 |
19 | try:
20 | LOG.info(f"Executing query: {query}")
21 | _ = cursor.execute(query)
22 | except sqlite3.OperationalError as e:
23 | if "duplicate column name" in str(e).lower():
24 | LOG.warning(f"Skipping duplicate column error: {e}")
25 | continue
26 | else:
27 | LOG.error(f"Error executing query: {query}")
28 | raise
29 |
30 | if os.environ.get("REGISTRATION_ENABLED", "true").lower() == "false":
31 | default_user_email = os.environ.get("DEFAULT_USER_EMAIL")
32 | default_user_password = os.environ.get("DEFAULT_USER_PASSWORD")
33 | default_user_full_name = os.environ.get("DEFAULT_USER_FULL_NAME")
34 |
35 | if (
36 | not default_user_email
37 | or not default_user_password
38 | or not default_user_full_name
39 | ):
40 | LOG.error(
41 | "DEFAULT_USER_EMAIL, DEFAULT_USER_PASSWORD, or DEFAULT_USER_FULL_NAME is not set!"
42 | )
43 | exit(1)
44 |
45 | query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)"
46 | _ = insert(
47 | query,
48 | (
49 | default_user_email,
50 | get_password_hash(default_user_password),
51 | default_user_full_name,
52 | ),
53 | )
54 |
55 | cursor.close()
56 |
--------------------------------------------------------------------------------
/api/backend/job/__init__.py:
--------------------------------------------------------------------------------
1 | from .job import (
2 | insert,
3 | update_job,
4 | delete_jobs,
5 | get_jobs_per_day,
6 | get_queued_job,
7 | average_elements_per_link,
8 | )
9 |
10 | __all__ = [
11 | "insert",
12 | "update_job",
13 | "delete_jobs",
14 | "get_jobs_per_day",
15 | "get_queued_job",
16 | "average_elements_per_link",
17 | ]
18 |
--------------------------------------------------------------------------------
/api/backend/job/cron_scheduling/cron_scheduling.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | from typing import Any
3 | import uuid
4 | from api.backend.database.common import insert, query
5 | from api.backend.models import CronJob
6 | from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
7 | from apscheduler.triggers.cron import CronTrigger # type: ignore
8 |
9 | from api.backend.job import insert as insert_job
10 | import logging
11 |
12 | LOG = logging.getLogger("Cron Scheduler")
13 |
14 |
15 | def insert_cron_job(cron_job: CronJob):
16 | query = """
17 | INSERT INTO cron_jobs (id, user_email, job_id, cron_expression, time_created, time_updated)
18 | VALUES (?, ?, ?, ?, ?, ?)
19 | """
20 | values = (
21 | cron_job.id,
22 | cron_job.user_email,
23 | cron_job.job_id,
24 | cron_job.cron_expression,
25 | cron_job.time_created,
26 | cron_job.time_updated,
27 | )
28 |
29 | insert(query, values)
30 |
31 | return True
32 |
33 |
34 | def delete_cron_job(id: str, user_email: str):
35 | query = """
36 | DELETE FROM cron_jobs
37 | WHERE id = ? AND user_email = ?
38 | """
39 | values = (id, user_email)
40 | insert(query, values)
41 |
42 | return True
43 |
44 |
45 | def get_cron_jobs(user_email: str):
46 | cron_jobs = query("SELECT * FROM cron_jobs WHERE user_email = ?", (user_email,))
47 |
48 | return cron_jobs
49 |
50 |
51 | def get_all_cron_jobs():
52 | cron_jobs = query("SELECT * FROM cron_jobs")
53 |
54 | return cron_jobs
55 |
56 |
57 | def insert_job_from_cron_job(job: dict[str, Any]):
58 | insert_job(
59 | {
60 | **job,
61 | "id": uuid.uuid4().hex,
62 | "status": "Queued",
63 | "result": "",
64 | "chat": None,
65 | "time_created": datetime.datetime.now(),
66 | "time_updated": datetime.datetime.now(),
67 | }
68 | )
69 |
70 |
71 | def get_cron_job_trigger(cron_expression: str):
72 | expression_parts = cron_expression.split()
73 |
74 | if len(expression_parts) != 5:
75 | print(f"Invalid cron expression: {cron_expression}")
76 | return None
77 |
78 | minute, hour, day, month, day_of_week = expression_parts
79 |
80 | return CronTrigger(
81 | minute=minute, hour=hour, day=day, month=month, day_of_week=day_of_week
82 | )
83 |
84 |
85 | def start_cron_scheduler(scheduler: BackgroundScheduler):
86 | cron_jobs = get_all_cron_jobs()
87 |
88 | LOG.info(f"Cron jobs: {cron_jobs}")
89 |
90 | for job in cron_jobs:
91 | queried_job = query("SELECT * FROM jobs WHERE id = ?", (job["job_id"],))
92 |
93 | LOG.info(f"Adding job: {queried_job}")
94 |
95 | scheduler.add_job(
96 | insert_job_from_cron_job,
97 | get_cron_job_trigger(job["cron_expression"]),
98 | id=job["id"],
99 | args=[queried_job[0]],
100 | )
101 |
--------------------------------------------------------------------------------
/api/backend/job/job.py:
--------------------------------------------------------------------------------
1 | # STL
2 | import logging
3 | from typing import Any
4 |
5 | # LOCAL
6 | from api.backend.utils import format_list_for_query
7 | from api.backend.database.common import (
8 | insert as common_insert,
9 | query as common_query,
10 | QUERIES,
11 | update as common_update,
12 | )
13 |
14 | LOG = logging.getLogger(__name__)
15 |
16 |
17 | def insert(item: dict[str, Any]) -> None:
18 | common_insert(
19 | QUERIES["insert_job"],
20 | (
21 | item["id"],
22 | item["url"],
23 | item["elements"],
24 | item["user"],
25 | item["time_created"],
26 | item["result"],
27 | item["status"],
28 | item["chat"],
29 | item["job_options"],
30 | item["agent_mode"],
31 | item["prompt"],
32 | ),
33 | )
34 | LOG.info(f"Inserted item: {item}")
35 |
36 |
37 | async def get_queued_job():
38 | query = (
39 | "SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
40 | )
41 | res = common_query(query)
42 | LOG.info(f"Got queued job: {res}")
43 | return res[0] if res else None
44 |
45 |
46 | async def update_job(ids: list[str], field: str, value: Any):
47 | query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
48 | res = common_update(query, tuple([value] + ids))
49 | LOG.info(f"Updated job: {res}")
50 |
51 |
52 | async def delete_jobs(jobs: list[str]):
53 | if not jobs:
54 | LOG.info("No jobs to delete.")
55 | return False
56 |
57 | query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}"
58 | res = common_update(query, tuple(jobs))
59 |
60 | return res > 0
61 |
62 |
63 | async def average_elements_per_link(user: str):
64 | job_query = """
65 | SELECT
66 | DATE(time_created) AS date,
67 | AVG(json_array_length(elements)) AS average_elements,
68 | COUNT(*) AS count
69 | FROM
70 | jobs
71 | WHERE
72 | status = 'Completed' AND user = ?
73 | GROUP BY
74 | DATE(time_created)
75 | ORDER BY
76 | date ASC;
77 | """
78 | results = common_query(job_query, (user,))
79 |
80 | return results
81 |
82 |
83 | async def get_jobs_per_day(user: str):
84 | job_query = """
85 | SELECT
86 | DATE(time_created) AS date,
87 | COUNT(*) AS job_count
88 | FROM
89 | jobs
90 | WHERE
91 | status = 'Completed' AND user = ?
92 | GROUP BY
93 | DATE(time_created)
94 | ORDER BY
95 | date ASC;
96 | """
97 | results = common_query(job_query, (user,))
98 |
99 | return results
100 |
--------------------------------------------------------------------------------
/api/backend/job/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .job_options import JobOptions
2 |
3 | __all__ = ["JobOptions"]
4 |
--------------------------------------------------------------------------------
/api/backend/job/models/job_options.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | from typing import Any, Optional
3 | from api.backend.job.models.site_map import SiteMap
4 |
5 |
6 | class FetchOptions(BaseModel):
7 | chat: Optional[bool] = None
8 |
9 |
10 | class JobOptions(BaseModel):
11 | multi_page_scrape: bool = False
12 | custom_headers: dict[str, Any] = {}
13 | proxies: list[str] = []
14 | site_map: Optional[SiteMap] = None
15 | collect_media: bool = False
16 | custom_cookies: list[dict[str, Any]] = []
17 |
--------------------------------------------------------------------------------
/api/backend/job/models/site_map.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | from typing import Literal
3 |
4 |
5 | class Action(BaseModel):
6 | type: Literal["click", "input"]
7 | xpath: str
8 | name: str
9 | input: str = ""
10 | do_once: bool = True
11 |
12 |
13 | class SiteMap(BaseModel):
14 | actions: list[Action]
15 |
--------------------------------------------------------------------------------
/api/backend/job/scraping/add_custom.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Optional
2 | from urllib.parse import urlparse
3 |
4 | from playwright.async_api import Page, BrowserContext
5 |
6 | import logging
7 |
8 | LOG = logging.getLogger(__name__)
9 |
10 |
11 | async def add_custom_cookies(
12 | custom_cookies: list[dict[str, Any]],
13 | url: str,
14 | context: BrowserContext,
15 | ) -> None:
16 | parsed_url = urlparse(url)
17 | domain = parsed_url.netloc
18 |
19 | for cookie in custom_cookies:
20 | cookie_dict = {
21 | "name": cookie.get("name", "default_name"),
22 | "value": cookie.get("value", "default_value"),
23 | "domain": domain,
24 | "path": "/",
25 | }
26 |
27 | LOG.info(f"Adding cookie: {cookie_dict}")
28 | await context.add_cookies([cookie_dict]) # type: ignore
29 |
30 |
31 | async def add_custom_headers(
32 | custom_headers: dict[str, Any],
33 | page: Page,
34 | ) -> None:
35 | await page.set_extra_http_headers(custom_headers)
36 |
37 |
38 | async def add_custom_items(
39 | url: str,
40 | page: Page,
41 | cookies: Optional[list[dict[str, Any]]] = None,
42 | headers: Optional[dict[str, Any]] = None,
43 | ) -> None:
44 | if cookies:
45 | await add_custom_cookies(cookies, url, page.context)
46 |
47 | if headers:
48 | await add_custom_headers(headers, page)
49 |
--------------------------------------------------------------------------------
/api/backend/job/scraping/scraping_utils.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from typing import Set, Tuple
3 | from playwright.async_api import Page
4 |
5 | from api.backend.utils import LOG
6 |
7 | from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
8 |
9 |
10 | async def scrape_content(
11 | id: str, page: Page, pages: Set[Tuple[str, str]], collect_media: bool
12 | ) -> str:
13 | last_height = await page.evaluate("document.body.scrollHeight")
14 |
15 | while True:
16 | await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
17 | await asyncio.sleep(3)
18 | new_height = await page.evaluate("document.body.scrollHeight")
19 |
20 | if new_height == last_height:
21 | break
22 |
23 | last_height = new_height
24 |
25 | html = await page.content()
26 | pages.add((html, page.url))
27 |
28 | if collect_media:
29 | LOG.info("Collecting media")
30 | await collect_media_utils(id, page)
31 |
32 | return html
33 |
34 |
35 | def clean_format_characters(text: str) -> str:
36 | text = text.strip()
37 | text = text.replace("\n", " ")
38 | text = text.replace("\t", " ")
39 | text = text.replace("\r", " ")
40 | text = text.replace("\f", " ")
41 | text = text.replace("\v", " ")
42 | text = text.replace("\b", " ")
43 | text = text.replace("\a", " ")
44 |
45 | return text
46 |
--------------------------------------------------------------------------------
/api/backend/job/site_mapping/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/api/backend/job/site_mapping/__init__.py
--------------------------------------------------------------------------------
/api/backend/job/site_mapping/site_mapping.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import asyncio
3 | from copy import deepcopy
4 | from typing import Any
5 |
6 | from playwright.async_api import Page
7 |
8 | from api.backend.job.models.site_map import Action, SiteMap
9 | from api.backend.job.scraping.scraping_utils import scrape_content
10 |
11 | LOG = logging.getLogger(__name__)
12 |
13 |
14 | def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]:
15 | """Clear all actions that have been clicked."""
16 | cleared_site_map = deepcopy(site_map)
17 | cleared_site_map["actions"] = [
18 | action for action in cleared_site_map["actions"] if not action["do_once"]
19 | ]
20 |
21 | return cleared_site_map
22 |
23 |
24 | async def handle_input(action: Action, page: Page) -> bool:
25 | try:
26 | element = page.locator(f"xpath={action.xpath}")
27 | LOG.info(f"Sending keys: {action.input} to element: {action.xpath}")
28 | await element.fill(action.input)
29 | return True
30 | except Exception as e:
31 | LOG.warning(f"Error handling input for xpath '{action.xpath}': {e}")
32 | return False
33 |
34 |
35 | async def handle_click(action: Action, page: Page) -> bool:
36 | try:
37 | element = page.locator(f"xpath={action.xpath}")
38 | LOG.info(f"Clicking element: {action.xpath}")
39 | await element.click()
40 | return True
41 | except Exception as e:
42 | LOG.warning(f"Error clicking element at xpath '{action.xpath}': {e}")
43 | return False
44 |
45 |
46 | ACTION_MAP = {
47 | "click": handle_click,
48 | "input": handle_input,
49 | }
50 |
51 |
52 | async def handle_site_mapping(
53 | id: str,
54 | site_map_dict: dict[str, Any],
55 | page: Page,
56 | pages: set[tuple[str, str]],
57 | collect_media: bool = False,
58 | ):
59 | site_map = SiteMap(**site_map_dict)
60 |
61 | for action in site_map.actions:
62 | action_handler = ACTION_MAP[action.type]
63 | success = await action_handler(action, page)
64 |
65 | if not success:
66 | return
67 |
68 | await asyncio.sleep(2)
69 |
70 | await scrape_content(id, page, pages, collect_media=collect_media)
71 |
72 | cleared_site_map_dict = clear_done_actions(site_map_dict)
73 |
74 | if cleared_site_map_dict["actions"]:
75 | await handle_site_mapping(
76 | id, cleared_site_map_dict, page, pages, collect_media=collect_media
77 | )
78 |
--------------------------------------------------------------------------------
/api/backend/job/utils/clean_job_format.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | from api.backend.utils import clean_text
4 |
5 |
6 | def clean_job_format(jobs: list[dict[str, Any]]) -> dict[str, Any]:
7 | """
8 | Convert a single job to a dictionary format.
9 | """
10 | headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"]
11 |
12 | cleaned_rows = []
13 |
14 | for job in jobs:
15 | for res in job["result"]:
16 | for url, elements in res.items():
17 | for element_name, values in elements.items():
18 | for value in values:
19 | text = clean_text(value.get("text", "")).strip()
20 | if text:
21 | cleaned_rows.append(
22 | {
23 | "id": job.get("id", ""),
24 | "url": url,
25 | "element_name": element_name,
26 | "xpath": value.get("xpath", ""),
27 | "text": text,
28 | "user": job.get("user", ""),
29 | "time_created": job.get("time_created", ""),
30 | }
31 | )
32 |
33 | return {
34 | "headers": headers,
35 | "rows": cleaned_rows,
36 | }
37 |
--------------------------------------------------------------------------------
/api/backend/job/utils/stream_md_from_job_results.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | from api.backend.utils import clean_text
4 |
5 |
6 | def stream_md_from_job_results(jobs: list[dict[str, Any]]):
7 | md = "# Job Results Summary\n\n"
8 | for i, job in enumerate(jobs, start=1):
9 | md += f"## Job #{i}\n"
10 | yield f"- **Job URL:** {job.get('url', 'N/A')}\n"
11 | yield f"- **Timestamp:** {job.get('time_created', 'N/A')}\n"
12 | yield f"- **ID:** {job.get('id', 'N/A')}\n"
13 | yield "### Extracted Results:\n"
14 |
15 | for res in job.get("result", []):
16 | for url, elements in res.items():
17 | yield f"\n#### URL: {url}\n"
18 | for element_name, values in elements.items():
19 | for value in values:
20 | text = clean_text(value.get("text", "")).strip()
21 | if text:
22 | yield f"- **Element:** `{element_name}`\n"
23 | yield f" - **Text:** {text}\n"
24 | yield "\n---\n"
25 |
--------------------------------------------------------------------------------
/api/backend/models.py:
--------------------------------------------------------------------------------
1 | # STL
2 | from typing import Any, Literal, Optional, Union
3 | from datetime import datetime
4 |
5 | # LOCAL
6 | from api.backend.job.models.job_options import JobOptions
7 |
8 | # PDM
9 | import pydantic
10 |
11 |
12 | class Element(pydantic.BaseModel):
13 | name: str
14 | xpath: str
15 | url: Optional[str] = None
16 |
17 |
18 | class CapturedElement(pydantic.BaseModel):
19 | xpath: str
20 | text: str
21 | name: str
22 |
23 |
24 | class RetrieveScrapeJobs(pydantic.BaseModel):
25 | user: str
26 |
27 |
28 | class DownloadJob(pydantic.BaseModel):
29 | ids: list[str]
30 | job_format: Literal["csv", "md"]
31 |
32 |
33 | class DeleteScrapeJobs(pydantic.BaseModel):
34 | ids: list[str]
35 |
36 |
37 | class GetStatistics(pydantic.BaseModel):
38 | user: str
39 |
40 |
41 | class UpdateJobs(pydantic.BaseModel):
42 | ids: list[str]
43 | field: str
44 | value: Any
45 |
46 |
47 | class AI(pydantic.BaseModel):
48 | messages: list[Any]
49 |
50 |
51 | class Job(pydantic.BaseModel):
52 | id: Optional[str] = None
53 | url: str
54 | elements: list[Element]
55 | user: str = ""
56 | time_created: Optional[Union[datetime, str]] = None
57 | result: list[dict[str, dict[str, list[CapturedElement]]]] = []
58 | job_options: JobOptions
59 | status: str = "Queued"
60 | chat: Optional[str] = None
61 | agent_mode: bool = False
62 | prompt: Optional[str] = None
63 |
64 |
65 | class CronJob(pydantic.BaseModel):
66 | id: Optional[str] = None
67 | user_email: str
68 | job_id: str
69 | cron_expression: str
70 | time_created: Optional[Union[datetime, str]] = None
71 | time_updated: Optional[Union[datetime, str]] = None
72 |
73 |
74 | class DeleteCronJob(pydantic.BaseModel):
75 | id: str
76 | user_email: str
77 |
--------------------------------------------------------------------------------
/api/backend/routers/stats_router.py:
--------------------------------------------------------------------------------
1 | # STL
2 | import logging
3 |
4 | # PDM
5 | from fastapi import APIRouter, Depends
6 |
7 | # LOCAL
8 | from api.backend.job import (
9 | get_jobs_per_day,
10 | average_elements_per_link,
11 | )
12 | from api.backend.auth.auth_utils import get_current_user
13 | from api.backend.schemas import User
14 |
15 |
16 | LOG = logging.getLogger(__name__)
17 |
18 | stats_router = APIRouter()
19 |
20 |
21 | @stats_router.get("/statistics/get-average-element-per-link")
22 | async def get_average_element_per_link(user: User = Depends(get_current_user)):
23 | return await average_elements_per_link(user.email)
24 |
25 |
26 | @stats_router.get("/statistics/get-average-jobs-per-day")
27 | async def average_jobs_per_day(user: User = Depends(get_current_user)):
28 | data = await get_jobs_per_day(user.email)
29 | return data
30 |
--------------------------------------------------------------------------------
/api/backend/scheduler.py:
--------------------------------------------------------------------------------
1 | from apscheduler.schedulers.background import BackgroundScheduler # type: ignore
2 |
3 | scheduler = BackgroundScheduler()
4 |
--------------------------------------------------------------------------------
/api/backend/schemas.py:
--------------------------------------------------------------------------------
1 | # STL
2 | from typing import Union, Literal, Optional
3 |
4 | # PDM
5 | from pydantic import EmailStr, BaseModel
6 |
7 |
8 | class Token(BaseModel):
9 | access_token: str
10 | token_type: str
11 |
12 |
13 | class TokenData(BaseModel):
14 | email: Optional[str] = None
15 |
16 |
17 | class User(BaseModel):
18 | email: Union[EmailStr, Literal[""]]
19 | full_name: Optional[str] = None
20 | disabled: Optional[bool] = None
21 |
22 |
23 | class UserInDB(User):
24 | hashed_password: str
25 |
26 |
27 | class UserCreate(BaseModel):
28 | email: EmailStr
29 | password: str
30 | full_name: Optional[str] = None
31 |
--------------------------------------------------------------------------------
/api/backend/tests/factories/job_factory.py:
--------------------------------------------------------------------------------
1 | from api.backend.models import Element, Job, JobOptions, CapturedElement
2 | import uuid
3 | from faker import Faker
4 |
5 | fake = Faker()
6 |
7 |
8 | def create_job(
9 | job_options: JobOptions = JobOptions(multi_page_scrape=False, custom_headers={})
10 | ):
11 | return Job(
12 | id=uuid.uuid4().hex,
13 | url="https://example.com",
14 | elements=[Element(name="test", xpath="xpath")],
15 | job_options=job_options,
16 | )
17 |
18 |
19 | def create_completed_job() -> Job:
20 | return Job(
21 | id=uuid.uuid4().hex,
22 | url="http://example.com",
23 | elements=[
24 | Element(
25 | name="element_name",
26 | xpath="//div",
27 | url="https://example.com",
28 | )
29 | ],
30 | job_options=JobOptions(multi_page_scrape=False, custom_headers={}),
31 | user=fake.name(),
32 | time_created=fake.date(),
33 | result=[
34 | {
35 | "https://example.com": {
36 | "element_name": [
37 | CapturedElement(
38 | xpath="//div", text="example", name="element_name"
39 | )
40 | ]
41 | }
42 | }
43 | ],
44 | )
45 |
--------------------------------------------------------------------------------
/api/backend/tests/job/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/api/backend/tests/job/__init__.py
--------------------------------------------------------------------------------
/api/backend/tests/job/test_download_job.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from fastapi.testclient import TestClient
3 | from unittest.mock import AsyncMock, patch
4 | from api.backend.app import app
5 | from api.backend.models import DownloadJob
6 | from api.backend.tests.factories.job_factory import create_completed_job
7 |
8 | client = TestClient(app)
9 |
10 | mocked_job = create_completed_job().model_dump()
11 | mock_results = [mocked_job]
12 | mocked_random_int = 123456
13 |
14 |
15 | @pytest.mark.asyncio
16 | @patch("api.backend.routers.job_router.query")
17 | @patch("api.backend.routers.job_router.random.randint")
18 | async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
19 | # Ensure the mock returns immediately
20 | mock_query.return_value = mock_results
21 | mock_randint.return_value = mocked_random_int
22 |
23 | # Create a DownloadJob instance
24 | download_job = DownloadJob(ids=[mocked_job["id"]], job_format="csv")
25 |
26 | # Make a POST request to the /download endpoint
27 | response = client.post("/download", json=download_job.model_dump())
28 |
29 | # Assertions
30 | assert response.status_code == 200
31 | assert response.headers["Content-Disposition"] == "attachment; filename=export.csv"
32 |
33 | # Check the content of the CSV
34 | csv_content = response.content.decode("utf-8")
35 | expected_csv = (
36 | f'"id","url","element_name","xpath","text","user","time_created"\r\n'
37 | f'"{mocked_job["id"]}-{mocked_random_int}","https://example.com","element_name","//div","example",'
38 | f'"{mocked_job["user"]}","{mocked_job["time_created"]}"\r\n'
39 | )
40 | assert csv_content == expected_csv
41 |
--------------------------------------------------------------------------------
/api/backend/tests/scraping/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/api/backend/tests/scraping/__init__.py
--------------------------------------------------------------------------------
/api/backend/tests/scraping/test_scraping.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import logging
3 | from typing import Dict
4 | from playwright.async_api import async_playwright, Cookie, Route
5 | from api.backend.job.scraping.add_custom import add_custom_items
6 |
7 | logging.basicConfig(level=logging.DEBUG)
8 | LOG = logging.getLogger(__name__)
9 |
10 |
11 | @pytest.mark.asyncio
12 | async def test_add_custom_items():
13 | test_cookies = [{"name": "big", "value": "cookie"}]
14 | test_headers = {"User-Agent": "test-agent", "Accept": "application/json"}
15 |
16 | async with async_playwright() as p:
17 | browser = await p.chromium.launch(headless=True)
18 | context = await browser.new_context()
19 | page = await context.new_page()
20 |
21 | # Set up request interception
22 | captured_headers: Dict[str, str] = {}
23 |
24 | async def handle_route(route: Route) -> None:
25 | nonlocal captured_headers
26 | captured_headers = route.request.headers
27 | await route.continue_()
28 |
29 | await page.route("**/*", handle_route)
30 |
31 | await add_custom_items(
32 | url="http://example.com",
33 | page=page,
34 | cookies=test_cookies,
35 | headers=test_headers,
36 | )
37 |
38 | # Navigate to example.com
39 | await page.goto("http://example.com")
40 |
41 | # Verify cookies were added
42 | cookies: list[Cookie] = await page.context.cookies()
43 | test_cookie = next((c for c in cookies if c.get("name") == "big"), None)
44 |
45 | assert test_cookie is not None
46 | assert test_cookie.get("value") == "cookie"
47 | assert test_cookie.get("path") == "/" # Default path should be set
48 | assert test_cookie.get("sameSite") == "Lax" # Default sameSite should be set
49 |
50 | # Verify headers were added
51 | assert captured_headers.get("user-agent") == "test-agent"
52 |
53 | await browser.close()
54 |
--------------------------------------------------------------------------------
/api/backend/utils.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Optional
2 | import logging
3 | import json
4 |
5 | LOG = logging.getLogger(__name__)
6 |
7 |
8 | def clean_text(text: str):
9 | text = text.replace("\r\n", "\n") # Normalize newlines
10 | text = text.replace("\n", "\\n") # Escape newlines
11 | text = text.replace('"', '\\"') # Escape double quotes
12 | return text
13 |
14 |
15 | def get_log_level(level_name: Optional[str]) -> int:
16 | level = logging.INFO
17 |
18 | if level_name:
19 | level_name = level_name.upper()
20 | level = getattr(logging, level_name, logging.INFO)
21 |
22 | return level
23 |
24 |
25 | def format_list_for_query(ids: list[str]):
26 | return (
27 | f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)"
28 | )
29 |
30 |
31 | def format_sql_row_to_python(row: dict[str, Any]):
32 | new_row: dict[str, Any] = {}
33 | for key, value in row.items():
34 | if isinstance(value, str):
35 | try:
36 | new_row[key] = json.loads(value)
37 | except json.JSONDecodeError:
38 | new_row[key] = value
39 | else:
40 | new_row[key] = value
41 |
42 | return new_row
43 |
44 |
45 | def format_json(items: list[Any]):
46 | for idx, item in enumerate(items):
47 | if isinstance(item, (dict, list)):
48 | formatted_item = json.dumps(item)
49 | items[idx] = formatted_item
50 |
--------------------------------------------------------------------------------
/api/backend/worker/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 |
4 | from api.backend.utils import get_log_level
5 |
6 | logging.basicConfig(
7 | level=get_log_level(os.getenv("LOG_LEVEL")),
8 | format="%(levelname)s: %(asctime)s - %(name)s - %(message)s",
9 | handlers=[logging.StreamHandler()],
10 | )
11 |
12 | LOG = logging.getLogger(__name__)
13 |
--------------------------------------------------------------------------------
/api/backend/worker/post_job_complete/discord_notification.py:
--------------------------------------------------------------------------------
1 | import json
2 | from typing import Any
3 |
4 | import requests
5 |
6 | from api.backend.worker.logger import LOG
7 | from api.backend.worker.post_job_complete.models import (
8 | PostJobCompleteOptions,
9 | JOB_COLOR_MAP,
10 | )
11 |
12 |
13 | def discord_notification(job: dict[str, Any], options: PostJobCompleteOptions):
14 | webhook_url = options["webhook_url"]
15 | scraperr_frontend_url = options["scraperr_frontend_url"]
16 |
17 | LOG.info(f"Sending discord notification to {webhook_url}")
18 |
19 | embed = {
20 | "title": "Job Completed",
21 | "description": "Scraping job has been completed.",
22 | "color": JOB_COLOR_MAP[job["status"]],
23 | "url": f"{scraperr_frontend_url}/jobs?search={job['id']}&type=id",
24 | "image": {
25 | "url": "https://github.com/jaypyles/Scraperr/raw/master/docs/logo_picture.png",
26 | },
27 | "author": {
28 | "name": "Scraperr",
29 | "url": "https://github.com/jaypyles/Scraperr",
30 | },
31 | "fields": [
32 | {
33 | "name": "Status",
34 | "value": "Completed",
35 | "inline": True,
36 | },
37 | {
38 | "name": "URL",
39 | "value": job["url"],
40 | "inline": True,
41 | },
42 | {
43 | "name": "ID",
44 | "value": job["id"],
45 | "inline": False,
46 | },
47 | {
48 | "name": "Options",
49 | "value": f"```json\n{json.dumps(job['job_options'], indent=4)}\n```",
50 | "inline": False,
51 | },
52 | ],
53 | }
54 |
55 | payload = {"embeds": [embed]}
56 | requests.post(webhook_url, json=payload)
57 |
--------------------------------------------------------------------------------
/api/backend/worker/post_job_complete/email_notifcation.py:
--------------------------------------------------------------------------------
1 | import smtplib
2 | import ssl
3 | from email.mime.text import MIMEText
4 | from email.mime.multipart import MIMEMultipart
5 | import json
6 | from typing import Any
7 |
8 | from api.backend.worker.logger import LOG
9 |
10 | from api.backend.worker.post_job_complete.models import (
11 | JOB_COLOR_MAP,
12 | PostJobCompleteOptions,
13 | )
14 |
15 |
16 | def send_job_complete_email(
17 | job: dict[str, Any],
18 | options: PostJobCompleteOptions,
19 | ):
20 | status = job["status"]
21 | status_color = JOB_COLOR_MAP.get(status, 0x808080)
22 | job_url = job["url"]
23 | job_id = job["id"]
24 | job_options_json = json.dumps(job["job_options"], indent=4)
25 | frontend_url = options["scraperr_frontend_url"]
26 |
27 | subject = "📦 Job Completed - Scraperr Notification"
28 |
29 | html = f"""
30 |
31 |
32 | ✅ Job Completed
33 | Scraping job has been completed successfully.
34 |
35 |
36 |
37 |
38 |
39 | Job Info:
40 |
41 | Status: {status}
42 | Job URL: {job_url}
43 | Job ID: {job_id}
44 |
45 |
46 | Options:
47 |
48 | {job_options_json}
49 |
50 |
51 | View your job here:
52 | Scraperr Job
53 |
54 |
55 | Sent by Scraperr
56 |
57 |
58 |
59 | """
60 |
61 | # Create email
62 | message = MIMEMultipart("alternative")
63 | message["From"] = options["email"]
64 | message["To"] = options["to"]
65 | message["Subject"] = subject
66 | message.attach(
67 | MIMEText(
68 | "Job completed. View this email in HTML format for full details.", "plain"
69 | )
70 | )
71 | message.attach(MIMEText(html, "html"))
72 |
73 | context = ssl.create_default_context()
74 |
75 | try:
76 | if options["use_tls"]:
77 | with smtplib.SMTP(options["smtp_host"], options["smtp_port"]) as server:
78 | server.starttls(context=context)
79 | server.login(options["smtp_user"], options["smtp_password"])
80 | server.sendmail(
81 | from_addr=options["email"],
82 | to_addrs=options["to"],
83 | msg=message.as_string(),
84 | )
85 | else:
86 | with smtplib.SMTP_SSL(
87 | options["smtp_host"], options["smtp_port"], context=context
88 | ) as server:
89 | server.login(options["smtp_user"], options["smtp_password"])
90 | server.sendmail(
91 | from_addr=options["email"],
92 | to_addrs=options["to"],
93 | msg=message.as_string(),
94 | )
95 | LOG.info("✅ Email sent successfully!")
96 | except Exception as e:
97 | LOG.error(f"❌ Failed to send email: {e}")
98 |
--------------------------------------------------------------------------------
/api/backend/worker/post_job_complete/models.py:
--------------------------------------------------------------------------------
1 | from typing import TypedDict
2 |
3 |
4 | class PostJobCompleteOptions(TypedDict):
5 | channel: str
6 | webhook_url: str
7 | scraperr_frontend_url: str
8 | email: str
9 | to: str
10 | smtp_host: str
11 | smtp_port: int
12 | smtp_user: str
13 | smtp_password: str
14 | use_tls: bool
15 |
16 |
17 | JOB_COLOR_MAP = {
18 | "Queued": 0x0000FF,
19 | "Scraping": 0x0000FF,
20 | "Completed": 0x00FF00,
21 | "Failed": 0xFF0000,
22 | }
23 |
--------------------------------------------------------------------------------
/api/backend/worker/post_job_complete/post_job_complete.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | from api.backend.worker.post_job_complete.models import PostJobCompleteOptions
4 | from api.backend.worker.post_job_complete.email_notifcation import (
5 | send_job_complete_email,
6 | )
7 | from api.backend.worker.post_job_complete.discord_notification import (
8 | discord_notification,
9 | )
10 |
11 |
12 | async def post_job_complete(job: dict[str, Any], options: PostJobCompleteOptions):
13 | if options["channel"] == "":
14 | return
15 |
16 | if not options.values():
17 | return
18 |
19 | if options["channel"] == "discord":
20 | discord_notification(job, options)
21 | elif options["channel"] == "email":
22 | send_job_complete_email(job, options)
23 | else:
24 | raise ValueError(f"Invalid channel: {options['channel']}")
25 |
--------------------------------------------------------------------------------
/cypress.config.ts:
--------------------------------------------------------------------------------
1 | import { defineConfig } from "cypress";
2 |
3 | export default defineConfig({
4 | e2e: {
5 | setupNodeEvents(on, config) {},
6 | baseUrl: "http://localhost",
7 | },
8 | });
9 |
--------------------------------------------------------------------------------
/cypress/e2e/authentication.cy.ts:
--------------------------------------------------------------------------------
1 | describe("Authentication", () => {
2 | it("should register", () => {
3 | cy.intercept("POST", "/api/signup").as("signup");
4 |
5 | cy.visit("/").then(() => {
6 | cy.get("button").contains("Login").click();
7 | cy.url().should("include", "/login");
8 |
9 | cy.get("form").should("be.visible");
10 | cy.get("button")
11 | .contains("No Account? Sign up")
12 | .should("be.visible")
13 | .click();
14 |
15 | cy.get("input[name='email']").type("test@test.com");
16 | cy.get("input[name='password']").type("password");
17 | cy.get("input[name='fullName']").type("John Doe");
18 | cy.get("button[type='submit']").contains("Signup").click();
19 |
20 | cy.wait("@signup").then((interception) => {
21 | if (!interception.response) {
22 | cy.log("No response received!");
23 | throw new Error("signup request did not return a response");
24 | }
25 |
26 | cy.log("Response status: " + interception.response.statusCode);
27 | cy.log("Response body: " + JSON.stringify(interception.response.body));
28 |
29 | expect(interception.response.statusCode).to.eq(200);
30 | });
31 | });
32 | });
33 |
34 | it("should login", () => {
35 | cy.intercept("POST", "/api/token").as("token");
36 |
37 | cy.visit("/").then(() => {
38 | cy.get("button")
39 | .contains("Login")
40 | .click()
41 | .then(() => {
42 | cy.get("input[name='email']").type("test@test.com");
43 | cy.get("input[name='password']").type("password");
44 | cy.get("button[type='submit']").contains("Login").click();
45 |
46 | cy.wait("@token").then((interception) => {
47 | if (!interception.response) {
48 | cy.log("No response received!");
49 | throw new Error("token request did not return a response");
50 | }
51 |
52 | cy.log("Response status: " + interception.response.statusCode);
53 | cy.log("Response body: " + JSON.stringify(interception.response.body));
54 |
55 | expect(interception.response.statusCode).to.eq(200);
56 | });
57 | });
58 | });
59 | });
60 | });
61 |
--------------------------------------------------------------------------------
/cypress/e2e/navigation.cy.ts:
--------------------------------------------------------------------------------
1 | describe("General site navigation", () => {
2 | it("passes", () => {
3 | cy.visit("/");
4 | });
5 | });
6 |
--------------------------------------------------------------------------------
/cypress/e2e/submit-job.cy.ts:
--------------------------------------------------------------------------------
1 | describe.only("Job", () => {
2 | it("should create a job", () => {
3 | cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
4 |
5 | cy.visit("/");
6 |
7 | cy.get('[data-cy="url-input"]').type("https://example.com");
8 | cy.get('[data-cy="name-field"]').type("example");
9 | cy.get('[data-cy="xpath-field"]').type("//body");
10 | cy.get('[data-cy="add-button"]').click();
11 |
12 | cy.contains("Submit").click();
13 |
14 | cy.wait("@submitScrapeJob").then((interception) => {
15 | if (!interception.response) {
16 | cy.log("No response received!");
17 | cy.log("Request body: " + JSON.stringify(interception.request?.body));
18 | throw new Error("submitScrapeJob request did not return a response");
19 | }
20 |
21 | cy.log("Response status: " + interception.response.statusCode);
22 | cy.log("Response body: " + JSON.stringify(interception.response.body));
23 |
24 | expect(interception.response.statusCode).to.eq(200);
25 | });
26 |
27 | cy.get("li").contains("Jobs").click();
28 |
29 | cy.contains("div", "https://example.com", { timeout: 10000 }).should(
30 | "exist"
31 | );
32 | cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
33 |
34 | cy.get("tbody tr")
35 | .first()
36 | .within(() => {
37 | cy.get('input[type="checkbox"]').click();
38 | });
39 |
40 | cy.get("[data-testid='DeleteIcon']").click();
41 |
42 | cy.contains("div", "https://example.com", { timeout: 10000 }).should(
43 | "not.exist"
44 | );
45 | });
46 |
47 | it("should create a job with advanced options (media)", () => {
48 | cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
49 |
50 | cy.visit("/");
51 |
52 | cy.get("button").contains("Advanced Job Options").click();
53 |
54 | cy.get('[data-cy="collect-media-checkbox"]').click();
55 | cy.get("body").type("{esc}");
56 |
57 | cy.get('[data-cy="url-input"]').type("https://books.toscrape.com");
58 | cy.get('[data-cy="name-field"]').type("example");
59 | cy.get('[data-cy="xpath-field"]').type("//body");
60 | cy.get('[data-cy="add-button"]').click();
61 |
62 | cy.get("button").contains("Submit").click();
63 |
64 | cy.get("li").contains("Jobs").click();
65 |
66 | cy.contains("div", "https://books.toscrape.com", { timeout: 10000 }).should(
67 | "exist"
68 | );
69 |
70 | cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
71 | cy.get("li").contains("Media").click();
72 |
73 | cy.get("div[id='select-job']").click();
74 | cy.get("li[role='option']").click();
75 |
76 | cy.get("[data-testid='media-grid']", { timeout: 10000 }).should("exist");
77 |
78 | cy.get("li").contains("Jobs").click();
79 |
80 | cy.get("tbody tr")
81 | .first()
82 | .within(() => {
83 | cy.get('input[type="checkbox"]').click();
84 | });
85 |
86 | cy.get("[data-testid='DeleteIcon']").click();
87 | });
88 | });
89 |
--------------------------------------------------------------------------------
/cypress/fixtures/example.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Using fixtures to represent data",
3 | "email": "hello@cypress.io",
4 | "body": "Fixtures are a great way to mock data for responses to routes"
5 | }
6 |
--------------------------------------------------------------------------------
/cypress/support/commands.ts:
--------------------------------------------------------------------------------
1 | ///
2 | // ***********************************************
3 | // This example commands.ts shows you how to
4 | // create various custom commands and overwrite
5 | // existing commands.
6 | //
7 | // For more comprehensive examples of custom
8 | // commands please read more here:
9 | // https://on.cypress.io/custom-commands
10 | // ***********************************************
11 | //
12 | //
13 | // -- This is a parent command --
14 | // Cypress.Commands.add('login', (email, password) => { ... })
15 | //
16 | //
17 | // -- This is a child command --
18 | // Cypress.Commands.add('drag', { prevSubject: 'element'}, (subject, options) => { ... })
19 | //
20 | //
21 | // -- This is a dual command --
22 | // Cypress.Commands.add('dismiss', { prevSubject: 'optional'}, (subject, options) => { ... })
23 | //
24 | //
25 | // -- This will overwrite an existing command --
26 | // Cypress.Commands.overwrite('visit', (originalFn, url, options) => { ... })
27 | //
28 | // declare global {
29 | // namespace Cypress {
30 | // interface Chainable {
31 | // login(email: string, password: string): Chainable
32 | // drag(subject: string, options?: Partial): Chainable
33 | // dismiss(subject: string, options?: Partial): Chainable
34 | // visit(originalFn: CommandOriginalFn, url: string, options: Partial): Chainable
35 | // }
36 | // }
37 | // }
38 |
--------------------------------------------------------------------------------
/cypress/support/e2e.ts:
--------------------------------------------------------------------------------
1 | // ***********************************************************
2 | // This example support/e2e.ts is processed and
3 | // loaded automatically before your test files.
4 | //
5 | // This is a great place to put global configuration and
6 | // behavior that modifies Cypress.
7 | //
8 | // You can change the location of this file or turn off
9 | // automatically serving support files with the
10 | // 'supportFile' configuration option.
11 | //
12 | // You can read more here:
13 | // https://on.cypress.io/configuration
14 | // ***********************************************************
15 |
16 | // Import commands.js using ES2015 syntax:
17 | import './commands'
18 |
19 | // Alternatively you can use CommonJS syntax:
20 | // require('./commands')
--------------------------------------------------------------------------------
/docker-compose.dev.yml:
--------------------------------------------------------------------------------
1 | version: "3"
2 | services:
3 | scraperr:
4 | build:
5 | context: .
6 | dockerfile: docker/frontend/Dockerfile
7 | command: ["npm", "run", "dev"]
8 | volumes:
9 | - "$PWD/src:/app/src"
10 | - "$PWD/public:/app/public"
11 | - "$PWD/next.config.mjs:/app/next.config.mjs"
12 | - "$PWD/package.json:/app/package.json"
13 | - "$PWD/package-lock.json:/app/package-lock.json"
14 | - "$PWD/tsconfig.json:/app/tsconfig.json"
15 | scraperr_api:
16 | build:
17 | context: .
18 | dockerfile: docker/api/Dockerfile
19 | environment:
20 | - LOG_LEVEL=INFO
21 | volumes:
22 | - "$PWD/api:/project/app/api"
23 | ports:
24 | - "5900:5900"
25 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | scraperr:
3 | image: jpyles0524/scraperr:latest
4 | container_name: scraperr
5 | command: ["npm", "run", "start"]
6 | environment:
7 | - NEXT_PUBLIC_API_URL=http://scraperr_api:8000 # your API URL
8 | - SERVER_URL=http://scraperr_api:8000 # your docker container API URL
9 | ports:
10 | - 80:3000
11 | networks:
12 | - web
13 | scraperr_api:
14 | init: True
15 | image: jpyles0524/scraperr_api:latest
16 | environment:
17 | - LOG_LEVEL=INFO
18 | container_name: scraperr_api
19 | ports:
20 | - 8000:8000
21 | volumes:
22 | - "$PWD/data:/project/app/data"
23 | - "$PWD/media:/project/app/media"
24 | networks:
25 | - web
26 |
27 | networks:
28 | web:
29 |
--------------------------------------------------------------------------------
/docker/api/Dockerfile:
--------------------------------------------------------------------------------
1 | # Build python dependencies
2 | FROM python:3.10.12-slim as pybuilder
3 |
4 | RUN apt-get update && \
5 | apt-get install -y curl && \
6 | apt-get install -y x11vnc xvfb uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 ffmpeg && \
7 | curl -LsSf https://astral.sh/uv/install.sh | sh && \
8 | apt-get remove -y curl && \
9 | apt-get autoremove -y && \
10 | rm -rf /var/lib/apt/lists/*
11 |
12 | RUN python -m pip --no-cache-dir install pdm
13 | RUN pdm config python.use_venv false
14 |
15 | WORKDIR /project/app
16 | COPY pyproject.toml pdm.lock /project/app/
17 |
18 | RUN pdm install -v --frozen-lockfile
19 |
20 | RUN pdm run playwright install --with-deps
21 |
22 | RUN pdm run camoufox fetch
23 |
24 | COPY ./api/ /project/app/api
25 |
26 | ENV PYTHONPATH=/project/pkgs
27 |
28 | COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
29 |
30 | EXPOSE 8000
31 |
32 | WORKDIR /project/app
33 |
34 | RUN mkdir -p /project/app/media
35 | RUN mkdir -p /project/app/data
36 | RUN touch /project/app/data/database.db
37 |
38 | EXPOSE 5900
39 |
40 | COPY start.sh /project/app/start.sh
41 |
42 | CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ]
--------------------------------------------------------------------------------
/docker/frontend/Dockerfile:
--------------------------------------------------------------------------------
1 | # Build next dependencies
2 | FROM node:23.1-slim
3 | WORKDIR /app
4 |
5 | # Copy package files first to leverage Docker cache
6 | COPY package.json yarn.lock ./
7 |
8 | # Install dependencies in a separate layer
9 | RUN yarn install --frozen-lockfile
10 |
11 | # Copy the rest of the application
12 | COPY tsconfig.json /app/tsconfig.json
13 | COPY tailwind.config.js /app/tailwind.config.js
14 | COPY next.config.mjs /app/next.config.mjs
15 | COPY postcss.config.js /app/postcss.config.js
16 |
17 | COPY public /app/public
18 | COPY src /app/src
19 |
20 | # Build the application
21 | RUN yarn build
22 |
23 | EXPOSE 3000
--------------------------------------------------------------------------------
/docs/chat_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/chat_page.png
--------------------------------------------------------------------------------
/docs/docs_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/docs_page.png
--------------------------------------------------------------------------------
/docs/job_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/job_page.png
--------------------------------------------------------------------------------
/docs/log_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/log_page.png
--------------------------------------------------------------------------------
/docs/login.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/login.png
--------------------------------------------------------------------------------
/docs/logo_picture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/logo_picture.png
--------------------------------------------------------------------------------
/docs/main_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/main_page.png
--------------------------------------------------------------------------------
/docs/stats_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/stats_page.png
--------------------------------------------------------------------------------
/helm/.helmignore:
--------------------------------------------------------------------------------
1 | # Patterns to ignore when building packages.
2 | # This supports shell glob matching, relative path matching, and
3 | # negation (prefixed with !). Only one pattern per line.
4 | .DS_Store
5 | # Common VCS dirs
6 | .git/
7 | .gitignore
8 | .bzr/
9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 |
--------------------------------------------------------------------------------
/helm/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: scraperr
3 | description: A Helm chart for Kubernetes
4 |
5 | # A chart can be either an 'application' or a 'library' chart.
6 | #
7 | # Application charts are a collection of templates that can be packaged into versioned archives
8 | # to be deployed.
9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 |
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 1.1.0
19 |
20 | # This is the version number of the application being deployed. This version number should be
21 | # incremented each time you make changes to the application. Versions are not expected to
22 | # follow Semantic Versioning. They should reflect the version the application is using.
23 | # It is recommended to use it with quotes.
24 | appVersion: "1.16.0"
25 |
--------------------------------------------------------------------------------
/helm/templates/deployment.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: apps/v1
3 | kind: Deployment
4 | metadata:
5 | name: scraperr
6 | spec:
7 | replicas: {{ .Values.replicaCount }}
8 | selector:
9 | matchLabels:
10 | app: scraperr
11 | template:
12 | metadata:
13 | labels:
14 | app: scraperr
15 | spec:
16 | containers:
17 | - name: scraperr
18 | {{ if .Values.scraperr.image.repository }}
19 | image: "{{ .Values.scraperr.image.repository }}:{{ .Values.scraperr.image.tag }}"
20 | {{ else }}
21 | image: "{{ .Chart.Name }}:{{ .Chart.Version }}"
22 | {{ end }}
23 | imagePullPolicy: {{ .Values.scraperr.image.pullPolicy }}
24 | command: {{ .Values.scraperr.containerCommand | toJson }}
25 | ports:
26 | - containerPort: {{ .Values.scraperr.containerPort }}
27 | env: {{ toYaml .Values.scraperr.env | nindent 12 }}
28 |
29 | ---
30 | apiVersion: apps/v1
31 | kind: Deployment
32 | metadata:
33 | name: scraperr-api
34 | spec:
35 | replicas: {{ .Values.replicaCount }}
36 | selector:
37 | matchLabels:
38 | app: scraperr-api
39 | template:
40 | metadata:
41 | labels:
42 | app: scraperr-api
43 | spec:
44 | containers:
45 | - name: scraperr-api
46 | {{ if .Values.scraperrApi.image.repository }}
47 | image: "{{ .Values.scraperrApi.image.repository }}:{{ .Values.scraperrApi.image.tag }}"
48 | {{ else }}
49 | image: "{{ .Chart.Name }}:{{ .Chart.Version }}"
50 | {{ end }}
51 | imagePullPolicy: {{ .Values.scraperrApi.image.pullPolicy }}
52 | ports:
53 | - containerPort: {{ .Values.scraperrApi.containerPort }}
54 | env: {{ toYaml .Values.scraperrApi.env | nindent 12 }}
55 | volumeMounts: {{ toYaml .Values.scraperrApi.volumeMounts | nindent 12 }}
56 | volumes: {{ toYaml .Values.scraperrApi.volumes | nindent 12 }}
--------------------------------------------------------------------------------
/helm/templates/service.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | apiVersion: v1
3 | kind: Service
4 | metadata:
5 | name: scraperr
6 | spec:
7 | type: {{ .Values.scraperr.serviceType }}
8 | selector:
9 | app: scraperr
10 | ports:
11 | {{- range .Values.scraperr.ports }}
12 | - port: {{ .port }}
13 | targetPort: {{ .targetPort }}
14 | {{- if .nodePort }}
15 | nodePort: {{ .nodePort }}
16 | {{- end }}
17 | protocol: {{ .protocol | default "TCP" }}
18 | {{- end }}
19 |
20 | ---
21 | apiVersion: v1
22 | kind: Service
23 | metadata:
24 | name: scraperr-api
25 | spec:
26 | type: {{ .Values.scraperrApi.serviceType }}
27 | selector:
28 | app: scraperr-api
29 | ports:
30 | {{- range .Values.scraperrApi.ports }}
31 | - port: {{ .port }}
32 | targetPort: {{ .targetPort }}
33 | {{- if .nodePort }}
34 | nodePort: {{ .nodePort }}
35 | {{- end }}
36 | protocol: {{ .protocol | default "TCP" }}
37 | {{- end }}
38 |
--------------------------------------------------------------------------------
/helm/values.yaml:
--------------------------------------------------------------------------------
1 | scraperr:
2 | image:
3 | repository: jpyles0524/scraperr
4 | tag: latest
5 | pullPolicy: IfNotPresent
6 | containerCommand: ["npm", "run","start"]
7 | containerPort: 3000
8 | serviceType: NodePort
9 | ports:
10 | - port: 80
11 | targetPort: 3000
12 | nodePort: 32300
13 | protocol: TCP
14 | env:
15 | - name: NEXT_PUBLIC_API_URL
16 | value: "http://scraperr-api:8000"
17 | - name: SERVER_URL
18 | value: "http://scraperr-api:8000"
19 |
20 | scraperrApi:
21 | image:
22 | repository: jpyles0524/scraperr_api
23 | tag: latest
24 | pullPolicy: IfNotPresent
25 | containerPort: 8000
26 | serviceType: ClusterIP
27 | ports:
28 | - port: 8000
29 | targetPort: 8000
30 | protocol: TCP
31 | env:
32 | - name: LOG_LEVEL
33 | value: "INFO"
34 | volumeMounts:
35 | - name: data
36 | mountPath: /project/app/data
37 | - name: media
38 | mountPath: /project/app/media
39 | volumes:
40 | - name: data
41 | hostPath:
42 | path: /data/scraperr/data
43 | type: DirectoryOrCreate
44 | - name: media
45 | hostPath:
46 | path: /data/scraperr/media
47 | replicaCount: 1
--------------------------------------------------------------------------------
/next-env.d.ts:
--------------------------------------------------------------------------------
1 | ///
2 | ///
3 |
4 | // NOTE: This file should not be edited
5 | // see https://nextjs.org/docs/basic-features/typescript for more information.
6 |
--------------------------------------------------------------------------------
/next.config.mjs:
--------------------------------------------------------------------------------
1 | import dotenv from "dotenv";
2 | dotenv.config();
3 |
4 | /** @type {import('next').NextConfig} */
5 | const nextConfig = {
6 | distDir: "./dist",
7 | images: { unoptimized: true },
8 | env: {
9 | DOMAIN: `${process.env.NEXT_PUBLIC_API_PATH}`,
10 | },
11 | };
12 |
13 | export default nextConfig;
14 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "webapp-template",
3 | "version": "0.1.0",
4 | "private": true,
5 | "dependencies": {
6 | "@auth0/auth0-react": "^2.2.4",
7 | "@auth0/nextjs-auth0": "^3.5.0",
8 | "@chakra-ui/react": "^2.8.2",
9 | "@emotion/react": "^11.11.4",
10 | "@emotion/styled": "^11.11.5",
11 | "@fontsource/roboto": "^5.0.13",
12 | "@minchat/react-chat-ui": "^0.16.2",
13 | "@mui/icons-material": "^5.15.3",
14 | "@mui/material": "^5.16.0",
15 | "@reduxjs/toolkit": "^2.8.2",
16 | "@testing-library/jest-dom": "^5.16.5",
17 | "@testing-library/react": "^13.4.0",
18 | "@testing-library/user-event": "^13.5.0",
19 | "@types/react": "^18.3.21",
20 | "axios": "^1.7.2",
21 | "bootstrap": "^5.3.0",
22 | "chart.js": "^4.4.3",
23 | "cookie": "^0.6.0",
24 | "dotenv": "^16.5.0",
25 | "framer-motion": "^4.1.17",
26 | "js-cookie": "^3.0.5",
27 | "next": "^14.2.4",
28 | "next-auth": "^4.24.7",
29 | "nookies": "^2.5.2",
30 | "react": "^18.3.1",
31 | "react-bootstrap": "^2.8.0",
32 | "react-dom": "^18.3.1",
33 | "react-markdown": "^9.0.0",
34 | "react-modal-image": "^2.6.0",
35 | "react-redux": "^9.2.0",
36 | "react-router": "^6.14.1",
37 | "react-router-dom": "^6.14.1",
38 | "react-spinners": "^0.14.1",
39 | "redux-persist": "^6.0.0",
40 | "typescript": "^4.9.5",
41 | "web-vitals": "^2.1.4"
42 | },
43 | "scripts": {
44 | "dev": "yarn next dev",
45 | "build": "yarn next build",
46 | "start": "yarn next start",
47 | "serve": "serve -s ./dist",
48 | "cy:open": "cypress open",
49 | "cy:run": "cypress run"
50 | },
51 | "eslintConfig": {
52 | "extends": [
53 | "react-app",
54 | "react-app/jest"
55 | ]
56 | },
57 | "browserslist": {
58 | "production": [
59 | ">0.2%",
60 | "not dead",
61 | "not op_mini all"
62 | ],
63 | "development": [
64 | "last 1 chrome version",
65 | "last 1 firefox version",
66 | "last 1 safari version"
67 | ]
68 | },
69 | "devDependencies": {
70 | "@types/cypress": "^1.1.6",
71 | "@types/js-cookie": "^3.0.6",
72 | "autoprefixer": "^10.4.21",
73 | "cypress": "^13.17.0",
74 | "eslint": "^9.26.0",
75 | "postcss": "^8.5.3",
76 | "tailwindcss": "^3.3.5"
77 | },
78 | "overrides": {
79 | "react-refresh": "0.11.0"
80 | },
81 | "resolutions": {
82 | "postcss": "^8.4.31"
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/postcss.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | plugins: {
3 | tailwindcss: {},
4 | autoprefixer: {},
5 | },
6 | };
7 |
--------------------------------------------------------------------------------
/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/public/favicon.ico
--------------------------------------------------------------------------------
/public/images/scraperr_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/public/images/scraperr_logo.png
--------------------------------------------------------------------------------
/public/manifest.json:
--------------------------------------------------------------------------------
1 | {
2 | "short_name": "React App",
3 | "name": "Create React App Sample",
4 | "icons": [
5 | {
6 | "src": "favicon.ico",
7 | "sizes": "64x64 32x32 24x24 16x16",
8 | "type": "image/x-icon"
9 | },
10 | {
11 | "src": "logo192.png",
12 | "type": "image/png",
13 | "sizes": "192x192"
14 | },
15 | {
16 | "src": "logo512.png",
17 | "type": "image/png",
18 | "sizes": "512x512"
19 | }
20 | ],
21 | "start_url": ".",
22 | "display": "standalone",
23 | "theme_color": "#000000",
24 | "background_color": "#ffffff"
25 | }
26 |
--------------------------------------------------------------------------------
/public/robots.txt:
--------------------------------------------------------------------------------
1 | # https://www.robotstxt.org/robotstxt.html
2 | User-agent: *
3 | Disallow:
4 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "web-scrape"
3 | version = "0.1.0"
4 | description = ""
5 | authors = [{ name = "Jayden Pyles", email = "jpylesbuisness@gmail.com" }]
6 | dependencies = [
7 | "uvicorn>=0.30.1",
8 | "fastapi>=0.111.0",
9 | "boto3>=1.34.140",
10 | "python-dotenv>=1.0.1",
11 | "boto3-stubs[essential]>=1.34.140",
12 | "asyncio>=3.4.3",
13 | "aiohttp>=3.9.5",
14 | "bs4>=0.0.2",
15 | "lxml[html_clean]>=5.2.2",
16 | "lxml-stubs>=0.5.1",
17 | "fake-useragent>=1.5.1",
18 | "requests-html>=0.10.0",
19 | "webdriver-manager>=4.0.1",
20 | "pydantic[email]>=2.9.2",
21 | "pandas>=2.2.2",
22 | "openpyxl>=3.1.5",
23 | "xlsxwriter>=3.2.0",
24 | "python-keycloak>=4.2.0",
25 | "fastapi-keycloak>=1.0.11",
26 | "pymongo>=4.8.0",
27 | "motor[asyncio]>=3.5.0",
28 | "python-jose[cryptography]>=3.3.0",
29 | "passlib[bcrypt]>=1.7.4",
30 | "selenium-wire>=5.1.0",
31 | "blinker<1.8.0",
32 | "setuptools>=71.0.4",
33 | "docker>=7.1.0",
34 | "ollama>=0.3.0",
35 | "openai>=1.37.1",
36 | "exceptiongroup>=1.2.2",
37 | "Faker>=30.6.0",
38 | "pytest-asyncio>=0.24.0",
39 | "python-multipart>=0.0.1",
40 | "bcrypt==4.0.1",
41 | "apscheduler>=3.11.0",
42 | "playwright>=1.52.0",
43 | "camoufox>=0.4.11",
44 | "html2text>=2025.4.15",
45 | ]
46 | requires-python = ">=3.10"
47 | readme = "README.md"
48 | license = { text = "MIT" }
49 |
50 | [tool.pdm]
51 | distribution = true
52 |
53 | [tool.pdm.dev-dependencies]
54 | dev = ["ipython>=8.26.0", "pytest>=8.3.3"]
55 | [tool.pyright]
56 | include = ["./api/backend/"]
57 | exclude = ["**/node_modules", "**/__pycache__"]
58 | ignore = []
59 | defineConstant = { DEBUG = true }
60 | stubPath = ""
61 |
62 | # Type checking strictness
63 | typeCheckingMode = "strict" # Enables strict type checking mode
64 | reportPrivateUsage = "none"
65 | reportMissingTypeStubs = "none"
66 | reportUntypedFunctionDecorator = "error"
67 | reportUntypedClassDecorator = "error"
68 | reportUntypedBaseClass = "error"
69 | reportInvalidTypeVarUse = "error"
70 | reportUnnecessaryTypeIgnoreComment = "information"
71 | reportUnknownVariableType = "none"
72 | reportUnknownMemberType = "none"
73 | reportUnknownParameterType = "none"
74 |
75 | # Additional checks
76 | reportImplicitStringConcatenation = "error"
77 | reportInvalidStringEscapeSequence = "error"
78 | reportMissingImports = "error"
79 | reportMissingModuleSource = "error"
80 | reportOptionalCall = "error"
81 | reportOptionalIterable = "error"
82 | reportOptionalMemberAccess = "error"
83 | reportOptionalOperand = "error"
84 | reportOptionalSubscript = "error"
85 | reportTypedDictNotRequiredAccess = "error"
86 |
87 | # Function return type checking
88 | reportIncompleteStub = "error"
89 | reportIncompatibleMethodOverride = "error"
90 | reportInvalidStubStatement = "error"
91 | reportInconsistentOverload = "error"
92 |
93 | # Misc settings
94 | pythonVersion = "3.10" # Matches your Python version from pyproject.toml
95 | strictListInference = true
96 | strictDictionaryInference = true
97 | strictSetInference = true
98 |
99 |
100 | [tool.isort]
101 | length_sort = "1"
102 | profile = "black"
103 | sections = "STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER"
104 | import_heading_stdlib = "STL"
105 | import_heading_thirdparty = "PDM"
106 | import_heading_firstparty = "LOCAL"
107 | import_heading_localfolder = "LOCAL"
108 |
--------------------------------------------------------------------------------
/src/components/ai/Chat.tsx:
--------------------------------------------------------------------------------
1 | import React from "react";
2 |
3 | export const Chat = () => {
4 | return Chat ;
5 | };
6 |
--------------------------------------------------------------------------------
/src/components/ai/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./Chat";
2 | export * from "./JobSelector";
3 |
--------------------------------------------------------------------------------
/src/components/common/advanced-job-options/advanced-job-options.tsx:
--------------------------------------------------------------------------------
1 | import { Box, Link, Typography } from "@mui/material";
2 | import { SetStateAction, Dispatch, useState } from "react";
3 | import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog";
4 | import { RawJobOptions } from "@/types";
5 |
6 | export type AdvancedJobOptionsProps = {
7 | jobOptions: RawJobOptions;
8 | setJobOptions: Dispatch>;
9 | multiPageScrapeEnabled?: boolean;
10 | };
11 |
12 | export const AdvancedJobOptions = ({
13 | jobOptions,
14 | setJobOptions,
15 | multiPageScrapeEnabled = true,
16 | }: AdvancedJobOptionsProps) => {
17 | const [open, setOpen] = useState(false);
18 | return (
19 |
20 | setOpen(true)}
24 | sx={{
25 | textDecoration: "none",
26 | color: "primary.main",
27 | "&:hover": {
28 | color: "primary.dark",
29 | textDecoration: "underline",
30 | },
31 | paddingLeft: 1,
32 | display: "inline-flex",
33 | alignItems: "center",
34 | gap: 0.5,
35 | }}
36 | >
37 | Advanced Job Options
38 |
39 | setOpen(false)}
42 | jobOptions={jobOptions}
43 | setJobOptions={setJobOptions}
44 | multiPageScrapeEnabled={multiPageScrapeEnabled}
45 | />
46 |
47 | );
48 | };
49 |
--------------------------------------------------------------------------------
/src/components/common/advanced-job-options/dialog/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./advanced-job-options-dialog";
2 |
--------------------------------------------------------------------------------
/src/components/common/advanced-job-options/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./advanced-job-options";
2 |
--------------------------------------------------------------------------------
/src/components/common/csv-table/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./csv-table";
2 |
--------------------------------------------------------------------------------
/src/components/common/disabled/disabled.tsx:
--------------------------------------------------------------------------------
1 | import { Box } from "@mui/material";
2 |
3 | export type DisabledProps = {
4 | message: string;
5 | };
6 |
7 | export const Disabled = ({ message }: DisabledProps) => {
8 | return (
9 |
16 |
25 | {message}
26 |
27 |
28 | );
29 | };
30 |
--------------------------------------------------------------------------------
/src/components/common/disabled/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./disabled";
2 |
--------------------------------------------------------------------------------
/src/components/common/expanded-table-input/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./expanded-table-input";
2 |
--------------------------------------------------------------------------------
/src/components/common/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./nav-drawer";
2 |
--------------------------------------------------------------------------------
/src/components/common/job-download-dialog/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./job-download-dialog";
2 |
--------------------------------------------------------------------------------
/src/components/common/job-download-dialog/job-download-dialog.tsx:
--------------------------------------------------------------------------------
1 | import {
2 | Dialog,
3 | DialogTitle,
4 | DialogContent,
5 | DialogActions,
6 | Button,
7 | FormControl,
8 | RadioGroup,
9 | FormControlLabel,
10 | Radio,
11 | FormLabel,
12 | Typography,
13 | Box,
14 | } from "@mui/material";
15 | import { useState } from "react";
16 |
17 | export type JobDownloadDialogProps = {
18 | open: boolean;
19 | onClose: () => void;
20 | ids: string[];
21 | };
22 |
23 | export const JobDownloadDialog = ({
24 | open,
25 | onClose,
26 | ids,
27 | }: JobDownloadDialogProps) => {
28 | const [jobFormat, setJobFormat] = useState("csv");
29 | const handleDownload = async () => {
30 | const response = await fetch("/api/download", {
31 | method: "POST",
32 | headers: { "Content-Type": "application/json" },
33 | body: JSON.stringify({ data: { ids: ids, job_format: jobFormat } }),
34 | });
35 |
36 | if (response.ok) {
37 | const blob = await response.blob();
38 | const url = window.URL.createObjectURL(blob);
39 | const a = document.createElement("a");
40 | a.style.display = "none";
41 | a.href = url;
42 | a.download = `job_${ids[0]}.${jobFormat}`;
43 | document.body.appendChild(a);
44 | a.click();
45 | window.URL.revokeObjectURL(url);
46 | document.body.removeChild(a);
47 | } else {
48 | console.error("Failed to download the file.");
49 | }
50 | };
51 |
52 | return (
53 |
54 | Download Job
55 |
56 |
57 |
58 | You are about to download {ids.length} job(s). Please select the
59 | format that you would like to download them in.
60 |
61 |
62 |
71 | Format
72 |
73 | setJobFormat(e.target.value)}
78 | >
79 | } label="CSV" />
80 | }
83 | label="Markdown"
84 | />
85 |
86 |
87 |
88 |
89 | Download
90 |
91 |
92 |
93 |
94 | );
95 | };
96 |
--------------------------------------------------------------------------------
/src/components/common/media-viewer/audio/audio-viewer.tsx:
--------------------------------------------------------------------------------
1 |
2 | import { Box, Typography } from "@mui/material";
3 |
4 | interface AudioViewerProps {
5 | mediaUrl: string;
6 | selectedMedia: string;
7 | onError: () => void;
8 | }
9 |
10 | export const AudioViewer = ({
11 | mediaUrl,
12 | selectedMedia,
13 | onError,
14 | }: AudioViewerProps) => {
15 | return (
16 |
26 | {selectedMedia}
27 |
35 |
36 | Your browser does not support the audio element.
37 |
38 |
39 | );
40 | };
41 |
--------------------------------------------------------------------------------
/src/components/common/media-viewer/audio/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./audio-viewer";
2 |
--------------------------------------------------------------------------------
/src/components/common/media-viewer/image/image-viewer.tsx:
--------------------------------------------------------------------------------
1 | import { Box, useTheme } from "@mui/material";
2 |
3 | export const ImageViewer = ({
4 | mediaUrl,
5 | selectedMedia,
6 | }: {
7 | mediaUrl: string;
8 | selectedMedia: string;
9 | }) => {
10 | const theme = useTheme();
11 | return (
12 |
23 |
34 |
35 | );
36 | };
37 |
--------------------------------------------------------------------------------
/src/components/common/media-viewer/image/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./image-viewer";
2 |
--------------------------------------------------------------------------------
/src/components/common/media-viewer/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./media-viewer";
2 |
--------------------------------------------------------------------------------
/src/components/common/media-viewer/media-viewer.tsx:
--------------------------------------------------------------------------------
1 | import { Box, Typography } from "@mui/material";
2 | import { ImageViewer } from "./image";
3 | import { VideoViewer } from "./video";
4 | import { AudioViewer } from "./audio";
5 | import { PDFViewer } from "./pdf-viewer";
6 |
7 | interface MediaViewerProps {
8 | selectedMedia: string;
9 | activeTab: string;
10 | getMediaUrl: (fileName: string) => string;
11 | onError: (error: string) => void;
12 | }
13 |
14 | export const MediaViewer = ({
15 | selectedMedia,
16 | activeTab,
17 | getMediaUrl,
18 | onError,
19 | }: MediaViewerProps) => {
20 | if (!selectedMedia) {
21 | return (
22 |
30 |
31 | Select a file to view
32 |
33 |
34 | );
35 | }
36 |
37 | const mediaUrl = getMediaUrl(selectedMedia);
38 |
39 | switch (activeTab) {
40 | case "images":
41 | return ;
42 | case "videos":
43 | return (
44 | onError("Error loading video")}
47 | />
48 | );
49 | case "audio":
50 | return (
51 | onError("Error loading audio")}
55 | />
56 | );
57 | case "pdfs":
58 | return ;
59 | default:
60 | return (
61 |
69 |
70 | {selectedMedia} - Download this file to view it
71 |
72 |
73 | );
74 | }
75 | };
76 |
--------------------------------------------------------------------------------
/src/components/common/media-viewer/pdf-viewer/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./pdf-viewer";
2 |
--------------------------------------------------------------------------------
/src/components/common/media-viewer/pdf-viewer/pdf-viewer.tsx:
--------------------------------------------------------------------------------
1 | import { Box, useTheme } from "@mui/material";
2 |
3 | interface PDFViewerProps {
4 | mediaUrl: string;
5 | selectedMedia: string;
6 | }
7 |
8 | export const PDFViewer = ({ mediaUrl, selectedMedia }: PDFViewerProps) => {
9 | const theme = useTheme();
10 |
11 | return (
12 |
20 |
31 |
32 | );
33 | };
34 |
--------------------------------------------------------------------------------
/src/components/common/media-viewer/tile-grid-view/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./tile-grid-view";
2 |
--------------------------------------------------------------------------------
/src/components/common/media-viewer/tile-grid-view/tile-grid-view.tsx:
--------------------------------------------------------------------------------
1 | import { MediaFiles } from "@/components/pages/media/id/id";
2 | import {
3 | Card,
4 | CardActionArea,
5 | CardMedia,
6 | CardContent,
7 | Typography,
8 | Box,
9 | Grid,
10 | useTheme,
11 | } from "@mui/material";
12 |
13 | interface TileGridViewProps {
14 | mediaFiles: MediaFiles;
15 | activeTab: string;
16 | selectedMedia: string;
17 | handleMediaSelect: (fileName: string) => void;
18 | getMediaUrl: (fileName: string) => string;
19 | }
20 |
21 | export const TileGridView = ({
22 | mediaFiles,
23 | activeTab,
24 | selectedMedia,
25 | handleMediaSelect,
26 | getMediaUrl,
27 | }: TileGridViewProps) => {
28 | const theme = useTheme();
29 |
30 | return (
31 |
32 | {mediaFiles[activeTab].map((fileName: string) => (
33 |
34 |
52 | handleMediaSelect(fileName)}>
53 |
67 | {activeTab === "images" ? (
68 | {
82 | const target = e.target as HTMLImageElement;
83 | if (target.src !== "/placeholder-image.png") {
84 | target.src = "";
85 | }
86 | }}
87 | />
88 | ) : (
89 |
99 | {fileName.split(".").pop()?.toUpperCase() || "FILE"}
100 |
101 | )}
102 |
103 |
104 |
105 | {fileName}
106 |
107 |
108 |
109 |
110 |
111 | ))}
112 |
113 | );
114 | };
115 |
--------------------------------------------------------------------------------
/src/components/common/media-viewer/video/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./video-viewer";
2 |
--------------------------------------------------------------------------------
/src/components/common/media-viewer/video/video-viewer.tsx:
--------------------------------------------------------------------------------
1 | import { Box, useTheme } from "@mui/material";
2 |
3 | export const VideoViewer = ({
4 | mediaUrl,
5 | onError,
6 | }: {
7 | mediaUrl: string;
8 | onError: () => void;
9 | }) => {
10 | const theme = useTheme();
11 | return (
12 |
23 |
34 |
35 | Your browser does not support the video tag.
36 |
37 |
38 | );
39 | };
40 |
--------------------------------------------------------------------------------
/src/components/common/nav-drawer/index.ts:
--------------------------------------------------------------------------------
1 | export { NavDrawer } from "./nav-drawer";
2 |
--------------------------------------------------------------------------------
/src/components/common/nav-drawer/nav-drawer.module.css:
--------------------------------------------------------------------------------
1 | .userControl {
2 | margin-bottom: 1rem;
3 | }
4 |
--------------------------------------------------------------------------------
/src/components/common/nav-drawer/nav-drawer.tsx:
--------------------------------------------------------------------------------
1 | "use client";
2 |
3 | import React from "react";
4 | import { useAuth } from "../../../contexts/AuthContext";
5 | import { Box, Drawer } from "@mui/material";
6 |
7 | import { QuickSettings } from "../../nav/quick-settings";
8 | import { NavItems } from "./nav-items/nav-items";
9 | import { UserControl } from "./user-control";
10 |
11 | import classes from "./nav-drawer.module.css";
12 |
13 | interface NavDrawerProps {
14 | toggleTheme: () => void;
15 | isDarkMode: boolean;
16 | }
17 |
18 | const drawerWidth = 240;
19 |
20 | export const NavDrawer: React.FC = ({
21 | toggleTheme,
22 | isDarkMode,
23 | }) => {
24 | const { logout, user, isAuthenticated } = useAuth();
25 |
26 | return (
27 |
38 |
47 |
48 |
49 |
50 |
51 |
57 |
58 |
59 |
60 |
61 | );
62 | };
63 |
--------------------------------------------------------------------------------
/src/components/common/nav-drawer/nav-item/index.ts:
--------------------------------------------------------------------------------
1 | export { default as NavItem } from "./nav-item";
2 |
--------------------------------------------------------------------------------
/src/components/common/nav-drawer/nav-item/nav-item.tsx:
--------------------------------------------------------------------------------
1 | import {
2 | ListItem,
3 | ListItemButton,
4 | ListItemIcon,
5 | ListItemText,
6 | } from "@mui/material";
7 | import { useRouter } from "next/router";
8 | import React from "react";
9 |
10 | export type NavItemProps = {
11 | icon: React.ReactNode;
12 | text: string;
13 | href: string;
14 | };
15 |
16 | const NavItem: React.FC = ({ icon, text, href }) => {
17 | const router = useRouter();
18 |
19 | const handleClick = () => {
20 | router.push(href);
21 | };
22 |
23 | return (
24 |
25 |
26 | {icon}
27 |
28 |
29 |
30 | );
31 | };
32 |
33 | export default NavItem;
34 |
--------------------------------------------------------------------------------
/src/components/common/nav-drawer/nav-items/index.ts:
--------------------------------------------------------------------------------
1 | export { NavItems } from "./nav-items";
2 |
--------------------------------------------------------------------------------
/src/components/common/nav-drawer/nav-items/nav-items.tsx:
--------------------------------------------------------------------------------
1 | import React from "react";
2 | import { NavItem } from "../nav-item";
3 |
4 | import HomeIcon from "@mui/icons-material/Home";
5 | import HttpIcon from "@mui/icons-material/Http";
6 | import BarChart from "@mui/icons-material/BarChart";
7 | import AutoAwesomeIcon from "@mui/icons-material/AutoAwesome";
8 | import { List } from "@mui/material";
9 | import { Folder, Person, Schedule, VideoFile } from "@mui/icons-material";
10 |
11 | const items = [
12 | {
13 | icon: ,
14 | text: "Home",
15 | href: "/",
16 | },
17 | {
18 | icon: ,
19 | text: "Jobs",
20 | href: "/jobs",
21 | },
22 | {
23 | icon: ,
24 | text: "Agent",
25 | href: "/agent",
26 | },
27 | {
28 | icon: ,
29 | text: "Chat",
30 | href: "/chat",
31 | },
32 | {
33 | icon: ,
34 | text: "Statistics",
35 | href: "/statistics",
36 | },
37 | {
38 | icon: ,
39 | text: "Cron Jobs",
40 | href: "/cron-jobs",
41 | },
42 | {
43 | icon: ,
44 | text: "Recordings",
45 | href: "/recordings",
46 | },
47 | {
48 | icon: ,
49 | text: "Media",
50 | href: "/media",
51 | },
52 | ];
53 |
54 | export const NavItems = () => {
55 | return (
56 |
57 | {items.map((item) => (
58 |
59 | ))}
60 |
61 | );
62 | };
63 |
--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./user-control";
2 |
--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/logged-in-control/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./logged-in-control";
--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/logged-in-control/logged-in-control.module.css:
--------------------------------------------------------------------------------
1 | .welcome {
2 | margin: 0.25rem;
3 | }
4 |
5 | .userControlButton {
6 | width: 100%;
7 | }
--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/logged-in-control/logged-in-control.tsx:
--------------------------------------------------------------------------------
1 | import React from "react";
2 | import { Typography, Button } from "@mui/material";
3 | import classes from "./logged-in-control.module.css";
4 |
5 | type LoggedInControlProps = {
6 | user: any;
7 | logout: () => void;
8 | children?: React.ReactNode;
9 | };
10 |
11 | export const LoggedInControl = ({
12 | user,
13 | logout,
14 | children,
15 | }: LoggedInControlProps) => {
16 | if (children) {
17 | return <>{children}>;
18 | }
19 |
20 | return (
21 | <>
22 |
23 | Welcome, {user?.full_name}
24 |
25 |
30 | Logout
31 |
32 | >
33 | );
34 | };
35 |
--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/logged-out-control/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./logged-out-control";
2 |
--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/logged-out-control/logged-out-control.module.css:
--------------------------------------------------------------------------------
1 | .userControlButton {
2 | width: 100%;
3 | }
4 |
--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/logged-out-control/logged-out-control.tsx:
--------------------------------------------------------------------------------
1 | import React from "react";
2 | import { Button } from "@mui/material";
3 | import classes from "./logged-out-control.module.css";
4 | import { useRouter } from "next/navigation";
5 |
6 | export type LoggedOutControlProps = {
7 | children?: React.ReactNode;
8 | };
9 |
10 | export const LoggedOutControl: React.FC = ({
11 | children,
12 | }) => {
13 | const router = useRouter();
14 | const login = () => router.push("/login");
15 |
16 | if (children) {
17 | return <>{children}>;
18 | }
19 |
20 | return (
21 |
26 | Login
27 |
28 | );
29 | };
30 |
--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/user-control.module.css:
--------------------------------------------------------------------------------
1 | .userControl {
2 | display: flex;
3 | flex-direction: column;
4 | align-items: center;
5 | }
6 |
7 | .welcome {
8 | margin: 0.25rem;
9 | }
10 |
11 | .userControlButton {
12 | width: 100%;
13 | }
14 |
--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/user-control.tsx:
--------------------------------------------------------------------------------
1 | import React from "react";
2 | import { Box } from "@mui/material";
3 | import clsx from "clsx";
4 |
5 | import classes from "./user-control.module.css";
6 | import { LoggedInControl } from "./logged-in-control";
7 | import { LoggedOutControl } from "./logged-out-control";
8 |
9 | export type UserControlProps = {
10 | isAuthenticated: boolean;
11 | user: any;
12 | logout: () => void;
13 | loggedInChildren?: React.ReactNode;
14 | loggedOutChildren?: React.ReactNode;
15 | className?: string;
16 | };
17 |
18 | export const UserControl = ({
19 | isAuthenticated,
20 | user,
21 | logout,
22 | loggedInChildren,
23 | loggedOutChildren,
24 | className,
25 | }: UserControlProps) => {
26 | return (
27 |
28 | {isAuthenticated ? (
29 |
30 | {loggedInChildren}
31 |
32 | ) : (
33 | {loggedOutChildren}
34 | )}
35 |
36 | );
37 | };
38 |
--------------------------------------------------------------------------------
/src/components/jobs/Favorites.tsx:
--------------------------------------------------------------------------------
1 | import React from "react";
2 | import {
3 | Tooltip,
4 | IconButton,
5 | Table,
6 | TableBody,
7 | TableCell,
8 | TableHead,
9 | TableRow,
10 | Box,
11 | Checkbox,
12 | Button,
13 | } from "@mui/material";
14 | import { Job } from "../../types";
15 | import StarIcon from "@mui/icons-material/Star";
16 |
17 | interface stateProps {
18 | selectedJobs: Set;
19 | filteredJobs: Job[];
20 | }
21 |
22 | interface Props {
23 | onSelectJob: (job: string) => void;
24 | onNavigate: (elements: Object[], url: string, options: any) => void;
25 | onFavorite: (ids: string[], field: string, value: any) => void;
26 | stateProps: stateProps;
27 | }
28 |
29 | export const Favorites = ({
30 | stateProps,
31 | onSelectJob,
32 | onNavigate,
33 | onFavorite,
34 | }: Props) => {
35 | const { selectedJobs, filteredJobs } = stateProps;
36 | const favoritedJobs = filteredJobs.filter((job) => job.favorite);
37 |
38 | return (
39 |
40 |
41 |
42 | Select
43 | Id
44 | Url
45 | Elements
46 | Time Created
47 | Actions
48 |
49 |
50 |
51 | {favoritedJobs.map((row, index) => (
52 |
53 |
54 | onSelectJob(row.id)}
57 | />
58 |
59 |
60 | {
63 | onFavorite([row.id], "favorite", !row.favorite);
64 | row.favorite = !row.favorite;
65 | }}
66 | >
67 |
68 |
69 |
70 |
71 |
72 |
73 | {row.id}
74 |
75 |
76 | {row.url}
77 |
78 |
79 |
80 | {JSON.stringify(row.elements)}
81 |
82 |
83 |
84 |
85 | {new Date(row.time_created).toLocaleString()}
86 |
87 |
88 |
89 |
91 | onNavigate(row.elements, row.url, row.job_options)
92 | }
93 | >
94 | Run
95 |
96 |
97 |
98 | ))}
99 |
100 |
101 | );
102 | };
103 |
--------------------------------------------------------------------------------
/src/components/jobs/index.tsx:
--------------------------------------------------------------------------------
1 | export * from "./JobQueue";
2 | export * from "./Favorites";
3 | export * from "./JobTable";
4 |
--------------------------------------------------------------------------------
/src/components/nav/quick-settings/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./quick-settings";
--------------------------------------------------------------------------------
/src/components/nav/quick-settings/quick-settings.module.css:
--------------------------------------------------------------------------------
1 | .quickSettings {
2 | padding: 0;
3 | margin-bottom: 0.25rem;
4 | }
5 |
6 | .details {
7 | display: flex;
8 | margin-right: 0.25rem;
9 | }
10 |
11 | .detailsText p {
12 | font-size: 1rem;
13 | }
14 |
--------------------------------------------------------------------------------
/src/components/nav/quick-settings/quick-settings.tsx:
--------------------------------------------------------------------------------
1 | import React from "react";
2 |
3 | import classes from "./quick-settings.module.css";
4 | import {
5 | Accordion,
6 | AccordionDetails,
7 | AccordionSummary,
8 | Switch,
9 | Typography,
10 | } from "@mui/material";
11 | import { ExpandMore } from "@mui/icons-material";
12 |
13 | type QuickSettingsProps = {
14 | toggleTheme: () => void;
15 | isDarkMode: boolean;
16 | };
17 |
18 | export const QuickSettings: React.FC = ({
19 | toggleTheme,
20 | isDarkMode,
21 | }) => {
22 | return (
23 |
24 | }
26 | aria-controls="panel1a-content"
27 | id="panel1a-header"
28 | >
29 | Quick Settings
30 |
31 |
32 |
33 |
34 | Dark Theme Toggle
35 |
36 |
37 |
38 |
39 |
40 | );
41 | };
42 |
--------------------------------------------------------------------------------
/src/components/pages/agent/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./agent";
2 |
--------------------------------------------------------------------------------
/src/components/pages/cron-jobs/create-cron-jobs/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./create-cron-jobs";
2 |
--------------------------------------------------------------------------------
/src/components/pages/cron-jobs/cron-jobs.module.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/src/components/pages/cron-jobs/cron-jobs.module.css
--------------------------------------------------------------------------------
/src/components/pages/cron-jobs/cron-jobs.tsx:
--------------------------------------------------------------------------------
1 | import { Job, CronJob } from "@/types/job";
2 | import { useState, useEffect } from "react";
3 | import { CreateCronJobs } from "./create-cron-jobs";
4 | import {
5 | Table,
6 | TableHead,
7 | TableRow,
8 | TableCell,
9 | TableBody,
10 | Button,
11 | Box,
12 | Typography,
13 | useTheme,
14 | } from "@mui/material";
15 | import Cookies from "js-cookie";
16 |
17 | export type CronJobsProps = {
18 | initialJobs: Job[];
19 | initialCronJobs: CronJob[];
20 | initialUser: any;
21 | };
22 |
23 | export const CronJobs = ({
24 | initialJobs,
25 | initialCronJobs,
26 | initialUser,
27 | }: CronJobsProps) => {
28 | const [jobs, setJobs] = useState(initialJobs);
29 | const [cronJobs, setCronJobs] = useState(initialCronJobs);
30 | const [user, setUser] = useState(initialUser);
31 | const theme = useTheme();
32 |
33 | useEffect(() => {
34 | setJobs(initialJobs);
35 | setCronJobs(initialCronJobs);
36 | setUser(initialUser);
37 | }, [initialJobs, initialCronJobs, initialUser]);
38 |
39 | const handleDeleteCronJob = async (id: string) => {
40 | const token = Cookies.get("token");
41 | const response = await fetch("/api/delete-cron-job", {
42 | method: "POST",
43 | headers: {
44 | "Content-Type": "application/json",
45 | Authorization: `Bearer ${token}`,
46 | },
47 | body: JSON.stringify({ data: { id, user_email: user.email } }),
48 | });
49 |
50 | if (response.ok) {
51 | console.log("Cron job deleted successfully");
52 | setCronJobs(cronJobs.filter((cronJob) => cronJob.id !== id));
53 | } else {
54 | console.error("Failed to delete cron job");
55 | }
56 | };
57 |
58 | if (!user) {
59 | return (
60 |
72 |
80 | Please login to view your cron jobs
81 |
82 |
83 | );
84 | }
85 |
86 | return (
87 |
88 |
89 |
90 |
91 |
92 |
93 | Cron Expression
94 | Job ID
95 | User Email
96 | Created At
97 | Updated At
98 | Actions
99 |
100 |
101 |
102 | {cronJobs.map((cronJob) => (
103 |
104 | {cronJob.cron_expression}
105 | {cronJob.job_id}
106 | {cronJob.user_email}
107 |
108 | {new Date(cronJob.time_created).toLocaleString()}
109 |
110 |
111 | {new Date(cronJob.time_updated).toLocaleString()}
112 |
113 |
114 | handleDeleteCronJob(cronJob.id)}>
115 | Delete
116 |
117 |
118 |
119 | ))}
120 |
121 |
122 |
123 | );
124 | };
125 |
--------------------------------------------------------------------------------
/src/components/pages/cron-jobs/get-server-side-props.ts:
--------------------------------------------------------------------------------
1 | import axios from "axios";
2 | import { GetServerSideProps } from "next";
3 | import { parseCookies } from "nookies";
4 | import { CronJob, Job } from "../../../types";
5 |
6 | export const getServerSideProps: GetServerSideProps = async (context) => {
7 | const { req } = context;
8 | const cookies = parseCookies({ req });
9 | const token = cookies.token;
10 | let user = null;
11 | let initialJobs: Job[] = [];
12 | let initialCronJobs: CronJob[] = [];
13 | if (token) {
14 | try {
15 | const userResponse = await axios.get(
16 | `${process.env.NEXT_PUBLIC_API_URL}/api/auth/users/me`,
17 | {
18 | headers: { Authorization: `Bearer ${token}` },
19 | }
20 | );
21 |
22 | user = userResponse.data;
23 |
24 | const jobsResponse = await fetch(
25 | `${process.env.NEXT_PUBLIC_API_URL}/api/retrieve-scrape-jobs`,
26 | {
27 | method: "POST",
28 | body: JSON.stringify({ user: user.email }),
29 | headers: {
30 | "content-type": "application/json",
31 | Authorization: `Bearer ${token}`,
32 | },
33 | }
34 | );
35 |
36 | initialJobs = await jobsResponse.json();
37 | console.log(initialJobs);
38 |
39 | const cronJobsResponse = await fetch(
40 | `${process.env.NEXT_PUBLIC_API_URL}/api/cron-jobs`,
41 | {
42 | headers: {
43 | "content-type": "application/json",
44 | Authorization: `Bearer ${token}`,
45 | },
46 | }
47 | );
48 |
49 | initialCronJobs = await cronJobsResponse.json();
50 | } catch (error) {
51 | console.error("Error fetching user or jobs:", error);
52 | }
53 | }
54 |
55 | return {
56 | props: {
57 | initialJobs,
58 | initialUser: user,
59 | initialCronJobs,
60 | },
61 | };
62 | };
63 |
--------------------------------------------------------------------------------
/src/components/pages/cron-jobs/index.ts:
--------------------------------------------------------------------------------
1 | export { CronJobs } from "./cron-jobs";
2 |
--------------------------------------------------------------------------------
/src/components/pages/home/home.tsx:
--------------------------------------------------------------------------------
1 | "use client";
2 |
3 | import React, { useState, useEffect, useRef } from "react";
4 | import { Button, Container, Box, Snackbar, Alert } from "@mui/material";
5 | import { useRouter } from "next/router";
6 | import { Element, Result } from "@/types";
7 | import { ElementTable, JobSubmitter } from "@/components/submit/job-submitter";
8 | import { useJobSubmitterProvider } from "@/components/submit/job-submitter/provider";
9 |
10 | export const Home = () => {
11 | const {
12 | submittedURL,
13 | setSubmittedURL,
14 | rows,
15 | setRows,
16 | results,
17 | snackbarOpen,
18 | setSnackbarOpen,
19 | snackbarMessage,
20 | snackbarSeverity,
21 | } = useJobSubmitterProvider();
22 | const router = useRouter();
23 | const { elements, url } = router.query;
24 |
25 | const resultsRef = useRef(null);
26 |
27 | useEffect(() => {
28 | if (elements) {
29 | setRows(JSON.parse(elements as string));
30 | }
31 | if (url) {
32 | setSubmittedURL(url as string);
33 | }
34 | }, [elements, url]);
35 |
36 | useEffect(() => {
37 | if (results && resultsRef.current) {
38 | resultsRef.current.scrollIntoView({ behavior: "smooth" });
39 | }
40 | }, [results]);
41 |
42 | const handleCloseSnackbar = () => {
43 | setSnackbarOpen(false);
44 | };
45 |
46 | const ErrorSnackbar = () => {
47 | return (
48 |
53 |
54 | {snackbarMessage}
55 |
56 |
57 | );
58 | };
59 |
60 | const NotifySnackbar = () => {
61 | const goTo = () => {
62 | router.push("/jobs");
63 | };
64 |
65 | const action = (
66 |
67 | Go To Job
68 |
69 | );
70 |
71 | return (
72 |
77 |
78 | {snackbarMessage}
79 |
80 |
81 | );
82 | };
83 |
84 | return (
85 |
94 |
95 |
96 | {submittedURL.length ? (
97 |
102 | ) : null}
103 |
104 | {snackbarSeverity === "info" ? : }
105 |
106 | );
107 | };
108 |
--------------------------------------------------------------------------------
/src/components/pages/home/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./home";
2 |
--------------------------------------------------------------------------------
/src/components/pages/job/csv/id/get-server-side-props.ts:
--------------------------------------------------------------------------------
1 | import { GetServerSideProps } from "next";
2 | import { parseCookies } from "nookies";
3 |
4 | export const getServerSideProps: GetServerSideProps = async (context) => {
5 | const { req, params } = context;
6 | const id = params?.id;
7 |
8 | const cookies = parseCookies({ req });
9 | const token = cookies.token;
10 | let csv = null;
11 |
12 | try {
13 | const csvResponse = await fetch(
14 | `${process.env.NEXT_PUBLIC_API_URL}/api/job/${id}/convert-to-csv`,
15 | {
16 |
17 | method: "GET",
18 | headers: {
19 | "content-type": "application/json",
20 | Authorization: `Bearer ${token}`,
21 | },
22 | }
23 | );
24 |
25 | csv = await csvResponse.json();
26 | } catch (error) {
27 | console.error("Error fetching job:", error);
28 | }
29 |
30 | return {
31 | props: {
32 | csv,
33 | },
34 | };
35 | };
36 |
--------------------------------------------------------------------------------
/src/components/pages/job/csv/id/id.tsx:
--------------------------------------------------------------------------------
1 | import { CsvRow, CsvTable } from "@/components/common/csv-table/csv-table";
2 |
3 | export type Csv = {
4 | rows: CsvRow[];
5 | headers: string[];
6 | };
7 |
8 | export const JobCsvId = ({ csv }: { csv: Csv }) => {
9 | return ;
10 | };
11 |
--------------------------------------------------------------------------------
/src/components/pages/job/csv/id/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./id";
2 |
--------------------------------------------------------------------------------
/src/components/pages/media/id/index.ts:
--------------------------------------------------------------------------------
1 | export { MediaId } from "./id";
2 |
--------------------------------------------------------------------------------
/src/components/pages/recordings/id/index.ts:
--------------------------------------------------------------------------------
1 | export { RecordingId } from "./id";
2 |
--------------------------------------------------------------------------------
/src/components/submit/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./job-submitter";
2 |
--------------------------------------------------------------------------------
/src/components/submit/job-submitter/element-table/index.ts:
--------------------------------------------------------------------------------
1 | export { ElementTable } from "./element-table";
2 |
--------------------------------------------------------------------------------
/src/components/submit/job-submitter/index.ts:
--------------------------------------------------------------------------------
1 | export { JobSubmitter } from "./job-submitter";
2 | export { ElementTable } from "./element-table";
3 |
--------------------------------------------------------------------------------
/src/components/submit/job-submitter/job-submitter-header/index.ts:
--------------------------------------------------------------------------------
1 | export { JobSubmitterHeader } from "./job-submitter-header";
2 |
--------------------------------------------------------------------------------
/src/components/submit/job-submitter/job-submitter-header/job-submitter-header.module.css:
--------------------------------------------------------------------------------
1 | .jobSubmitterHeader {
2 | margin-bottom: 1rem;
3 | text-align: center;
4 | }
5 |
--------------------------------------------------------------------------------
/src/components/submit/job-submitter/job-submitter-header/job-submitter-header.tsx:
--------------------------------------------------------------------------------
1 | import React, { ReactNode } from "react";
2 | import { Typography } from "@mui/material";
3 | import classes from "./job-submitter-header.module.css";
4 |
5 | interface JobSubmitterHeaderProps {
6 | title?: string;
7 | children?: ReactNode;
8 | }
9 |
10 | export const JobSubmitterHeader: React.FC = ({
11 | title = "Scraping Made Easy",
12 | children,
13 | }) => {
14 | return (
15 |
16 | {title}
17 | {children}
18 |
19 | );
20 | };
21 |
--------------------------------------------------------------------------------
/src/components/submit/job-submitter/job-submitter-input/index.ts:
--------------------------------------------------------------------------------
1 | export { JobSubmitterInput } from "./job-submitter-input";
2 |
--------------------------------------------------------------------------------
/src/components/submit/job-submitter/job-submitter-input/job-submitter-input.module.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/src/components/submit/job-submitter/job-submitter-input/job-submitter-input.module.css
--------------------------------------------------------------------------------
/src/components/submit/job-submitter/job-submitter-input/job-submitter-input.tsx:
--------------------------------------------------------------------------------
1 | import React from "react";
2 | import { TextField, Button, CircularProgress } from "@mui/material";
3 | import { useJobSubmitterProvider } from "../provider";
4 |
5 | export type JobSubmitterInputProps = {
6 | urlError: string | null;
7 | handleSubmit: () => void;
8 | loading: boolean;
9 | };
10 |
11 | export const JobSubmitterInput = ({
12 | handleSubmit,
13 | loading,
14 | urlError,
15 | }: JobSubmitterInputProps) => {
16 | const { submittedURL, setSubmittedURL, isValidURL, rows } =
17 | useJobSubmitterProvider();
18 | return (
19 |
20 | setSubmittedURL(e.target.value)}
27 | error={!isValidURL}
28 | helperText={!isValidURL ? urlError : ""}
29 | className="rounded-md"
30 | />
31 | 0) || loading}
37 | className={`bg-[#034efc] text-white font-semibold rounded-md
38 | transition-transform transform hover:scale-105 disabled:opacity-50`}
39 | >
40 | {loading ? : "Submit"}
41 |
42 |
43 | );
44 | };
45 |
--------------------------------------------------------------------------------
/src/components/submit/job-submitter/job-submitter-options/index.ts:
--------------------------------------------------------------------------------
1 | export { JobSubmitterOptions } from "./job-submitter-options";
2 |
--------------------------------------------------------------------------------
/src/components/submit/job-submitter/job-submitter.tsx:
--------------------------------------------------------------------------------
1 | "use client";
2 |
3 | import React, { useEffect, useState } from "react";
4 | import { useAuth } from "@/contexts/AuthContext";
5 | import { useRouter } from "next/router";
6 | import { RawJobOptions } from "@/types/job";
7 | import { parseJobOptions, validateURL } from "@/lib";
8 | import { JobSubmitterHeader } from "./job-submitter-header";
9 | import { JobSubmitterInput } from "./job-submitter-input";
10 | import { JobSubmitterOptions } from "./job-submitter-options";
11 | import { ApiService } from "@/services";
12 | import { useJobSubmitterProvider } from "./provider";
13 | import { AdvancedJobOptions } from "@/components/common/advanced-job-options";
14 |
15 | const initialJobOptions: RawJobOptions = {
16 | multi_page_scrape: false,
17 | custom_headers: null,
18 | proxies: null,
19 | collect_media: false,
20 | custom_cookies: null,
21 | };
22 |
23 | export const JobSubmitter = () => {
24 | const { user } = useAuth();
25 | const router = useRouter();
26 | const { job_options } = router.query;
27 |
28 | const {
29 | submittedURL,
30 | rows,
31 | siteMap,
32 | setIsValidUrl,
33 | setSnackbarMessage,
34 | setSnackbarOpen,
35 | setSnackbarSeverity,
36 | setSiteMap,
37 | } = useJobSubmitterProvider();
38 |
39 | const [urlError, setUrlError] = useState(null);
40 | const [loading, setLoading] = useState(false);
41 | const [jobOptions, setJobOptions] =
42 | useState(initialJobOptions);
43 |
44 | const handleSubmit = async () => {
45 | if (!validateURL(submittedURL)) {
46 | setIsValidUrl(false);
47 | setUrlError("Please enter a valid URL.");
48 | return;
49 | }
50 |
51 | setIsValidUrl(true);
52 | setUrlError(null);
53 | setLoading(true);
54 |
55 | let customHeaders;
56 | let customCookies;
57 |
58 | try {
59 | customHeaders = jobOptions.custom_headers || null;
60 | customCookies = jobOptions.custom_cookies || null;
61 | } catch (error: any) {
62 | console.error(error);
63 | setSnackbarMessage("Invalid JSON in custom headers.");
64 | setSnackbarOpen(true);
65 | setSnackbarSeverity("error");
66 | setLoading(false);
67 | return;
68 | }
69 |
70 | await ApiService.submitJob(
71 | submittedURL,
72 | rows,
73 | user,
74 | jobOptions,
75 | customHeaders,
76 | customCookies,
77 | siteMap
78 | )
79 | .then(async (response) => {
80 | if (!response.ok) {
81 | return response.json().then((error) => {
82 | throw new Error(error.error);
83 | });
84 | }
85 | return response.json();
86 | })
87 | .then((data) => {
88 | setSnackbarMessage(
89 | `Job: ${data.id} submitted successfully.` ||
90 | "Job submitted successfully."
91 | );
92 | setSnackbarSeverity("info");
93 | setSnackbarOpen(true);
94 | })
95 | .catch((error) => {
96 | setSnackbarMessage(error || "An error occurred.");
97 | setSnackbarSeverity("error");
98 | setSnackbarOpen(true);
99 | })
100 | .finally(() => setLoading(false));
101 | };
102 |
103 | useEffect(() => {
104 | if (job_options) {
105 | parseJobOptions(job_options as string, setJobOptions, setSiteMap);
106 | }
107 | }, [job_options]);
108 |
109 | return (
110 |
122 | );
123 | };
124 |
--------------------------------------------------------------------------------
/src/components/submit/job-submitter/provider.tsx:
--------------------------------------------------------------------------------
1 | import React, {
2 | createContext,
3 | PropsWithChildren,
4 | useContext,
5 | useState,
6 | Dispatch,
7 | useMemo,
8 | } from "react";
9 | import { Element, Result, SiteMap } from "@/types";
10 |
11 | type JobSubmitterProviderType = {
12 | submittedURL: string;
13 | setSubmittedURL: Dispatch>;
14 | rows: Element[];
15 | setRows: Dispatch>;
16 | results: Result;
17 | setResults: Dispatch>;
18 | snackbarOpen: boolean;
19 | setSnackbarOpen: Dispatch>;
20 | snackbarMessage: string;
21 | setSnackbarMessage: Dispatch>;
22 | snackbarSeverity: string;
23 | setSnackbarSeverity: Dispatch>;
24 | isValidURL: boolean;
25 | setIsValidUrl: Dispatch>;
26 | siteMap: SiteMap | null;
27 | setSiteMap: Dispatch>;
28 | };
29 |
30 | const JobSubmitterProvider = createContext(
31 | {} as JobSubmitterProviderType
32 | );
33 |
34 | export const Provider = ({ children }: PropsWithChildren) => {
35 | const [submittedURL, setSubmittedURL] = useState("");
36 | const [rows, setRows] = useState([]);
37 | const [results, setResults] = useState({});
38 | const [snackbarOpen, setSnackbarOpen] = useState(false);
39 | const [snackbarMessage, setSnackbarMessage] = useState("");
40 | const [snackbarSeverity, setSnackbarSeverity] = useState("error");
41 | const [isValidURL, setIsValidUrl] = useState(true);
42 | const [siteMap, setSiteMap] = useState(null);
43 |
44 | const value: JobSubmitterProviderType = useMemo(
45 | () => ({
46 | submittedURL,
47 | setSubmittedURL,
48 | rows,
49 | setRows,
50 | results,
51 | setResults,
52 | snackbarOpen,
53 | setSnackbarOpen,
54 | snackbarMessage,
55 | setSnackbarMessage,
56 | snackbarSeverity,
57 | setSnackbarSeverity,
58 | isValidURL,
59 | setIsValidUrl,
60 | siteMap,
61 | setSiteMap,
62 | }),
63 | [
64 | submittedURL,
65 | rows,
66 | results,
67 | snackbarOpen,
68 | snackbarMessage,
69 | snackbarSeverity,
70 | isValidURL,
71 | siteMap,
72 | ]
73 | );
74 |
75 | return (
76 |
77 | {children}
78 |
79 | );
80 | };
81 |
82 | export const useJobSubmitterProvider = () => {
83 | return useContext(JobSubmitterProvider);
84 | };
85 |
--------------------------------------------------------------------------------
/src/components/submit/job-submitter/site-map/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./site-map";
2 |
--------------------------------------------------------------------------------
/src/components/submit/job-submitter/site-map/site-map-input/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./site-map-input";
2 |
--------------------------------------------------------------------------------
/src/components/submit/job-submitter/site-map/site-map-input/site-map-input.module.css:
--------------------------------------------------------------------------------
1 | .button {
2 | height: 3rem;
3 | width: 2rem;
4 |
5 | color: #ffffff;
6 | font-weight: 600;
7 | border-radius: 0.375rem;
8 | transition: transform 0.2s ease-in-out;
9 | transform: scale(1);
10 | }
11 |
12 | .button:hover {
13 | transform: scale(1.05);
14 | }
15 |
16 | .remove {
17 | background-color: var(--delete-red) !important;
18 | }
19 |
20 | .remove:hover {
21 | background-color: var(--delete-red-hover) !important;
22 | }
23 |
--------------------------------------------------------------------------------
/src/components/submit/job-submitter/site-map/site-map-input/site-map-input.tsx:
--------------------------------------------------------------------------------
1 | import { useState } from "react";
2 | import { useJobSubmitterProvider } from "../../provider";
3 | import {
4 | MenuItem,
5 | Select,
6 | TextField,
7 | FormControl,
8 | Button,
9 | Checkbox,
10 | FormControlLabel,
11 | } from "@mui/material";
12 | import { ActionOption } from "@/types/job";
13 | import classes from "./site-map-input.module.css";
14 | import { clsx } from "clsx";
15 |
16 | export type SiteMapInputProps = {
17 | disabled?: boolean;
18 | xpath?: string;
19 | option?: ActionOption;
20 | clickOnce?: boolean;
21 | input?: string;
22 | };
23 |
24 | export const SiteMapInput = ({
25 | disabled,
26 | xpath,
27 | option,
28 | clickOnce,
29 | input,
30 | }: SiteMapInputProps) => {
31 | console.log(clickOnce);
32 | const [optionState, setOptionState] = useState(
33 | option || "click"
34 | );
35 | const [xpathState, setXpathState] = useState(xpath || "");
36 | const [clickOnceState, setClickOnceState] = useState(
37 | clickOnce || false
38 | );
39 | const [inputState, setInputState] = useState(input || "");
40 |
41 | const { siteMap, setSiteMap } = useJobSubmitterProvider();
42 |
43 | const handleAdd = () => {
44 | if (!siteMap) return;
45 |
46 | console.log(optionState, xpathState, clickOnceState, inputState);
47 |
48 | setSiteMap((prevSiteMap) => ({
49 | ...prevSiteMap,
50 | actions: [
51 | {
52 | type: optionState,
53 | xpath: xpathState,
54 | name: "",
55 | do_once: clickOnceState,
56 | input: inputState,
57 | },
58 | ...(prevSiteMap?.actions || []),
59 | ],
60 | }));
61 |
62 | setXpathState("");
63 | };
64 |
65 | const handleRemove = () => {
66 | if (!siteMap) return;
67 |
68 | setSiteMap((prevSiteMap) => ({
69 | ...prevSiteMap,
70 | actions: (prevSiteMap?.actions || []).slice(0, -1),
71 | }));
72 | };
73 |
74 | return (
75 |
76 |
77 |
78 | setOptionState(e.target.value as ActionOption)}
83 | >
84 | Click
85 | Input
86 |
87 |
88 | {optionState === "input" && (
89 | setInputState(e.target.value)}
94 | disabled={disabled}
95 | />
96 | )}
97 | setXpathState(e.target.value)}
102 | disabled={disabled}
103 | />
104 | {disabled ? (
105 |
109 | Delete
110 |
111 | ) : (
112 |
117 | Add
118 |
119 | )}
120 |
121 | {!disabled && (
122 |
setClickOnceState(!clickOnceState)}
129 | />
130 | }
131 | />
132 | )}
133 |
134 | );
135 | };
136 |
--------------------------------------------------------------------------------
/src/components/submit/job-submitter/site-map/site-map.tsx:
--------------------------------------------------------------------------------
1 | import { useEffect, useState } from "react";
2 | import { useJobSubmitterProvider } from "../provider";
3 | import { Button, Divider, Typography, useTheme } from "@mui/material";
4 | import { SiteMapInput } from "./site-map-input";
5 |
6 | export const SiteMap = () => {
7 | const { siteMap, setSiteMap } = useJobSubmitterProvider();
8 | const [showSiteMap, setShowSiteMap] = useState(false);
9 | const theme = useTheme();
10 |
11 | const handleCreateSiteMap = () => {
12 | setSiteMap({ actions: [] });
13 | setShowSiteMap(true);
14 | };
15 |
16 | const handleClearSiteMap = () => {
17 | setSiteMap(null);
18 | setShowSiteMap(false);
19 | };
20 |
21 | useEffect(() => {
22 | if (siteMap) {
23 | setShowSiteMap(true);
24 | }
25 | }, [siteMap]);
26 |
27 | return (
28 |
29 | {siteMap ? (
30 |
Clear Site Map
31 | ) : (
32 |
Create Site Map
33 | )}
34 | {showSiteMap && (
35 |
36 |
37 | {siteMap?.actions && siteMap?.actions.length > 0 && (
38 | <>
39 |
45 |
46 | Site Map Actions
47 |
48 | >
49 | )}
50 |
51 | {siteMap?.actions.reverse().map((action, index) => (
52 |
53 |
54 | Action {index + 1}:
55 |
56 |
63 |
64 | ))}
65 |
66 |
67 | )}
68 |
69 | );
70 | };
71 |
--------------------------------------------------------------------------------
/src/contexts/AuthContext.tsx:
--------------------------------------------------------------------------------
1 | import React, { createContext, useContext, useState, useEffect } from "react";
2 | import axios from "axios";
3 | import Cookies from "js-cookie";
4 |
5 | interface AuthContextProps {
6 | user: any;
7 | isAuthenticated: boolean;
8 | login: (email: string, password: string) => Promise;
9 | logout: () => void;
10 | setUser: (user: any) => void;
11 | }
12 |
13 | const AuthContext = createContext(undefined);
14 |
15 | interface AuthProps {
16 | children: React.ReactNode;
17 | }
18 |
19 | export const AuthProvider: React.FC = ({ children }) => {
20 | const [user, setUser] = useState(null);
21 | const [isAuthenticated, setIsAuthenticated] = useState(false);
22 |
23 | useEffect(() => {
24 | const token = Cookies.get("token");
25 | if (token) {
26 | axios
27 | .get(`/api/me`, {
28 | headers: { Authorization: `Bearer ${token}` },
29 | })
30 | .then((response) => {
31 | setUser(response.data);
32 | setIsAuthenticated(true);
33 | })
34 | .catch(() => {
35 | localStorage.removeItem("token");
36 | });
37 | }
38 | }, []);
39 |
40 | const login = async (email: string, password: string) => {
41 | const params = new URLSearchParams();
42 | params.append("username", email);
43 | params.append("password", password);
44 | const response = await axios.post(`/api/token`, params);
45 | const isSecure = window.location.protocol === "https:";
46 |
47 | Cookies.set("token", response.data.access_token, {
48 | expires: 7,
49 | path: "/",
50 | secure: isSecure,
51 | sameSite: "Lax",
52 | });
53 |
54 | const userResponse = await axios.get(`/api/me`, {
55 | headers: { Authorization: `Bearer ${response.data.access_token}` },
56 | });
57 |
58 | setUser(userResponse.data);
59 | setIsAuthenticated(true);
60 | };
61 |
62 | const logout = () => {
63 | Cookies.remove("token");
64 | setUser(null);
65 | setIsAuthenticated(false);
66 | };
67 |
68 | return (
69 |
72 | {children}
73 |
74 | );
75 | };
76 |
77 | export const useAuth = () => {
78 | const context = useContext(AuthContext);
79 | if (!context) {
80 | throw new Error("useAuth must be used within an AuthProvider");
81 | }
82 | return context;
83 | };
84 |
--------------------------------------------------------------------------------
/src/declaration.d.ts:
--------------------------------------------------------------------------------
1 | declare module "*.png";
2 |
--------------------------------------------------------------------------------
/src/lib/constants.ts:
--------------------------------------------------------------------------------
1 | export const Constants = {
2 | DOMAIN: "",
3 | };
4 |
--------------------------------------------------------------------------------
/src/lib/helpers/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./parse-job-options";
2 | export * from "./validate-url";
3 |
--------------------------------------------------------------------------------
/src/lib/helpers/parse-job-options.ts:
--------------------------------------------------------------------------------
1 | import { Dispatch, SetStateAction } from "react";
2 |
3 | import { RawJobOptions, SiteMap } from "@/types";
4 |
5 | export const parseJobOptions = (
6 | job_options: string,
7 | setJobOptions: Dispatch>,
8 | setSiteMap?: Dispatch>
9 | ) => {
10 | if (job_options) {
11 | const jsonOptions = JSON.parse(job_options as string);
12 | const newJobOptions: RawJobOptions = {
13 | multi_page_scrape: false,
14 | custom_headers: null,
15 | proxies: null,
16 | collect_media: false,
17 | custom_cookies: null,
18 | };
19 |
20 | if (jsonOptions.collect_media) {
21 | newJobOptions.collect_media = true;
22 | }
23 |
24 | if (
25 | jsonOptions.custom_headers &&
26 | Object.keys(jsonOptions.custom_headers).length
27 | ) {
28 | newJobOptions.custom_headers = jsonOptions.custom_headers;
29 | }
30 |
31 | if (jsonOptions.custom_cookies && jsonOptions.custom_cookies.length > 0) {
32 | newJobOptions.custom_cookies = jsonOptions.custom_cookies;
33 | }
34 |
35 | newJobOptions.multi_page_scrape = jsonOptions.multi_page_scrape;
36 |
37 | if (jsonOptions.proxies.length > 0) {
38 | newJobOptions.proxies = jsonOptions.proxies.join(",");
39 | }
40 |
41 | if (jsonOptions.site_map && setSiteMap) {
42 | setSiteMap(jsonOptions.site_map);
43 | }
44 |
45 | setJobOptions(newJobOptions);
46 | }
47 | };
48 |
--------------------------------------------------------------------------------
/src/lib/helpers/parse-json-to-entries.ts:
--------------------------------------------------------------------------------
1 | export const parseJsonToEntries = (json: string): [string, string][] | null => {
2 | try {
3 | const parsed = JSON.parse(json);
4 |
5 | if (Array.isArray(parsed)) {
6 | if (
7 | parsed.length > 0 &&
8 | Array.isArray(parsed[0]) &&
9 | parsed[0].length === 2 &&
10 | typeof parsed[0][0] === "string"
11 | ) {
12 | // Already array of [key, val] tuples
13 | // Just ensure values are strings
14 | return parsed.map(([k, v]) => [k, String(v)]);
15 | }
16 |
17 | // Array of objects
18 | const allEntries: [string, string][] = [];
19 | for (const item of parsed) {
20 | if (typeof item === "object" && item !== null) {
21 | allEntries.push(
22 | // @ts-ignore
23 | ...Object.entries(item).map(([k, v]) => [k, String(v)])
24 | );
25 | } else {
26 | return null;
27 | }
28 | }
29 | return allEntries.length > 0 ? allEntries : null;
30 | } else if (typeof parsed === "object" && parsed !== null) {
31 | return Object.entries(parsed).map(([k, v]) => [k, String(v)]);
32 | }
33 | return null;
34 | } catch {
35 | return null;
36 | }
37 | };
38 |
--------------------------------------------------------------------------------
/src/lib/helpers/validate-url.ts:
--------------------------------------------------------------------------------
1 | export function validateURL(url: string): boolean {
2 | try {
3 | new URL(url);
4 | return true;
5 | } catch (_) {
6 | return false;
7 | }
8 | }
9 |
--------------------------------------------------------------------------------
/src/lib/hooks/use-advanced-job-options/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./use-advanced-job-options";
2 |
--------------------------------------------------------------------------------
/src/lib/hooks/use-advanced-job-options/use-advanced-job-options.ts:
--------------------------------------------------------------------------------
1 | import { useEffect, useState } from "react";
2 |
3 | import { RawJobOptions } from "@/types";
4 | import { parseJobOptions } from "@/lib/helpers/parse-job-options";
5 | import { useRouter } from "next/router";
6 |
7 | export const useAdvancedJobOptions = () => {
8 | const initialJobOptions: RawJobOptions = {
9 | multi_page_scrape: false,
10 | custom_headers: null,
11 | proxies: null,
12 | collect_media: false,
13 | custom_cookies: null,
14 | };
15 |
16 | const router = useRouter();
17 | const { job_options } = router.query;
18 |
19 | const [jobOptions, setJobOptions] =
20 | useState(initialJobOptions);
21 |
22 | useEffect(() => {
23 | if (job_options) {
24 | parseJobOptions(job_options as string, setJobOptions);
25 | }
26 | }, [job_options]);
27 |
28 | return { jobOptions, setJobOptions };
29 | };
30 |
--------------------------------------------------------------------------------
/src/lib/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./constants";
2 | export * from "./utils";
3 | export * from "./helpers";
4 |
--------------------------------------------------------------------------------
/src/lib/utils.ts:
--------------------------------------------------------------------------------
1 | import Cookies from "js-cookie";
2 | import React, { Dispatch } from "react";
3 | import { Job } from "../types";
4 |
5 | interface fetchOptions {
6 | chat?: boolean;
7 | }
8 |
9 | export const fetchJobs = async (
10 | setJobs: Dispatch>,
11 | fetchOptions: fetchOptions = {}
12 | ) => {
13 | const token = Cookies.get("token");
14 | await fetch("/api/retrieve", {
15 | method: "POST",
16 | headers: {
17 | "content-type": "application/json",
18 | Authorization: `Bearer ${token}`,
19 | },
20 | body: JSON.stringify({ data: fetchOptions }),
21 | })
22 | .then((response) => response.json())
23 | .then((data) => setJobs(data))
24 | .catch((error) => {
25 | console.error("Error fetching jobs:", error);
26 | });
27 | };
28 |
29 | export const fetchJob = async (id: string) => {
30 | const token = Cookies.get("token");
31 | try {
32 | const response = await fetch(`/api/job/${id}`, {
33 | headers: {
34 | "content-type": "application/json",
35 | Authorization: `Bearer ${token}`,
36 | },
37 | });
38 | const data = await response.json();
39 | return data;
40 | } catch (error) {
41 | console.error("Error fetching jobs:", error);
42 | throw error;
43 | }
44 | };
45 |
46 | export const checkAI = async (
47 | setAiEnabled: Dispatch>
48 | ) => {
49 | const token = Cookies.get("token");
50 | try {
51 | const response = await fetch("/api/check", {
52 | headers: {
53 | "content-type": "application/json",
54 | Authorization: `Bearer ${token}`,
55 | },
56 | });
57 | const data = await response.json();
58 | setAiEnabled(data.ai_enabled);
59 | } catch (error) {
60 | console.error("Error fetching jobs:", error);
61 | throw error;
62 | }
63 | };
64 |
65 | export const updateJob = async (ids: string[], field: string, value: any) => {
66 | const token = Cookies.get("token");
67 | const postBody = {
68 | ids: ids,
69 | field: field,
70 | value: value,
71 | };
72 | await fetch("/api/update", {
73 | method: "POST",
74 | headers: {
75 | "content-type": "application/json",
76 | Authorization: `Bearer ${token}`,
77 | },
78 | body: JSON.stringify({ data: postBody }),
79 | }).catch((error) => {
80 | console.error("Error fetching jobs:", error);
81 | });
82 | };
83 |
84 | export const getUserSettings = async () => {
85 | const token = Cookies.get("token");
86 |
87 | try {
88 | const response = await fetch("/api/check", {
89 | headers: {
90 | "content-type": "application/json",
91 | Authorization: `Bearer ${token}`,
92 | },
93 | });
94 |
95 | const data = await response.json();
96 | return data;
97 | } catch (error) {
98 | console.error("Error fetching jobs:", error);
99 | throw error;
100 | }
101 | };
102 |
--------------------------------------------------------------------------------
/src/pages/_app.tsx:
--------------------------------------------------------------------------------
1 | import "bootstrap/dist/css/bootstrap.min.css";
2 | import "../styles/globals.css";
3 |
4 | import React, { useState, useEffect } from "react";
5 | import type { AppProps } from "next/app";
6 | import Head from "next/head";
7 | import { ThemeProvider, CssBaseline, Box } from "@mui/material";
8 | import { NavDrawer } from "../components/common";
9 | import { darkTheme, lightTheme } from "../styles/themes";
10 | import { AuthProvider } from "../contexts/AuthContext";
11 | import { Provider } from "react-redux";
12 | import { PersistGate } from "redux-persist/integration/react";
13 | import { store, persistor } from "@/store/store";
14 |
15 | const App: React.FC = ({ Component, pageProps }) => {
16 | const [isDarkMode, setIsDarkMode] = useState(false);
17 |
18 | useEffect(() => {
19 | const savedTheme = localStorage.getItem("theme");
20 | if (savedTheme) {
21 | setIsDarkMode(savedTheme === "dark");
22 | } else {
23 | const prefersDarkMode = window.matchMedia(
24 | "(prefers-color-scheme: dark)"
25 | ).matches;
26 | setIsDarkMode(prefersDarkMode);
27 | }
28 | }, []);
29 |
30 | const toggleTheme = () => {
31 | const newTheme = !isDarkMode;
32 | setIsDarkMode(newTheme);
33 | localStorage.setItem("theme", newTheme ? "dark" : "light");
34 | };
35 |
36 | return (
37 | <>
38 |
39 | Scraperr
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 | >
66 | );
67 | };
68 |
69 | export default App;
70 |
--------------------------------------------------------------------------------
/src/pages/_document.tsx:
--------------------------------------------------------------------------------
1 | import { Html, Head, Main, NextScript } from "next/document";
2 | import React from "react";
3 |
4 | export default function Document() {
5 | return (
6 |
7 |
8 |
9 |
10 |
11 |
12 |
16 |
17 |
18 | You need to enable JavaScript to run this app.
19 |
20 |
21 |
22 |
23 | );
24 | }
25 |
--------------------------------------------------------------------------------
/src/pages/agent.tsx:
--------------------------------------------------------------------------------
1 | export { Agent as default } from "@/components/pages/agent";
2 |
--------------------------------------------------------------------------------
/src/pages/api/ai/index.ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | const { data } = req.body;
8 |
9 | try {
10 | const response = await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/ai`, {
11 | method: "POST",
12 | headers: {
13 | Accept: "text/event-stream",
14 | "Content-Type": "application/json",
15 | },
16 | body: JSON.stringify(data),
17 | });
18 |
19 | if (!response.ok) {
20 | const errorDetails = await response.text();
21 | if (response.status === 422) {
22 | console.error(`422 Error: ${errorDetails}`);
23 | }
24 | throw new Error(
25 | `Error fetching logs: ${response.statusText} - ${errorDetails}`
26 | );
27 | }
28 |
29 | if (!response.body) {
30 | throw new Error(`No response body from API`);
31 | }
32 |
33 | res.writeHead(200, {
34 | "Content-Type": "text/event-stream",
35 | "Cache-Control": "no-cache, no-transform",
36 | Connection: "keep-alive",
37 | "Transfer-Encoding": "chunked",
38 | });
39 |
40 | let responseStream = response.body;
41 | const reader = responseStream.getReader();
42 | const decoder = new TextDecoder();
43 |
44 | while (true) {
45 | const { done, value } = await reader.read();
46 | if (done) break;
47 | const chunk = decoder.decode(value, { stream: true });
48 | res.write(`${chunk}`);
49 | }
50 |
51 | res.end();
52 | } catch (error) {
53 | console.error("Error streaming logs:", error);
54 | res.status(500).json({ error: "Internal Server Error" });
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/src/pages/api/check.ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | try {
8 | const headers = new Headers(req.headers as Record);
9 | headers.set("content-type", "application/json");
10 | headers.set("Authorization", `Bearer ${req.headers.authorization}`);
11 |
12 | const response = await fetch(
13 | `${global.process.env.NEXT_PUBLIC_API_URL}/api/ai/check`,
14 | {
15 | method: "GET",
16 | headers,
17 | }
18 | );
19 |
20 | const checksResponse = await fetch(
21 | `${global.process.env.NEXT_PUBLIC_API_URL}/api/auth/check`,
22 | {
23 | method: "GET",
24 | headers,
25 | }
26 | );
27 |
28 | if (!response.ok) {
29 | throw new Error(`Error: ${response.statusText}`);
30 | }
31 |
32 | const result = await response.json();
33 | const checksResult = await checksResponse.json();
34 | res.status(200).json({ ...result, ...checksResult });
35 | } catch (error) {
36 | console.error("Error submitting scrape job:", error);
37 | res.status(500).json({ error: "Internal Server Error" });
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/pages/api/delete-cron-job.ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | if (req.method === "POST") {
8 | const { data } = req.body;
9 | console.log("Data", data);
10 |
11 | const headers = new Headers();
12 | headers.set("content-type", "application/json");
13 |
14 | try {
15 | const response = await fetch(
16 | `${global.process.env.NEXT_PUBLIC_API_URL}/api/delete-cron-job`,
17 | {
18 | method: "POST",
19 | headers,
20 | body: JSON.stringify(data),
21 | }
22 | );
23 |
24 | if (!response.ok) {
25 | console.error(response);
26 | throw new Error(`Error: ${response.statusText}`);
27 | }
28 |
29 | const result = await response.json();
30 | res.status(200).json(result);
31 | } catch (error) {
32 | console.error("Error deleting cron job:", error);
33 | res.status(500).json({ error: "Internal Server Error" });
34 | }
35 | } else {
36 | res.setHeader("Allow", ["POST"]);
37 | res.status(405).end(`Method ${req.method} Not Allowed`);
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/pages/api/delete.ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | if (req.method === "POST") {
8 | const { data } = req.body;
9 |
10 | const headers = new Headers();
11 | headers.set("content-type", "application/json");
12 | headers.set("Authorization", `Bearer ${req.headers.authorization}`);
13 |
14 | try {
15 | const response = await fetch(
16 | `${global.process.env.NEXT_PUBLIC_API_URL}/api/delete-scrape-jobs`,
17 | {
18 | method: "POST",
19 | headers,
20 | body: JSON.stringify(data),
21 | }
22 | );
23 |
24 | if (!response.ok) {
25 | throw new Error(`Error: ${response.statusText}`);
26 | }
27 |
28 | const result = await response.json();
29 | res.status(200).json(result);
30 | } catch (error) {
31 | console.error("Error submitting scrape job:", error);
32 | res.status(500).json({ error: "Internal Server Error" });
33 | }
34 | } else {
35 | res.setHeader("Allow", ["POST"]);
36 | res.status(405).end(`Method ${req.method} Not Allowed`);
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/pages/api/download.ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | if (req.method === "POST") {
8 | const { data } = req.body;
9 |
10 | const headers = new Headers();
11 | headers.set("content-type", "application/json");
12 |
13 | try {
14 | const response = await fetch(
15 | `${global.process.env.NEXT_PUBLIC_API_URL}/api/download`,
16 | {
17 | method: "POST",
18 | headers,
19 | body: JSON.stringify(data),
20 | }
21 | );
22 |
23 | if (!response.ok) {
24 | throw new Error(`Error: ${response.statusText}`);
25 | }
26 |
27 | const csvText = await response.text();
28 | res.status(200).send(csvText);
29 | } catch (error) {
30 | console.error("Error submitting scrape job:", error);
31 | res.status(500).json({ error: "Internal Server Error" });
32 | }
33 | } else {
34 | res.setHeader("Allow", ["POST"]);
35 | res.status(405).end(`Method ${req.method} Not Allowed`);
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/pages/api/get-average-element-per-link.ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | const headers = new Headers();
8 | headers.set("content-type", "application/json");
9 | headers.set("Authorization", `Bearer ${req.headers.authorization}`);
10 |
11 | try {
12 | const response = await fetch(
13 | `${process.env.NEXT_PUBLIC_API_URL}/api/statistics/get-average-element-per-link`,
14 | {
15 | method: "GET",
16 | headers,
17 | }
18 | );
19 |
20 | if (!response.ok) {
21 | throw new Error(`Error: ${response.statusText}`);
22 | }
23 |
24 | const csvText = await response.text();
25 | res.status(200).send(csvText);
26 | } catch (error) {
27 | console.error("Error submitting scrape job:", error);
28 | res.status(500).json({ error: "Internal Server Error" });
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/pages/api/get-average-jobs-per-day.ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | const headers = new Headers();
8 | headers.set("content-type", "application/json");
9 | headers.set("Authorization", `Bearer ${req.headers.authorization}`);
10 |
11 | try {
12 | const response = await fetch(
13 | `${global.process.env.NEXT_PUBLIC_API_URL}/api/statistics/get-average-jobs-per-day`,
14 | {
15 | method: "GET",
16 | headers,
17 | }
18 | );
19 |
20 | if (!response.ok) {
21 | throw new Error(`Error: ${response.statusText}`);
22 | }
23 |
24 | const csvText = await response.text();
25 | res.status(200).send(csvText);
26 | } catch (error) {
27 | console.error("Error submitting scrape job:", error);
28 | res.status(500).json({ error: "Internal Server Error" });
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/pages/api/job/[id].ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | const { id } = req.query;
8 |
9 | const headers = new Headers();
10 | headers.set("content-type", "application/json");
11 | headers.set("Authorization", `Bearer ${req.headers.authorization}`);
12 |
13 | try {
14 | const response = await fetch(
15 | `${global.process.env.NEXT_PUBLIC_API_URL}/api/job/${id}`,
16 | {
17 | headers,
18 | }
19 | );
20 |
21 | if (!response.ok) {
22 | throw new Error(`Error: ${response.statusText}`);
23 | }
24 |
25 | const result = await response.json();
26 | res.status(200).json(result);
27 | } catch (error) {
28 | console.error("Error submitting scrape job:", error);
29 | res.status(500).json({ error: "Internal Server Error" });
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/pages/api/logs.ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | try {
8 | const response = await fetch(
9 | `${process.env.NEXT_PUBLIC_API_URL}/api/logs`,
10 | {
11 | method: "GET",
12 | headers: {
13 | Accept: "text/event-stream",
14 | },
15 | }
16 | );
17 |
18 | if (!response.ok || !response.body) {
19 | throw new Error(`Error fetching logs: ${response.statusText}`);
20 | }
21 |
22 | res.writeHead(200, {
23 | "Content-Type": "text/event-stream",
24 | "Cache-Control": "no-cache, no-transform",
25 | Connection: "keep-alive",
26 | "Transfer-Encoding": "chunked",
27 | });
28 |
29 | let responseStream = response.body;
30 | const reader = responseStream.getReader();
31 | const decoder = new TextDecoder();
32 |
33 | while (true) {
34 | const { done, value } = await reader.read();
35 | if (done) break;
36 | const chunk = decoder.decode(value, { stream: true });
37 | res.write(`data: ${chunk}\n\n`);
38 | }
39 |
40 | res.end();
41 | } catch (error) {
42 | console.error("Error streaming logs:", error);
43 | res.status(500).json({ error: "Internal Server Error" });
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/pages/api/me.ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | try {
8 | const headers = new Headers();
9 | headers.set("Authorization", `Bearer ${req.headers.authorization}`);
10 | headers.set("content-type", "application/json");
11 |
12 | const response = await fetch(
13 | `${global.process.env.NEXT_PUBLIC_API_URL}/api/auth/users/me`,
14 | {
15 | method: "GET",
16 | headers,
17 | }
18 | );
19 |
20 | if (!response.ok) {
21 | throw new Error(`Error: ${response.statusText}`);
22 | }
23 |
24 | const result = await response.json();
25 | res.status(200).json(result);
26 | } catch (error) {
27 | console.error("Error submitting scrape job:", error);
28 | res.status(500).json({ error: "Internal Server Error" });
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/pages/api/media/get-media.ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | const { id } = req.query;
8 |
9 | try {
10 | const response = await fetch(
11 | `${process.env.NEXT_PUBLIC_API_URL}/get-media?id=${id}`
12 | );
13 |
14 | if (!response.ok) {
15 | throw new Error(`Error: ${response.statusText}`);
16 | }
17 |
18 | const data = await response.json();
19 | res.status(200).json(data);
20 | } catch (error) {
21 | console.error("Error streaming video:", error);
22 | res.status(404).json({ error: "Error streaming video" });
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/src/pages/api/media/index.ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | const { id, type, file } = req.query;
8 |
9 | if (!id || !type || !file) {
10 | return res.status(400).json({ error: "Missing required parameters" });
11 | }
12 |
13 | try {
14 | const response = await fetch(
15 | `${process.env.NEXT_PUBLIC_API_URL}/media?id=${id}&type=${type}&file=${file}`
16 | );
17 |
18 | if (!response.ok) {
19 | throw new Error(`Error: ${response.statusText}`);
20 | }
21 |
22 | const contentType =
23 | response.headers.get("content-type") || "application/octet-stream";
24 |
25 | res.setHeader("Content-Type", contentType);
26 |
27 | const arrayBuffer = await response.arrayBuffer();
28 | res.status(200).send(Buffer.from(arrayBuffer));
29 | } catch (error) {
30 | console.error("Error streaming media:", error);
31 | res.status(404).json({ error: "Error retrieving media file" });
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/pages/api/recordings/[id].ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | const { id } = req.query;
8 |
9 | try {
10 | const response = await fetch(
11 | `${process.env.NEXT_PUBLIC_API_URL}/recordings/${id}`
12 | );
13 |
14 | if (!response.ok) {
15 | throw new Error(`Error: ${response.statusText}`);
16 | }
17 |
18 | res.setHeader("Content-Type", "video/mp4");
19 | res.setHeader("Accept-Ranges", "bytes");
20 |
21 | const reader = response.body?.getReader();
22 |
23 | if (!reader) {
24 | res.status(404).json({ error: "Recording not found" });
25 | return;
26 | }
27 |
28 | while (true) {
29 | const { done, value } = await reader.read();
30 | if (done) break;
31 | res.write(value);
32 | }
33 |
34 | res.end();
35 | } catch (error) {
36 | console.error("Error streaming video:", error);
37 | res.status(404).json({ error: "Error streaming video" });
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/pages/api/retrieve.ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | if (req.method === "POST") {
8 | const { data } = req.body;
9 |
10 | const headers = new Headers();
11 | headers.set("content-type", "application/json");
12 | headers.set("Authorization", `Bearer ${req.headers.authorization}`);
13 |
14 | try {
15 | const response = await fetch(
16 | `${global.process.env.NEXT_PUBLIC_API_URL}/api/retrieve-scrape-jobs`,
17 | {
18 | method: "POST",
19 | headers,
20 | body: JSON.stringify(data),
21 | }
22 | );
23 |
24 | if (!response.ok) {
25 | throw new Error(`Error: ${response.statusText}`);
26 | }
27 |
28 | const result = await response.json();
29 | res.status(200).json(result);
30 | } catch (error) {
31 | console.error("Error submitting scrape job:", error);
32 | res.status(500).json({ error: "Internal Server Error" });
33 | }
34 | } else {
35 | res.setHeader("Allow", ["POST"]);
36 | res.status(405).end(`Method ${req.method} Not Allowed`);
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/pages/api/schedule-cron-job.ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | if (req.method === "POST") {
8 | const { data } = req.body;
9 | console.log("Data", data);
10 |
11 | const headers = new Headers();
12 | headers.set("content-type", "application/json");
13 |
14 | try {
15 | const response = await fetch(
16 | `${global.process.env.NEXT_PUBLIC_API_URL}/api/schedule-cron-job`,
17 | {
18 | method: "POST",
19 | headers,
20 | body: JSON.stringify(data),
21 | }
22 | );
23 |
24 | if (!response.ok) {
25 | console.error(response);
26 | throw new Error(`Error: ${response.statusText}`);
27 | }
28 |
29 | const result = await response.json();
30 | res.status(200).json(result);
31 | } catch (error) {
32 | console.error("Error scheduling cron job:", error);
33 | res.status(500).json({ error: "Internal Server Error" });
34 | }
35 | } else {
36 | res.setHeader("Allow", ["POST"]);
37 | res.status(405).end(`Method ${req.method} Not Allowed`);
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/pages/api/signup.ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | if (req.method === "POST") {
8 | const { data } = req.body;
9 |
10 | const headers = new Headers();
11 | headers.set("content-type", "application/json");
12 |
13 | try {
14 | const response = await fetch(
15 | `${global.process.env.NEXT_PUBLIC_API_URL}/api/auth/signup`,
16 | {
17 | method: "POST",
18 | headers,
19 | body: JSON.stringify(data),
20 | }
21 | );
22 |
23 | if (!response.ok) {
24 | throw new Error(`Error: ${response.statusText}`);
25 | }
26 |
27 | const result = await response.json();
28 | res.status(200).json(result);
29 | } catch (error) {
30 | console.error("Error submitting scrape job:", error);
31 | res.status(500).json({ error: "Internal Server Error" });
32 | }
33 | } else {
34 | res.setHeader("Allow", ["POST"]);
35 | res.status(405).end(`Method ${req.method} Not Allowed`);
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/pages/api/submit-scrape-job.ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | if (req.method === "POST") {
8 | const { data } = req.body;
9 |
10 | const headers = new Headers();
11 | headers.set("Authorization", `Bearer ${req.headers.authorization}`);
12 | headers.set("content-type", "application/json");
13 |
14 | try {
15 | const response = await fetch(
16 | `${global.process.env.NEXT_PUBLIC_API_URL}/api/submit-scrape-job`,
17 | {
18 | method: "POST",
19 | headers,
20 | body: JSON.stringify(data),
21 | }
22 | );
23 |
24 | if (!response.ok) {
25 | throw new Error(`Error: ${response.statusText}`);
26 | }
27 |
28 | const result = await response.json();
29 | res.status(200).json(result);
30 | } catch (error) {
31 | console.error("Error submitting scrape job:", error);
32 | res.status(500).json({ error: "Internal Server Error" });
33 | }
34 | } else {
35 | res.setHeader("Allow", ["POST"]);
36 | res.status(405).end(`Method ${req.method} Not Allowed`);
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/pages/api/token.ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | if (req.method === "POST") {
8 | const body = new URLSearchParams(req.body as string);
9 | const username = body.get("username") || "";
10 | const password = body.get("password") || "";
11 |
12 | const headers = new Headers();
13 | headers.set("content-type", "application/x-www-form-urlencoded");
14 |
15 | try {
16 | const response = await fetch(
17 | `${global.process.env.NEXT_PUBLIC_API_URL}/api/auth/token`,
18 | {
19 | method: "POST",
20 | headers,
21 | body: new URLSearchParams({ username, password }).toString(),
22 | }
23 | );
24 |
25 | if (!response.ok) {
26 | throw new Error(`Error: ${response.statusText}`);
27 | }
28 |
29 | const result = await response.json();
30 | res.status(200).json(result);
31 | } catch (error) {
32 | console.error("Error submitting scrape job:", error);
33 | res.status(500).json({ error: "Internal Server Error" });
34 | }
35 | } else {
36 | res.setHeader("Allow", ["POST"]);
37 | res.status(405).end(`Method ${req.method} Not Allowed`);
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/pages/api/update.ts:
--------------------------------------------------------------------------------
1 | import { NextApiRequest, NextApiResponse } from "next";
2 |
3 | export default async function handler(
4 | req: NextApiRequest,
5 | res: NextApiResponse
6 | ) {
7 | if (req.method === "POST") {
8 | const { data } = req.body;
9 |
10 | const headers = new Headers();
11 | headers.set("content-type", "application/json");
12 | headers.set("Authorization", `Bearer ${req.headers.authorization}`);
13 |
14 | try {
15 | const response = await fetch(
16 | `${global.process.env.NEXT_PUBLIC_API_URL}/api/update`,
17 | {
18 | method: "POST",
19 | headers,
20 | body: JSON.stringify(data),
21 | }
22 | );
23 |
24 | if (!response.ok) {
25 | const errorDetails = await response.text();
26 | if (response.status === 422) {
27 | console.error(`422 Error: ${errorDetails}`);
28 | }
29 | throw new Error(
30 | `Error fetching logs: ${response.statusText} - ${errorDetails}`
31 | );
32 | }
33 |
34 | if (!response.ok) {
35 | throw new Error(`Error: ${response.statusText}`);
36 | }
37 |
38 | const result = await response.json();
39 | res.status(200).json(result);
40 | } catch (error) {
41 | console.error("Error submitting scrape job:", error);
42 | res.status(500).json({ error: "Internal Server Error" });
43 | }
44 | } else {
45 | res.setHeader("Allow", ["POST"]);
46 | res.status(405).end(`Method ${req.method} Not Allowed`);
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/pages/chat.tsx:
--------------------------------------------------------------------------------
1 | export { AI as default } from "../components/pages/chat/chat";
2 |
--------------------------------------------------------------------------------
/src/pages/cron-jobs.tsx:
--------------------------------------------------------------------------------
1 | import { CronJobs } from "../components/pages/cron-jobs";
2 | import { getServerSideProps } from "../components/pages/cron-jobs/get-server-side-props";
3 | export { getServerSideProps };
4 | export default CronJobs;
5 |
--------------------------------------------------------------------------------
/src/pages/index.tsx:
--------------------------------------------------------------------------------
1 | import { Provider as JobSubmitterProvider } from "@/components/submit/job-submitter/provider";
2 | import { Home } from "@/components/pages/home/home";
3 |
4 | export default function Main() {
5 | return (
6 |
7 |
8 |
9 | );
10 | }
11 |
--------------------------------------------------------------------------------
/src/pages/job/csv/[id].tsx:
--------------------------------------------------------------------------------
1 | export { JobCsvId as default } from "@/components/pages/job/csv/id";
2 | export { getServerSideProps } from "@/components/pages/job/csv/id/get-server-side-props";
3 |
--------------------------------------------------------------------------------
/src/pages/jobs.tsx:
--------------------------------------------------------------------------------
1 | import React, { useEffect, useState } from "react";
2 | import { JobTable } from "../components/jobs";
3 | import { useAuth } from "../contexts/AuthContext";
4 | import { Job } from "../types";
5 | import { GetServerSideProps } from "next/types";
6 | import axios from "axios";
7 | import { parseCookies } from "nookies";
8 | import { fetchJobs } from "../lib";
9 |
10 | interface JobsProps {
11 | initialJobs: Job[];
12 | initialUser: any;
13 | }
14 |
15 | export const getServerSideProps: GetServerSideProps = async (context) => {
16 | const { req } = context;
17 | const cookies = parseCookies({ req });
18 | const token = cookies.token;
19 | let user = null;
20 | let initialJobs: Job[] = [];
21 |
22 | if (token) {
23 | try {
24 | const userResponse = await axios.get(`/api/me`, {
25 | headers: { Authorization: `Bearer ${token}` },
26 | });
27 | user = userResponse.data;
28 |
29 | const jobsResponse = await fetch(`/api/retrieve-scrape-jobs`, {
30 | method: "POST",
31 | body: JSON.stringify({ user: user.email }),
32 | headers: {
33 | "content-type": "application/json",
34 | Authorization: `Bearer ${token}`,
35 | },
36 | });
37 |
38 | initialJobs = await jobsResponse.json();
39 | } catch (error) {
40 | console.error("Error fetching user or jobs:", error);
41 | }
42 | }
43 |
44 | return {
45 | props: {
46 | initialJobs,
47 | initialUser: user,
48 | },
49 | };
50 | };
51 |
52 | const Jobs: React.FC = ({ initialJobs, initialUser }) => {
53 | const { user, setUser } = useAuth();
54 | const [jobs, setJobs] = useState(initialJobs || []);
55 |
56 | useEffect(() => {
57 | if (!user && initialUser) {
58 | setUser(initialUser);
59 | }
60 | }, [user, initialUser, setUser]);
61 |
62 | useEffect(() => {
63 | fetchJobs(setJobs);
64 | }, [user]);
65 |
66 | useEffect(() => {
67 | const intervalId = setInterval(() => {
68 | fetchJobs(setJobs);
69 | }, 5000);
70 | return () => clearInterval(intervalId);
71 | }, []);
72 |
73 | return ;
74 | };
75 |
76 | export default Jobs;
77 |
--------------------------------------------------------------------------------
/src/pages/media/index.tsx:
--------------------------------------------------------------------------------
1 | export { MediaId as default } from "@/components/pages/media/id";
2 |
--------------------------------------------------------------------------------
/src/pages/recordings/index.tsx:
--------------------------------------------------------------------------------
1 | export { RecordingId as default } from "@/components/pages/recordings/id";
2 |
--------------------------------------------------------------------------------
/src/services/api-service/api-service.ts:
--------------------------------------------------------------------------------
1 | import * as functions from "./functions";
2 |
3 | export const ApiService = {
4 | ...functions,
5 | };
6 |
--------------------------------------------------------------------------------
/src/services/api-service/functions/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./submit-job";
2 |
--------------------------------------------------------------------------------
/src/services/api-service/functions/submit-job.ts:
--------------------------------------------------------------------------------
1 | import { SiteMap } from "@/types/job";
2 |
3 | export const submitJob = async (
4 | submittedURL: string,
5 | rows: any[],
6 | user: any,
7 | jobOptions: any,
8 | customHeaders: any,
9 | customCookies: any,
10 | siteMap: SiteMap | null,
11 | agentMode: boolean = false,
12 | prompt?: string
13 | ) => {
14 | return await fetch(`/api/submit-scrape-job`, {
15 | method: "POST",
16 | headers: { "content-type": "application/json" },
17 | body: JSON.stringify({
18 | data: {
19 | url: submittedURL,
20 | elements: rows,
21 | user: user?.email,
22 | time_created: new Date().toISOString(),
23 | job_options: {
24 | ...jobOptions,
25 | collect_media: jobOptions.collect_media || false,
26 | custom_headers: customHeaders || {},
27 | proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [],
28 | site_map: siteMap,
29 | custom_cookies: customCookies || [],
30 | },
31 | agent_mode: agentMode,
32 | prompt: prompt || "",
33 | },
34 | }),
35 | });
36 | };
37 |
--------------------------------------------------------------------------------
/src/services/api-service/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./api-service";
2 |
--------------------------------------------------------------------------------
/src/services/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./api-service";
2 |
--------------------------------------------------------------------------------
/src/store/hooks.ts:
--------------------------------------------------------------------------------
1 | import { TypedUseSelectorHook, useDispatch, useSelector } from "react-redux";
2 | import type { RootState, AppDispatch } from "./store";
3 | import {
4 | SettingsState,
5 | setAiEnabled,
6 | setRecordingsEnabled,
7 | } from "./slices/settingsSlice";
8 |
9 | export const useAppDispatch = () => useDispatch();
10 | export const useAppSelector: TypedUseSelectorHook = useSelector;
11 |
12 | export const useUserSettings = () => {
13 | const userSettings = useAppSelector((state) => state.settings);
14 | const dispatch = useAppDispatch();
15 |
16 | const setUserSettings = (userSettings: any) => {
17 | dispatch(setAiEnabled(userSettings.ai_enabled));
18 | dispatch(setRecordingsEnabled(userSettings.recordings_enabled));
19 | return userSettings;
20 | };
21 |
22 | return { userSettings, setUserSettings };
23 | };
24 |
--------------------------------------------------------------------------------
/src/store/slices/settingsSlice.ts:
--------------------------------------------------------------------------------
1 | import { createSlice, PayloadAction } from "@reduxjs/toolkit";
2 |
3 | export interface SettingsState {
4 | aiEnabled: boolean;
5 | recordingsEnabled: boolean;
6 | }
7 |
8 | const initialState: SettingsState = {
9 | aiEnabled: false,
10 | recordingsEnabled: false,
11 | };
12 |
13 | const settingsSlice = createSlice({
14 | name: "settings",
15 | initialState,
16 | reducers: {
17 | setAiEnabled: (state, action: PayloadAction) => {
18 | state.aiEnabled = action.payload;
19 | },
20 | setRecordingsEnabled: (state, action: PayloadAction) => {
21 | state.recordingsEnabled = action.payload;
22 | },
23 | },
24 | });
25 |
26 | export const { setAiEnabled, setRecordingsEnabled } = settingsSlice.actions;
27 |
28 | export default settingsSlice.reducer;
29 |
--------------------------------------------------------------------------------
/src/store/store.ts:
--------------------------------------------------------------------------------
1 | import { configureStore } from "@reduxjs/toolkit";
2 | import { persistStore, persistReducer } from "redux-persist";
3 | import storage from "redux-persist/lib/storage";
4 | import { combineReducers } from "@reduxjs/toolkit";
5 | import settingsReducer from "./slices/settingsSlice";
6 |
7 | const persistConfig = {
8 | key: "root",
9 | storage,
10 | whitelist: ["settings"], // only settings will be persisted
11 | };
12 |
13 | const rootReducer = combineReducers({
14 | settings: settingsReducer,
15 | });
16 |
17 | const persistedReducer = persistReducer(persistConfig, rootReducer);
18 |
19 | export const store = configureStore({
20 | reducer: persistedReducer,
21 | middleware: (getDefaultMiddleware) =>
22 | getDefaultMiddleware({
23 | serializableCheck: {
24 | ignoredActions: ["persist/PERSIST", "persist/REHYDRATE"],
25 | },
26 | }),
27 | });
28 |
29 | export const persistor = persistStore(store);
30 |
31 | export type RootState = ReturnType;
32 | export type AppDispatch = typeof store.dispatch;
33 |
--------------------------------------------------------------------------------
/src/styles/globals.css:
--------------------------------------------------------------------------------
1 | @tailwind base;
2 | @tailwind components;
3 | @tailwind utilities;
4 |
5 | :root {
6 | --delete-red: #ef4444;
7 | --delete-red-hover: #ff6969;
8 | --primary-blue: #007bff;
9 | --primary-gray: #f8f9fa;
10 | }
11 |
12 | #__next {
13 | height: 100%;
14 | }
15 |
16 | html,
17 | body {
18 | height: 100vh;
19 | font-family: "Schibsted Grotesk", sans-serif;
20 | }
21 |
22 | .MuiPopover-paper {
23 | padding: 0 !important;
24 | }
25 |
26 | ::-webkit-scrollbar {
27 | width: 8px;
28 | height: 8px;
29 | }
30 |
31 | ::-webkit-scrollbar-track {
32 | background-color: rgba(0, 0, 0, 0.05);
33 | border-radius: 8px;
34 | }
35 |
36 | ::-webkit-scrollbar-thumb {
37 | background-color: rgba(0, 0, 0, 0.2);
38 | border-radius: 8px;
39 | }
40 |
41 | ::-webkit-scrollbar-thumb:hover {
42 | background-color: rgba(0, 0, 0, 0.3);
43 | }
44 |
--------------------------------------------------------------------------------
/src/types/element.ts:
--------------------------------------------------------------------------------
1 | export type Element = {
2 | name: string;
3 | xpath: string;
4 | url: string;
5 | };
6 |
--------------------------------------------------------------------------------
/src/types/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./element";
2 | export * from "./result";
3 | export * from "./job";
4 | export * from "./message";
5 |
--------------------------------------------------------------------------------
/src/types/job.ts:
--------------------------------------------------------------------------------
1 | import { Message } from "./message";
2 |
3 | export interface Job {
4 | id: string;
5 | url: string;
6 | elements: Object[];
7 | result: Object;
8 | time_created: Date;
9 | status: string;
10 | job_options: RawJobOptions;
11 | favorite: boolean;
12 | chat?: Message[];
13 | agent_mode?: boolean;
14 | prompt?: string;
15 | }
16 |
17 | export type JobOptions = {
18 | multi_page_scrape: boolean;
19 | custom_headers: null | string;
20 | proxies: string[];
21 | site_map?: SiteMap;
22 | };
23 |
24 | export type RawJobOptions = {
25 | multi_page_scrape: boolean;
26 | custom_headers: string | null;
27 | proxies: string | null;
28 | collect_media: boolean;
29 | custom_cookies: string | null;
30 | };
31 |
32 | export type ActionOption = "click" | "input";
33 |
34 | export type Action = {
35 | type: ActionOption;
36 | xpath: string;
37 | name: string;
38 | do_once?: boolean;
39 | input?: string;
40 | };
41 |
42 | export type SiteMap = {
43 | actions: Action[];
44 | };
45 |
46 | export type CronJob = {
47 | id: string;
48 | user_email: string;
49 | job_id: string;
50 | cron_expression: string;
51 | time_created: Date;
52 | time_updated: Date;
53 | };
54 |
--------------------------------------------------------------------------------
/src/types/message.ts:
--------------------------------------------------------------------------------
1 | export interface Message {
2 | role: string;
3 | content: string;
4 | }
5 |
--------------------------------------------------------------------------------
/src/types/result.ts:
--------------------------------------------------------------------------------
1 | interface ScrapeResult {
2 | xpath: string;
3 | text: string;
4 | name: string;
5 | }
6 |
7 | export type Result = {
8 | [key: string]: ScrapeResult[];
9 | };
10 |
--------------------------------------------------------------------------------
/start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | RECORDINGS_ENABLED=${RECORDINGS_ENABLED:-true}
4 |
5 | if [ "$RECORDINGS_ENABLED" == "false" ]; then
6 | pdm run python -m api.backend.worker.job_worker
7 | else
8 | Xvfb :99 -screen 0 1280x1024x24 &
9 | XVFB_PID=$!
10 | sleep 2
11 | x11vnc -display :99 -rfbport 5900 -forever -nopw &
12 | VNC_PID=$!
13 | DISPLAY=:99 pdm run python -m api.backend.worker.job_worker
14 | fi
15 |
--------------------------------------------------------------------------------
/supervisord.conf:
--------------------------------------------------------------------------------
1 | [supervisord]
2 | nodaemon=true
3 |
4 | [program:api]
5 | command=pdm run python -m uvicorn api.backend.app:app --reload --host 0.0.0.0 --port 8000
6 | directory=/project/app
7 | autostart=true
8 | autorestart=true
9 | stdout_logfile=/dev/stdout
10 | stderr_logfile=/dev/stderr
11 | stdout_logfile_maxbytes=0
12 | stderr_logfile_maxbytes=0
13 |
14 | [program:worker]
15 | command=/project/app/start.sh
16 | directory=/project/app
17 | autostart=true
18 | autorestart=true
19 | stdout_logfile=/dev/stdout
20 | stderr_logfile=/dev/stderr
21 | stdout_logfile_maxbytes=0
22 | stderr_logfile_maxbytes=0
23 |
--------------------------------------------------------------------------------
/tailwind.config.js:
--------------------------------------------------------------------------------
1 | /** @type {import('tailwindcss').Config} */
2 | module.exports = {
3 | content: ["./src/**/*.{js,jsx,ts,tsx}"],
4 | theme: {
5 | extend: {
6 | animation: {
7 | fadeIn: "fadeIn 0.5s ease-in-out",
8 | fadeOut: "fadeOut 0.5s ease-in-out",
9 | },
10 | keyframes: {
11 | fadeIn: {
12 | "0%": { opacity: 0 },
13 | "100%": { opacity: 1 },
14 | },
15 | fadeOut: {
16 | "0%": { opacity: 1 },
17 | "100%": { opacity: 0 },
18 | },
19 | },
20 | },
21 | },
22 | plugins: [],
23 | };
24 |
--------------------------------------------------------------------------------