├── .dockerignore
├── .github
    ├── ISSUE_TEMPLATE
    │   └── bug_report.md
    ├── actions
    │   ├── push-to-helm
    │   │   └── action.yaml
    │   └── run-cypress-tests
    │   │   └── action.yaml
    └── workflows
    │   ├── docker-image.yml
    │   └── unit-tests.yml
├── .gitignore
├── .prettierignore
├── .python-version
├── FUNDING.yml
├── LICENSE
├── Makefile
├── README.md
├── api
    └── backend
    │   ├── __init__.py
    │   ├── ai
    │       ├── agent
    │       │   ├── actions.py
    │       │   ├── agent.py
    │       │   ├── prompts.py
    │       │   └── utils.py
    │       ├── ai_router.py
    │       └── clients.py
    │   ├── app.py
    │   ├── auth
    │       ├── __init__.py
    │       ├── auth_router.py
    │       └── auth_utils.py
    │   ├── constants.py
    │   ├── database
    │       ├── __init__.py
    │       ├── common.py
    │       ├── queries
    │       │   ├── __init__.py
    │       │   └── queries.py
    │       ├── schema
    │       │   ├── __init__.py
    │       │   └── schema.py
    │       └── startup.py
    │   ├── job
    │       ├── __init__.py
    │       ├── cron_scheduling
    │       │   └── cron_scheduling.py
    │       ├── job.py
    │       ├── models
    │       │   ├── __init__.py
    │       │   ├── job_options.py
    │       │   └── site_map.py
    │       ├── scraping
    │       │   ├── add_custom.py
    │       │   ├── collect_media.py
    │       │   └── scraping_utils.py
    │       ├── site_mapping
    │       │   ├── __init__.py
    │       │   └── site_mapping.py
    │       └── utils
    │       │   ├── clean_job_format.py
    │       │   └── stream_md_from_job_results.py
    │   ├── models.py
    │   ├── routers
    │       ├── job_router.py
    │       └── stats_router.py
    │   ├── scheduler.py
    │   ├── schemas.py
    │   ├── scraping.py
    │   ├── tests
    │       ├── factories
    │       │   └── job_factory.py
    │       ├── job
    │       │   ├── __init__.py
    │       │   └── test_download_job.py
    │       └── scraping
    │       │   ├── __init__.py
    │       │   └── test_scraping.py
    │   ├── utils.py
    │   └── worker
    │       ├── job_worker.py
    │       ├── logger.py
    │       └── post_job_complete
    │           ├── discord_notification.py
    │           ├── email_notifcation.py
    │           ├── models.py
    │           └── post_job_complete.py
├── cypress.config.ts
├── cypress
    ├── e2e
    │   ├── authentication.cy.ts
    │   ├── navigation.cy.ts
    │   └── submit-job.cy.ts
    ├── fixtures
    │   └── example.json
    └── support
    │   ├── commands.ts
    │   └── e2e.ts
├── docker-compose.dev.yml
├── docker-compose.yml
├── docker
    ├── api
    │   └── Dockerfile
    └── frontend
    │   └── Dockerfile
├── docs
    ├── chat_page.png
    ├── docs_page.png
    ├── job_page.png
    ├── log_page.png
    ├── login.png
    ├── logo_picture.png
    ├── main_page.png
    └── stats_page.png
├── helm
    ├── .helmignore
    ├── Chart.yaml
    ├── templates
    │   ├── deployment.yaml
    │   └── service.yaml
    └── values.yaml
├── next-env.d.ts
├── next.config.mjs
├── package.json
├── pdm.lock
├── postcss.config.js
├── public
    ├── favicon.ico
    ├── images
    │   └── scraperr_logo.png
    ├── manifest.json
    └── robots.txt
├── pyproject.toml
├── src
    ├── components
    │   ├── ai
    │   │   ├── Chat.tsx
    │   │   ├── JobSelector.tsx
    │   │   └── index.ts
    │   ├── common
    │   │   ├── advanced-job-options
    │   │   │   ├── advanced-job-options.tsx
    │   │   │   ├── dialog
    │   │   │   │   ├── advanced-job-options-dialog.tsx
    │   │   │   │   └── index.ts
    │   │   │   └── index.ts
    │   │   ├── csv-table
    │   │   │   ├── csv-table.tsx
    │   │   │   └── index.ts
    │   │   ├── disabled
    │   │   │   ├── disabled.tsx
    │   │   │   └── index.ts
    │   │   ├── expanded-table-input
    │   │   │   ├── expanded-table-input.tsx
    │   │   │   └── index.ts
    │   │   ├── index.ts
    │   │   ├── job-download-dialog
    │   │   │   ├── index.ts
    │   │   │   └── job-download-dialog.tsx
    │   │   ├── media-viewer
    │   │   │   ├── audio
    │   │   │   │   ├── audio-viewer.tsx
    │   │   │   │   └── index.ts
    │   │   │   ├── image
    │   │   │   │   ├── image-viewer.tsx
    │   │   │   │   └── index.ts
    │   │   │   ├── index.ts
    │   │   │   ├── media-viewer.tsx
    │   │   │   ├── pdf-viewer
    │   │   │   │   ├── index.ts
    │   │   │   │   └── pdf-viewer.tsx
    │   │   │   ├── tile-grid-view
    │   │   │   │   ├── index.ts
    │   │   │   │   └── tile-grid-view.tsx
    │   │   │   └── video
    │   │   │   │   ├── index.ts
    │   │   │   │   └── video-viewer.tsx
    │   │   └── nav-drawer
    │   │   │   ├── index.ts
    │   │   │   ├── nav-drawer.module.css
    │   │   │   ├── nav-drawer.tsx
    │   │   │   ├── nav-item
    │   │   │       ├── index.ts
    │   │   │       └── nav-item.tsx
    │   │   │   ├── nav-items
    │   │   │       ├── index.ts
    │   │   │       └── nav-items.tsx
    │   │   │   └── user-control
    │   │   │       ├── index.ts
    │   │   │       ├── logged-in-control
    │   │   │           ├── index.ts
    │   │   │           ├── logged-in-control.module.css
    │   │   │           └── logged-in-control.tsx
    │   │   │       ├── logged-out-control
    │   │   │           ├── index.ts
    │   │   │           ├── logged-out-control.module.css
    │   │   │           └── logged-out-control.tsx
    │   │   │       ├── user-control.module.css
    │   │   │       └── user-control.tsx
    │   ├── jobs
    │   │   ├── Favorites.tsx
    │   │   ├── JobQueue.tsx
    │   │   ├── JobTable.tsx
    │   │   └── index.tsx
    │   ├── nav
    │   │   └── quick-settings
    │   │   │   ├── index.ts
    │   │   │   ├── quick-settings.module.css
    │   │   │   └── quick-settings.tsx
    │   ├── pages
    │   │   ├── agent
    │   │   │   ├── agent.tsx
    │   │   │   └── index.ts
    │   │   ├── chat
    │   │   │   └── chat.tsx
    │   │   ├── cron-jobs
    │   │   │   ├── create-cron-jobs
    │   │   │   │   ├── create-cron-jobs.tsx
    │   │   │   │   └── index.ts
    │   │   │   ├── cron-jobs.module.css
    │   │   │   ├── cron-jobs.tsx
    │   │   │   ├── get-server-side-props.ts
    │   │   │   └── index.ts
    │   │   ├── home
    │   │   │   ├── home.tsx
    │   │   │   └── index.ts
    │   │   ├── job
    │   │   │   └── csv
    │   │   │   │   └── id
    │   │   │   │       ├── get-server-side-props.ts
    │   │   │   │       ├── id.tsx
    │   │   │   │       └── index.ts
    │   │   ├── media
    │   │   │   └── id
    │   │   │   │   ├── id.tsx
    │   │   │   │   └── index.ts
    │   │   └── recordings
    │   │   │   └── id
    │   │   │       ├── id.tsx
    │   │   │       └── index.ts
    │   └── submit
    │   │   ├── index.ts
    │   │   └── job-submitter
    │   │       ├── element-table
    │   │           ├── element-table.tsx
    │   │           └── index.ts
    │   │       ├── index.ts
    │   │       ├── job-submitter-header
    │   │           ├── index.ts
    │   │           ├── job-submitter-header.module.css
    │   │           └── job-submitter-header.tsx
    │   │       ├── job-submitter-input
    │   │           ├── index.ts
    │   │           ├── job-submitter-input.module.css
    │   │           └── job-submitter-input.tsx
    │   │       ├── job-submitter-options
    │   │           ├── index.ts
    │   │           └── job-submitter-options.tsx
    │   │       ├── job-submitter.tsx
    │   │       ├── provider.tsx
    │   │       └── site-map
    │   │           ├── index.ts
    │   │           ├── site-map-input
    │   │               ├── index.ts
    │   │               ├── site-map-input.module.css
    │   │               └── site-map-input.tsx
    │   │           └── site-map.tsx
    ├── contexts
    │   └── AuthContext.tsx
    ├── declaration.d.ts
    ├── lib
    │   ├── constants.ts
    │   ├── helpers
    │   │   ├── index.ts
    │   │   ├── parse-job-options.ts
    │   │   ├── parse-json-to-entries.ts
    │   │   └── validate-url.ts
    │   ├── hooks
    │   │   └── use-advanced-job-options
    │   │   │   ├── index.ts
    │   │   │   └── use-advanced-job-options.ts
    │   ├── index.ts
    │   └── utils.ts
    ├── pages
    │   ├── _app.tsx
    │   ├── _document.tsx
    │   ├── agent.tsx
    │   ├── api
    │   │   ├── ai
    │   │   │   └── index.ts
    │   │   ├── check.ts
    │   │   ├── delete-cron-job.ts
    │   │   ├── delete.ts
    │   │   ├── download.ts
    │   │   ├── get-average-element-per-link.ts
    │   │   ├── get-average-jobs-per-day.ts
    │   │   ├── job
    │   │   │   └── [id].ts
    │   │   ├── logs.ts
    │   │   ├── me.ts
    │   │   ├── media
    │   │   │   ├── get-media.ts
    │   │   │   └── index.ts
    │   │   ├── recordings
    │   │   │   └── [id].ts
    │   │   ├── retrieve.ts
    │   │   ├── schedule-cron-job.ts
    │   │   ├── signup.ts
    │   │   ├── submit-scrape-job.ts
    │   │   ├── token.ts
    │   │   └── update.ts
    │   ├── chat.tsx
    │   ├── cron-jobs.tsx
    │   ├── index.tsx
    │   ├── job
    │   │   └── csv
    │   │   │   └── [id].tsx
    │   ├── jobs.tsx
    │   ├── login.tsx
    │   ├── media
    │   │   └── index.tsx
    │   ├── recordings
    │   │   └── index.tsx
    │   └── statistics.tsx
    ├── services
    │   ├── api-service
    │   │   ├── api-service.ts
    │   │   ├── functions
    │   │   │   ├── index.ts
    │   │   │   └── submit-job.ts
    │   │   └── index.ts
    │   └── index.ts
    ├── store
    │   ├── hooks.ts
    │   ├── slices
    │   │   └── settingsSlice.ts
    │   └── store.ts
    ├── styles
    │   ├── globals.css
    │   └── themes.ts
    └── types
    │   ├── element.ts
    │   ├── index.ts
    │   ├── job.ts
    │   ├── message.ts
    │   └── result.ts
├── start.sh
├── supervisord.conf
├── tailwind.config.js
├── tsconfig.json
└── yarn.lock


/.dockerignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | npm-debug.log
3 | Dockerfile
4 | .dockerignore


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: 'Bug reporting '
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Additional context**
32 | Add any other context about the problem here.
33 | 


--------------------------------------------------------------------------------
/.github/actions/push-to-helm/action.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish Helm Chart
 2 | description: Publish a Helm chart to a target repository
 3 | 
 4 | inputs:
 5 |   app-repo-token:
 6 |     required: true
 7 |     description: "The token for the target repository"
 8 | 
 9 | runs:
10 |   using: 'composite'
11 |   steps:
12 |     - name: Checkout app repo
13 |       uses: actions/checkout@v4
14 | 
15 |     - name: Set up Helm
16 |       uses: azure/setup-helm@v3
17 | 
18 |     - name: Package Helm chart
19 |       run: |
20 |         mkdir -p packaged
21 |         helm package helm -d packaged
22 |       shell: bash
23 | 
24 |     - name: Clone target Helm repo
25 |       run: |
26 |         git clone https://github.com/jaypyles/helm.git target-repo
27 |         cd target-repo
28 |         git config user.name "github-actions"
29 |         git config user.email "github-actions@github.com"
30 |         git fetch origin gh-pages  # Fetch gh-pages explicitly
31 |         git checkout gh-pages      # Checkout gh-pages branch
32 |         git pull origin gh-pages    # Pull latest changes from gh-pages
33 |       shell: bash
34 | 
35 |     - name: Copy package and update index
36 |       run: |
37 |         APP_NAME="scraperr"
38 |         mkdir -p target-repo/charts/$APP_NAME
39 |         cp packaged/*.tgz target-repo/charts/$APP_NAME/
40 |         cd target-repo/charts/$APP_NAME
41 |         helm repo index . --url https://jaypyles.github.io/helm/charts/$APP_NAME
42 |       shell: bash
43 | 
44 |     - name: Commit and push to target repo
45 |       run: |
46 |         cd target-repo
47 |         git add charts/
48 |         git commit -m "Update $APP_NAME chart $(date +'%Y-%m-%d %H:%M:%S')" || echo "No changes"
49 |         git push https://x-access-token:${{ inputs.app-repo-token }}@github.com/jaypyles/helm.git gh-pages
50 |       shell: bash


--------------------------------------------------------------------------------
/.github/actions/run-cypress-tests/action.yaml:
--------------------------------------------------------------------------------
 1 | name: Run Cypress Tests
 2 | 
 3 | description: Run Cypress tests
 4 | 
 5 | runs:
 6 |   using: "composite"
 7 |   steps:
 8 |     - name: Checkout code
 9 |       uses: actions/checkout@v4
10 | 
11 |     - name: Setup Node
12 |       uses: actions/setup-node@v4
13 |       with:
14 |         node-version: 22
15 | 
16 |     - name: Setup Docker project
17 |       shell: bash
18 |       run: make build-ci up-ci
19 | 
20 |     - name: Install dependencies
21 |       shell: bash
22 |       run: yarn install
23 | 
24 |     - name: Wait for frontend to be ready
25 |       shell: bash
26 |       run: |
27 |         for i in {1..10}; do
28 |           curl -s http://127.0.0.1:80 && echo "Frontend is ready" && exit 0
29 |           echo "Waiting for frontend to be ready... attempt $i"
30 |           sleep 1
31 |         done
32 |         echo "Frontend failed to be ready after 10 retries"
33 |         exit 1
34 | 
35 |     - name: Wait for backend to be ready
36 |       shell: bash
37 |       run: |
38 |         for i in {1..10}; do
39 |           curl -s http://127.0.0.1:8000 && echo "Backend is ready" && exit 0
40 |           echo "Waiting for backend to be ready... attempt $i"
41 |           sleep 1
42 |         done
43 |         echo "Backend failed to be ready after 10 retries"
44 |         exit 1
45 | 
46 |     - name: Show backend logs on failure
47 |       if: failure()
48 |       shell: bash
49 |       run: |
50 |         echo "== Docker Containers =="
51 |         docker ps -a
52 |         echo "== Backend Logs =="
53 |         docker logs $(docker ps -a --filter "name=scraperr_api" --format "{{.Names}}") || echo "Could not get backend logs"
54 | 
55 |     - name: Run Cypress tests
56 |       shell: bash
57 |       run: npm run cy:run
58 | 
59 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image
 2 | on:
 3 |   workflow_dispatch:
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - name: Checkout
10 |         uses: actions/checkout@v4
11 | 
12 |       - name: Get version from helm chart
13 |         run: |
14 |           VERSION=$(grep "version:" ./helm/Chart.yaml | cut -d: -f2 | tr -d ' ')
15 |           echo "VERSION=$VERSION" >> $GITHUB_ENV
16 |           echo "Version is $VERSION"
17 | 
18 |       - name: Login to Docker Hub
19 |         uses: docker/login-action@v3
20 |         with:
21 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
22 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
23 | 
24 |       - name: Set up Docker Buildx
25 |         uses: docker/setup-buildx-action@v3
26 | 
27 |       - name: Build and push frontend
28 |         uses: docker/build-push-action@v5
29 |         with:
30 |           context: .
31 |           file: ./docker/frontend/Dockerfile
32 |           push: true
33 |           tags: |
34 |             ${{ secrets.DOCKERHUB_USERNAME }}/scraperr:latest
35 |             ${{ secrets.DOCKERHUB_USERNAME }}/scraperr:${{ env.VERSION }}
36 | 
37 |       - name: Build and push api
38 |         uses: docker/build-push-action@v5
39 |         with:
40 |           context: .
41 |           file: ./docker/api/Dockerfile
42 |           push: true
43 |           tags: |
44 |             ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest
45 |             ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:${{ env.VERSION }}
46 | 
47 |   push-helm-chart:
48 |     runs-on: ubuntu-latest
49 |     needs:
50 |       - build
51 |     steps:
52 |       - uses: actions/checkout@v4
53 | 
54 |       - name: Push Helm Chart
55 |         uses: ./.github/actions/push-to-helm
56 |         with:
57 |           app-repo-token: ${{ secrets.GPAT_TOKEN }}
58 | 
59 |   success-message:
60 |     runs-on: ubuntu-latest
61 |     needs:
62 |       - build
63 |       - push-helm-chart
64 |     steps:
65 |       - name: Send Discord Message
66 |         uses: jaypyles/discord-webhook-action@v1.0.0
67 |         with:
68 |           webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
69 |           content: "Scraperr Successfully Built Docker Images"
70 |           username: "Scraperr CI"
71 |           embed-title: "✅ Deployment Status"
72 |           embed-description: "Scraperr successfully built docker images."
73 |           embed-color: 3066993 # Green
74 |           embed-footer-text: "Scraperr CI"
75 |           embed-timestamp: ${{ github.event.head_commit.timestamp }}
76 | 


--------------------------------------------------------------------------------
/.github/workflows/unit-tests.yml:
--------------------------------------------------------------------------------
 1 | name: Unit Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 | 
 8 |   pull_request:
 9 |     types: [opened, synchronize, reopened]
10 | 
11 |   workflow_dispatch:
12 | 
13 | jobs:
14 |   unit-tests:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - name: Checkout
18 |         uses: actions/checkout@v4
19 | 
20 |       - name: Set env
21 |         run: echo "ENV=test" >> $GITHUB_ENV
22 | 
23 |       - name: Install pdm
24 |         run: pip install pdm
25 | 
26 |       - name: Install project dependencies
27 |         run: pdm install
28 | 
29 |       - name: Install playwright
30 |         run: pdm run playwright install
31 | 
32 |       - name: Run tests
33 |         run: PYTHONPATH=. pdm run pytest -v -ra api/backend/tests
34 | 
35 |   cypress-tests:
36 |     runs-on: ubuntu-latest
37 |     steps:
38 |       - uses: actions/checkout@v4
39 |       - uses: ./.github/actions/run-cypress-tests
40 | 
41 |   success-message:
42 |     runs-on: ubuntu-latest
43 |     needs:
44 |       - unit-tests
45 |       - cypress-tests
46 |     steps:
47 |       - name: Send Discord Message
48 |         uses: jaypyles/discord-webhook-action@v1.0.0
49 |         with:
50 |           webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }}
51 |           content: "Scraperr Successfully Passed Tests"
52 |           username: "Scraperr CI"
53 |           embed-title: "✅ Deployment Status"
54 |           embed-description: "Scraperr successfully passed all tests."
55 |           embed-color: 3066993 # Green
56 |           embed-footer-text: "Scraperr CI"
57 |           embed-timestamp: ${{ github.event.head_commit.timestamp }}
58 | 


--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------
1 | *.yaml
2 | *.yml
3 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10.12
2 | 


--------------------------------------------------------------------------------
/FUNDING.yml:
--------------------------------------------------------------------------------
1 | custom: ["https://www.buymeacoffee.com/jaypyles"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Jayden Pyles 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .DEFAULT_GOAL := help
 2 | 
 3 | COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.local.yml
 4 | COMPOSE_PROD = docker compose -f docker-compose.yml
 5 | 
 6 | .PHONY: help deps build pull up up-dev down setup deploy
 7 | 
 8 | help:
 9 | 	@echo "Usage:"
10 | 	@echo "  make logs    		- Check Docker container logs"
11 | 	@echo "  make deps    		- Build frontend assets"
12 | 	@echo "  make build   		- Build Docker images"
13 | 	@echo "  make build-force   - Build Docker images"
14 | 	@echo "  make pull    		- Pull Docker images"
15 | 	@echo "  make up      		- Start production environment"
16 | 	@echo "  make up-dev  		- Start development environment"
17 | 	@echo "  make down    		- Stop and remove containers, networks, images, and volumes"
18 | 	@echo "  make setup   		- Setup server with dependencies and clone repo"
19 | 	@echo "  make deploy  		- Deploy site onto server"
20 | 	@echo "  make cypress-start	- Start Cypress"
21 | 	@echo ""
22 | 
23 | logs:
24 | 	docker compose logs -f
25 | 
26 | deps:
27 | 	pdm install
28 | 	npm install
29 | 	npm run build
30 | 
31 | build:
32 | 	$(COMPOSE_DEV) build
33 | 
34 | build-force:
35 | 	$(COMPOSE_DEV) build --no-cache
36 | 
37 | pull:
38 | 	docker compose pull
39 | 
40 | up:
41 | 	$(COMPOSE_PROD) up -d --force-recreate
42 | 
43 | up-dev:
44 | 	$(COMPOSE_DEV) up -d --force-recreate
45 | 
46 | down:
47 | 	$(COMPOSE_DEV) down
48 | 	$(COMPOSE_PROD) down
49 | 
50 | setup:
51 | 	ansible-playbook -i ./ansible/inventory.yaml ./ansible/setup.yaml
52 | 
53 | deploy:
54 | 	ansible-playbook -i ./ansible/inventory.yaml ./ansible/deploy_site.yaml -v
55 | 
56 | build-ci:
57 | 	docker compose -f docker-compose.yml -f docker-compose.dev.yml build
58 | 
59 | up-ci:
60 | 	docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d --force-recreate
61 | 
62 | cypress-start:
63 | 	DISPLAY=:0 npx cypress open


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 |   <img src="https://github.com/jaypyles/www-scrape/blob/master/docs/logo_picture.png" alt="Scraperr Logo" width="250px">
 3 |   
 4 |   **A powerful self-hosted web scraping solution**
 5 |   
 6 |   <div>
 7 |     <img src="https://img.shields.io/badge/MongoDB-%234ea94b.svg?style=for-the-badge&logo=mongodb&logoColor=white" alt="MongoDB" />
 8 |     <img src="https://img.shields.io/badge/FastAPI-005571?style=for-the-badge&logo=fastapi" alt="FastAPI" />
 9 |     <img src="https://img.shields.io/badge/Next-black?style=for-the-badge&logo=next.js&logoColor=white" alt="Next JS" />
10 |     <img src="https://img.shields.io/badge/tailwindcss-%2338B2AC.svg?style=for-the-badge&logo=tailwind-css&logoColor=white" alt="TailwindCSS" />
11 |   </div>
12 | </div>
13 | 
14 | ## 📋 Overview
15 | 
16 | Scrape websites without writing a single line of code.
17 | 
18 | > 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information.
19 | 
20 | <div align="center">
21 |   <img src="https://github.com/jaypyles/www-scrape/blob/master/docs/main_page.png" alt="Scraperr Main Interface" width="800px">
22 | </div>
23 | 
24 | ## ✨ Key Features
25 | 
26 | - **XPath-Based Extraction**: Precisely target page elements
27 | - **Queue Management**: Submit and manage multiple scraping jobs
28 | - **Domain Spidering**: Option to scrape all pages within the same domain
29 | - **Custom Headers**: Add JSON headers to your scraping requests
30 | - **Media Downloads**: Automatically download images, videos, and other media
31 | - **Results Visualization**: View scraped data in a structured table format
32 | - **Data Export**: Export your results in markdown and csv formats
33 | - **Notifcation Channels**: Send completion notifcations, through various channels
34 | 
35 | ## 🚀 Getting Started
36 | 
37 | ### Docker
38 | 
39 | ```bash
40 | make up
41 | ```
42 | 
43 | ### Helm
44 | 
45 | > Refer to the docs for helm deployment: https://scraperr-docs.pages.dev/guides/helm-deployment
46 | 
47 | ## ⚖️ Legal and Ethical Guidelines
48 | 
49 | When using Scraperr, please remember to:
50 | 
51 | 1. **Respect `robots.txt`**: Always check a website's `robots.txt` file to verify which pages permit scraping
52 | 2. **Terms of Service**: Adhere to each website's Terms of Service regarding data extraction
53 | 3. **Rate Limiting**: Implement reasonable delays between requests to avoid overloading servers
54 | 
55 | > **Disclaimer**: Scraperr is intended for use only on websites that explicitly permit scraping. The creator accepts no responsibility for misuse of this tool.
56 | 
57 | ## 💬 Join the Community
58 | 
59 | Get support, report bugs, and chat with other users and contributors.
60 | 
61 | 👉 [Join the Scraperr Discord](https://discord.gg/89q7scsGEK)
62 | 
63 | ## 📄 License
64 | 
65 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
66 | 
67 | ## 👏 Contributions
68 | 
69 | Development made easier with the [webapp template](https://github.com/jaypyles/webapp-template).
70 | 
71 | To get started, simply run `make build up-dev`.


--------------------------------------------------------------------------------
/api/backend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/api/backend/__init__.py


--------------------------------------------------------------------------------
/api/backend/ai/agent/actions.py:
--------------------------------------------------------------------------------
1 | from typing_extensions import TypedDict
2 | 
3 | 
4 | class Action(TypedDict):
5 |     type: str
6 |     url: str
7 | 


--------------------------------------------------------------------------------
/api/backend/ai/agent/agent.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from typing import Any
 3 | 
 4 | from camoufox import AsyncCamoufox
 5 | from playwright.async_api import Page
 6 | 
 7 | from api.backend.ai.agent.utils import (
 8 |     capture_elements,
 9 |     convert_to_markdown,
10 |     parse_response,
11 | )
12 | 
13 | from api.backend.ai.clients import ask_open_ai, ask_ollama, open_ai_key
14 | 
15 | from api.backend.ai.agent.prompts import (
16 |     ELEMENT_EXTRACTION_PROMPT,
17 |     EXTRACT_ELEMENTS_PROMPT,
18 | )
19 | 
20 | from api.backend.job.scraping.collect_media import collect_media
21 | from api.backend.worker.logger import LOG
22 | 
23 | from api.backend.job.scraping.add_custom import add_custom_items
24 | 
25 | from api.backend.models import CapturedElement
26 | 
27 | 
28 | ask_ai = ask_open_ai if open_ai_key else ask_ollama
29 | 
30 | 
31 | async def scrape_with_agent(agent_job: dict[str, Any]):
32 |     LOG.info(f"Starting work for agent job: {agent_job}")
33 |     pages = set()
34 | 
35 |     if agent_job["job_options"]["proxies"]:
36 |         proxy = random.choice(agent_job["job_options"]["proxies"])
37 |         LOG.info(f"Using proxy: {proxy}")
38 | 
39 |     async with AsyncCamoufox(headless=True) as browser:
40 |         page: Page = await browser.new_page()
41 | 
42 |         await add_custom_items(
43 |             agent_job["url"],
44 |             page,
45 |             agent_job["job_options"]["custom_cookies"],
46 |             agent_job["job_options"]["custom_headers"],
47 |         )
48 | 
49 |         try:
50 |             await page.set_viewport_size({"width": 1920, "height": 1080})
51 |             await page.goto(agent_job["url"], timeout=60000)
52 | 
53 |             if agent_job["job_options"]["collect_media"]:
54 |                 await collect_media(agent_job["id"], page)
55 | 
56 |             html_content = await page.content()
57 |             markdown_content = convert_to_markdown(html_content)
58 | 
59 |             response = await ask_ai(
60 |                 ELEMENT_EXTRACTION_PROMPT.format(
61 |                     extraction_prompt=EXTRACT_ELEMENTS_PROMPT,
62 |                     webpage=markdown_content,
63 |                     prompt=agent_job["prompt"],
64 |                 )
65 |             )
66 | 
67 |             xpaths = parse_response(response)
68 | 
69 |             captured_elements = await capture_elements(page, xpaths)
70 | 
71 |             final_url = page.url
72 | 
73 |             pages.add((html_content, final_url))
74 |         finally:
75 |             await page.close()
76 |             await browser.close()
77 | 
78 |     name_to_elements = {}
79 | 
80 |     for page in pages:
81 |         for element in captured_elements:
82 |             if element.name not in name_to_elements:
83 |                 name_to_elements[element.name] = []
84 | 
85 |             name_to_elements[element.name].append(element)
86 | 
87 |     scraped_elements: list[dict[str, dict[str, list[CapturedElement]]]] = [
88 |         {
89 |             page[1]: name_to_elements,
90 |         }
91 |         for page in pages
92 |     ]
93 | 
94 |     return scraped_elements
95 | 


--------------------------------------------------------------------------------
/api/backend/ai/agent/prompts.py:
--------------------------------------------------------------------------------
 1 | EXTRACT_ELEMENTS_PROMPT = """
 2 | You are an assistant that extracts XPath expressions from webpages.
 3 | 
 4 | You will receive HTML content in markdown format.
 5 | 
 6 | Each element in the markdown has their xpath shown above them in a path like:
 7 | <!-- //div -->
 8 | 
 9 | Respond only with a list of general XPath expressions inside `<xpaths>...</xpaths>` tags.
10 | 
11 | You will also decide the decision of what to do next. If there is no decision available, return nothing for that section.
12 | """
13 | 
14 | ELEMENT_EXTRACTION_PROMPT = """
15 | {extraction_prompt}
16 | 
17 | **Guidelines:**
18 | - Prefer shorter, more general XPaths like `//div[...]` or `//span[...]`.
19 | - Avoid overly specific or deep paths like `//div[3]/ul/li[2]/a`.
20 | - Do **not** chain multiple elements deeply (e.g., `//div/span/a`).
21 | - Use XPaths further down the tree when possible.
22 | - Do not include any extra explanation or text.
23 | - One XPath is acceptable if that's all that's needed.
24 | - Try and limit it down to 1 - 3 xpaths.
25 | - Include a name for each xpath.
26 | 
27 | <important>
28 | - USE THE MOST SIMPLE XPATHS POSSIBLE.
29 | - USE THE MOST GENERAL XPATHS POSSIBLE.
30 | - USE THE MOST SPECIFIC XPATHS POSSIBLE.
31 | - USE THE MOST GENERAL XPATHS POSSIBLE.
32 | </important>
33 | 
34 | **Example Format:**
35 | ```xml
36 | <xpaths>
37 | - <name: insert_name_here>: <xpath: //div>
38 | - <name: insert_name_here>: <xpath: //span>
39 | - <name: insert_name_here>: <xpath: //span[contains(@text, 'example')]>
40 | - <name: insert_name_here>: <xpath: //div[contains(@text, 'example')]>
41 | - <name: insert_name_here>: <xpath: //a[@href]>
42 | - etc
43 | </xpaths>
44 | 
45 | <decision>
46 |     <next_page>
47 |         - //a[@href='next_page_url']
48 |     </next_page>
49 | </decision>
50 | ```
51 | 
52 | **Input webpage:**
53 | {webpage}
54 | 
55 | **Target content:**
56 | {prompt}
57 | 
58 | """
59 | 


--------------------------------------------------------------------------------
/api/backend/ai/ai_router.py:
--------------------------------------------------------------------------------
 1 | # STL
 2 | import logging
 3 | from collections.abc import Iterable, AsyncGenerator
 4 | 
 5 | # PDM
 6 | from fastapi import APIRouter
 7 | from fastapi.responses import JSONResponse, StreamingResponse
 8 | from openai.types.chat import ChatCompletionMessageParam
 9 | 
10 | # LOCAL
11 | from ollama import Message
12 | from api.backend.models import AI
13 | 
14 | from api.backend.ai.clients import (
15 |     llama_client,
16 |     llama_model,
17 |     openai_client,
18 |     open_ai_model,
19 |     open_ai_key,
20 | )
21 | 
22 | 
23 | LOG = logging.getLogger(__name__)
24 | 
25 | ai_router = APIRouter()
26 | 
27 | 
28 | async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]:
29 |     if llama_client and llama_model:
30 |         try:
31 |             async for part in await llama_client.chat(
32 |                 model=llama_model, messages=chat_messages, stream=True
33 |             ):
34 |                 yield part["message"]["content"]
35 |         except Exception as e:
36 |             LOG.error(f"Error during chat: {e}")
37 |             yield "An error occurred while processing your request."
38 | 
39 | 
40 | async def openai_chat(
41 |     chat_messages: Iterable[ChatCompletionMessageParam],
42 | ) -> AsyncGenerator[str, None]:
43 |     if openai_client and not open_ai_model:
44 |         LOG.error("OpenAI model is not set")
45 |         yield "An error occurred while processing your request."
46 | 
47 |     if not openai_client:
48 |         LOG.error("OpenAI client is not set")
49 |         yield "An error occurred while processing your request."
50 | 
51 |     if openai_client and open_ai_model:
52 |         try:
53 |             response = openai_client.chat.completions.create(
54 |                 model=open_ai_model, messages=chat_messages, stream=True
55 |             )
56 |             for part in response:
57 |                 yield part.choices[0].delta.content or ""
58 |         except Exception as e:
59 |             LOG.error(f"Error during OpenAI chat: {e}")
60 |             yield "An error occurred while processing your request."
61 | 
62 | 
63 | chat_function = llama_chat if llama_client else openai_chat
64 | 
65 | 
66 | @ai_router.post("/ai")
67 | async def ai(c: AI):
68 |     return StreamingResponse(
69 |         chat_function(chat_messages=c.messages), media_type="text/plain"
70 |     )
71 | 
72 | 
73 | @ai_router.get("/ai/check")
74 | async def check():
75 |     return JSONResponse(content={"ai_enabled": bool(open_ai_key or llama_model)})
76 | 


--------------------------------------------------------------------------------
/api/backend/ai/clients.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from openai import OpenAI
 4 | from ollama import AsyncClient
 5 | 
 6 | 
 7 | # Load environment variables
 8 | open_ai_key = os.getenv("OPENAI_KEY")
 9 | open_ai_model = os.getenv("OPENAI_MODEL")
10 | llama_url = os.getenv("OLLAMA_URL")
11 | llama_model = os.getenv("OLLAMA_MODEL")
12 | 
13 | # Initialize clients
14 | openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None
15 | llama_client = AsyncClient(host=llama_url) if llama_url else None
16 | 
17 | 
18 | async def ask_open_ai(prompt: str) -> str:
19 |     if not openai_client:
20 |         raise ValueError("OpenAI client not initialized")
21 | 
22 |     response = openai_client.chat.completions.create(
23 |         model=open_ai_model or "gpt-4.1-mini",
24 |         messages=[{"role": "user", "content": prompt}],
25 |     )
26 | 
27 |     return response.choices[0].message.content or ""
28 | 
29 | 
30 | async def ask_ollama(prompt: str) -> str:
31 |     if not llama_client:
32 |         raise ValueError("Ollama client not initialized")
33 | 
34 |     response = await llama_client.chat(
35 |         model=llama_model or "", messages=[{"role": "user", "content": prompt}]
36 |     )
37 | 
38 |     return response.message.content or ""
39 | 


--------------------------------------------------------------------------------
/api/backend/app.py:
--------------------------------------------------------------------------------
 1 | # STL
 2 | import os
 3 | import logging
 4 | import apscheduler  # type: ignore
 5 | from contextlib import asynccontextmanager
 6 | 
 7 | # PDM
 8 | import apscheduler.schedulers
 9 | import apscheduler.schedulers.background
10 | from fastapi import FastAPI, Request, status
11 | from fastapi.exceptions import RequestValidationError
12 | from fastapi.middleware.cors import CORSMiddleware
13 | 
14 | # LOCAL
15 | from api.backend.ai.ai_router import ai_router
16 | from api.backend.auth.auth_router import auth_router
17 | from api.backend.utils import get_log_level
18 | from api.backend.routers.job_router import job_router
19 | from api.backend.routers.stats_router import stats_router
20 | from api.backend.database.startup import init_database
21 | from fastapi.responses import JSONResponse
22 | 
23 | from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler
24 | from api.backend.scheduler import scheduler
25 | 
26 | log_level = os.getenv("LOG_LEVEL")
27 | LOG_LEVEL = get_log_level(log_level)
28 | 
29 | logging.basicConfig(
30 |     level=LOG_LEVEL,
31 |     format="%(levelname)s:     %(asctime)s - %(name)s - %(message)s",
32 |     handlers=[logging.StreamHandler()],
33 | )
34 | 
35 | LOG = logging.getLogger(__name__)
36 | 
37 | 
38 | @asynccontextmanager
39 | async def lifespan(app: FastAPI):
40 |     # Startup
41 |     LOG.info("Starting application...")
42 | 
43 |     init_database()
44 | 
45 |     LOG.info("Starting cron scheduler...")
46 |     start_cron_scheduler(scheduler)
47 |     scheduler.start()
48 |     LOG.info("Cron scheduler started successfully")
49 | 
50 |     yield
51 | 
52 |     # Shutdown
53 |     LOG.info("Shutting down application...")
54 |     LOG.info("Stopping cron scheduler...")
55 |     scheduler.shutdown(wait=False)  # Set wait=False to not block shutdown
56 |     LOG.info("Cron scheduler stopped")
57 |     LOG.info("Application shutdown complete")
58 | 
59 | 
60 | app = FastAPI(title="api", root_path="/api", lifespan=lifespan)
61 | 
62 | app.add_middleware(
63 |     CORSMiddleware,
64 |     allow_origins=["*"],
65 |     allow_credentials=True,
66 |     allow_methods=["*"],
67 |     allow_headers=["*"],
68 | )
69 | 
70 | app.include_router(auth_router)
71 | app.include_router(ai_router)
72 | app.include_router(job_router)
73 | app.include_router(stats_router)
74 | 
75 | 
76 | @app.exception_handler(RequestValidationError)
77 | async def validation_exception_handler(request: Request, exc: RequestValidationError):
78 |     exc_str = f"{exc}".replace("\n", " ").replace("   ", " ")
79 |     logging.error(f"{request}: {exc_str}")
80 |     content = {"status_code": 10422, "message": exc_str, "data": None}
81 |     return JSONResponse(
82 |         content=content, status_code=status.HTTP_422_UNPROCESSABLE_ENTITY
83 |     )
84 | 


--------------------------------------------------------------------------------
/api/backend/auth/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/api/backend/auth/__init__.py


--------------------------------------------------------------------------------
/api/backend/auth/auth_router.py:
--------------------------------------------------------------------------------
 1 | # STL
 2 | from datetime import timedelta
 3 | import os
 4 | 
 5 | # PDM
 6 | from fastapi import Depends, APIRouter, HTTPException, status
 7 | from fastapi.security import OAuth2PasswordRequestForm
 8 | 
 9 | # LOCAL
10 | from api.backend.schemas import User, Token, UserCreate
11 | from api.backend.auth.auth_utils import (
12 |     ACCESS_TOKEN_EXPIRE_MINUTES,
13 |     get_current_user,
14 |     authenticate_user,
15 |     get_password_hash,
16 |     create_access_token,
17 | )
18 | import logging
19 | 
20 | from api.backend.database.common import update
21 | 
22 | auth_router = APIRouter()
23 | 
24 | LOG = logging.getLogger("auth_router")
25 | 
26 | 
27 | @auth_router.post("/auth/token", response_model=Token)
28 | async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
29 |     user = await authenticate_user(form_data.username, form_data.password)
30 |     if not user:
31 |         raise HTTPException(
32 |             status_code=status.HTTP_401_UNAUTHORIZED,
33 |             detail="Incorrect username or password",
34 |             headers={"WWW-Authenticate": "Bearer"},
35 |         )
36 | 
37 |     expire_minutes = (
38 |         int(ACCESS_TOKEN_EXPIRE_MINUTES) if ACCESS_TOKEN_EXPIRE_MINUTES else 60
39 |     )
40 | 
41 |     access_token_expires = timedelta(minutes=expire_minutes)
42 |     access_token = create_access_token(
43 |         data={"sub": user.email}, expires_delta=access_token_expires
44 |     )
45 | 
46 |     return {"access_token": access_token, "token_type": "bearer"}
47 | 
48 | 
49 | @auth_router.post("/auth/signup", response_model=User)
50 | async def create_user(user: UserCreate):
51 |     hashed_password = get_password_hash(user.password)
52 |     user_dict = user.model_dump()
53 |     user_dict["hashed_password"] = hashed_password
54 |     del user_dict["password"]
55 | 
56 |     query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)"
57 |     _ = update(query, (user_dict["email"], hashed_password, user_dict["full_name"]))
58 | 
59 |     return user_dict
60 | 
61 | 
62 | @auth_router.get("/auth/users/me", response_model=User)
63 | async def read_users_me(current_user: User = Depends(get_current_user)):
64 |     return current_user
65 | 
66 | 
67 | @auth_router.get("/auth/check")
68 | async def check_auth():
69 |     return {
70 |         "registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True",
71 |         "recordings_enabled": os.environ.get("RECORDINGS_ENABLED", "true").lower()
72 |         == "true",
73 |     }
74 | 


--------------------------------------------------------------------------------
/api/backend/constants.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import os
 3 | 
 4 | DATABASE_PATH = "data/database.db"
 5 | RECORDINGS_DIR = Path("media/recordings")
 6 | RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true"
 7 | MEDIA_DIR = Path("media")
 8 | MEDIA_TYPES = [
 9 |     "audio",
10 |     "documents",
11 |     "images",
12 |     "pdfs",
13 |     "presentations",
14 |     "spreadsheets",
15 |     "videos",
16 | ]
17 | 


--------------------------------------------------------------------------------
/api/backend/database/__init__.py:
--------------------------------------------------------------------------------
1 | from .common import insert, QUERIES, update
2 | 
3 | __all__ = ["insert", "QUERIES", "update"]
4 | 


--------------------------------------------------------------------------------
/api/backend/database/common.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | from typing import Any, Optional
 3 | from api.backend.constants import DATABASE_PATH
 4 | from api.backend.utils import format_json, format_sql_row_to_python
 5 | from api.backend.database.schema import INIT_QUERY
 6 | from api.backend.database.queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
 7 | import logging
 8 | 
 9 | LOG = logging.getLogger(__name__)
10 | 
11 | 
12 | def connect():
13 |     connection = sqlite3.connect(DATABASE_PATH)
14 |     connection.set_trace_callback(print)
15 |     cursor = connection.cursor()
16 |     return cursor
17 | 
18 | 
19 | def insert(query: str, values: tuple[Any, ...]):
20 |     connection = sqlite3.connect(DATABASE_PATH)
21 |     cursor = connection.cursor()
22 |     copy = list(values)
23 |     format_json(copy)
24 | 
25 |     try:
26 |         _ = cursor.execute(query, copy)
27 |         connection.commit()
28 |     except sqlite3.Error as e:
29 |         LOG.error(f"An error occurred: {e}")
30 |     finally:
31 |         cursor.close()
32 |         connection.close()
33 | 
34 | 
35 | def query(query: str, values: Optional[tuple[Any, ...]] = None):
36 |     connection = sqlite3.connect(DATABASE_PATH)
37 |     connection.row_factory = sqlite3.Row
38 |     cursor = connection.cursor()
39 |     rows = []
40 |     try:
41 |         if values:
42 |             _ = cursor.execute(query, values)
43 |         else:
44 |             _ = cursor.execute(query)
45 | 
46 |         rows = cursor.fetchall()
47 | 
48 |     finally:
49 |         cursor.close()
50 |         connection.close()
51 | 
52 |     formatted_rows: list[dict[str, Any]] = []
53 | 
54 |     for row in rows:
55 |         row = dict(row)
56 |         formatted_row = format_sql_row_to_python(row)
57 |         formatted_rows.append(formatted_row)
58 | 
59 |     return formatted_rows
60 | 
61 | 
62 | def update(query: str, values: Optional[tuple[Any, ...]] = None):
63 |     connection = sqlite3.connect(DATABASE_PATH)
64 |     cursor = connection.cursor()
65 | 
66 |     copy = None
67 | 
68 |     if values:
69 |         copy = list(values)
70 |         format_json(copy)
71 | 
72 |     try:
73 |         if copy:
74 |             res = cursor.execute(query, copy)
75 |         else:
76 |             res = cursor.execute(query)
77 |         connection.commit()
78 |         return res.rowcount
79 |     except sqlite3.Error as e:
80 |         LOG.error(f"An error occurred: {e}")
81 |     finally:
82 |         cursor.close()
83 |         connection.close()
84 | 
85 |     return 0
86 | 
87 | 
88 | QUERIES = {
89 |     "init": INIT_QUERY,
90 |     "insert_job": JOB_INSERT_QUERY,
91 |     "delete_job": DELETE_JOB_QUERY,
92 | }
93 | 


--------------------------------------------------------------------------------
/api/backend/database/queries/__init__.py:
--------------------------------------------------------------------------------
1 | from .queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY
2 | 
3 | __all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"]
4 | 


--------------------------------------------------------------------------------
/api/backend/database/queries/queries.py:
--------------------------------------------------------------------------------
 1 | JOB_INSERT_QUERY = """
 2 | INSERT INTO jobs 
 3 | (id, url, elements, user, time_created, result, status, chat, job_options, agent_mode, prompt)
 4 | VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
 5 | """
 6 | 
 7 | DELETE_JOB_QUERY = """
 8 | DELETE FROM jobs WHERE id IN ()
 9 | """
10 | 


--------------------------------------------------------------------------------
/api/backend/database/schema/__init__.py:
--------------------------------------------------------------------------------
1 | from .schema import INIT_QUERY
2 | 
3 | __all__ = ["INIT_QUERY"]
4 | 


--------------------------------------------------------------------------------
/api/backend/database/schema/schema.py:
--------------------------------------------------------------------------------
 1 | INIT_QUERY = """
 2 | CREATE TABLE IF NOT EXISTS jobs (
 3 |     id STRING PRIMARY KEY NOT NULL,
 4 |     url STRING NOT NULL,
 5 |     elements JSON NOT NULL,
 6 |     user STRING,
 7 |     time_created DATETIME NOT NULL,
 8 |     result JSON NOT NULL,
 9 |     status STRING NOT NULL,
10 |     chat JSON,
11 |     job_options JSON
12 | );
13 | 
14 | CREATE TABLE IF NOT EXISTS users (
15 |     email STRING PRIMARY KEY NOT NULL,
16 |     hashed_password STRING NOT NULL,
17 |     full_name STRING,
18 |     disabled BOOLEAN
19 | );
20 | 
21 | CREATE TABLE IF NOT EXISTS cron_jobs (
22 |     id STRING PRIMARY KEY NOT NULL,
23 |     user_email STRING NOT NULL,
24 |     job_id STRING NOT NULL,
25 |     cron_expression STRING NOT NULL,
26 |     time_created DATETIME NOT NULL,
27 |     time_updated DATETIME NOT NULL,
28 |     FOREIGN KEY (job_id) REFERENCES jobs(id)
29 | );
30 | 
31 | ALTER TABLE jobs ADD COLUMN agent_mode BOOLEAN NOT NULL DEFAULT FALSE;
32 | ALTER TABLE jobs ADD COLUMN prompt STRING;
33 | """
34 | 


--------------------------------------------------------------------------------
/api/backend/database/startup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from api.backend.database.common import connect, QUERIES, insert
 3 | import logging
 4 | import sqlite3
 5 | 
 6 | from api.backend.auth.auth_utils import get_password_hash
 7 | 
 8 | LOG = logging.getLogger(__name__)
 9 | 
10 | 
11 | def init_database():
12 |     cursor = connect()
13 | 
14 |     for query in QUERIES["init"].strip().split(";"):
15 |         query = query.strip()
16 |         if not query:
17 |             continue
18 | 
19 |         try:
20 |             LOG.info(f"Executing query: {query}")
21 |             _ = cursor.execute(query)
22 |         except sqlite3.OperationalError as e:
23 |             if "duplicate column name" in str(e).lower():
24 |                 LOG.warning(f"Skipping duplicate column error: {e}")
25 |                 continue
26 |             else:
27 |                 LOG.error(f"Error executing query: {query}")
28 |                 raise
29 | 
30 |     if os.environ.get("REGISTRATION_ENABLED", "true").lower() == "false":
31 |         default_user_email = os.environ.get("DEFAULT_USER_EMAIL")
32 |         default_user_password = os.environ.get("DEFAULT_USER_PASSWORD")
33 |         default_user_full_name = os.environ.get("DEFAULT_USER_FULL_NAME")
34 | 
35 |         if (
36 |             not default_user_email
37 |             or not default_user_password
38 |             or not default_user_full_name
39 |         ):
40 |             LOG.error(
41 |                 "DEFAULT_USER_EMAIL, DEFAULT_USER_PASSWORD, or DEFAULT_USER_FULL_NAME is not set!"
42 |             )
43 |             exit(1)
44 | 
45 |         query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)"
46 |         _ = insert(
47 |             query,
48 |             (
49 |                 default_user_email,
50 |                 get_password_hash(default_user_password),
51 |                 default_user_full_name,
52 |             ),
53 |         )
54 | 
55 |     cursor.close()
56 | 


--------------------------------------------------------------------------------
/api/backend/job/__init__.py:
--------------------------------------------------------------------------------
 1 | from .job import (
 2 |     insert,
 3 |     update_job,
 4 |     delete_jobs,
 5 |     get_jobs_per_day,
 6 |     get_queued_job,
 7 |     average_elements_per_link,
 8 | )
 9 | 
10 | __all__ = [
11 |     "insert",
12 |     "update_job",
13 |     "delete_jobs",
14 |     "get_jobs_per_day",
15 |     "get_queued_job",
16 |     "average_elements_per_link",
17 | ]
18 | 


--------------------------------------------------------------------------------
/api/backend/job/cron_scheduling/cron_scheduling.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from typing import Any
  3 | import uuid
  4 | from api.backend.database.common import insert, query
  5 | from api.backend.models import CronJob
  6 | from apscheduler.schedulers.background import BackgroundScheduler  # type: ignore
  7 | from apscheduler.triggers.cron import CronTrigger  # type: ignore
  8 | 
  9 | from api.backend.job import insert as insert_job
 10 | import logging
 11 | 
 12 | LOG = logging.getLogger("Cron Scheduler")
 13 | 
 14 | 
 15 | def insert_cron_job(cron_job: CronJob):
 16 |     query = """
 17 |     INSERT INTO cron_jobs (id, user_email, job_id, cron_expression, time_created, time_updated)
 18 |     VALUES (?, ?, ?, ?, ?, ?)
 19 |     """
 20 |     values = (
 21 |         cron_job.id,
 22 |         cron_job.user_email,
 23 |         cron_job.job_id,
 24 |         cron_job.cron_expression,
 25 |         cron_job.time_created,
 26 |         cron_job.time_updated,
 27 |     )
 28 | 
 29 |     insert(query, values)
 30 | 
 31 |     return True
 32 | 
 33 | 
 34 | def delete_cron_job(id: str, user_email: str):
 35 |     query = """
 36 |     DELETE FROM cron_jobs
 37 |     WHERE id = ? AND user_email = ?
 38 |     """
 39 |     values = (id, user_email)
 40 |     insert(query, values)
 41 | 
 42 |     return True
 43 | 
 44 | 
 45 | def get_cron_jobs(user_email: str):
 46 |     cron_jobs = query("SELECT * FROM cron_jobs WHERE user_email = ?", (user_email,))
 47 | 
 48 |     return cron_jobs
 49 | 
 50 | 
 51 | def get_all_cron_jobs():
 52 |     cron_jobs = query("SELECT * FROM cron_jobs")
 53 | 
 54 |     return cron_jobs
 55 | 
 56 | 
 57 | def insert_job_from_cron_job(job: dict[str, Any]):
 58 |     insert_job(
 59 |         {
 60 |             **job,
 61 |             "id": uuid.uuid4().hex,
 62 |             "status": "Queued",
 63 |             "result": "",
 64 |             "chat": None,
 65 |             "time_created": datetime.datetime.now(),
 66 |             "time_updated": datetime.datetime.now(),
 67 |         }
 68 |     )
 69 | 
 70 | 
 71 | def get_cron_job_trigger(cron_expression: str):
 72 |     expression_parts = cron_expression.split()
 73 | 
 74 |     if len(expression_parts) != 5:
 75 |         print(f"Invalid cron expression: {cron_expression}")
 76 |         return None
 77 | 
 78 |     minute, hour, day, month, day_of_week = expression_parts
 79 | 
 80 |     return CronTrigger(
 81 |         minute=minute, hour=hour, day=day, month=month, day_of_week=day_of_week
 82 |     )
 83 | 
 84 | 
 85 | def start_cron_scheduler(scheduler: BackgroundScheduler):
 86 |     cron_jobs = get_all_cron_jobs()
 87 | 
 88 |     LOG.info(f"Cron jobs: {cron_jobs}")
 89 | 
 90 |     for job in cron_jobs:
 91 |         queried_job = query("SELECT * FROM jobs WHERE id = ?", (job["job_id"],))
 92 | 
 93 |         LOG.info(f"Adding job: {queried_job}")
 94 | 
 95 |         scheduler.add_job(
 96 |             insert_job_from_cron_job,
 97 |             get_cron_job_trigger(job["cron_expression"]),
 98 |             id=job["id"],
 99 |             args=[queried_job[0]],
100 |         )
101 | 


--------------------------------------------------------------------------------
/api/backend/job/job.py:
--------------------------------------------------------------------------------
  1 | # STL
  2 | import logging
  3 | from typing import Any
  4 | 
  5 | # LOCAL
  6 | from api.backend.utils import format_list_for_query
  7 | from api.backend.database.common import (
  8 |     insert as common_insert,
  9 |     query as common_query,
 10 |     QUERIES,
 11 |     update as common_update,
 12 | )
 13 | 
 14 | LOG = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | def insert(item: dict[str, Any]) -> None:
 18 |     common_insert(
 19 |         QUERIES["insert_job"],
 20 |         (
 21 |             item["id"],
 22 |             item["url"],
 23 |             item["elements"],
 24 |             item["user"],
 25 |             item["time_created"],
 26 |             item["result"],
 27 |             item["status"],
 28 |             item["chat"],
 29 |             item["job_options"],
 30 |             item["agent_mode"],
 31 |             item["prompt"],
 32 |         ),
 33 |     )
 34 |     LOG.info(f"Inserted item: {item}")
 35 | 
 36 | 
 37 | async def get_queued_job():
 38 |     query = (
 39 |         "SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
 40 |     )
 41 |     res = common_query(query)
 42 |     LOG.info(f"Got queued job: {res}")
 43 |     return res[0] if res else None
 44 | 
 45 | 
 46 | async def update_job(ids: list[str], field: str, value: Any):
 47 |     query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
 48 |     res = common_update(query, tuple([value] + ids))
 49 |     LOG.info(f"Updated job: {res}")
 50 | 
 51 | 
 52 | async def delete_jobs(jobs: list[str]):
 53 |     if not jobs:
 54 |         LOG.info("No jobs to delete.")
 55 |         return False
 56 | 
 57 |     query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}"
 58 |     res = common_update(query, tuple(jobs))
 59 | 
 60 |     return res > 0
 61 | 
 62 | 
 63 | async def average_elements_per_link(user: str):
 64 |     job_query = """
 65 |     SELECT 
 66 |         DATE(time_created) AS date,
 67 |         AVG(json_array_length(elements)) AS average_elements,
 68 |         COUNT(*) AS count
 69 |     FROM 
 70 |         jobs
 71 |     WHERE 
 72 |         status = 'Completed' AND user = ?
 73 |     GROUP BY 
 74 |         DATE(time_created)
 75 |     ORDER BY 
 76 |         date ASC;
 77 |     """
 78 |     results = common_query(job_query, (user,))
 79 | 
 80 |     return results
 81 | 
 82 | 
 83 | async def get_jobs_per_day(user: str):
 84 |     job_query = """
 85 |     SELECT 
 86 |         DATE(time_created) AS date,
 87 |         COUNT(*) AS job_count
 88 |     FROM 
 89 |         jobs
 90 |     WHERE 
 91 |         status = 'Completed' AND user = ?
 92 |     GROUP BY 
 93 |         DATE(time_created)
 94 |     ORDER BY 
 95 |         date ASC;
 96 |     """
 97 |     results = common_query(job_query, (user,))
 98 | 
 99 |     return results
100 | 


--------------------------------------------------------------------------------
/api/backend/job/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .job_options import JobOptions
2 | 
3 | __all__ = ["JobOptions"]
4 | 


--------------------------------------------------------------------------------
/api/backend/job/models/job_options.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import Any, Optional
 3 | from api.backend.job.models.site_map import SiteMap
 4 | 
 5 | 
 6 | class FetchOptions(BaseModel):
 7 |     chat: Optional[bool] = None
 8 | 
 9 | 
10 | class JobOptions(BaseModel):
11 |     multi_page_scrape: bool = False
12 |     custom_headers: dict[str, Any] = {}
13 |     proxies: list[str] = []
14 |     site_map: Optional[SiteMap] = None
15 |     collect_media: bool = False
16 |     custom_cookies: list[dict[str, Any]] = []
17 | 


--------------------------------------------------------------------------------
/api/backend/job/models/site_map.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import Literal
 3 | 
 4 | 
 5 | class Action(BaseModel):
 6 |     type: Literal["click", "input"]
 7 |     xpath: str
 8 |     name: str
 9 |     input: str = ""
10 |     do_once: bool = True
11 | 
12 | 
13 | class SiteMap(BaseModel):
14 |     actions: list[Action]
15 | 


--------------------------------------------------------------------------------
/api/backend/job/scraping/add_custom.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional
 2 | from urllib.parse import urlparse
 3 | 
 4 | from playwright.async_api import Page, BrowserContext
 5 | 
 6 | import logging
 7 | 
 8 | LOG = logging.getLogger(__name__)
 9 | 
10 | 
11 | async def add_custom_cookies(
12 |     custom_cookies: list[dict[str, Any]],
13 |     url: str,
14 |     context: BrowserContext,
15 | ) -> None:
16 |     parsed_url = urlparse(url)
17 |     domain = parsed_url.netloc
18 | 
19 |     for cookie in custom_cookies:
20 |         cookie_dict = {
21 |             "name": cookie.get("name", "default_name"),
22 |             "value": cookie.get("value", "default_value"),
23 |             "domain": domain,
24 |             "path": "/",
25 |         }
26 | 
27 |         LOG.info(f"Adding cookie: {cookie_dict}")
28 |         await context.add_cookies([cookie_dict])  # type: ignore
29 | 
30 | 
31 | async def add_custom_headers(
32 |     custom_headers: dict[str, Any],
33 |     page: Page,
34 | ) -> None:
35 |     await page.set_extra_http_headers(custom_headers)
36 | 
37 | 
38 | async def add_custom_items(
39 |     url: str,
40 |     page: Page,
41 |     cookies: Optional[list[dict[str, Any]]] = None,
42 |     headers: Optional[dict[str, Any]] = None,
43 | ) -> None:
44 |     if cookies:
45 |         await add_custom_cookies(cookies, url, page.context)
46 | 
47 |     if headers:
48 |         await add_custom_headers(headers, page)
49 | 


--------------------------------------------------------------------------------
/api/backend/job/scraping/scraping_utils.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import Set, Tuple
 3 | from playwright.async_api import Page
 4 | 
 5 | from api.backend.utils import LOG
 6 | 
 7 | from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
 8 | 
 9 | 
10 | async def scrape_content(
11 |     id: str, page: Page, pages: Set[Tuple[str, str]], collect_media: bool
12 | ) -> str:
13 |     last_height = await page.evaluate("document.body.scrollHeight")
14 | 
15 |     while True:
16 |         await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
17 |         await asyncio.sleep(3)
18 |         new_height = await page.evaluate("document.body.scrollHeight")
19 | 
20 |         if new_height == last_height:
21 |             break
22 | 
23 |         last_height = new_height
24 | 
25 |     html = await page.content()
26 |     pages.add((html, page.url))
27 | 
28 |     if collect_media:
29 |         LOG.info("Collecting media")
30 |         await collect_media_utils(id, page)
31 | 
32 |     return html
33 | 
34 | 
35 | def clean_format_characters(text: str) -> str:
36 |     text = text.strip()
37 |     text = text.replace("\n", " ")
38 |     text = text.replace("\t", " ")
39 |     text = text.replace("\r", " ")
40 |     text = text.replace("\f", " ")
41 |     text = text.replace("\v", " ")
42 |     text = text.replace("\b", " ")
43 |     text = text.replace("\a", " ")
44 | 
45 |     return text
46 | 


--------------------------------------------------------------------------------
/api/backend/job/site_mapping/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/api/backend/job/site_mapping/__init__.py


--------------------------------------------------------------------------------
/api/backend/job/site_mapping/site_mapping.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import asyncio
 3 | from copy import deepcopy
 4 | from typing import Any
 5 | 
 6 | from playwright.async_api import Page
 7 | 
 8 | from api.backend.job.models.site_map import Action, SiteMap
 9 | from api.backend.job.scraping.scraping_utils import scrape_content
10 | 
11 | LOG = logging.getLogger(__name__)
12 | 
13 | 
14 | def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]:
15 |     """Clear all actions that have been clicked."""
16 |     cleared_site_map = deepcopy(site_map)
17 |     cleared_site_map["actions"] = [
18 |         action for action in cleared_site_map["actions"] if not action["do_once"]
19 |     ]
20 | 
21 |     return cleared_site_map
22 | 
23 | 
24 | async def handle_input(action: Action, page: Page) -> bool:
25 |     try:
26 |         element = page.locator(f"xpath={action.xpath}")
27 |         LOG.info(f"Sending keys: {action.input} to element: {action.xpath}")
28 |         await element.fill(action.input)
29 |         return True
30 |     except Exception as e:
31 |         LOG.warning(f"Error handling input for xpath '{action.xpath}': {e}")
32 |         return False
33 | 
34 | 
35 | async def handle_click(action: Action, page: Page) -> bool:
36 |     try:
37 |         element = page.locator(f"xpath={action.xpath}")
38 |         LOG.info(f"Clicking element: {action.xpath}")
39 |         await element.click()
40 |         return True
41 |     except Exception as e:
42 |         LOG.warning(f"Error clicking element at xpath '{action.xpath}': {e}")
43 |         return False
44 | 
45 | 
46 | ACTION_MAP = {
47 |     "click": handle_click,
48 |     "input": handle_input,
49 | }
50 | 
51 | 
52 | async def handle_site_mapping(
53 |     id: str,
54 |     site_map_dict: dict[str, Any],
55 |     page: Page,
56 |     pages: set[tuple[str, str]],
57 |     collect_media: bool = False,
58 | ):
59 |     site_map = SiteMap(**site_map_dict)
60 | 
61 |     for action in site_map.actions:
62 |         action_handler = ACTION_MAP[action.type]
63 |         success = await action_handler(action, page)
64 | 
65 |         if not success:
66 |             return
67 | 
68 |         await asyncio.sleep(2)
69 | 
70 |     await scrape_content(id, page, pages, collect_media=collect_media)
71 | 
72 |     cleared_site_map_dict = clear_done_actions(site_map_dict)
73 | 
74 |     if cleared_site_map_dict["actions"]:
75 |         await handle_site_mapping(
76 |             id, cleared_site_map_dict, page, pages, collect_media=collect_media
77 |         )
78 | 


--------------------------------------------------------------------------------
/api/backend/job/utils/clean_job_format.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from api.backend.utils import clean_text
 4 | 
 5 | 
 6 | def clean_job_format(jobs: list[dict[str, Any]]) -> dict[str, Any]:
 7 |     """
 8 |     Convert a single job to a dictionary format.
 9 |     """
10 |     headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"]
11 | 
12 |     cleaned_rows = []
13 | 
14 |     for job in jobs:
15 |         for res in job["result"]:
16 |             for url, elements in res.items():
17 |                 for element_name, values in elements.items():
18 |                     for value in values:
19 |                         text = clean_text(value.get("text", "")).strip()
20 |                         if text:
21 |                             cleaned_rows.append(
22 |                                 {
23 |                                     "id": job.get("id", ""),
24 |                                     "url": url,
25 |                                     "element_name": element_name,
26 |                                     "xpath": value.get("xpath", ""),
27 |                                     "text": text,
28 |                                     "user": job.get("user", ""),
29 |                                     "time_created": job.get("time_created", ""),
30 |                                 }
31 |                             )
32 | 
33 |     return {
34 |         "headers": headers,
35 |         "rows": cleaned_rows,
36 |     }
37 | 


--------------------------------------------------------------------------------
/api/backend/job/utils/stream_md_from_job_results.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from api.backend.utils import clean_text
 4 | 
 5 | 
 6 | def stream_md_from_job_results(jobs: list[dict[str, Any]]):
 7 |     md = "# Job Results Summary\n\n"
 8 |     for i, job in enumerate(jobs, start=1):
 9 |         md += f"## Job #{i}\n"
10 |         yield f"- **Job URL:** {job.get('url', 'N/A')}\n"
11 |         yield f"- **Timestamp:** {job.get('time_created', 'N/A')}\n"
12 |         yield f"- **ID:** {job.get('id', 'N/A')}\n"
13 |         yield "### Extracted Results:\n"
14 | 
15 |         for res in job.get("result", []):
16 |             for url, elements in res.items():
17 |                 yield f"\n#### URL: {url}\n"
18 |                 for element_name, values in elements.items():
19 |                     for value in values:
20 |                         text = clean_text(value.get("text", "")).strip()
21 |                         if text:
22 |                             yield f"- **Element:** `{element_name}`\n"
23 |                             yield f"  - **Text:** {text}\n"
24 |         yield "\n---\n"
25 | 


--------------------------------------------------------------------------------
/api/backend/models.py:
--------------------------------------------------------------------------------
 1 | # STL
 2 | from typing import Any, Literal, Optional, Union
 3 | from datetime import datetime
 4 | 
 5 | # LOCAL
 6 | from api.backend.job.models.job_options import JobOptions
 7 | 
 8 | # PDM
 9 | import pydantic
10 | 
11 | 
12 | class Element(pydantic.BaseModel):
13 |     name: str
14 |     xpath: str
15 |     url: Optional[str] = None
16 | 
17 | 
18 | class CapturedElement(pydantic.BaseModel):
19 |     xpath: str
20 |     text: str
21 |     name: str
22 | 
23 | 
24 | class RetrieveScrapeJobs(pydantic.BaseModel):
25 |     user: str
26 | 
27 | 
28 | class DownloadJob(pydantic.BaseModel):
29 |     ids: list[str]
30 |     job_format: Literal["csv", "md"]
31 | 
32 | 
33 | class DeleteScrapeJobs(pydantic.BaseModel):
34 |     ids: list[str]
35 | 
36 | 
37 | class GetStatistics(pydantic.BaseModel):
38 |     user: str
39 | 
40 | 
41 | class UpdateJobs(pydantic.BaseModel):
42 |     ids: list[str]
43 |     field: str
44 |     value: Any
45 | 
46 | 
47 | class AI(pydantic.BaseModel):
48 |     messages: list[Any]
49 | 
50 | 
51 | class Job(pydantic.BaseModel):
52 |     id: Optional[str] = None
53 |     url: str
54 |     elements: list[Element]
55 |     user: str = ""
56 |     time_created: Optional[Union[datetime, str]] = None
57 |     result: list[dict[str, dict[str, list[CapturedElement]]]] = []
58 |     job_options: JobOptions
59 |     status: str = "Queued"
60 |     chat: Optional[str] = None
61 |     agent_mode: bool = False
62 |     prompt: Optional[str] = None
63 | 
64 | 
65 | class CronJob(pydantic.BaseModel):
66 |     id: Optional[str] = None
67 |     user_email: str
68 |     job_id: str
69 |     cron_expression: str
70 |     time_created: Optional[Union[datetime, str]] = None
71 |     time_updated: Optional[Union[datetime, str]] = None
72 | 
73 | 
74 | class DeleteCronJob(pydantic.BaseModel):
75 |     id: str
76 |     user_email: str
77 | 


--------------------------------------------------------------------------------
/api/backend/routers/stats_router.py:
--------------------------------------------------------------------------------
 1 | # STL
 2 | import logging
 3 | 
 4 | # PDM
 5 | from fastapi import APIRouter, Depends
 6 | 
 7 | # LOCAL
 8 | from api.backend.job import (
 9 |     get_jobs_per_day,
10 |     average_elements_per_link,
11 | )
12 | from api.backend.auth.auth_utils import get_current_user
13 | from api.backend.schemas import User
14 | 
15 | 
16 | LOG = logging.getLogger(__name__)
17 | 
18 | stats_router = APIRouter()
19 | 
20 | 
21 | @stats_router.get("/statistics/get-average-element-per-link")
22 | async def get_average_element_per_link(user: User = Depends(get_current_user)):
23 |     return await average_elements_per_link(user.email)
24 | 
25 | 
26 | @stats_router.get("/statistics/get-average-jobs-per-day")
27 | async def average_jobs_per_day(user: User = Depends(get_current_user)):
28 |     data = await get_jobs_per_day(user.email)
29 |     return data
30 | 


--------------------------------------------------------------------------------
/api/backend/scheduler.py:
--------------------------------------------------------------------------------
1 | from apscheduler.schedulers.background import BackgroundScheduler  # type: ignore
2 | 
3 | scheduler = BackgroundScheduler()
4 | 


--------------------------------------------------------------------------------
/api/backend/schemas.py:
--------------------------------------------------------------------------------
 1 | # STL
 2 | from typing import Union, Literal, Optional
 3 | 
 4 | # PDM
 5 | from pydantic import EmailStr, BaseModel
 6 | 
 7 | 
 8 | class Token(BaseModel):
 9 |     access_token: str
10 |     token_type: str
11 | 
12 | 
13 | class TokenData(BaseModel):
14 |     email: Optional[str] = None
15 | 
16 | 
17 | class User(BaseModel):
18 |     email: Union[EmailStr, Literal[""]]
19 |     full_name: Optional[str] = None
20 |     disabled: Optional[bool] = None
21 | 
22 | 
23 | class UserInDB(User):
24 |     hashed_password: str
25 | 
26 | 
27 | class UserCreate(BaseModel):
28 |     email: EmailStr
29 |     password: str
30 |     full_name: Optional[str] = None
31 | 


--------------------------------------------------------------------------------
/api/backend/tests/factories/job_factory.py:
--------------------------------------------------------------------------------
 1 | from api.backend.models import Element, Job, JobOptions, CapturedElement
 2 | import uuid
 3 | from faker import Faker
 4 | 
 5 | fake = Faker()
 6 | 
 7 | 
 8 | def create_job(
 9 |     job_options: JobOptions = JobOptions(multi_page_scrape=False, custom_headers={})
10 | ):
11 |     return Job(
12 |         id=uuid.uuid4().hex,
13 |         url="https://example.com",
14 |         elements=[Element(name="test", xpath="xpath")],
15 |         job_options=job_options,
16 |     )
17 | 
18 | 
19 | def create_completed_job() -> Job:
20 |     return Job(
21 |         id=uuid.uuid4().hex,
22 |         url="http://example.com",
23 |         elements=[
24 |             Element(
25 |                 name="element_name",
26 |                 xpath="//div",
27 |                 url="https://example.com",
28 |             )
29 |         ],
30 |         job_options=JobOptions(multi_page_scrape=False, custom_headers={}),
31 |         user=fake.name(),
32 |         time_created=fake.date(),
33 |         result=[
34 |             {
35 |                 "https://example.com": {
36 |                     "element_name": [
37 |                         CapturedElement(
38 |                             xpath="//div", text="example", name="element_name"
39 |                         )
40 |                     ]
41 |                 }
42 |             }
43 |         ],
44 |     )
45 | 


--------------------------------------------------------------------------------
/api/backend/tests/job/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/api/backend/tests/job/__init__.py


--------------------------------------------------------------------------------
/api/backend/tests/job/test_download_job.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from fastapi.testclient import TestClient
 3 | from unittest.mock import AsyncMock, patch
 4 | from api.backend.app import app
 5 | from api.backend.models import DownloadJob
 6 | from api.backend.tests.factories.job_factory import create_completed_job
 7 | 
 8 | client = TestClient(app)
 9 | 
10 | mocked_job = create_completed_job().model_dump()
11 | mock_results = [mocked_job]
12 | mocked_random_int = 123456
13 | 
14 | 
15 | @pytest.mark.asyncio
16 | @patch("api.backend.routers.job_router.query")
17 | @patch("api.backend.routers.job_router.random.randint")
18 | async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
19 |     # Ensure the mock returns immediately
20 |     mock_query.return_value = mock_results
21 |     mock_randint.return_value = mocked_random_int
22 | 
23 |     # Create a DownloadJob instance
24 |     download_job = DownloadJob(ids=[mocked_job["id"]], job_format="csv")
25 | 
26 |     # Make a POST request to the /download endpoint
27 |     response = client.post("/download", json=download_job.model_dump())
28 | 
29 |     # Assertions
30 |     assert response.status_code == 200
31 |     assert response.headers["Content-Disposition"] == "attachment; filename=export.csv"
32 | 
33 |     # Check the content of the CSV
34 |     csv_content = response.content.decode("utf-8")
35 |     expected_csv = (
36 |         f'"id","url","element_name","xpath","text","user","time_created"\r\n'
37 |         f'"{mocked_job["id"]}-{mocked_random_int}","https://example.com","element_name","//div","example",'
38 |         f'"{mocked_job["user"]}","{mocked_job["time_created"]}"\r\n'
39 |     )
40 |     assert csv_content == expected_csv
41 | 


--------------------------------------------------------------------------------
/api/backend/tests/scraping/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/api/backend/tests/scraping/__init__.py


--------------------------------------------------------------------------------
/api/backend/tests/scraping/test_scraping.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import logging
 3 | from typing import Dict
 4 | from playwright.async_api import async_playwright, Cookie, Route
 5 | from api.backend.job.scraping.add_custom import add_custom_items
 6 | 
 7 | logging.basicConfig(level=logging.DEBUG)
 8 | LOG = logging.getLogger(__name__)
 9 | 
10 | 
11 | @pytest.mark.asyncio
12 | async def test_add_custom_items():
13 |     test_cookies = [{"name": "big", "value": "cookie"}]
14 |     test_headers = {"User-Agent": "test-agent", "Accept": "application/json"}
15 | 
16 |     async with async_playwright() as p:
17 |         browser = await p.chromium.launch(headless=True)
18 |         context = await browser.new_context()
19 |         page = await context.new_page()
20 | 
21 |         # Set up request interception
22 |         captured_headers: Dict[str, str] = {}
23 | 
24 |         async def handle_route(route: Route) -> None:
25 |             nonlocal captured_headers
26 |             captured_headers = route.request.headers
27 |             await route.continue_()
28 | 
29 |         await page.route("**/*", handle_route)
30 | 
31 |         await add_custom_items(
32 |             url="http://example.com",
33 |             page=page,
34 |             cookies=test_cookies,
35 |             headers=test_headers,
36 |         )
37 | 
38 |         # Navigate to example.com
39 |         await page.goto("http://example.com")
40 | 
41 |         # Verify cookies were added
42 |         cookies: list[Cookie] = await page.context.cookies()
43 |         test_cookie = next((c for c in cookies if c.get("name") == "big"), None)
44 | 
45 |         assert test_cookie is not None
46 |         assert test_cookie.get("value") == "cookie"
47 |         assert test_cookie.get("path") == "/"  # Default path should be set
48 |         assert test_cookie.get("sameSite") == "Lax"  # Default sameSite should be set
49 | 
50 |         # Verify headers were added
51 |         assert captured_headers.get("user-agent") == "test-agent"
52 | 
53 |         await browser.close()
54 | 


--------------------------------------------------------------------------------
/api/backend/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional
 2 | import logging
 3 | import json
 4 | 
 5 | LOG = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | def clean_text(text: str):
 9 |     text = text.replace("\r\n", "\n")  # Normalize newlines
10 |     text = text.replace("\n", "\\n")  # Escape newlines
11 |     text = text.replace('"', '\\"')  # Escape double quotes
12 |     return text
13 | 
14 | 
15 | def get_log_level(level_name: Optional[str]) -> int:
16 |     level = logging.INFO
17 | 
18 |     if level_name:
19 |         level_name = level_name.upper()
20 |         level = getattr(logging, level_name, logging.INFO)
21 | 
22 |     return level
23 | 
24 | 
25 | def format_list_for_query(ids: list[str]):
26 |     return (
27 |         f"({','.join(['?' for _ in ids])})"  # Returns placeholders, e.g., "(?, ?, ?)"
28 |     )
29 | 
30 | 
31 | def format_sql_row_to_python(row: dict[str, Any]):
32 |     new_row: dict[str, Any] = {}
33 |     for key, value in row.items():
34 |         if isinstance(value, str):
35 |             try:
36 |                 new_row[key] = json.loads(value)
37 |             except json.JSONDecodeError:
38 |                 new_row[key] = value
39 |         else:
40 |             new_row[key] = value
41 | 
42 |     return new_row
43 | 
44 | 
45 | def format_json(items: list[Any]):
46 |     for idx, item in enumerate(items):
47 |         if isinstance(item, (dict, list)):
48 |             formatted_item = json.dumps(item)
49 |             items[idx] = formatted_item
50 | 


--------------------------------------------------------------------------------
/api/backend/worker/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | from api.backend.utils import get_log_level
 5 | 
 6 | logging.basicConfig(
 7 |     level=get_log_level(os.getenv("LOG_LEVEL")),
 8 |     format="%(levelname)s:     %(asctime)s - %(name)s - %(message)s",
 9 |     handlers=[logging.StreamHandler()],
10 | )
11 | 
12 | LOG = logging.getLogger(__name__)
13 | 


--------------------------------------------------------------------------------
/api/backend/worker/post_job_complete/discord_notification.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Any
 3 | 
 4 | import requests
 5 | 
 6 | from api.backend.worker.logger import LOG
 7 | from api.backend.worker.post_job_complete.models import (
 8 |     PostJobCompleteOptions,
 9 |     JOB_COLOR_MAP,
10 | )
11 | 
12 | 
13 | def discord_notification(job: dict[str, Any], options: PostJobCompleteOptions):
14 |     webhook_url = options["webhook_url"]
15 |     scraperr_frontend_url = options["scraperr_frontend_url"]
16 | 
17 |     LOG.info(f"Sending discord notification to {webhook_url}")
18 | 
19 |     embed = {
20 |         "title": "Job Completed",
21 |         "description": "Scraping job has been completed.",
22 |         "color": JOB_COLOR_MAP[job["status"]],
23 |         "url": f"{scraperr_frontend_url}/jobs?search={job['id']}&type=id",
24 |         "image": {
25 |             "url": "https://github.com/jaypyles/Scraperr/raw/master/docs/logo_picture.png",
26 |         },
27 |         "author": {
28 |             "name": "Scraperr",
29 |             "url": "https://github.com/jaypyles/Scraperr",
30 |         },
31 |         "fields": [
32 |             {
33 |                 "name": "Status",
34 |                 "value": "Completed",
35 |                 "inline": True,
36 |             },
37 |             {
38 |                 "name": "URL",
39 |                 "value": job["url"],
40 |                 "inline": True,
41 |             },
42 |             {
43 |                 "name": "ID",
44 |                 "value": job["id"],
45 |                 "inline": False,
46 |             },
47 |             {
48 |                 "name": "Options",
49 |                 "value": f"```json\n{json.dumps(job['job_options'], indent=4)}\n```",
50 |                 "inline": False,
51 |             },
52 |         ],
53 |     }
54 | 
55 |     payload = {"embeds": [embed]}
56 |     requests.post(webhook_url, json=payload)
57 | 


--------------------------------------------------------------------------------
/api/backend/worker/post_job_complete/email_notifcation.py:
--------------------------------------------------------------------------------
 1 | import smtplib
 2 | import ssl
 3 | from email.mime.text import MIMEText
 4 | from email.mime.multipart import MIMEMultipart
 5 | import json
 6 | from typing import Any
 7 | 
 8 | from api.backend.worker.logger import LOG
 9 | 
10 | from api.backend.worker.post_job_complete.models import (
11 |     JOB_COLOR_MAP,
12 |     PostJobCompleteOptions,
13 | )
14 | 
15 | 
16 | def send_job_complete_email(
17 |     job: dict[str, Any],
18 |     options: PostJobCompleteOptions,
19 | ):
20 |     status = job["status"]
21 |     status_color = JOB_COLOR_MAP.get(status, 0x808080)
22 |     job_url = job["url"]
23 |     job_id = job["id"]
24 |     job_options_json = json.dumps(job["job_options"], indent=4)
25 |     frontend_url = options["scraperr_frontend_url"]
26 | 
27 |     subject = "📦 Job Completed - Scraperr Notification"
28 | 
29 |     html = f"""
30 |     <html>
31 |       <body style="font-family: Arial, sans-serif;">
32 |         <h2 style="color: #{status_color:06x};">✅ Job Completed</h2>
33 |         <p>Scraping job has been completed successfully.</p>
34 | 
35 |         <a href="{frontend_url}/jobs?search={job_id}&type=id" target="_blank">
36 |           <img src="https://github.com/jaypyles/Scraperr/raw/master/docs/logo_picture.png" alt="Scraperr Logo" width="200">
37 |         </a>
38 | 
39 |         <h3>Job Info:</h3>
40 |         <ul>
41 |           <li><strong>Status:</strong> {status}</li>
42 |           <li><strong>Job URL:</strong> <a href="{job_url}">{job_url}</a></li>
43 |           <li><strong>Job ID:</strong> {job_id}</li>
44 |         </ul>
45 | 
46 |         <h3>Options:</h3>
47 |         <pre style="background-color:#f4f4f4; padding:10px; border-radius:5px;">
48 | {job_options_json}
49 |         </pre>
50 | 
51 |         <h3>View your job here:</h3>
52 |         <a href="{options['scraperr_frontend_url']}/jobs?search={job_id}&type=id">Scraperr Job</a>
53 | 
54 |         <p style="font-size: 12px; color: gray;">
55 |           Sent by <a href="https://github.com/jaypyles/Scraperr" target="_blank">Scraperr</a>
56 |         </p>
57 |       </body>
58 |     </html>
59 |     """
60 | 
61 |     # Create email
62 |     message = MIMEMultipart("alternative")
63 |     message["From"] = options["email"]
64 |     message["To"] = options["to"]
65 |     message["Subject"] = subject
66 |     message.attach(
67 |         MIMEText(
68 |             "Job completed. View this email in HTML format for full details.", "plain"
69 |         )
70 |     )
71 |     message.attach(MIMEText(html, "html"))
72 | 
73 |     context = ssl.create_default_context()
74 | 
75 |     try:
76 |         if options["use_tls"]:
77 |             with smtplib.SMTP(options["smtp_host"], options["smtp_port"]) as server:
78 |                 server.starttls(context=context)
79 |                 server.login(options["smtp_user"], options["smtp_password"])
80 |                 server.sendmail(
81 |                     from_addr=options["email"],
82 |                     to_addrs=options["to"],
83 |                     msg=message.as_string(),
84 |                 )
85 |         else:
86 |             with smtplib.SMTP_SSL(
87 |                 options["smtp_host"], options["smtp_port"], context=context
88 |             ) as server:
89 |                 server.login(options["smtp_user"], options["smtp_password"])
90 |                 server.sendmail(
91 |                     from_addr=options["email"],
92 |                     to_addrs=options["to"],
93 |                     msg=message.as_string(),
94 |                 )
95 |         LOG.info("✅ Email sent successfully!")
96 |     except Exception as e:
97 |         LOG.error(f"❌ Failed to send email: {e}")
98 | 


--------------------------------------------------------------------------------
/api/backend/worker/post_job_complete/models.py:
--------------------------------------------------------------------------------
 1 | from typing import TypedDict
 2 | 
 3 | 
 4 | class PostJobCompleteOptions(TypedDict):
 5 |     channel: str
 6 |     webhook_url: str
 7 |     scraperr_frontend_url: str
 8 |     email: str
 9 |     to: str
10 |     smtp_host: str
11 |     smtp_port: int
12 |     smtp_user: str
13 |     smtp_password: str
14 |     use_tls: bool
15 | 
16 | 
17 | JOB_COLOR_MAP = {
18 |     "Queued": 0x0000FF,
19 |     "Scraping": 0x0000FF,
20 |     "Completed": 0x00FF00,
21 |     "Failed": 0xFF0000,
22 | }
23 | 


--------------------------------------------------------------------------------
/api/backend/worker/post_job_complete/post_job_complete.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from api.backend.worker.post_job_complete.models import PostJobCompleteOptions
 4 | from api.backend.worker.post_job_complete.email_notifcation import (
 5 |     send_job_complete_email,
 6 | )
 7 | from api.backend.worker.post_job_complete.discord_notification import (
 8 |     discord_notification,
 9 | )
10 | 
11 | 
12 | async def post_job_complete(job: dict[str, Any], options: PostJobCompleteOptions):
13 |     if options["channel"] == "":
14 |         return
15 | 
16 |     if not options.values():
17 |         return
18 | 
19 |     if options["channel"] == "discord":
20 |         discord_notification(job, options)
21 |     elif options["channel"] == "email":
22 |         send_job_complete_email(job, options)
23 |     else:
24 |         raise ValueError(f"Invalid channel: {options['channel']}")
25 | 


--------------------------------------------------------------------------------
/cypress.config.ts:
--------------------------------------------------------------------------------
1 | import { defineConfig } from "cypress";
2 | 
3 | export default defineConfig({
4 |   e2e: {
5 |     setupNodeEvents(on, config) {},
6 |     baseUrl: "http://localhost",
7 |   },
8 | });
9 | 


--------------------------------------------------------------------------------
/cypress/e2e/authentication.cy.ts:
--------------------------------------------------------------------------------
 1 | describe("Authentication", () => {
 2 |   it("should register", () => {
 3 |     cy.intercept("POST", "/api/signup").as("signup");
 4 | 
 5 |     cy.visit("/").then(() => {
 6 |       cy.get("button").contains("Login").click();
 7 |       cy.url().should("include", "/login");
 8 | 
 9 |       cy.get("form").should("be.visible");
10 |       cy.get("button")
11 |         .contains("No Account? Sign up")
12 |         .should("be.visible")
13 |         .click();
14 | 
15 |       cy.get("input[name='email']").type("test@test.com");
16 |       cy.get("input[name='password']").type("password");
17 |       cy.get("input[name='fullName']").type("John Doe");
18 |       cy.get("button[type='submit']").contains("Signup").click();
19 | 
20 |       cy.wait("@signup").then((interception) => {
21 |         if (!interception.response) {
22 |           cy.log("No response received!");
23 |           throw new Error("signup request did not return a response");
24 |         }
25 | 
26 |         cy.log("Response status: " + interception.response.statusCode);
27 |         cy.log("Response body: " + JSON.stringify(interception.response.body));
28 | 
29 |         expect(interception.response.statusCode).to.eq(200);
30 |       });
31 |     });
32 |   });
33 | 
34 |   it("should login", () => {
35 |     cy.intercept("POST", "/api/token").as("token");
36 | 
37 |     cy.visit("/").then(() => {
38 |       cy.get("button")
39 |         .contains("Login")
40 |         .click()
41 |         .then(() => {
42 |           cy.get("input[name='email']").type("test@test.com");
43 |           cy.get("input[name='password']").type("password");
44 |           cy.get("button[type='submit']").contains("Login").click();
45 | 
46 |           cy.wait("@token").then((interception) => {
47 |             if (!interception.response) {
48 |               cy.log("No response received!");
49 |               throw new Error("token request did not return a response");
50 |             }
51 | 
52 |             cy.log("Response status: " + interception.response.statusCode);
53 |             cy.log("Response body: " + JSON.stringify(interception.response.body));
54 | 
55 |             expect(interception.response.statusCode).to.eq(200);
56 |           });
57 |         });
58 |     });
59 |   });
60 | });
61 | 


--------------------------------------------------------------------------------
/cypress/e2e/navigation.cy.ts:
--------------------------------------------------------------------------------
1 | describe("General site navigation", () => {
2 |   it("passes", () => {
3 |     cy.visit("/");
4 |   });
5 | });
6 | 


--------------------------------------------------------------------------------
/cypress/e2e/submit-job.cy.ts:
--------------------------------------------------------------------------------
 1 | describe.only("Job", () => {
 2 |   it("should create a job", () => {
 3 |     cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
 4 | 
 5 |     cy.visit("/");
 6 | 
 7 |     cy.get('[data-cy="url-input"]').type("https://example.com");
 8 |     cy.get('[data-cy="name-field"]').type("example");
 9 |     cy.get('[data-cy="xpath-field"]').type("//body");
10 |     cy.get('[data-cy="add-button"]').click();
11 | 
12 |     cy.contains("Submit").click();
13 | 
14 |     cy.wait("@submitScrapeJob").then((interception) => {
15 |       if (!interception.response) {
16 |         cy.log("No response received!");
17 |         cy.log("Request body: " + JSON.stringify(interception.request?.body));
18 |         throw new Error("submitScrapeJob request did not return a response");
19 |       }
20 | 
21 |       cy.log("Response status: " + interception.response.statusCode);
22 |       cy.log("Response body: " + JSON.stringify(interception.response.body));
23 | 
24 |       expect(interception.response.statusCode).to.eq(200);
25 |     });
26 | 
27 |     cy.get("li").contains("Jobs").click();
28 | 
29 |     cy.contains("div", "https://example.com", { timeout: 10000 }).should(
30 |       "exist"
31 |     );
32 |     cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
33 | 
34 |     cy.get("tbody tr")
35 |       .first()
36 |       .within(() => {
37 |         cy.get('input[type="checkbox"]').click();
38 |       });
39 | 
40 |     cy.get("[data-testid='DeleteIcon']").click();
41 | 
42 |     cy.contains("div", "https://example.com", { timeout: 10000 }).should(
43 |       "not.exist"
44 |     );
45 |   });
46 | 
47 |   it("should create a job with advanced options (media)", () => {
48 |     cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob");
49 | 
50 |     cy.visit("/");
51 | 
52 |     cy.get("button").contains("Advanced Job Options").click();
53 | 
54 |     cy.get('[data-cy="collect-media-checkbox"]').click();
55 |     cy.get("body").type("{esc}");
56 | 
57 |     cy.get('[data-cy="url-input"]').type("https://books.toscrape.com");
58 |     cy.get('[data-cy="name-field"]').type("example");
59 |     cy.get('[data-cy="xpath-field"]').type("//body");
60 |     cy.get('[data-cy="add-button"]').click();
61 | 
62 |     cy.get("button").contains("Submit").click();
63 | 
64 |     cy.get("li").contains("Jobs").click();
65 | 
66 |     cy.contains("div", "https://books.toscrape.com", { timeout: 10000 }).should(
67 |       "exist"
68 |     );
69 | 
70 |     cy.contains("div", "Completed", { timeout: 20000 }).should("exist");
71 |     cy.get("li").contains("Media").click();
72 | 
73 |     cy.get("div[id='select-job']").click();
74 |     cy.get("li[role='option']").click();
75 | 
76 |     cy.get("[data-testid='media-grid']", { timeout: 10000 }).should("exist");
77 | 
78 |     cy.get("li").contains("Jobs").click();
79 | 
80 |     cy.get("tbody tr")
81 |       .first()
82 |       .within(() => {
83 |         cy.get('input[type="checkbox"]').click();
84 |       });
85 | 
86 |     cy.get("[data-testid='DeleteIcon']").click();
87 |   });
88 | });
89 | 


--------------------------------------------------------------------------------
/cypress/fixtures/example.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name": "Using fixtures to represent data",
3 |   "email": "hello@cypress.io",
4 |   "body": "Fixtures are a great way to mock data for responses to routes"
5 | }
6 | 


--------------------------------------------------------------------------------
/cypress/support/commands.ts:
--------------------------------------------------------------------------------
 1 | /// <reference types="cypress" />
 2 | // ***********************************************
 3 | // This example commands.ts shows you how to
 4 | // create various custom commands and overwrite
 5 | // existing commands.
 6 | //
 7 | // For more comprehensive examples of custom
 8 | // commands please read more here:
 9 | // https://on.cypress.io/custom-commands
10 | // ***********************************************
11 | //
12 | //
13 | // -- This is a parent command --
14 | // Cypress.Commands.add('login', (email, password) => { ... })
15 | //
16 | //
17 | // -- This is a child command --
18 | // Cypress.Commands.add('drag', { prevSubject: 'element'}, (subject, options) => { ... })
19 | //
20 | //
21 | // -- This is a dual command --
22 | // Cypress.Commands.add('dismiss', { prevSubject: 'optional'}, (subject, options) => { ... })
23 | //
24 | //
25 | // -- This will overwrite an existing command --
26 | // Cypress.Commands.overwrite('visit', (originalFn, url, options) => { ... })
27 | //
28 | // declare global {
29 | //   namespace Cypress {
30 | //     interface Chainable {
31 | //       login(email: string, password: string): Chainable<void>
32 | //       drag(subject: string, options?: Partial<TypeOptions>): Chainable<Element>
33 | //       dismiss(subject: string, options?: Partial<TypeOptions>): Chainable<Element>
34 | //       visit(originalFn: CommandOriginalFn, url: string, options: Partial<VisitOptions>): Chainable<Element>
35 | //     }
36 | //   }
37 | // }
38 | 


--------------------------------------------------------------------------------
/cypress/support/e2e.ts:
--------------------------------------------------------------------------------
 1 | // ***********************************************************
 2 | // This example support/e2e.ts is processed and
 3 | // loaded automatically before your test files.
 4 | //
 5 | // This is a great place to put global configuration and
 6 | // behavior that modifies Cypress.
 7 | //
 8 | // You can change the location of this file or turn off
 9 | // automatically serving support files with the
10 | // 'supportFile' configuration option.
11 | //
12 | // You can read more here:
13 | // https://on.cypress.io/configuration
14 | // ***********************************************************
15 | 
16 | // Import commands.js using ES2015 syntax:
17 | import './commands'
18 | 
19 | // Alternatively you can use CommonJS syntax:
20 | // require('./commands')


--------------------------------------------------------------------------------
/docker-compose.dev.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 |   scraperr:
 4 |     build:
 5 |       context: .
 6 |       dockerfile: docker/frontend/Dockerfile
 7 |     command: ["npm", "run", "dev"]
 8 |     volumes:
 9 |       - "$PWD/src:/app/src"
10 |       - "$PWD/public:/app/public"
11 |       - "$PWD/next.config.mjs:/app/next.config.mjs"
12 |       - "$PWD/package.json:/app/package.json"
13 |       - "$PWD/package-lock.json:/app/package-lock.json"
14 |       - "$PWD/tsconfig.json:/app/tsconfig.json"
15 |   scraperr_api:
16 |     build:
17 |       context: .
18 |       dockerfile: docker/api/Dockerfile
19 |     environment:
20 |       - LOG_LEVEL=INFO
21 |     volumes:
22 |       - "$PWD/api:/project/app/api"
23 |     ports:
24 |       - "5900:5900"
25 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   scraperr:
 3 |     image: jpyles0524/scraperr:latest
 4 |     container_name: scraperr
 5 |     command: ["npm", "run", "start"]
 6 |     environment:
 7 |       - NEXT_PUBLIC_API_URL=http://scraperr_api:8000 # your API URL
 8 |       - SERVER_URL=http://scraperr_api:8000 # your docker container API URL
 9 |     ports:
10 |       - 80:3000
11 |     networks:
12 |       - web
13 |   scraperr_api:
14 |     init: True
15 |     image: jpyles0524/scraperr_api:latest
16 |     environment:
17 |       - LOG_LEVEL=INFO
18 |     container_name: scraperr_api
19 |     ports:
20 |       - 8000:8000
21 |     volumes:
22 |       - "$PWD/data:/project/app/data"
23 |       - "$PWD/media:/project/app/media"
24 |     networks:
25 |       - web
26 | 
27 | networks:
28 |   web:
29 | 


--------------------------------------------------------------------------------
/docker/api/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Build python dependencies
 2 | FROM python:3.10.12-slim as pybuilder
 3 | 
 4 | RUN apt-get update && \
 5 |     apt-get install -y curl && \
 6 |     apt-get install -y x11vnc xvfb uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 ffmpeg && \
 7 |     curl -LsSf https://astral.sh/uv/install.sh | sh && \
 8 |     apt-get remove -y curl && \
 9 |     apt-get autoremove -y && \
10 |     rm -rf /var/lib/apt/lists/*
11 | 
12 | RUN python -m pip --no-cache-dir install pdm
13 | RUN pdm config python.use_venv false
14 | 
15 | WORKDIR /project/app
16 | COPY pyproject.toml pdm.lock /project/app/
17 | 
18 | RUN pdm install -v --frozen-lockfile
19 | 
20 | RUN pdm run playwright install --with-deps
21 | 
22 | RUN pdm run camoufox fetch
23 | 
24 | COPY ./api/ /project/app/api
25 | 
26 | ENV PYTHONPATH=/project/pkgs
27 | 
28 | COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
29 | 
30 | EXPOSE 8000
31 | 
32 | WORKDIR /project/app
33 | 
34 | RUN mkdir -p /project/app/media
35 | RUN mkdir -p /project/app/data
36 | RUN touch /project/app/data/database.db
37 | 
38 | EXPOSE 5900
39 | 
40 | COPY start.sh /project/app/start.sh
41 | 
42 | CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ]


--------------------------------------------------------------------------------
/docker/frontend/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Build next dependencies
 2 | FROM node:23.1-slim
 3 | WORKDIR /app
 4 | 
 5 | # Copy package files first to leverage Docker cache
 6 | COPY package.json yarn.lock ./
 7 | 
 8 | # Install dependencies in a separate layer
 9 | RUN yarn install --frozen-lockfile
10 | 
11 | # Copy the rest of the application
12 | COPY tsconfig.json /app/tsconfig.json
13 | COPY tailwind.config.js /app/tailwind.config.js
14 | COPY next.config.mjs /app/next.config.mjs
15 | COPY postcss.config.js /app/postcss.config.js
16 | 
17 | COPY public /app/public
18 | COPY src /app/src
19 | 
20 | # Build the application
21 | RUN yarn build
22 | 
23 | EXPOSE 3000


--------------------------------------------------------------------------------
/docs/chat_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/chat_page.png


--------------------------------------------------------------------------------
/docs/docs_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/docs_page.png


--------------------------------------------------------------------------------
/docs/job_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/job_page.png


--------------------------------------------------------------------------------
/docs/log_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/log_page.png


--------------------------------------------------------------------------------
/docs/login.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/login.png


--------------------------------------------------------------------------------
/docs/logo_picture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/logo_picture.png


--------------------------------------------------------------------------------
/docs/main_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/main_page.png


--------------------------------------------------------------------------------
/docs/stats_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/stats_page.png


--------------------------------------------------------------------------------
/helm/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/helm/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: scraperr
 3 | description: A Helm chart for Kubernetes
 4 | 
 5 | # A chart can be either an 'application' or a 'library' chart.
 6 | #
 7 | # Application charts are a collection of templates that can be packaged into versioned archives
 8 | # to be deployed.
 9 | #
10 | # Library charts provide useful utilities or functions for the chart developer. They're included as
11 | # a dependency of application charts to inject those utilities and functions into the rendering
12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed.
13 | type: application
14 | 
15 | # This is the chart version. This version number should be incremented each time you make changes
16 | # to the chart and its templates, including the app version.
17 | # Versions are expected to follow Semantic Versioning (https://semver.org/)
18 | version: 1.1.0
19 | 
20 | # This is the version number of the application being deployed. This version number should be
21 | # incremented each time you make changes to the application. Versions are not expected to
22 | # follow Semantic Versioning. They should reflect the version the application is using.
23 | # It is recommended to use it with quotes.
24 | appVersion: "1.16.0"
25 | 


--------------------------------------------------------------------------------
/helm/templates/deployment.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: apps/v1
 3 | kind: Deployment
 4 | metadata:
 5 |   name: scraperr
 6 | spec:
 7 |   replicas: {{ .Values.replicaCount }}
 8 |   selector:
 9 |     matchLabels:
10 |       app: scraperr
11 |   template:
12 |     metadata:
13 |       labels:
14 |         app: scraperr
15 |     spec:
16 |       containers:
17 |         - name: scraperr
18 |           {{ if .Values.scraperr.image.repository }}
19 |           image: "{{ .Values.scraperr.image.repository }}:{{ .Values.scraperr.image.tag }}"
20 |           {{ else }}
21 |           image: "{{ .Chart.Name }}:{{ .Chart.Version }}"
22 |           {{ end }}
23 |           imagePullPolicy: {{ .Values.scraperr.image.pullPolicy }}
24 |           command: {{ .Values.scraperr.containerCommand | toJson }}
25 |           ports:
26 |             - containerPort: {{ .Values.scraperr.containerPort }}
27 |           env: {{ toYaml .Values.scraperr.env | nindent 12 }}
28 | 
29 | ---
30 | apiVersion: apps/v1
31 | kind: Deployment
32 | metadata:
33 |   name: scraperr-api
34 | spec:
35 |   replicas: {{ .Values.replicaCount }}
36 |   selector:
37 |     matchLabels:
38 |       app: scraperr-api
39 |   template:
40 |     metadata:
41 |       labels:
42 |         app: scraperr-api
43 |     spec:
44 |       containers:
45 |         - name: scraperr-api
46 |           {{ if .Values.scraperrApi.image.repository }}
47 |           image: "{{ .Values.scraperrApi.image.repository }}:{{ .Values.scraperrApi.image.tag }}"
48 |           {{ else }}
49 |           image: "{{ .Chart.Name }}:{{ .Chart.Version }}"
50 |           {{ end }}
51 |           imagePullPolicy: {{ .Values.scraperrApi.image.pullPolicy }}
52 |           ports:
53 |             - containerPort: {{ .Values.scraperrApi.containerPort }}
54 |           env: {{ toYaml .Values.scraperrApi.env | nindent 12 }}
55 |           volumeMounts: {{ toYaml .Values.scraperrApi.volumeMounts | nindent 12 }}
56 |       volumes: {{ toYaml .Values.scraperrApi.volumes | nindent 12 }}


--------------------------------------------------------------------------------
/helm/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: v1
 3 | kind: Service
 4 | metadata:
 5 |   name: scraperr
 6 | spec:
 7 |   type: {{ .Values.scraperr.serviceType }}
 8 |   selector:
 9 |     app: scraperr
10 |   ports:
11 |     {{- range .Values.scraperr.ports }}
12 |     - port: {{ .port }}
13 |       targetPort: {{ .targetPort }}
14 |       {{- if .nodePort }}
15 |       nodePort: {{ .nodePort }}
16 |       {{- end }}
17 |       protocol: {{ .protocol | default "TCP" }}
18 |     {{- end }}
19 | 
20 | ---
21 | apiVersion: v1
22 | kind: Service
23 | metadata:
24 |   name: scraperr-api
25 | spec:
26 |   type: {{ .Values.scraperrApi.serviceType }}
27 |   selector:
28 |     app: scraperr-api
29 |   ports:
30 |     {{- range .Values.scraperrApi.ports }}
31 |     - port: {{ .port }}
32 |       targetPort: {{ .targetPort }}
33 |       {{- if .nodePort }}
34 |       nodePort: {{ .nodePort }}
35 |       {{- end }}
36 |       protocol: {{ .protocol | default "TCP" }}
37 |     {{- end }}
38 | 


--------------------------------------------------------------------------------
/helm/values.yaml:
--------------------------------------------------------------------------------
 1 | scraperr:
 2 |   image:
 3 |     repository: jpyles0524/scraperr
 4 |     tag: latest
 5 |     pullPolicy: IfNotPresent
 6 |   containerCommand: ["npm", "run","start"]
 7 |   containerPort: 3000
 8 |   serviceType: NodePort
 9 |   ports:
10 |     - port: 80         
11 |       targetPort: 3000  
12 |       nodePort: 32300
13 |       protocol: TCP
14 |   env:
15 |     - name: NEXT_PUBLIC_API_URL
16 |       value: "http://scraperr-api:8000"
17 |     - name: SERVER_URL
18 |       value: "http://scraperr-api:8000"
19 | 
20 | scraperrApi:
21 |   image:
22 |     repository: jpyles0524/scraperr_api
23 |     tag: latest
24 |     pullPolicy: IfNotPresent
25 |   containerPort: 8000
26 |   serviceType: ClusterIP
27 |   ports:
28 |     - port: 8000
29 |       targetPort: 8000
30 |       protocol: TCP
31 |   env:
32 |     - name: LOG_LEVEL
33 |       value: "INFO"
34 |   volumeMounts:
35 |     - name: data
36 |       mountPath: /project/app/data
37 |     - name: media
38 |       mountPath: /project/app/media
39 |   volumes:
40 |     - name: data
41 |       hostPath:
42 |         path: /data/scraperr/data
43 |         type: DirectoryOrCreate
44 |     - name: media
45 |       hostPath:
46 |         path: /data/scraperr/media
47 | replicaCount: 1


--------------------------------------------------------------------------------
/next-env.d.ts:
--------------------------------------------------------------------------------
1 | /// <reference types="next" />
2 | /// <reference types="next/image-types/global" />
3 | 
4 | // NOTE: This file should not be edited
5 | // see https://nextjs.org/docs/basic-features/typescript for more information.
6 | 


--------------------------------------------------------------------------------
/next.config.mjs:
--------------------------------------------------------------------------------
 1 | import dotenv from "dotenv";
 2 | dotenv.config();
 3 | 
 4 | /** @type {import('next').NextConfig} */
 5 | const nextConfig = {
 6 |   distDir: "./dist",
 7 |   images: { unoptimized: true },
 8 |   env: {
 9 |     DOMAIN: `${process.env.NEXT_PUBLIC_API_PATH}`,
10 |   },
11 | };
12 | 
13 | export default nextConfig;
14 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "webapp-template",
 3 |   "version": "0.1.0",
 4 |   "private": true,
 5 |   "dependencies": {
 6 |     "@auth0/auth0-react": "^2.2.4",
 7 |     "@auth0/nextjs-auth0": "^3.5.0",
 8 |     "@chakra-ui/react": "^2.8.2",
 9 |     "@emotion/react": "^11.11.4",
10 |     "@emotion/styled": "^11.11.5",
11 |     "@fontsource/roboto": "^5.0.13",
12 |     "@minchat/react-chat-ui": "^0.16.2",
13 |     "@mui/icons-material": "^5.15.3",
14 |     "@mui/material": "^5.16.0",
15 |     "@reduxjs/toolkit": "^2.8.2",
16 |     "@testing-library/jest-dom": "^5.16.5",
17 |     "@testing-library/react": "^13.4.0",
18 |     "@testing-library/user-event": "^13.5.0",
19 |     "@types/react": "^18.3.21",
20 |     "axios": "^1.7.2",
21 |     "bootstrap": "^5.3.0",
22 |     "chart.js": "^4.4.3",
23 |     "cookie": "^0.6.0",
24 |     "dotenv": "^16.5.0",
25 |     "framer-motion": "^4.1.17",
26 |     "js-cookie": "^3.0.5",
27 |     "next": "^14.2.4",
28 |     "next-auth": "^4.24.7",
29 |     "nookies": "^2.5.2",
30 |     "react": "^18.3.1",
31 |     "react-bootstrap": "^2.8.0",
32 |     "react-dom": "^18.3.1",
33 |     "react-markdown": "^9.0.0",
34 |     "react-modal-image": "^2.6.0",
35 |     "react-redux": "^9.2.0",
36 |     "react-router": "^6.14.1",
37 |     "react-router-dom": "^6.14.1",
38 |     "react-spinners": "^0.14.1",
39 |     "redux-persist": "^6.0.0",
40 |     "typescript": "^4.9.5",
41 |     "web-vitals": "^2.1.4"
42 |   },
43 |   "scripts": {
44 |     "dev": "yarn next dev",
45 |     "build": "yarn next build",
46 |     "start": "yarn next start",
47 |     "serve": "serve -s ./dist",
48 |     "cy:open": "cypress open",
49 |     "cy:run": "cypress run"
50 |   },
51 |   "eslintConfig": {
52 |     "extends": [
53 |       "react-app",
54 |       "react-app/jest"
55 |     ]
56 |   },
57 |   "browserslist": {
58 |     "production": [
59 |       ">0.2%",
60 |       "not dead",
61 |       "not op_mini all"
62 |     ],
63 |     "development": [
64 |       "last 1 chrome version",
65 |       "last 1 firefox version",
66 |       "last 1 safari version"
67 |     ]
68 |   },
69 |   "devDependencies": {
70 |     "@types/cypress": "^1.1.6",
71 |     "@types/js-cookie": "^3.0.6",
72 |     "autoprefixer": "^10.4.21",
73 |     "cypress": "^13.17.0",
74 |     "eslint": "^9.26.0",
75 |     "postcss": "^8.5.3",
76 |     "tailwindcss": "^3.3.5"
77 |   },
78 |   "overrides": {
79 |     "react-refresh": "0.11.0"
80 |   },
81 |   "resolutions": {
82 |     "postcss": "^8.4.31"
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/postcss.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   plugins: {
3 |     tailwindcss: {},
4 |     autoprefixer: {},
5 |   },
6 | };
7 | 


--------------------------------------------------------------------------------
/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/public/favicon.ico


--------------------------------------------------------------------------------
/public/images/scraperr_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/public/images/scraperr_logo.png


--------------------------------------------------------------------------------
/public/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "short_name": "React App",
 3 |   "name": "Create React App Sample",
 4 |   "icons": [
 5 |     {
 6 |       "src": "favicon.ico",
 7 |       "sizes": "64x64 32x32 24x24 16x16",
 8 |       "type": "image/x-icon"
 9 |     },
10 |     {
11 |       "src": "logo192.png",
12 |       "type": "image/png",
13 |       "sizes": "192x192"
14 |     },
15 |     {
16 |       "src": "logo512.png",
17 |       "type": "image/png",
18 |       "sizes": "512x512"
19 |     }
20 |   ],
21 |   "start_url": ".",
22 |   "display": "standalone",
23 |   "theme_color": "#000000",
24 |   "background_color": "#ffffff"
25 | }
26 | 


--------------------------------------------------------------------------------
/public/robots.txt:
--------------------------------------------------------------------------------
1 | # https://www.robotstxt.org/robotstxt.html
2 | User-agent: *
3 | Disallow:
4 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | name = "web-scrape"
  3 | version = "0.1.0"
  4 | description = ""
  5 | authors = [{ name = "Jayden Pyles", email = "jpylesbuisness@gmail.com" }]
  6 | dependencies = [
  7 |     "uvicorn>=0.30.1",
  8 |     "fastapi>=0.111.0",
  9 |     "boto3>=1.34.140",
 10 |     "python-dotenv>=1.0.1",
 11 |     "boto3-stubs[essential]>=1.34.140",
 12 |     "asyncio>=3.4.3",
 13 |     "aiohttp>=3.9.5",
 14 |     "bs4>=0.0.2",
 15 |     "lxml[html_clean]>=5.2.2",
 16 |     "lxml-stubs>=0.5.1",
 17 |     "fake-useragent>=1.5.1",
 18 |     "requests-html>=0.10.0",
 19 |     "webdriver-manager>=4.0.1",
 20 |     "pydantic[email]>=2.9.2",
 21 |     "pandas>=2.2.2",
 22 |     "openpyxl>=3.1.5",
 23 |     "xlsxwriter>=3.2.0",
 24 |     "python-keycloak>=4.2.0",
 25 |     "fastapi-keycloak>=1.0.11",
 26 |     "pymongo>=4.8.0",
 27 |     "motor[asyncio]>=3.5.0",
 28 |     "python-jose[cryptography]>=3.3.0",
 29 |     "passlib[bcrypt]>=1.7.4",
 30 |     "selenium-wire>=5.1.0",
 31 |     "blinker<1.8.0",
 32 |     "setuptools>=71.0.4",
 33 |     "docker>=7.1.0",
 34 |     "ollama>=0.3.0",
 35 |     "openai>=1.37.1",
 36 |     "exceptiongroup>=1.2.2",
 37 |     "Faker>=30.6.0",
 38 |     "pytest-asyncio>=0.24.0",
 39 |     "python-multipart>=0.0.1",
 40 |     "bcrypt==4.0.1",
 41 |     "apscheduler>=3.11.0",
 42 |     "playwright>=1.52.0",
 43 |     "camoufox>=0.4.11",
 44 |     "html2text>=2025.4.15",
 45 | ]
 46 | requires-python = ">=3.10"
 47 | readme = "README.md"
 48 | license = { text = "MIT" }
 49 | 
 50 | [tool.pdm]
 51 | distribution = true
 52 | 
 53 | [tool.pdm.dev-dependencies]
 54 | dev = ["ipython>=8.26.0", "pytest>=8.3.3"]
 55 | [tool.pyright]
 56 | include = ["./api/backend/"]
 57 | exclude = ["**/node_modules", "**/__pycache__"]
 58 | ignore = []
 59 | defineConstant = { DEBUG = true }
 60 | stubPath = ""
 61 | 
 62 | # Type checking strictness
 63 | typeCheckingMode = "strict"                        # Enables strict type checking mode
 64 | reportPrivateUsage = "none"
 65 | reportMissingTypeStubs = "none"
 66 | reportUntypedFunctionDecorator = "error"
 67 | reportUntypedClassDecorator = "error"
 68 | reportUntypedBaseClass = "error"
 69 | reportInvalidTypeVarUse = "error"
 70 | reportUnnecessaryTypeIgnoreComment = "information"
 71 | reportUnknownVariableType = "none"
 72 | reportUnknownMemberType = "none"
 73 | reportUnknownParameterType = "none"
 74 | 
 75 | # Additional checks
 76 | reportImplicitStringConcatenation = "error"
 77 | reportInvalidStringEscapeSequence = "error"
 78 | reportMissingImports = "error"
 79 | reportMissingModuleSource = "error"
 80 | reportOptionalCall = "error"
 81 | reportOptionalIterable = "error"
 82 | reportOptionalMemberAccess = "error"
 83 | reportOptionalOperand = "error"
 84 | reportOptionalSubscript = "error"
 85 | reportTypedDictNotRequiredAccess = "error"
 86 | 
 87 | # Function return type checking
 88 | reportIncompleteStub = "error"
 89 | reportIncompatibleMethodOverride = "error"
 90 | reportInvalidStubStatement = "error"
 91 | reportInconsistentOverload = "error"
 92 | 
 93 | # Misc settings
 94 | pythonVersion = "3.10"           # Matches your Python version from pyproject.toml
 95 | strictListInference = true
 96 | strictDictionaryInference = true
 97 | strictSetInference = true
 98 | 
 99 | 
100 | [tool.isort]
101 | length_sort = "1"
102 | profile = "black"
103 | sections = "STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER"
104 | import_heading_stdlib = "STL"
105 | import_heading_thirdparty = "PDM"
106 | import_heading_firstparty = "LOCAL"
107 | import_heading_localfolder = "LOCAL"
108 | 


--------------------------------------------------------------------------------
/src/components/ai/Chat.tsx:
--------------------------------------------------------------------------------
1 | import React from "react";
2 | 
3 | export const Chat = () => {
4 |   return <h1>Chat</h1>;
5 | };
6 | 


--------------------------------------------------------------------------------
/src/components/ai/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./Chat";
2 | export * from "./JobSelector";
3 | 


--------------------------------------------------------------------------------
/src/components/common/advanced-job-options/advanced-job-options.tsx:
--------------------------------------------------------------------------------
 1 | import { Box, Link, Typography } from "@mui/material";
 2 | import { SetStateAction, Dispatch, useState } from "react";
 3 | import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog";
 4 | import { RawJobOptions } from "@/types";
 5 | 
 6 | export type AdvancedJobOptionsProps = {
 7 |   jobOptions: RawJobOptions;
 8 |   setJobOptions: Dispatch<SetStateAction<RawJobOptions>>;
 9 |   multiPageScrapeEnabled?: boolean;
10 | };
11 | 
12 | export const AdvancedJobOptions = ({
13 |   jobOptions,
14 |   setJobOptions,
15 |   multiPageScrapeEnabled = true,
16 | }: AdvancedJobOptionsProps) => {
17 |   const [open, setOpen] = useState(false);
18 |   return (
19 |     <Box sx={{ mb: 2 }}>
20 |       <Link
21 |         component="button"
22 |         variant="body2"
23 |         onClick={() => setOpen(true)}
24 |         sx={{
25 |           textDecoration: "none",
26 |           color: "primary.main",
27 |           "&:hover": {
28 |             color: "primary.dark",
29 |             textDecoration: "underline",
30 |           },
31 |           paddingLeft: 1,
32 |           display: "inline-flex",
33 |           alignItems: "center",
34 |           gap: 0.5,
35 |         }}
36 |       >
37 |         <Typography variant="body2">Advanced Job Options</Typography>
38 |       </Link>
39 |       <AdvancedJobOptionsDialog
40 |         open={open}
41 |         onClose={() => setOpen(false)}
42 |         jobOptions={jobOptions}
43 |         setJobOptions={setJobOptions}
44 |         multiPageScrapeEnabled={multiPageScrapeEnabled}
45 |       />
46 |     </Box>
47 |   );
48 | };
49 | 


--------------------------------------------------------------------------------
/src/components/common/advanced-job-options/dialog/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./advanced-job-options-dialog";
2 | 


--------------------------------------------------------------------------------
/src/components/common/advanced-job-options/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./advanced-job-options";
2 | 


--------------------------------------------------------------------------------
/src/components/common/csv-table/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./csv-table";
2 | 


--------------------------------------------------------------------------------
/src/components/common/disabled/disabled.tsx:
--------------------------------------------------------------------------------
 1 | import { Box } from "@mui/material";
 2 | 
 3 | export type DisabledProps = {
 4 |   message: string;
 5 | };
 6 | 
 7 | export const Disabled = ({ message }: DisabledProps) => {
 8 |   return (
 9 |     <Box
10 |       bgcolor="background.default"
11 |       minHeight="100vh"
12 |       display="flex"
13 |       justifyContent="center"
14 |       alignItems="center"
15 |     >
16 |       <h4
17 |         style={{
18 |           color: "#fff",
19 |           padding: "20px",
20 |           borderRadius: "8px",
21 |           background: "rgba(0, 0, 0, 0.6)",
22 |           boxShadow: "0 4px 8px rgba(0, 0, 0, 0.2)",
23 |         }}
24 |       >
25 |         {message}
26 |       </h4>
27 |     </Box>
28 |   );
29 | };
30 | 


--------------------------------------------------------------------------------
/src/components/common/disabled/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./disabled";
2 | 


--------------------------------------------------------------------------------
/src/components/common/expanded-table-input/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./expanded-table-input";
2 | 


--------------------------------------------------------------------------------
/src/components/common/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./nav-drawer";
2 | 


--------------------------------------------------------------------------------
/src/components/common/job-download-dialog/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./job-download-dialog";
2 | 


--------------------------------------------------------------------------------
/src/components/common/job-download-dialog/job-download-dialog.tsx:
--------------------------------------------------------------------------------
 1 | import {
 2 |   Dialog,
 3 |   DialogTitle,
 4 |   DialogContent,
 5 |   DialogActions,
 6 |   Button,
 7 |   FormControl,
 8 |   RadioGroup,
 9 |   FormControlLabel,
10 |   Radio,
11 |   FormLabel,
12 |   Typography,
13 |   Box,
14 | } from "@mui/material";
15 | import { useState } from "react";
16 | 
17 | export type JobDownloadDialogProps = {
18 |   open: boolean;
19 |   onClose: () => void;
20 |   ids: string[];
21 | };
22 | 
23 | export const JobDownloadDialog = ({
24 |   open,
25 |   onClose,
26 |   ids,
27 | }: JobDownloadDialogProps) => {
28 |   const [jobFormat, setJobFormat] = useState<string>("csv");
29 |   const handleDownload = async () => {
30 |     const response = await fetch("/api/download", {
31 |       method: "POST",
32 |       headers: { "Content-Type": "application/json" },
33 |       body: JSON.stringify({ data: { ids: ids, job_format: jobFormat } }),
34 |     });
35 | 
36 |     if (response.ok) {
37 |       const blob = await response.blob();
38 |       const url = window.URL.createObjectURL(blob);
39 |       const a = document.createElement("a");
40 |       a.style.display = "none";
41 |       a.href = url;
42 |       a.download = `job_${ids[0]}.${jobFormat}`;
43 |       document.body.appendChild(a);
44 |       a.click();
45 |       window.URL.revokeObjectURL(url);
46 |       document.body.removeChild(a);
47 |     } else {
48 |       console.error("Failed to download the file.");
49 |     }
50 |   };
51 | 
52 |   return (
53 |     <Dialog open={open} onClose={onClose}>
54 |       <DialogTitle>Download Job</DialogTitle>
55 |       <DialogContent>
56 |         <FormControl>
57 |           <Typography variant="body1">
58 |             You are about to download {ids.length} job(s). Please select the
59 |             format that you would like to download them in.
60 |           </Typography>
61 |           <br />
62 |           <Box
63 |             sx={{
64 |               display: "flex",
65 |               flexDirection: "column",
66 |               backgroundColor: "background.paper",
67 |               padding: 2,
68 |               border: "1px solid",
69 |             }}
70 |           >
71 |             <FormLabel>Format</FormLabel>
72 |             <hr style={{ width: "100%", margin: "10px 0" }} />
73 |             <RadioGroup
74 |               aria-labelledby="job-download-format-radio-buttons"
75 |               name="job-download-format-radio-buttons"
76 |               value={jobFormat}
77 |               onChange={(e) => setJobFormat(e.target.value)}
78 |             >
79 |               <FormControlLabel value="csv" control={<Radio />} label="CSV" />
80 |               <FormControlLabel
81 |                 value="md"
82 |                 control={<Radio />}
83 |                 label="Markdown"
84 |               />
85 |             </RadioGroup>
86 |           </Box>
87 |           <br />
88 |           <Button onClick={handleDownload} size="small">
89 |             Download
90 |           </Button>
91 |         </FormControl>
92 |       </DialogContent>
93 |     </Dialog>
94 |   );
95 | };
96 | 


--------------------------------------------------------------------------------
/src/components/common/media-viewer/audio/audio-viewer.tsx:
--------------------------------------------------------------------------------
 1 | 
 2 | import { Box, Typography } from "@mui/material";
 3 | 
 4 | interface AudioViewerProps {
 5 |   mediaUrl: string;
 6 |   selectedMedia: string;
 7 |   onError: () => void;
 8 | }
 9 | 
10 | export const AudioViewer = ({
11 |   mediaUrl,
12 |   selectedMedia,
13 |   onError,
14 | }: AudioViewerProps) => {
15 |   return (
16 |     <Box
17 |       sx={{
18 |         display: "flex",
19 |         justifyContent: "center",
20 |         alignItems: "center",
21 |         flexDirection: "column",
22 |         height: "100%",
23 |         gap: 2,
24 |       }}
25 |     >
26 |       <Typography variant="h6">{selectedMedia}</Typography>
27 |       <audio
28 |         controls
29 |         onError={onError}
30 |         style={{
31 |           width: "80%",
32 |           maxWidth: "500px",
33 |         }}
34 |       >
35 |         <source src={mediaUrl} type="audio/mpeg" />
36 |         Your browser does not support the audio element.
37 |       </audio>
38 |     </Box>
39 |   );
40 | };
41 | 


--------------------------------------------------------------------------------
/src/components/common/media-viewer/audio/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./audio-viewer";
2 | 


--------------------------------------------------------------------------------
/src/components/common/media-viewer/image/image-viewer.tsx:
--------------------------------------------------------------------------------
 1 | import { Box, useTheme } from "@mui/material";
 2 | 
 3 | export const ImageViewer = ({
 4 |   mediaUrl,
 5 |   selectedMedia,
 6 | }: {
 7 |   mediaUrl: string;
 8 |   selectedMedia: string;
 9 | }) => {
10 |   const theme = useTheme();
11 |   return (
12 |     <Box
13 |       sx={{
14 |         display: "flex",
15 |         justifyContent: "center",
16 |         alignItems: "center",
17 |         height: "100%",
18 |         width: "100%",
19 |         overflow: "hidden",
20 |         position: "relative",
21 |       }}
22 |     >
23 |       <img
24 |         src={mediaUrl}
25 |         alt={selectedMedia}
26 |         style={{
27 |           maxHeight: "100%",
28 |           maxWidth: "100%",
29 |           objectFit: "contain",
30 |           borderRadius: "4px",
31 |           boxShadow: theme.shadows[4],
32 |         }}
33 |       />
34 |     </Box>
35 |   );
36 | };
37 | 


--------------------------------------------------------------------------------
/src/components/common/media-viewer/image/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./image-viewer";
2 | 


--------------------------------------------------------------------------------
/src/components/common/media-viewer/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./media-viewer";
2 | 


--------------------------------------------------------------------------------
/src/components/common/media-viewer/media-viewer.tsx:
--------------------------------------------------------------------------------
 1 | import { Box, Typography } from "@mui/material";
 2 | import { ImageViewer } from "./image";
 3 | import { VideoViewer } from "./video";
 4 | import { AudioViewer } from "./audio";
 5 | import { PDFViewer } from "./pdf-viewer";
 6 | 
 7 | interface MediaViewerProps {
 8 |   selectedMedia: string;
 9 |   activeTab: string;
10 |   getMediaUrl: (fileName: string) => string;
11 |   onError: (error: string) => void;
12 | }
13 | 
14 | export const MediaViewer = ({
15 |   selectedMedia,
16 |   activeTab,
17 |   getMediaUrl,
18 |   onError,
19 | }: MediaViewerProps) => {
20 |   if (!selectedMedia) {
21 |     return (
22 |       <Box
23 |         sx={{
24 |           display: "flex",
25 |           justifyContent: "center",
26 |           alignItems: "center",
27 |           height: "100%",
28 |         }}
29 |       >
30 |         <Typography variant="body1" color="textSecondary">
31 |           Select a file to view
32 |         </Typography>
33 |       </Box>
34 |     );
35 |   }
36 | 
37 |   const mediaUrl = getMediaUrl(selectedMedia);
38 | 
39 |   switch (activeTab) {
40 |     case "images":
41 |       return <ImageViewer mediaUrl={mediaUrl} selectedMedia={selectedMedia} />;
42 |     case "videos":
43 |       return (
44 |         <VideoViewer
45 |           mediaUrl={mediaUrl}
46 |           onError={() => onError("Error loading video")}
47 |         />
48 |       );
49 |     case "audio":
50 |       return (
51 |         <AudioViewer
52 |           mediaUrl={mediaUrl}
53 |           selectedMedia={selectedMedia}
54 |           onError={() => onError("Error loading audio")}
55 |         />
56 |       );
57 |     case "pdfs":
58 |       return <PDFViewer mediaUrl={mediaUrl} selectedMedia={selectedMedia} />;
59 |     default:
60 |       return (
61 |         <Box
62 |           sx={{
63 |             display: "flex",
64 |             justifyContent: "center",
65 |             alignItems: "center",
66 |             height: "100%",
67 |           }}
68 |         >
69 |           <Typography variant="body1">
70 |             {selectedMedia} - Download this file to view it
71 |           </Typography>
72 |         </Box>
73 |       );
74 |   }
75 | };
76 | 


--------------------------------------------------------------------------------
/src/components/common/media-viewer/pdf-viewer/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./pdf-viewer";
2 | 


--------------------------------------------------------------------------------
/src/components/common/media-viewer/pdf-viewer/pdf-viewer.tsx:
--------------------------------------------------------------------------------
 1 | import { Box, useTheme } from "@mui/material";
 2 | 
 3 | interface PDFViewerProps {
 4 |   mediaUrl: string;
 5 |   selectedMedia: string;
 6 | }
 7 | 
 8 | export const PDFViewer = ({ mediaUrl, selectedMedia }: PDFViewerProps) => {
 9 |   const theme = useTheme();
10 | 
11 |   return (
12 |     <Box
13 |       sx={{
14 |         width: "100%",
15 |         height: "100%",
16 |         overflow: "hidden",
17 |         borderRadius: 1,
18 |       }}
19 |     >
20 |       <iframe
21 |         src={`${mediaUrl}#view=fitH`}
22 |         style={{
23 |           width: "100%",
24 |           height: "100%",
25 |           border: "none",
26 |           borderRadius: "4px",
27 |           boxShadow: theme.shadows[4],
28 |         }}
29 |         title={selectedMedia}
30 |       />
31 |     </Box>
32 |   );
33 | };
34 | 


--------------------------------------------------------------------------------
/src/components/common/media-viewer/tile-grid-view/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./tile-grid-view";
2 | 


--------------------------------------------------------------------------------
/src/components/common/media-viewer/tile-grid-view/tile-grid-view.tsx:
--------------------------------------------------------------------------------
  1 | import { MediaFiles } from "@/components/pages/media/id/id";
  2 | import {
  3 |   Card,
  4 |   CardActionArea,
  5 |   CardMedia,
  6 |   CardContent,
  7 |   Typography,
  8 |   Box,
  9 |   Grid,
 10 |   useTheme,
 11 | } from "@mui/material";
 12 | 
 13 | interface TileGridViewProps {
 14 |   mediaFiles: MediaFiles;
 15 |   activeTab: string;
 16 |   selectedMedia: string;
 17 |   handleMediaSelect: (fileName: string) => void;
 18 |   getMediaUrl: (fileName: string) => string;
 19 | }
 20 | 
 21 | export const TileGridView = ({
 22 |   mediaFiles,
 23 |   activeTab,
 24 |   selectedMedia,
 25 |   handleMediaSelect,
 26 |   getMediaUrl,
 27 | }: TileGridViewProps) => {
 28 |   const theme = useTheme();
 29 | 
 30 |   return (
 31 |     <Grid container spacing={2} sx={{ p: 2 }} data-testid="media-grid">
 32 |       {mediaFiles[activeTab].map((fileName: string) => (
 33 |         <Grid item xs={6} sm={4} md={3} lg={2} key={fileName}>
 34 |           <Card
 35 |             sx={{
 36 |               height: "100%",
 37 |               display: "flex",
 38 |               flexDirection: "column",
 39 |               borderColor:
 40 |                 selectedMedia === fileName
 41 |                   ? theme.palette.primary.main
 42 |                   : "transparent",
 43 |               borderWidth: 2,
 44 |               borderStyle: "solid",
 45 |               transition: "all 0.2s",
 46 |               "&:hover": {
 47 |                 transform: "translateY(-4px)",
 48 |                 boxShadow: theme.shadows[6],
 49 |               },
 50 |             }}
 51 |           >
 52 |             <CardActionArea onClick={() => handleMediaSelect(fileName)}>
 53 |               <CardMedia
 54 |                 component="div"
 55 |                 sx={{
 56 |                   pt: "75%",
 57 |                   position: "relative",
 58 |                   backgroundColor:
 59 |                     theme.palette.mode === "light"
 60 |                       ? theme.palette.grey[100]
 61 |                       : theme.palette.grey[800],
 62 |                   display: "flex",
 63 |                   justifyContent: "center",
 64 |                   alignItems: "center",
 65 |                 }}
 66 |               >
 67 |                 {activeTab === "images" ? (
 68 |                   <Box
 69 |                     component="img"
 70 |                     src={getMediaUrl(fileName)}
 71 |                     alt={fileName}
 72 |                     sx={{
 73 |                       position: "absolute",
 74 |                       top: 0,
 75 |                       left: 0,
 76 |                       width: "100%",
 77 |                       height: "100%",
 78 |                       objectFit: "contain",
 79 |                       p: 1,
 80 |                     }}
 81 |                     onError={(e) => {
 82 |                       const target = e.target as HTMLImageElement;
 83 |                       if (target.src !== "/placeholder-image.png") {
 84 |                         target.src = "";
 85 |                       }
 86 |                     }}
 87 |                   />
 88 |                 ) : (
 89 |                   <Typography
 90 |                     variant="body2"
 91 |                     color="textSecondary"
 92 |                     sx={{
 93 |                       position: "absolute",
 94 |                       top: "50%",
 95 |                       left: "50%",
 96 |                       transform: "translate(-50%, -50%)",
 97 |                     }}
 98 |                   >
 99 |                     {fileName.split(".").pop()?.toUpperCase() || "FILE"}
100 |                   </Typography>
101 |                 )}
102 |               </CardMedia>
103 |               <CardContent sx={{ flexGrow: 1, p: 1 }}>
104 |                 <Typography variant="body2" noWrap title={fileName}>
105 |                   {fileName}
106 |                 </Typography>
107 |               </CardContent>
108 |             </CardActionArea>
109 |           </Card>
110 |         </Grid>
111 |       ))}
112 |     </Grid>
113 |   );
114 | };
115 | 


--------------------------------------------------------------------------------
/src/components/common/media-viewer/video/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./video-viewer";
2 | 


--------------------------------------------------------------------------------
/src/components/common/media-viewer/video/video-viewer.tsx:
--------------------------------------------------------------------------------
 1 | import { Box, useTheme } from "@mui/material";
 2 | 
 3 | export const VideoViewer = ({
 4 |   mediaUrl,
 5 |   onError,
 6 | }: {
 7 |   mediaUrl: string;
 8 |   onError: () => void;
 9 | }) => {
10 |   const theme = useTheme();
11 |   return (
12 |     <Box
13 |       sx={{
14 |         width: "100%",
15 |         height: "100%",
16 |         display: "flex",
17 |         justifyContent: "center",
18 |         alignItems: "center",
19 |         overflow: "hidden",
20 |         borderRadius: 1,
21 |       }}
22 |     >
23 |       <video
24 |         className="h-full w-full object-contain"
25 |         controls
26 |         onError={onError}
27 |         style={{
28 |           maxHeight: "100%",
29 |           maxWidth: "100%",
30 |           borderRadius: "4px",
31 |           boxShadow: theme.shadows[4],
32 |         }}
33 |       >
34 |         <source src={mediaUrl} type="video/mp4" />
35 |         Your browser does not support the video tag.
36 |       </video>
37 |     </Box>
38 |   );
39 | };
40 | 


--------------------------------------------------------------------------------
/src/components/common/nav-drawer/index.ts:
--------------------------------------------------------------------------------
1 | export { NavDrawer } from "./nav-drawer";
2 | 


--------------------------------------------------------------------------------
/src/components/common/nav-drawer/nav-drawer.module.css:
--------------------------------------------------------------------------------
1 | .userControl {
2 |   margin-bottom: 1rem;
3 | }
4 | 


--------------------------------------------------------------------------------
/src/components/common/nav-drawer/nav-drawer.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import React from "react";
 4 | import { useAuth } from "../../../contexts/AuthContext";
 5 | import { Box, Drawer } from "@mui/material";
 6 | 
 7 | import { QuickSettings } from "../../nav/quick-settings";
 8 | import { NavItems } from "./nav-items/nav-items";
 9 | import { UserControl } from "./user-control";
10 | 
11 | import classes from "./nav-drawer.module.css";
12 | 
13 | interface NavDrawerProps {
14 |   toggleTheme: () => void;
15 |   isDarkMode: boolean;
16 | }
17 | 
18 | const drawerWidth = 240;
19 | 
20 | export const NavDrawer: React.FC<NavDrawerProps> = ({
21 |   toggleTheme,
22 |   isDarkMode,
23 | }) => {
24 |   const { logout, user, isAuthenticated } = useAuth();
25 | 
26 |   return (
27 |     <Drawer
28 |       variant="permanent"
29 |       sx={{
30 |         width: drawerWidth,
31 |         flexShrink: 0,
32 |         [`& .MuiDrawer-paper`]: {
33 |           width: drawerWidth,
34 |           boxSizing: "border-box",
35 |         },
36 |       }}
37 |     >
38 |       <Box
39 |         sx={{
40 |           overflow: "auto",
41 |           display: "flex",
42 |           flexDirection: "column",
43 |           justifyContent: "space-between",
44 |           height: "100%",
45 |         }}
46 |       >
47 |         <div>
48 |           <NavItems />
49 |         </div>
50 |         <div>
51 |           <UserControl
52 |             isAuthenticated={isAuthenticated}
53 |             user={user}
54 |             logout={logout}
55 |             className={classes.userControl}
56 |           />
57 |           <QuickSettings toggleTheme={toggleTheme} isDarkMode={isDarkMode} />
58 |         </div>
59 |       </Box>
60 |     </Drawer>
61 |   );
62 | };
63 | 


--------------------------------------------------------------------------------
/src/components/common/nav-drawer/nav-item/index.ts:
--------------------------------------------------------------------------------
1 | export { default as NavItem } from "./nav-item";
2 | 


--------------------------------------------------------------------------------
/src/components/common/nav-drawer/nav-item/nav-item.tsx:
--------------------------------------------------------------------------------
 1 | import {
 2 |   ListItem,
 3 |   ListItemButton,
 4 |   ListItemIcon,
 5 |   ListItemText,
 6 | } from "@mui/material";
 7 | import { useRouter } from "next/router";
 8 | import React from "react";
 9 | 
10 | export type NavItemProps = {
11 |   icon: React.ReactNode;
12 |   text: string;
13 |   href: string;
14 | };
15 | 
16 | const NavItem: React.FC<NavItemProps> = ({ icon, text, href }) => {
17 |   const router = useRouter();
18 | 
19 |   const handleClick = () => {
20 |     router.push(href);
21 |   };
22 | 
23 |   return (
24 |     <ListItem>
25 |       <ListItemButton onClick={handleClick}>
26 |         <ListItemIcon>{icon}</ListItemIcon>
27 |         <ListItemText primary={text} />
28 |       </ListItemButton>
29 |     </ListItem>
30 |   );
31 | };
32 | 
33 | export default NavItem;
34 | 


--------------------------------------------------------------------------------
/src/components/common/nav-drawer/nav-items/index.ts:
--------------------------------------------------------------------------------
1 | export { NavItems } from "./nav-items";
2 | 


--------------------------------------------------------------------------------
/src/components/common/nav-drawer/nav-items/nav-items.tsx:
--------------------------------------------------------------------------------
 1 | import React from "react";
 2 | import { NavItem } from "../nav-item";
 3 | 
 4 | import HomeIcon from "@mui/icons-material/Home";
 5 | import HttpIcon from "@mui/icons-material/Http";
 6 | import BarChart from "@mui/icons-material/BarChart";
 7 | import AutoAwesomeIcon from "@mui/icons-material/AutoAwesome";
 8 | import { List } from "@mui/material";
 9 | import { Folder, Person, Schedule, VideoFile } from "@mui/icons-material";
10 | 
11 | const items = [
12 |   {
13 |     icon: <HomeIcon />,
14 |     text: "Home",
15 |     href: "/",
16 |   },
17 |   {
18 |     icon: <HttpIcon />,
19 |     text: "Jobs",
20 |     href: "/jobs",
21 |   },
22 |   {
23 |     icon: <Person />,
24 |     text: "Agent",
25 |     href: "/agent",
26 |   },
27 |   {
28 |     icon: <AutoAwesomeIcon />,
29 |     text: "Chat",
30 |     href: "/chat",
31 |   },
32 |   {
33 |     icon: <BarChart />,
34 |     text: "Statistics",
35 |     href: "/statistics",
36 |   },
37 |   {
38 |     icon: <Schedule />,
39 |     text: "Cron Jobs",
40 |     href: "/cron-jobs",
41 |   },
42 |   {
43 |     icon: <VideoFile />,
44 |     text: "Recordings",
45 |     href: "/recordings",
46 |   },
47 |   {
48 |     icon: <Folder />,
49 |     text: "Media",
50 |     href: "/media",
51 |   },
52 | ];
53 | 
54 | export const NavItems = () => {
55 |   return (
56 |     <List>
57 |       {items.map((item) => (
58 |         <NavItem key={item.href} {...item} />
59 |       ))}
60 |     </List>
61 |   );
62 | };
63 | 


--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./user-control";
2 | 


--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/logged-in-control/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./logged-in-control";


--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/logged-in-control/logged-in-control.module.css:
--------------------------------------------------------------------------------
1 | .welcome {
2 |   margin: 0.25rem;
3 | }
4 | 
5 | .userControlButton {
6 |   width: 100%;
7 | }


--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/logged-in-control/logged-in-control.tsx:
--------------------------------------------------------------------------------
 1 | import React from "react";
 2 | import { Typography, Button } from "@mui/material";
 3 | import classes from "./logged-in-control.module.css";
 4 | 
 5 | type LoggedInControlProps = {
 6 |   user: any;
 7 |   logout: () => void;
 8 |   children?: React.ReactNode;
 9 | };
10 | 
11 | export const LoggedInControl = ({
12 |   user,
13 |   logout,
14 |   children,
15 | }: LoggedInControlProps) => {
16 |   if (children) {
17 |     return <>{children}</>;
18 |   }
19 | 
20 |   return (
21 |     <>
22 |       <Typography variant="body1" className={classes.welcome}>
23 |         Welcome, {user?.full_name}
24 |       </Typography>
25 |       <Button
26 |         variant="contained"
27 |         onClick={logout}
28 |         className={classes.userControlButton}
29 |       >
30 |         Logout
31 |       </Button>
32 |     </>
33 |   );
34 | };
35 | 


--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/logged-out-control/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./logged-out-control";
2 | 


--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/logged-out-control/logged-out-control.module.css:
--------------------------------------------------------------------------------
1 | .userControlButton {
2 |   width: 100%;
3 | }
4 | 


--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/logged-out-control/logged-out-control.tsx:
--------------------------------------------------------------------------------
 1 | import React from "react";
 2 | import { Button } from "@mui/material";
 3 | import classes from "./logged-out-control.module.css";
 4 | import { useRouter } from "next/navigation";
 5 | 
 6 | export type LoggedOutControlProps = {
 7 |   children?: React.ReactNode;
 8 | };
 9 | 
10 | export const LoggedOutControl: React.FC<LoggedOutControlProps> = ({
11 |   children,
12 | }) => {
13 |   const router = useRouter();
14 |   const login = () => router.push("/login");
15 | 
16 |   if (children) {
17 |     return <>{children}</>;
18 |   }
19 | 
20 |   return (
21 |     <Button
22 |       variant="contained"
23 |       onClick={login}
24 |       className={classes.userControlButton}
25 |     >
26 |       Login
27 |     </Button>
28 |   );
29 | };
30 | 


--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/user-control.module.css:
--------------------------------------------------------------------------------
 1 | .userControl {
 2 |   display: flex;
 3 |   flex-direction: column;
 4 |   align-items: center;
 5 | }
 6 | 
 7 | .welcome {
 8 |   margin: 0.25rem;
 9 | }
10 | 
11 | .userControlButton {
12 |   width: 100%;
13 | }
14 | 


--------------------------------------------------------------------------------
/src/components/common/nav-drawer/user-control/user-control.tsx:
--------------------------------------------------------------------------------
 1 | import React from "react";
 2 | import { Box } from "@mui/material";
 3 | import clsx from "clsx";
 4 | 
 5 | import classes from "./user-control.module.css";
 6 | import { LoggedInControl } from "./logged-in-control";
 7 | import { LoggedOutControl } from "./logged-out-control";
 8 | 
 9 | export type UserControlProps = {
10 |   isAuthenticated: boolean;
11 |   user: any;
12 |   logout: () => void;
13 |   loggedInChildren?: React.ReactNode;
14 |   loggedOutChildren?: React.ReactNode;
15 |   className?: string;
16 | };
17 | 
18 | export const UserControl = ({
19 |   isAuthenticated,
20 |   user,
21 |   logout,
22 |   loggedInChildren,
23 |   loggedOutChildren,
24 |   className,
25 | }: UserControlProps) => {
26 |   return (
27 |     <Box className={clsx(classes.userControl, className)}>
28 |       {isAuthenticated ? (
29 |         <LoggedInControl user={user} logout={logout}>
30 |           {loggedInChildren}
31 |         </LoggedInControl>
32 |       ) : (
33 |         <LoggedOutControl>{loggedOutChildren}</LoggedOutControl>
34 |       )}
35 |     </Box>
36 |   );
37 | };
38 | 


--------------------------------------------------------------------------------
/src/components/jobs/Favorites.tsx:
--------------------------------------------------------------------------------
  1 | import React from "react";
  2 | import {
  3 |   Tooltip,
  4 |   IconButton,
  5 |   Table,
  6 |   TableBody,
  7 |   TableCell,
  8 |   TableHead,
  9 |   TableRow,
 10 |   Box,
 11 |   Checkbox,
 12 |   Button,
 13 | } from "@mui/material";
 14 | import { Job } from "../../types";
 15 | import StarIcon from "@mui/icons-material/Star";
 16 | 
 17 | interface stateProps {
 18 |   selectedJobs: Set<string>;
 19 |   filteredJobs: Job[];
 20 | }
 21 | 
 22 | interface Props {
 23 |   onSelectJob: (job: string) => void;
 24 |   onNavigate: (elements: Object[], url: string, options: any) => void;
 25 |   onFavorite: (ids: string[], field: string, value: any) => void;
 26 |   stateProps: stateProps;
 27 | }
 28 | 
 29 | export const Favorites = ({
 30 |   stateProps,
 31 |   onSelectJob,
 32 |   onNavigate,
 33 |   onFavorite,
 34 | }: Props) => {
 35 |   const { selectedJobs, filteredJobs } = stateProps;
 36 |   const favoritedJobs = filteredJobs.filter((job) => job.favorite);
 37 | 
 38 |   return (
 39 |     <Table sx={{ tableLayout: "fixed", width: "100%" }}>
 40 |       <TableHead>
 41 |         <TableRow>
 42 |           <TableCell>Select</TableCell>
 43 |           <TableCell>Id</TableCell>
 44 |           <TableCell>Url</TableCell>
 45 |           <TableCell>Elements</TableCell>
 46 |           <TableCell>Time Created</TableCell>
 47 |           <TableCell>Actions</TableCell>
 48 |         </TableRow>
 49 |       </TableHead>
 50 |       <TableBody>
 51 |         {favoritedJobs.map((row, index) => (
 52 |           <TableRow key={index}>
 53 |             <TableCell padding="checkbox">
 54 |               <Checkbox
 55 |                 checked={selectedJobs.has(row.id)}
 56 |                 onChange={() => onSelectJob(row.id)}
 57 |               />
 58 |               <Tooltip title="Favorite Job">
 59 |                 <span>
 60 |                   <IconButton
 61 |                     color={row.favorite ? "warning" : "default"}
 62 |                     onClick={() => {
 63 |                       onFavorite([row.id], "favorite", !row.favorite);
 64 |                       row.favorite = !row.favorite;
 65 |                     }}
 66 |                   >
 67 |                     <StarIcon />
 68 |                   </IconButton>
 69 |                 </span>
 70 |               </Tooltip>
 71 |             </TableCell>
 72 |             <TableCell sx={{ maxWidth: 100, overflow: "auto" }}>
 73 |               <Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.id}</Box>
 74 |             </TableCell>
 75 |             <TableCell sx={{ maxWidth: 200, overflow: "auto" }}>
 76 |               <Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.url}</Box>
 77 |             </TableCell>
 78 |             <TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
 79 |               <Box sx={{ maxHeight: 100, overflow: "auto" }}>
 80 |                 {JSON.stringify(row.elements)}
 81 |               </Box>
 82 |             </TableCell>
 83 |             <TableCell sx={{ maxWidth: 150, overflow: "auto" }}>
 84 |               <Box sx={{ maxHeight: 100, overflow: "auto" }}>
 85 |                 {new Date(row.time_created).toLocaleString()}
 86 |               </Box>
 87 |             </TableCell>
 88 |             <TableCell sx={{ maxWidth: 100, overflow: "auto" }}>
 89 |               <Button
 90 |                 onClick={() =>
 91 |                   onNavigate(row.elements, row.url, row.job_options)
 92 |                 }
 93 |               >
 94 |                 Run
 95 |               </Button>
 96 |             </TableCell>
 97 |           </TableRow>
 98 |         ))}
 99 |       </TableBody>
100 |     </Table>
101 |   );
102 | };
103 | 


--------------------------------------------------------------------------------
/src/components/jobs/index.tsx:
--------------------------------------------------------------------------------
1 | export * from "./JobQueue";
2 | export * from "./Favorites";
3 | export * from "./JobTable";
4 | 


--------------------------------------------------------------------------------
/src/components/nav/quick-settings/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./quick-settings";


--------------------------------------------------------------------------------
/src/components/nav/quick-settings/quick-settings.module.css:
--------------------------------------------------------------------------------
 1 | .quickSettings {
 2 |   padding: 0;
 3 |   margin-bottom: 0.25rem;
 4 | }
 5 | 
 6 | .details {
 7 |   display: flex;
 8 |   margin-right: 0.25rem;
 9 | }
10 | 
11 | .detailsText p {
12 |   font-size: 1rem;
13 | }
14 | 


--------------------------------------------------------------------------------
/src/components/nav/quick-settings/quick-settings.tsx:
--------------------------------------------------------------------------------
 1 | import React from "react";
 2 | 
 3 | import classes from "./quick-settings.module.css";
 4 | import {
 5 |   Accordion,
 6 |   AccordionDetails,
 7 |   AccordionSummary,
 8 |   Switch,
 9 |   Typography,
10 | } from "@mui/material";
11 | import { ExpandMore } from "@mui/icons-material";
12 | 
13 | type QuickSettingsProps = {
14 |   toggleTheme: () => void;
15 |   isDarkMode: boolean;
16 | };
17 | 
18 | export const QuickSettings: React.FC<QuickSettingsProps> = ({
19 |   toggleTheme,
20 |   isDarkMode,
21 | }) => {
22 |   return (
23 |     <Accordion className={classes.quickSettings}>
24 |       <AccordionSummary
25 |         expandIcon={<ExpandMore />}
26 |         aria-controls="panel1a-content"
27 |         id="panel1a-header"
28 |       >
29 |         <Typography>Quick Settings</Typography>
30 |       </AccordionSummary>
31 |       <AccordionDetails>
32 |         <div className={classes.details}>
33 |           <Typography className={classes.detailsText} component="span">
34 |             <p className={classes.detailsText}>Dark Theme Toggle</p>
35 |           </Typography>
36 |           <Switch checked={isDarkMode} onChange={toggleTheme} />
37 |         </div>
38 |       </AccordionDetails>
39 |     </Accordion>
40 |   );
41 | };
42 | 


--------------------------------------------------------------------------------
/src/components/pages/agent/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./agent";
2 | 


--------------------------------------------------------------------------------
/src/components/pages/cron-jobs/create-cron-jobs/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./create-cron-jobs";
2 | 


--------------------------------------------------------------------------------
/src/components/pages/cron-jobs/cron-jobs.module.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/src/components/pages/cron-jobs/cron-jobs.module.css


--------------------------------------------------------------------------------
/src/components/pages/cron-jobs/cron-jobs.tsx:
--------------------------------------------------------------------------------
  1 | import { Job, CronJob } from "@/types/job";
  2 | import { useState, useEffect } from "react";
  3 | import { CreateCronJobs } from "./create-cron-jobs";
  4 | import {
  5 |   Table,
  6 |   TableHead,
  7 |   TableRow,
  8 |   TableCell,
  9 |   TableBody,
 10 |   Button,
 11 |   Box,
 12 |   Typography,
 13 |   useTheme,
 14 | } from "@mui/material";
 15 | import Cookies from "js-cookie";
 16 | 
 17 | export type CronJobsProps = {
 18 |   initialJobs: Job[];
 19 |   initialCronJobs: CronJob[];
 20 |   initialUser: any;
 21 | };
 22 | 
 23 | export const CronJobs = ({
 24 |   initialJobs,
 25 |   initialCronJobs,
 26 |   initialUser,
 27 | }: CronJobsProps) => {
 28 |   const [jobs, setJobs] = useState<Job[]>(initialJobs);
 29 |   const [cronJobs, setCronJobs] = useState<CronJob[]>(initialCronJobs);
 30 |   const [user, setUser] = useState<any>(initialUser);
 31 |   const theme = useTheme();
 32 | 
 33 |   useEffect(() => {
 34 |     setJobs(initialJobs);
 35 |     setCronJobs(initialCronJobs);
 36 |     setUser(initialUser);
 37 |   }, [initialJobs, initialCronJobs, initialUser]);
 38 | 
 39 |   const handleDeleteCronJob = async (id: string) => {
 40 |     const token = Cookies.get("token");
 41 |     const response = await fetch("/api/delete-cron-job", {
 42 |       method: "POST",
 43 |       headers: {
 44 |         "Content-Type": "application/json",
 45 |         Authorization: `Bearer ${token}`,
 46 |       },
 47 |       body: JSON.stringify({ data: { id, user_email: user.email } }),
 48 |     });
 49 | 
 50 |     if (response.ok) {
 51 |       console.log("Cron job deleted successfully");
 52 |       setCronJobs(cronJobs.filter((cronJob) => cronJob.id !== id));
 53 |     } else {
 54 |       console.error("Failed to delete cron job");
 55 |     }
 56 |   };
 57 | 
 58 |   if (!user) {
 59 |     return (
 60 |       <Box
 61 |         sx={{
 62 |           display: "flex",
 63 |           justifyContent: "center",
 64 |           alignItems: "center",
 65 |           height: "100%",
 66 |           borderRadius: "8px",
 67 |           border:
 68 |             theme.palette.mode === "light" ? "solid white" : "solid #4b5057",
 69 |           boxShadow: "0 4px 8px rgba(0, 0, 0, 0.1)",
 70 |         }}
 71 |       >
 72 |         <h4
 73 |           style={{
 74 |             color: "#fff",
 75 |             padding: "20px",
 76 |             borderRadius: "8px",
 77 |             background: "rgba(0, 0, 0, 0.6)",
 78 |           }}
 79 |         >
 80 |           Please login to view your cron jobs
 81 |         </h4>
 82 |       </Box>
 83 |     );
 84 |   }
 85 | 
 86 |   return (
 87 |     <div>
 88 |       <CreateCronJobs availableJobs={jobs} user={user} />
 89 | 
 90 |       <Table>
 91 |         <TableHead>
 92 |           <TableRow>
 93 |             <TableCell>Cron Expression</TableCell>
 94 |             <TableCell>Job ID</TableCell>
 95 |             <TableCell>User Email</TableCell>
 96 |             <TableCell>Created At</TableCell>
 97 |             <TableCell>Updated At</TableCell>
 98 |             <TableCell>Actions</TableCell>
 99 |           </TableRow>
100 |         </TableHead>
101 |         <TableBody>
102 |           {cronJobs.map((cronJob) => (
103 |             <TableRow key={cronJob.id}>
104 |               <TableCell>{cronJob.cron_expression}</TableCell>
105 |               <TableCell>{cronJob.job_id}</TableCell>
106 |               <TableCell>{cronJob.user_email}</TableCell>
107 |               <TableCell>
108 |                 {new Date(cronJob.time_created).toLocaleString()}
109 |               </TableCell>
110 |               <TableCell>
111 |                 {new Date(cronJob.time_updated).toLocaleString()}
112 |               </TableCell>
113 |               <TableCell>
114 |                 <Button onClick={() => handleDeleteCronJob(cronJob.id)}>
115 |                   Delete
116 |                 </Button>
117 |               </TableCell>
118 |             </TableRow>
119 |           ))}
120 |         </TableBody>
121 |       </Table>
122 |     </div>
123 |   );
124 | };
125 | 


--------------------------------------------------------------------------------
/src/components/pages/cron-jobs/get-server-side-props.ts:
--------------------------------------------------------------------------------
 1 | import axios from "axios";
 2 | import { GetServerSideProps } from "next";
 3 | import { parseCookies } from "nookies";
 4 | import { CronJob, Job } from "../../../types";
 5 | 
 6 | export const getServerSideProps: GetServerSideProps = async (context) => {
 7 |   const { req } = context;
 8 |   const cookies = parseCookies({ req });
 9 |   const token = cookies.token;
10 |   let user = null;
11 |   let initialJobs: Job[] = [];
12 |   let initialCronJobs: CronJob[] = [];
13 |   if (token) {
14 |     try {
15 |       const userResponse = await axios.get(
16 |         `${process.env.NEXT_PUBLIC_API_URL}/api/auth/users/me`,
17 |         {
18 |           headers: { Authorization: `Bearer ${token}` },
19 |         }
20 |       );
21 | 
22 |       user = userResponse.data;
23 | 
24 |       const jobsResponse = await fetch(
25 |         `${process.env.NEXT_PUBLIC_API_URL}/api/retrieve-scrape-jobs`,
26 |         {
27 |           method: "POST",
28 |           body: JSON.stringify({ user: user.email }),
29 |           headers: {
30 |             "content-type": "application/json",
31 |             Authorization: `Bearer ${token}`,
32 |           },
33 |         }
34 |       );
35 | 
36 |       initialJobs = await jobsResponse.json();
37 |       console.log(initialJobs);
38 | 
39 |       const cronJobsResponse = await fetch(
40 |         `${process.env.NEXT_PUBLIC_API_URL}/api/cron-jobs`,
41 |         {
42 |           headers: {
43 |             "content-type": "application/json",
44 |             Authorization: `Bearer ${token}`,
45 |           },
46 |         }
47 |       );
48 | 
49 |       initialCronJobs = await cronJobsResponse.json();
50 |     } catch (error) {
51 |       console.error("Error fetching user or jobs:", error);
52 |     }
53 |   }
54 | 
55 |   return {
56 |     props: {
57 |       initialJobs,
58 |       initialUser: user,
59 |       initialCronJobs,
60 |     },
61 |   };
62 | };
63 | 


--------------------------------------------------------------------------------
/src/components/pages/cron-jobs/index.ts:
--------------------------------------------------------------------------------
1 | export { CronJobs } from "./cron-jobs";
2 | 


--------------------------------------------------------------------------------
/src/components/pages/home/home.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | 
  3 | import React, { useState, useEffect, useRef } from "react";
  4 | import { Button, Container, Box, Snackbar, Alert } from "@mui/material";
  5 | import { useRouter } from "next/router";
  6 | import { Element, Result } from "@/types";
  7 | import { ElementTable, JobSubmitter } from "@/components/submit/job-submitter";
  8 | import { useJobSubmitterProvider } from "@/components/submit/job-submitter/provider";
  9 | 
 10 | export const Home = () => {
 11 |   const {
 12 |     submittedURL,
 13 |     setSubmittedURL,
 14 |     rows,
 15 |     setRows,
 16 |     results,
 17 |     snackbarOpen,
 18 |     setSnackbarOpen,
 19 |     snackbarMessage,
 20 |     snackbarSeverity,
 21 |   } = useJobSubmitterProvider();
 22 |   const router = useRouter();
 23 |   const { elements, url } = router.query;
 24 | 
 25 |   const resultsRef = useRef<HTMLTableElement | null>(null);
 26 | 
 27 |   useEffect(() => {
 28 |     if (elements) {
 29 |       setRows(JSON.parse(elements as string));
 30 |     }
 31 |     if (url) {
 32 |       setSubmittedURL(url as string);
 33 |     }
 34 |   }, [elements, url]);
 35 | 
 36 |   useEffect(() => {
 37 |     if (results && resultsRef.current) {
 38 |       resultsRef.current.scrollIntoView({ behavior: "smooth" });
 39 |     }
 40 |   }, [results]);
 41 | 
 42 |   const handleCloseSnackbar = () => {
 43 |     setSnackbarOpen(false);
 44 |   };
 45 | 
 46 |   const ErrorSnackbar = () => {
 47 |     return (
 48 |       <Snackbar
 49 |         open={snackbarOpen}
 50 |         autoHideDuration={6000}
 51 |         onClose={handleCloseSnackbar}
 52 |       >
 53 |         <Alert onClose={handleCloseSnackbar} severity="error">
 54 |           {snackbarMessage}
 55 |         </Alert>
 56 |       </Snackbar>
 57 |     );
 58 |   };
 59 | 
 60 |   const NotifySnackbar = () => {
 61 |     const goTo = () => {
 62 |       router.push("/jobs");
 63 |     };
 64 | 
 65 |     const action = (
 66 |       <Button color="inherit" size="small" onClick={goTo}>
 67 |         Go To Job
 68 |       </Button>
 69 |     );
 70 | 
 71 |     return (
 72 |       <Snackbar
 73 |         open={snackbarOpen}
 74 |         autoHideDuration={6000}
 75 |         onClose={handleCloseSnackbar}
 76 |       >
 77 |         <Alert onClose={handleCloseSnackbar} severity="info" action={action}>
 78 |           {snackbarMessage}
 79 |         </Alert>
 80 |       </Snackbar>
 81 |     );
 82 |   };
 83 | 
 84 |   return (
 85 |     <Box
 86 |       bgcolor="background.default"
 87 |       display="flex"
 88 |       flexDirection="column"
 89 |       justifyContent="center"
 90 |       alignItems="center"
 91 |       height="100%"
 92 |       py={4}
 93 |     >
 94 |       <Container maxWidth="lg" className="overflow-y-auto max-h-full">
 95 |         <JobSubmitter />
 96 |         {submittedURL.length ? (
 97 |           <ElementTable
 98 |             rows={rows}
 99 |             setRows={setRows}
100 |             submittedURL={submittedURL}
101 |           />
102 |         ) : null}
103 |       </Container>
104 |       {snackbarSeverity === "info" ? <NotifySnackbar /> : <ErrorSnackbar />}
105 |     </Box>
106 |   );
107 | };
108 | 


--------------------------------------------------------------------------------
/src/components/pages/home/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./home";
2 | 


--------------------------------------------------------------------------------
/src/components/pages/job/csv/id/get-server-side-props.ts:
--------------------------------------------------------------------------------
 1 | import { GetServerSideProps } from "next";
 2 | import { parseCookies } from "nookies";
 3 | 
 4 | export const getServerSideProps: GetServerSideProps = async (context) => {
 5 |   const { req, params } = context;
 6 |   const id = params?.id;
 7 | 
 8 |   const cookies = parseCookies({ req });
 9 |   const token = cookies.token;
10 |   let csv = null;
11 | 
12 |   try {
13 |     const csvResponse = await fetch(
14 |       `${process.env.NEXT_PUBLIC_API_URL}/api/job/${id}/convert-to-csv`,
15 |       {
16 |         
17 |         method: "GET",
18 |         headers: {
19 |           "content-type": "application/json",
20 |           Authorization: `Bearer ${token}`,
21 |         },
22 |       }
23 |     );
24 | 
25 |     csv = await csvResponse.json();
26 |   } catch (error) {
27 |     console.error("Error fetching job:", error);
28 |   }
29 | 
30 |   return {
31 |     props: {
32 |       csv,
33 |     },
34 |   };
35 | };
36 | 


--------------------------------------------------------------------------------
/src/components/pages/job/csv/id/id.tsx:
--------------------------------------------------------------------------------
 1 | import { CsvRow, CsvTable } from "@/components/common/csv-table/csv-table";
 2 | 
 3 | export type Csv = {
 4 |   rows: CsvRow[];
 5 |   headers: string[];
 6 | };
 7 | 
 8 | export const JobCsvId = ({ csv }: { csv: Csv }) => {
 9 |   return <CsvTable csv={csv} />;
10 | };
11 | 


--------------------------------------------------------------------------------
/src/components/pages/job/csv/id/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./id";
2 | 


--------------------------------------------------------------------------------
/src/components/pages/media/id/index.ts:
--------------------------------------------------------------------------------
1 | export { MediaId } from "./id";
2 | 


--------------------------------------------------------------------------------
/src/components/pages/recordings/id/index.ts:
--------------------------------------------------------------------------------
1 | export { RecordingId } from "./id";
2 | 


--------------------------------------------------------------------------------
/src/components/submit/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./job-submitter";
2 | 


--------------------------------------------------------------------------------
/src/components/submit/job-submitter/element-table/index.ts:
--------------------------------------------------------------------------------
1 | export { ElementTable } from "./element-table";
2 | 


--------------------------------------------------------------------------------
/src/components/submit/job-submitter/index.ts:
--------------------------------------------------------------------------------
1 | export { JobSubmitter } from "./job-submitter";
2 | export { ElementTable } from "./element-table";
3 | 


--------------------------------------------------------------------------------
/src/components/submit/job-submitter/job-submitter-header/index.ts:
--------------------------------------------------------------------------------
1 | export { JobSubmitterHeader } from "./job-submitter-header";
2 | 


--------------------------------------------------------------------------------
/src/components/submit/job-submitter/job-submitter-header/job-submitter-header.module.css:
--------------------------------------------------------------------------------
1 | .jobSubmitterHeader {
2 |   margin-bottom: 1rem;
3 |   text-align: center;
4 | }
5 | 


--------------------------------------------------------------------------------
/src/components/submit/job-submitter/job-submitter-header/job-submitter-header.tsx:
--------------------------------------------------------------------------------
 1 | import React, { ReactNode } from "react";
 2 | import { Typography } from "@mui/material";
 3 | import classes from "./job-submitter-header.module.css";
 4 | 
 5 | interface JobSubmitterHeaderProps {
 6 |   title?: string;
 7 |   children?: ReactNode;
 8 | }
 9 | 
10 | export const JobSubmitterHeader: React.FC<JobSubmitterHeaderProps> = ({
11 |   title = "Scraping Made Easy",
12 |   children,
13 | }) => {
14 |   return (
15 |     <div className={classes.jobSubmitterHeader}>
16 |       <Typography variant="h3">{title}</Typography>
17 |       {children}
18 |     </div>
19 |   );
20 | };
21 | 


--------------------------------------------------------------------------------
/src/components/submit/job-submitter/job-submitter-input/index.ts:
--------------------------------------------------------------------------------
1 | export { JobSubmitterInput } from "./job-submitter-input";
2 | 


--------------------------------------------------------------------------------
/src/components/submit/job-submitter/job-submitter-input/job-submitter-input.module.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/src/components/submit/job-submitter/job-submitter-input/job-submitter-input.module.css


--------------------------------------------------------------------------------
/src/components/submit/job-submitter/job-submitter-input/job-submitter-input.tsx:
--------------------------------------------------------------------------------
 1 | import React from "react";
 2 | import { TextField, Button, CircularProgress } from "@mui/material";
 3 | import { useJobSubmitterProvider } from "../provider";
 4 | 
 5 | export type JobSubmitterInputProps = {
 6 |   urlError: string | null;
 7 |   handleSubmit: () => void;
 8 |   loading: boolean;
 9 | };
10 | 
11 | export const JobSubmitterInput = ({
12 |   handleSubmit,
13 |   loading,
14 |   urlError,
15 | }: JobSubmitterInputProps) => {
16 |   const { submittedURL, setSubmittedURL, isValidURL, rows } =
17 |     useJobSubmitterProvider();
18 |   return (
19 |     <div className="flex flex-row space-x-4 items-center mb-2">
20 |       <TextField
21 |         data-cy="url-input"
22 |         label="URL"
23 |         variant="outlined"
24 |         fullWidth
25 |         value={submittedURL}
26 |         onChange={(e) => setSubmittedURL(e.target.value)}
27 |         error={!isValidURL}
28 |         helperText={!isValidURL ? urlError : ""}
29 |         className="rounded-md"
30 |       />
31 |       <Button
32 |         data-cy="submit-button"
33 |         variant="contained"
34 |         size="small"
35 |         onClick={handleSubmit}
36 |         disabled={!(rows.length > 0) || loading}
37 |         className={`bg-[#034efc] text-white font-semibold rounded-md 
38 |                     transition-transform transform hover:scale-105 disabled:opacity-50`}
39 |       >
40 |         {loading ? <CircularProgress size={24} color="inherit" /> : "Submit"}
41 |       </Button>
42 |     </div>
43 |   );
44 | };
45 | 


--------------------------------------------------------------------------------
/src/components/submit/job-submitter/job-submitter-options/index.ts:
--------------------------------------------------------------------------------
1 | export { JobSubmitterOptions } from "./job-submitter-options";
2 | 


--------------------------------------------------------------------------------
/src/components/submit/job-submitter/job-submitter.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | 
  3 | import React, { useEffect, useState } from "react";
  4 | import { useAuth } from "@/contexts/AuthContext";
  5 | import { useRouter } from "next/router";
  6 | import { RawJobOptions } from "@/types/job";
  7 | import { parseJobOptions, validateURL } from "@/lib";
  8 | import { JobSubmitterHeader } from "./job-submitter-header";
  9 | import { JobSubmitterInput } from "./job-submitter-input";
 10 | import { JobSubmitterOptions } from "./job-submitter-options";
 11 | import { ApiService } from "@/services";
 12 | import { useJobSubmitterProvider } from "./provider";
 13 | import { AdvancedJobOptions } from "@/components/common/advanced-job-options";
 14 | 
 15 | const initialJobOptions: RawJobOptions = {
 16 |   multi_page_scrape: false,
 17 |   custom_headers: null,
 18 |   proxies: null,
 19 |   collect_media: false,
 20 |   custom_cookies: null,
 21 | };
 22 | 
 23 | export const JobSubmitter = () => {
 24 |   const { user } = useAuth();
 25 |   const router = useRouter();
 26 |   const { job_options } = router.query;
 27 | 
 28 |   const {
 29 |     submittedURL,
 30 |     rows,
 31 |     siteMap,
 32 |     setIsValidUrl,
 33 |     setSnackbarMessage,
 34 |     setSnackbarOpen,
 35 |     setSnackbarSeverity,
 36 |     setSiteMap,
 37 |   } = useJobSubmitterProvider();
 38 | 
 39 |   const [urlError, setUrlError] = useState<string | null>(null);
 40 |   const [loading, setLoading] = useState<boolean>(false);
 41 |   const [jobOptions, setJobOptions] =
 42 |     useState<RawJobOptions>(initialJobOptions);
 43 | 
 44 |   const handleSubmit = async () => {
 45 |     if (!validateURL(submittedURL)) {
 46 |       setIsValidUrl(false);
 47 |       setUrlError("Please enter a valid URL.");
 48 |       return;
 49 |     }
 50 | 
 51 |     setIsValidUrl(true);
 52 |     setUrlError(null);
 53 |     setLoading(true);
 54 | 
 55 |     let customHeaders;
 56 |     let customCookies;
 57 | 
 58 |     try {
 59 |       customHeaders = jobOptions.custom_headers || null;
 60 |       customCookies = jobOptions.custom_cookies || null;
 61 |     } catch (error: any) {
 62 |       console.error(error);
 63 |       setSnackbarMessage("Invalid JSON in custom headers.");
 64 |       setSnackbarOpen(true);
 65 |       setSnackbarSeverity("error");
 66 |       setLoading(false);
 67 |       return;
 68 |     }
 69 | 
 70 |     await ApiService.submitJob(
 71 |       submittedURL,
 72 |       rows,
 73 |       user,
 74 |       jobOptions,
 75 |       customHeaders,
 76 |       customCookies,
 77 |       siteMap
 78 |     )
 79 |       .then(async (response) => {
 80 |         if (!response.ok) {
 81 |           return response.json().then((error) => {
 82 |             throw new Error(error.error);
 83 |           });
 84 |         }
 85 |         return response.json();
 86 |       })
 87 |       .then((data) => {
 88 |         setSnackbarMessage(
 89 |           `Job: ${data.id} submitted successfully.` ||
 90 |             "Job submitted successfully."
 91 |         );
 92 |         setSnackbarSeverity("info");
 93 |         setSnackbarOpen(true);
 94 |       })
 95 |       .catch((error) => {
 96 |         setSnackbarMessage(error || "An error occurred.");
 97 |         setSnackbarSeverity("error");
 98 |         setSnackbarOpen(true);
 99 |       })
100 |       .finally(() => setLoading(false));
101 |   };
102 | 
103 |   useEffect(() => {
104 |     if (job_options) {
105 |       parseJobOptions(job_options as string, setJobOptions, setSiteMap);
106 |     }
107 |   }, [job_options]);
108 | 
109 |   return (
110 |     <div>
111 |       <JobSubmitterHeader />
112 |       <JobSubmitterInput
113 |         urlError={urlError}
114 |         handleSubmit={handleSubmit}
115 |         loading={loading}
116 |       />
117 |       <AdvancedJobOptions
118 |         jobOptions={jobOptions}
119 |         setJobOptions={setJobOptions}
120 |       />
121 |     </div>
122 |   );
123 | };
124 | 


--------------------------------------------------------------------------------
/src/components/submit/job-submitter/provider.tsx:
--------------------------------------------------------------------------------
 1 | import React, {
 2 |   createContext,
 3 |   PropsWithChildren,
 4 |   useContext,
 5 |   useState,
 6 |   Dispatch,
 7 |   useMemo,
 8 | } from "react";
 9 | import { Element, Result, SiteMap } from "@/types";
10 | 
11 | type JobSubmitterProviderType = {
12 |   submittedURL: string;
13 |   setSubmittedURL: Dispatch<React.SetStateAction<string>>;
14 |   rows: Element[];
15 |   setRows: Dispatch<React.SetStateAction<Element[]>>;
16 |   results: Result;
17 |   setResults: Dispatch<React.SetStateAction<Result>>;
18 |   snackbarOpen: boolean;
19 |   setSnackbarOpen: Dispatch<React.SetStateAction<boolean>>;
20 |   snackbarMessage: string;
21 |   setSnackbarMessage: Dispatch<React.SetStateAction<string>>;
22 |   snackbarSeverity: string;
23 |   setSnackbarSeverity: Dispatch<React.SetStateAction<string>>;
24 |   isValidURL: boolean;
25 |   setIsValidUrl: Dispatch<React.SetStateAction<boolean>>;
26 |   siteMap: SiteMap | null;
27 |   setSiteMap: Dispatch<React.SetStateAction<SiteMap | null>>;
28 | };
29 | 
30 | const JobSubmitterProvider = createContext<JobSubmitterProviderType>(
31 |   {} as JobSubmitterProviderType
32 | );
33 | 
34 | export const Provider = ({ children }: PropsWithChildren) => {
35 |   const [submittedURL, setSubmittedURL] = useState<string>("");
36 |   const [rows, setRows] = useState<Element[]>([]);
37 |   const [results, setResults] = useState<Result>({});
38 |   const [snackbarOpen, setSnackbarOpen] = useState<boolean>(false);
39 |   const [snackbarMessage, setSnackbarMessage] = useState<string>("");
40 |   const [snackbarSeverity, setSnackbarSeverity] = useState<string>("error");
41 |   const [isValidURL, setIsValidUrl] = useState<boolean>(true);
42 |   const [siteMap, setSiteMap] = useState<SiteMap | null>(null);
43 | 
44 |   const value: JobSubmitterProviderType = useMemo(
45 |     () => ({
46 |       submittedURL,
47 |       setSubmittedURL,
48 |       rows,
49 |       setRows,
50 |       results,
51 |       setResults,
52 |       snackbarOpen,
53 |       setSnackbarOpen,
54 |       snackbarMessage,
55 |       setSnackbarMessage,
56 |       snackbarSeverity,
57 |       setSnackbarSeverity,
58 |       isValidURL,
59 |       setIsValidUrl,
60 |       siteMap,
61 |       setSiteMap,
62 |     }),
63 |     [
64 |       submittedURL,
65 |       rows,
66 |       results,
67 |       snackbarOpen,
68 |       snackbarMessage,
69 |       snackbarSeverity,
70 |       isValidURL,
71 |       siteMap,
72 |     ]
73 |   );
74 | 
75 |   return (
76 |     <JobSubmitterProvider.Provider value={value}>
77 |       {children}
78 |     </JobSubmitterProvider.Provider>
79 |   );
80 | };
81 | 
82 | export const useJobSubmitterProvider = () => {
83 |   return useContext(JobSubmitterProvider);
84 | };
85 | 


--------------------------------------------------------------------------------
/src/components/submit/job-submitter/site-map/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./site-map";
2 | 


--------------------------------------------------------------------------------
/src/components/submit/job-submitter/site-map/site-map-input/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./site-map-input";
2 | 


--------------------------------------------------------------------------------
/src/components/submit/job-submitter/site-map/site-map-input/site-map-input.module.css:
--------------------------------------------------------------------------------
 1 | .button {
 2 |   height: 3rem;
 3 |   width: 2rem;
 4 | 
 5 |   color: #ffffff;
 6 |   font-weight: 600;
 7 |   border-radius: 0.375rem;
 8 |   transition: transform 0.2s ease-in-out;
 9 |   transform: scale(1);
10 | }
11 | 
12 | .button:hover {
13 |   transform: scale(1.05);
14 | }
15 | 
16 | .remove {
17 |   background-color: var(--delete-red) !important;
18 | }
19 | 
20 | .remove:hover {
21 |   background-color: var(--delete-red-hover) !important;
22 | }
23 | 


--------------------------------------------------------------------------------
/src/components/submit/job-submitter/site-map/site-map-input/site-map-input.tsx:
--------------------------------------------------------------------------------
  1 | import { useState } from "react";
  2 | import { useJobSubmitterProvider } from "../../provider";
  3 | import {
  4 |   MenuItem,
  5 |   Select,
  6 |   TextField,
  7 |   FormControl,
  8 |   Button,
  9 |   Checkbox,
 10 |   FormControlLabel,
 11 | } from "@mui/material";
 12 | import { ActionOption } from "@/types/job";
 13 | import classes from "./site-map-input.module.css";
 14 | import { clsx } from "clsx";
 15 | 
 16 | export type SiteMapInputProps = {
 17 |   disabled?: boolean;
 18 |   xpath?: string;
 19 |   option?: ActionOption;
 20 |   clickOnce?: boolean;
 21 |   input?: string;
 22 | };
 23 | 
 24 | export const SiteMapInput = ({
 25 |   disabled,
 26 |   xpath,
 27 |   option,
 28 |   clickOnce,
 29 |   input,
 30 | }: SiteMapInputProps) => {
 31 |   console.log(clickOnce);
 32 |   const [optionState, setOptionState] = useState<ActionOption>(
 33 |     option || "click"
 34 |   );
 35 |   const [xpathState, setXpathState] = useState<string>(xpath || "");
 36 |   const [clickOnceState, setClickOnceState] = useState<boolean>(
 37 |     clickOnce || false
 38 |   );
 39 |   const [inputState, setInputState] = useState<string>(input || "");
 40 | 
 41 |   const { siteMap, setSiteMap } = useJobSubmitterProvider();
 42 | 
 43 |   const handleAdd = () => {
 44 |     if (!siteMap) return;
 45 | 
 46 |     console.log(optionState, xpathState, clickOnceState, inputState);
 47 | 
 48 |     setSiteMap((prevSiteMap) => ({
 49 |       ...prevSiteMap,
 50 |       actions: [
 51 |         {
 52 |           type: optionState,
 53 |           xpath: xpathState,
 54 |           name: "",
 55 |           do_once: clickOnceState,
 56 |           input: inputState,
 57 |         },
 58 |         ...(prevSiteMap?.actions || []),
 59 |       ],
 60 |     }));
 61 | 
 62 |     setXpathState("");
 63 |   };
 64 | 
 65 |   const handleRemove = () => {
 66 |     if (!siteMap) return;
 67 | 
 68 |     setSiteMap((prevSiteMap) => ({
 69 |       ...prevSiteMap,
 70 |       actions: (prevSiteMap?.actions || []).slice(0, -1),
 71 |     }));
 72 |   };
 73 | 
 74 |   return (
 75 |     <div className="flex flex-col gap-2 w-full">
 76 |       <div className="flex gap-2 items-center">
 77 |         <FormControl className="w-1/4">
 78 |           <Select
 79 |             disabled={disabled}
 80 |             displayEmpty
 81 |             value={optionState}
 82 |             onChange={(e) => setOptionState(e.target.value as ActionOption)}
 83 |           >
 84 |             <MenuItem value="click">Click</MenuItem>
 85 |             <MenuItem value="input">Input</MenuItem>
 86 |           </Select>
 87 |         </FormControl>
 88 |         {optionState === "input" && (
 89 |           <TextField
 90 |             label="Input Text"
 91 |             fullWidth
 92 |             value={inputState}
 93 |             onChange={(e) => setInputState(e.target.value)}
 94 |             disabled={disabled}
 95 |           />
 96 |         )}
 97 |         <TextField
 98 |           label="XPath Selector"
 99 |           fullWidth
100 |           value={xpathState}
101 |           onChange={(e) => setXpathState(e.target.value)}
102 |           disabled={disabled}
103 |         />
104 |         {disabled ? (
105 |           <Button
106 |             onClick={handleRemove}
107 |             className={clsx(classes.button, classes.remove)}
108 |           >
109 |             Delete
110 |           </Button>
111 |         ) : (
112 |           <Button
113 |             onClick={handleAdd}
114 |             disabled={!xpathState}
115 |             className={clsx(classes.button, classes.add)}
116 |           >
117 |             Add
118 |           </Button>
119 |         )}
120 |       </div>
121 |       {!disabled && (
122 |         <FormControlLabel
123 |           label="Do Once"
124 |           control={
125 |             <Checkbox
126 |               checked={clickOnceState}
127 |               disabled={disabled}
128 |               onChange={() => setClickOnceState(!clickOnceState)}
129 |             />
130 |           }
131 |         />
132 |       )}
133 |     </div>
134 |   );
135 | };
136 | 


--------------------------------------------------------------------------------
/src/components/submit/job-submitter/site-map/site-map.tsx:
--------------------------------------------------------------------------------
 1 | import { useEffect, useState } from "react";
 2 | import { useJobSubmitterProvider } from "../provider";
 3 | import { Button, Divider, Typography, useTheme } from "@mui/material";
 4 | import { SiteMapInput } from "./site-map-input";
 5 | 
 6 | export const SiteMap = () => {
 7 |   const { siteMap, setSiteMap } = useJobSubmitterProvider();
 8 |   const [showSiteMap, setShowSiteMap] = useState<boolean>(false);
 9 |   const theme = useTheme();
10 | 
11 |   const handleCreateSiteMap = () => {
12 |     setSiteMap({ actions: [] });
13 |     setShowSiteMap(true);
14 |   };
15 | 
16 |   const handleClearSiteMap = () => {
17 |     setSiteMap(null);
18 |     setShowSiteMap(false);
19 |   };
20 | 
21 |   useEffect(() => {
22 |     if (siteMap) {
23 |       setShowSiteMap(true);
24 |     }
25 |   }, [siteMap]);
26 | 
27 |   return (
28 |     <div className="flex flex-col gap-4">
29 |       {siteMap ? (
30 |         <Button onClick={handleClearSiteMap}>Clear Site Map</Button>
31 |       ) : (
32 |         <Button onClick={handleCreateSiteMap}>Create Site Map</Button>
33 |       )}
34 |       {showSiteMap && (
35 |         <div className="flex flex-col gap-4">
36 |           <SiteMapInput />
37 |           {siteMap?.actions && siteMap?.actions.length > 0 && (
38 |             <>
39 |               <Divider
40 |                 sx={{
41 |                   borderColor:
42 |                     theme.palette.mode === "dark" ? "#ffffff" : "0000000",
43 |                 }}
44 |               />
45 |               <Typography className="w-full text-center" variant="h5">
46 |                 Site Map Actions
47 |               </Typography>
48 |             </>
49 |           )}
50 |           <ul className="flex flex-col gap-4">
51 |             {siteMap?.actions.reverse().map((action, index) => (
52 |               <li key={action.xpath} className="flex w-full items-center">
53 |                 <Typography variant="h6" className="w-[10%] mr-2">
54 |                   Action {index + 1}:
55 |                 </Typography>
56 |                 <SiteMapInput
57 |                   disabled={Boolean(siteMap)}
58 |                   xpath={action.xpath}
59 |                   option={action.type}
60 |                   clickOnce={action.do_once}
61 |                   input={action.input}
62 |                 />
63 |               </li>
64 |             ))}
65 |           </ul>
66 |         </div>
67 |       )}
68 |     </div>
69 |   );
70 | };
71 | 


--------------------------------------------------------------------------------
/src/contexts/AuthContext.tsx:
--------------------------------------------------------------------------------
 1 | import React, { createContext, useContext, useState, useEffect } from "react";
 2 | import axios from "axios";
 3 | import Cookies from "js-cookie";
 4 | 
 5 | interface AuthContextProps {
 6 |   user: any;
 7 |   isAuthenticated: boolean;
 8 |   login: (email: string, password: string) => Promise<void>;
 9 |   logout: () => void;
10 |   setUser: (user: any) => void;
11 | }
12 | 
13 | const AuthContext = createContext<AuthContextProps | undefined>(undefined);
14 | 
15 | interface AuthProps {
16 |   children: React.ReactNode;
17 | }
18 | 
19 | export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
20 |   const [user, setUser] = useState<any>(null);
21 |   const [isAuthenticated, setIsAuthenticated] = useState<boolean>(false);
22 | 
23 |   useEffect(() => {
24 |     const token = Cookies.get("token");
25 |     if (token) {
26 |       axios
27 |         .get(`/api/me`, {
28 |           headers: { Authorization: `Bearer ${token}` },
29 |         })
30 |         .then((response) => {
31 |           setUser(response.data);
32 |           setIsAuthenticated(true);
33 |         })
34 |         .catch(() => {
35 |           localStorage.removeItem("token");
36 |         });
37 |     }
38 |   }, []);
39 | 
40 |   const login = async (email: string, password: string) => {
41 |     const params = new URLSearchParams();
42 |     params.append("username", email);
43 |     params.append("password", password);
44 |     const response = await axios.post(`/api/token`, params);
45 |     const isSecure = window.location.protocol === "https:";
46 | 
47 |     Cookies.set("token", response.data.access_token, {
48 |       expires: 7,
49 |       path: "/",
50 |       secure: isSecure,
51 |       sameSite: "Lax",
52 |     });
53 | 
54 |     const userResponse = await axios.get(`/api/me`, {
55 |       headers: { Authorization: `Bearer ${response.data.access_token}` },
56 |     });
57 | 
58 |     setUser(userResponse.data);
59 |     setIsAuthenticated(true);
60 |   };
61 | 
62 |   const logout = () => {
63 |     Cookies.remove("token");
64 |     setUser(null);
65 |     setIsAuthenticated(false);
66 |   };
67 | 
68 |   return (
69 |     <AuthContext.Provider
70 |       value={{ user, isAuthenticated, login, logout, setUser }}
71 |     >
72 |       {children}
73 |     </AuthContext.Provider>
74 |   );
75 | };
76 | 
77 | export const useAuth = () => {
78 |   const context = useContext(AuthContext);
79 |   if (!context) {
80 |     throw new Error("useAuth must be used within an AuthProvider");
81 |   }
82 |   return context;
83 | };
84 | 


--------------------------------------------------------------------------------
/src/declaration.d.ts:
--------------------------------------------------------------------------------
1 | declare module "*.png";
2 | 


--------------------------------------------------------------------------------
/src/lib/constants.ts:
--------------------------------------------------------------------------------
1 | export const Constants = {
2 |   DOMAIN: "",
3 | };
4 | 


--------------------------------------------------------------------------------
/src/lib/helpers/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./parse-job-options";
2 | export * from "./validate-url";
3 | 


--------------------------------------------------------------------------------
/src/lib/helpers/parse-job-options.ts:
--------------------------------------------------------------------------------
 1 | import { Dispatch, SetStateAction } from "react";
 2 | 
 3 | import { RawJobOptions, SiteMap } from "@/types";
 4 | 
 5 | export const parseJobOptions = (
 6 |   job_options: string,
 7 |   setJobOptions: Dispatch<SetStateAction<RawJobOptions>>,
 8 |   setSiteMap?: Dispatch<SetStateAction<SiteMap | null>>
 9 | ) => {
10 |   if (job_options) {
11 |     const jsonOptions = JSON.parse(job_options as string);
12 |     const newJobOptions: RawJobOptions = {
13 |       multi_page_scrape: false,
14 |       custom_headers: null,
15 |       proxies: null,
16 |       collect_media: false,
17 |       custom_cookies: null,
18 |     };
19 | 
20 |     if (jsonOptions.collect_media) {
21 |       newJobOptions.collect_media = true;
22 |     }
23 | 
24 |     if (
25 |       jsonOptions.custom_headers &&
26 |       Object.keys(jsonOptions.custom_headers).length
27 |     ) {
28 |       newJobOptions.custom_headers = jsonOptions.custom_headers;
29 |     }
30 | 
31 |     if (jsonOptions.custom_cookies && jsonOptions.custom_cookies.length > 0) {
32 |       newJobOptions.custom_cookies = jsonOptions.custom_cookies;
33 |     }
34 | 
35 |     newJobOptions.multi_page_scrape = jsonOptions.multi_page_scrape;
36 | 
37 |     if (jsonOptions.proxies.length > 0) {
38 |       newJobOptions.proxies = jsonOptions.proxies.join(",");
39 |     }
40 | 
41 |     if (jsonOptions.site_map && setSiteMap) {
42 |       setSiteMap(jsonOptions.site_map);
43 |     }
44 | 
45 |     setJobOptions(newJobOptions);
46 |   }
47 | };
48 | 


--------------------------------------------------------------------------------
/src/lib/helpers/parse-json-to-entries.ts:
--------------------------------------------------------------------------------
 1 | export const parseJsonToEntries = (json: string): [string, string][] | null => {
 2 |   try {
 3 |     const parsed = JSON.parse(json);
 4 | 
 5 |     if (Array.isArray(parsed)) {
 6 |       if (
 7 |         parsed.length > 0 &&
 8 |         Array.isArray(parsed[0]) &&
 9 |         parsed[0].length === 2 &&
10 |         typeof parsed[0][0] === "string"
11 |       ) {
12 |         // Already array of [key, val] tuples
13 |         // Just ensure values are strings
14 |         return parsed.map(([k, v]) => [k, String(v)]);
15 |       }
16 | 
17 |       // Array of objects
18 |       const allEntries: [string, string][] = [];
19 |       for (const item of parsed) {
20 |         if (typeof item === "object" && item !== null) {
21 |           allEntries.push(
22 |             // @ts-ignore
23 |             ...Object.entries(item).map(([k, v]) => [k, String(v)])
24 |           );
25 |         } else {
26 |           return null;
27 |         }
28 |       }
29 |       return allEntries.length > 0 ? allEntries : null;
30 |     } else if (typeof parsed === "object" && parsed !== null) {
31 |       return Object.entries(parsed).map(([k, v]) => [k, String(v)]);
32 |     }
33 |     return null;
34 |   } catch {
35 |     return null;
36 |   }
37 | };
38 | 


--------------------------------------------------------------------------------
/src/lib/helpers/validate-url.ts:
--------------------------------------------------------------------------------
1 | export function validateURL(url: string): boolean {
2 |   try {
3 |     new URL(url);
4 |     return true;
5 |   } catch (_) {
6 |     return false;
7 |   }
8 | }
9 | 


--------------------------------------------------------------------------------
/src/lib/hooks/use-advanced-job-options/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./use-advanced-job-options";
2 | 


--------------------------------------------------------------------------------
/src/lib/hooks/use-advanced-job-options/use-advanced-job-options.ts:
--------------------------------------------------------------------------------
 1 | import { useEffect, useState } from "react";
 2 | 
 3 | import { RawJobOptions } from "@/types";
 4 | import { parseJobOptions } from "@/lib/helpers/parse-job-options";
 5 | import { useRouter } from "next/router";
 6 | 
 7 | export const useAdvancedJobOptions = () => {
 8 |   const initialJobOptions: RawJobOptions = {
 9 |     multi_page_scrape: false,
10 |     custom_headers: null,
11 |     proxies: null,
12 |     collect_media: false,
13 |     custom_cookies: null,
14 |   };
15 | 
16 |   const router = useRouter();
17 |   const { job_options } = router.query;
18 | 
19 |   const [jobOptions, setJobOptions] =
20 |     useState<RawJobOptions>(initialJobOptions);
21 | 
22 |   useEffect(() => {
23 |     if (job_options) {
24 |       parseJobOptions(job_options as string, setJobOptions);
25 |     }
26 |   }, [job_options]);
27 | 
28 |   return { jobOptions, setJobOptions };
29 | };
30 | 


--------------------------------------------------------------------------------
/src/lib/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./constants";
2 | export * from "./utils";
3 | export * from "./helpers";
4 | 


--------------------------------------------------------------------------------
/src/lib/utils.ts:
--------------------------------------------------------------------------------
  1 | import Cookies from "js-cookie";
  2 | import React, { Dispatch } from "react";
  3 | import { Job } from "../types";
  4 | 
  5 | interface fetchOptions {
  6 |   chat?: boolean;
  7 | }
  8 | 
  9 | export const fetchJobs = async (
 10 |   setJobs: Dispatch<React.SetStateAction<Job[]>>,
 11 |   fetchOptions: fetchOptions = {}
 12 | ) => {
 13 |   const token = Cookies.get("token");
 14 |   await fetch("/api/retrieve", {
 15 |     method: "POST",
 16 |     headers: {
 17 |       "content-type": "application/json",
 18 |       Authorization: `Bearer ${token}`,
 19 |     },
 20 |     body: JSON.stringify({ data: fetchOptions }),
 21 |   })
 22 |     .then((response) => response.json())
 23 |     .then((data) => setJobs(data))
 24 |     .catch((error) => {
 25 |       console.error("Error fetching jobs:", error);
 26 |     });
 27 | };
 28 | 
 29 | export const fetchJob = async (id: string) => {
 30 |   const token = Cookies.get("token");
 31 |   try {
 32 |     const response = await fetch(`/api/job/${id}`, {
 33 |       headers: {
 34 |         "content-type": "application/json",
 35 |         Authorization: `Bearer ${token}`,
 36 |       },
 37 |     });
 38 |     const data = await response.json();
 39 |     return data;
 40 |   } catch (error) {
 41 |     console.error("Error fetching jobs:", error);
 42 |     throw error;
 43 |   }
 44 | };
 45 | 
 46 | export const checkAI = async (
 47 |   setAiEnabled: Dispatch<React.SetStateAction<boolean>>
 48 | ) => {
 49 |   const token = Cookies.get("token");
 50 |   try {
 51 |     const response = await fetch("/api/check", {
 52 |       headers: {
 53 |         "content-type": "application/json",
 54 |         Authorization: `Bearer ${token}`,
 55 |       },
 56 |     });
 57 |     const data = await response.json();
 58 |     setAiEnabled(data.ai_enabled);
 59 |   } catch (error) {
 60 |     console.error("Error fetching jobs:", error);
 61 |     throw error;
 62 |   }
 63 | };
 64 | 
 65 | export const updateJob = async (ids: string[], field: string, value: any) => {
 66 |   const token = Cookies.get("token");
 67 |   const postBody = {
 68 |     ids: ids,
 69 |     field: field,
 70 |     value: value,
 71 |   };
 72 |   await fetch("/api/update", {
 73 |     method: "POST",
 74 |     headers: {
 75 |       "content-type": "application/json",
 76 |       Authorization: `Bearer ${token}`,
 77 |     },
 78 |     body: JSON.stringify({ data: postBody }),
 79 |   }).catch((error) => {
 80 |     console.error("Error fetching jobs:", error);
 81 |   });
 82 | };
 83 | 
 84 | export const getUserSettings = async () => {
 85 |   const token = Cookies.get("token");
 86 | 
 87 |   try {
 88 |     const response = await fetch("/api/check", {
 89 |       headers: {
 90 |         "content-type": "application/json",
 91 |         Authorization: `Bearer ${token}`,
 92 |       },
 93 |     });
 94 | 
 95 |     const data = await response.json();
 96 |     return data;
 97 |   } catch (error) {
 98 |     console.error("Error fetching jobs:", error);
 99 |     throw error;
100 |   }
101 | };
102 | 


--------------------------------------------------------------------------------
/src/pages/_app.tsx:
--------------------------------------------------------------------------------
 1 | import "bootstrap/dist/css/bootstrap.min.css";
 2 | import "../styles/globals.css";
 3 | 
 4 | import React, { useState, useEffect } from "react";
 5 | import type { AppProps } from "next/app";
 6 | import Head from "next/head";
 7 | import { ThemeProvider, CssBaseline, Box } from "@mui/material";
 8 | import { NavDrawer } from "../components/common";
 9 | import { darkTheme, lightTheme } from "../styles/themes";
10 | import { AuthProvider } from "../contexts/AuthContext";
11 | import { Provider } from "react-redux";
12 | import { PersistGate } from "redux-persist/integration/react";
13 | import { store, persistor } from "@/store/store";
14 | 
15 | const App: React.FC<AppProps> = ({ Component, pageProps }) => {
16 |   const [isDarkMode, setIsDarkMode] = useState(false);
17 | 
18 |   useEffect(() => {
19 |     const savedTheme = localStorage.getItem("theme");
20 |     if (savedTheme) {
21 |       setIsDarkMode(savedTheme === "dark");
22 |     } else {
23 |       const prefersDarkMode = window.matchMedia(
24 |         "(prefers-color-scheme: dark)"
25 |       ).matches;
26 |       setIsDarkMode(prefersDarkMode);
27 |     }
28 |   }, []);
29 | 
30 |   const toggleTheme = () => {
31 |     const newTheme = !isDarkMode;
32 |     setIsDarkMode(newTheme);
33 |     localStorage.setItem("theme", newTheme ? "dark" : "light");
34 |   };
35 | 
36 |   return (
37 |     <>
38 |       <Head>
39 |         <title>Scraperr</title>
40 |       </Head>
41 |       <Provider store={store}>
42 |         <PersistGate loading={null} persistor={persistor}>
43 |           <AuthProvider>
44 |             <ThemeProvider theme={isDarkMode ? darkTheme : lightTheme}>
45 |               <CssBaseline />
46 |               <Box sx={{ height: "100%", display: "flex" }}>
47 |                 <NavDrawer isDarkMode={isDarkMode} toggleTheme={toggleTheme} />
48 |                 <Box
49 |                   component="main"
50 |                   sx={{
51 |                     p: 3,
52 |                     bgcolor: "background.default",
53 |                     overflow: "hidden",
54 |                     height: "100%",
55 |                     width: "100%",
56 |                   }}
57 |                 >
58 |                   <Component {...pageProps} />
59 |                 </Box>
60 |               </Box>
61 |             </ThemeProvider>
62 |           </AuthProvider>
63 |         </PersistGate>
64 |       </Provider>
65 |     </>
66 |   );
67 | };
68 | 
69 | export default App;
70 | 


--------------------------------------------------------------------------------
/src/pages/_document.tsx:
--------------------------------------------------------------------------------
 1 | import { Html, Head, Main, NextScript } from "next/document";
 2 | import React from "react";
 3 | 
 4 | export default function Document() {
 5 |   return (
 6 |     <Html lang="en">
 7 |       <Head>
 8 |         <meta charSet="utf-8" />
 9 |         <meta name="description" content="Scraperr" />
10 |         <link rel="preconnect" href="https://fonts.googleapis.com" />
11 |         <link rel="preconnect" href="https://fonts.gstatic.com" />
12 |         <link
13 |           href="https://fonts.googleapis.com/css2?family=Schibsted+Grotesk:ital,wght@0,400..900;1,400..900&display=swap"
14 |           rel="stylesheet"
15 |         />
16 |       </Head>
17 |       <body>
18 |         <noscript>You need to enable JavaScript to run this app.</noscript>
19 |         <Main />
20 |         <NextScript />
21 |       </body>
22 |     </Html>
23 |   );
24 | }
25 | 


--------------------------------------------------------------------------------
/src/pages/agent.tsx:
--------------------------------------------------------------------------------
1 | export { Agent as default } from "@/components/pages/agent";
2 | 


--------------------------------------------------------------------------------
/src/pages/api/ai/index.ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   const { data } = req.body;
 8 | 
 9 |   try {
10 |     const response = await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/ai`, {
11 |       method: "POST",
12 |       headers: {
13 |         Accept: "text/event-stream",
14 |         "Content-Type": "application/json",
15 |       },
16 |       body: JSON.stringify(data),
17 |     });
18 | 
19 |     if (!response.ok) {
20 |       const errorDetails = await response.text();
21 |       if (response.status === 422) {
22 |         console.error(`422 Error: ${errorDetails}`);
23 |       }
24 |       throw new Error(
25 |         `Error fetching logs: ${response.statusText} - ${errorDetails}`
26 |       );
27 |     }
28 | 
29 |     if (!response.body) {
30 |       throw new Error(`No response body from API`);
31 |     }
32 | 
33 |     res.writeHead(200, {
34 |       "Content-Type": "text/event-stream",
35 |       "Cache-Control": "no-cache, no-transform",
36 |       Connection: "keep-alive",
37 |       "Transfer-Encoding": "chunked",
38 |     });
39 | 
40 |     let responseStream = response.body;
41 |     const reader = responseStream.getReader();
42 |     const decoder = new TextDecoder();
43 | 
44 |     while (true) {
45 |       const { done, value } = await reader.read();
46 |       if (done) break;
47 |       const chunk = decoder.decode(value, { stream: true });
48 |       res.write(`${chunk}`);
49 |     }
50 | 
51 |     res.end();
52 |   } catch (error) {
53 |     console.error("Error streaming logs:", error);
54 |     res.status(500).json({ error: "Internal Server Error" });
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/pages/api/check.ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   try {
 8 |     const headers = new Headers(req.headers as Record<string, string>);
 9 |     headers.set("content-type", "application/json");
10 |     headers.set("Authorization", `Bearer ${req.headers.authorization}`);
11 | 
12 |     const response = await fetch(
13 |       `${global.process.env.NEXT_PUBLIC_API_URL}/api/ai/check`,
14 |       {
15 |         method: "GET",
16 |         headers,
17 |       }
18 |     );
19 | 
20 |     const checksResponse = await fetch(
21 |       `${global.process.env.NEXT_PUBLIC_API_URL}/api/auth/check`,
22 |       {
23 |         method: "GET",
24 |         headers,
25 |       }
26 |     );
27 | 
28 |     if (!response.ok) {
29 |       throw new Error(`Error: ${response.statusText}`);
30 |     }
31 | 
32 |     const result = await response.json();
33 |     const checksResult = await checksResponse.json();
34 |     res.status(200).json({ ...result, ...checksResult });
35 |   } catch (error) {
36 |     console.error("Error submitting scrape job:", error);
37 |     res.status(500).json({ error: "Internal Server Error" });
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/pages/api/delete-cron-job.ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   if (req.method === "POST") {
 8 |     const { data } = req.body;
 9 |     console.log("Data", data);
10 | 
11 |     const headers = new Headers();
12 |     headers.set("content-type", "application/json");
13 | 
14 |     try {
15 |       const response = await fetch(
16 |         `${global.process.env.NEXT_PUBLIC_API_URL}/api/delete-cron-job`,
17 |         {
18 |           method: "POST",
19 |           headers,
20 |           body: JSON.stringify(data),
21 |         }
22 |       );
23 | 
24 |       if (!response.ok) {
25 |         console.error(response);
26 |         throw new Error(`Error: ${response.statusText}`);
27 |       }
28 | 
29 |       const result = await response.json();
30 |       res.status(200).json(result);
31 |     } catch (error) {
32 |       console.error("Error deleting cron job:", error);
33 |       res.status(500).json({ error: "Internal Server Error" });
34 |     }
35 |   } else {
36 |     res.setHeader("Allow", ["POST"]);
37 |     res.status(405).end(`Method ${req.method} Not Allowed`);
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/pages/api/delete.ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   if (req.method === "POST") {
 8 |     const { data } = req.body;
 9 | 
10 |     const headers = new Headers();
11 |     headers.set("content-type", "application/json");
12 |     headers.set("Authorization", `Bearer ${req.headers.authorization}`);
13 | 
14 |     try {
15 |       const response = await fetch(
16 |         `${global.process.env.NEXT_PUBLIC_API_URL}/api/delete-scrape-jobs`,
17 |         {
18 |           method: "POST",
19 |           headers,
20 |           body: JSON.stringify(data),
21 |         }
22 |       );
23 | 
24 |       if (!response.ok) {
25 |         throw new Error(`Error: ${response.statusText}`);
26 |       }
27 | 
28 |       const result = await response.json();
29 |       res.status(200).json(result);
30 |     } catch (error) {
31 |       console.error("Error submitting scrape job:", error);
32 |       res.status(500).json({ error: "Internal Server Error" });
33 |     }
34 |   } else {
35 |     res.setHeader("Allow", ["POST"]);
36 |     res.status(405).end(`Method ${req.method} Not Allowed`);
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/pages/api/download.ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   if (req.method === "POST") {
 8 |     const { data } = req.body;
 9 | 
10 |     const headers = new Headers();
11 |     headers.set("content-type", "application/json");
12 | 
13 |     try {
14 |       const response = await fetch(
15 |         `${global.process.env.NEXT_PUBLIC_API_URL}/api/download`,
16 |         {
17 |           method: "POST",
18 |           headers,
19 |           body: JSON.stringify(data),
20 |         }
21 |       );
22 | 
23 |       if (!response.ok) {
24 |         throw new Error(`Error: ${response.statusText}`);
25 |       }
26 | 
27 |       const csvText = await response.text();
28 |       res.status(200).send(csvText);
29 |     } catch (error) {
30 |       console.error("Error submitting scrape job:", error);
31 |       res.status(500).json({ error: "Internal Server Error" });
32 |     }
33 |   } else {
34 |     res.setHeader("Allow", ["POST"]);
35 |     res.status(405).end(`Method ${req.method} Not Allowed`);
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/pages/api/get-average-element-per-link.ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   const headers = new Headers();
 8 |   headers.set("content-type", "application/json");
 9 |   headers.set("Authorization", `Bearer ${req.headers.authorization}`);
10 | 
11 |   try {
12 |     const response = await fetch(
13 |       `${process.env.NEXT_PUBLIC_API_URL}/api/statistics/get-average-element-per-link`,
14 |       {
15 |         method: "GET",
16 |         headers,
17 |       }
18 |     );
19 | 
20 |     if (!response.ok) {
21 |       throw new Error(`Error: ${response.statusText}`);
22 |     }
23 | 
24 |     const csvText = await response.text();
25 |     res.status(200).send(csvText);
26 |   } catch (error) {
27 |     console.error("Error submitting scrape job:", error);
28 |     res.status(500).json({ error: "Internal Server Error" });
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/pages/api/get-average-jobs-per-day.ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   const headers = new Headers();
 8 |   headers.set("content-type", "application/json");
 9 |   headers.set("Authorization", `Bearer ${req.headers.authorization}`);
10 | 
11 |   try {
12 |     const response = await fetch(
13 |       `${global.process.env.NEXT_PUBLIC_API_URL}/api/statistics/get-average-jobs-per-day`,
14 |       {
15 |         method: "GET",
16 |         headers,
17 |       }
18 |     );
19 | 
20 |     if (!response.ok) {
21 |       throw new Error(`Error: ${response.statusText}`);
22 |     }
23 | 
24 |     const csvText = await response.text();
25 |     res.status(200).send(csvText);
26 |   } catch (error) {
27 |     console.error("Error submitting scrape job:", error);
28 |     res.status(500).json({ error: "Internal Server Error" });
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/pages/api/job/[id].ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   const { id } = req.query;
 8 | 
 9 |   const headers = new Headers();
10 |   headers.set("content-type", "application/json");
11 |   headers.set("Authorization", `Bearer ${req.headers.authorization}`);
12 | 
13 |   try {
14 |     const response = await fetch(
15 |       `${global.process.env.NEXT_PUBLIC_API_URL}/api/job/${id}`,
16 |       {
17 |         headers,
18 |       }
19 |     );
20 | 
21 |     if (!response.ok) {
22 |       throw new Error(`Error: ${response.statusText}`);
23 |     }
24 | 
25 |     const result = await response.json();
26 |     res.status(200).json(result);
27 |   } catch (error) {
28 |     console.error("Error submitting scrape job:", error);
29 |     res.status(500).json({ error: "Internal Server Error" });
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/pages/api/logs.ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   try {
 8 |     const response = await fetch(
 9 |       `${process.env.NEXT_PUBLIC_API_URL}/api/logs`,
10 |       {
11 |         method: "GET",
12 |         headers: {
13 |           Accept: "text/event-stream",
14 |         },
15 |       }
16 |     );
17 | 
18 |     if (!response.ok || !response.body) {
19 |       throw new Error(`Error fetching logs: ${response.statusText}`);
20 |     }
21 | 
22 |     res.writeHead(200, {
23 |       "Content-Type": "text/event-stream",
24 |       "Cache-Control": "no-cache, no-transform",
25 |       Connection: "keep-alive",
26 |       "Transfer-Encoding": "chunked",
27 |     });
28 | 
29 |     let responseStream = response.body;
30 |     const reader = responseStream.getReader();
31 |     const decoder = new TextDecoder();
32 | 
33 |     while (true) {
34 |       const { done, value } = await reader.read();
35 |       if (done) break;
36 |       const chunk = decoder.decode(value, { stream: true });
37 |       res.write(`data: ${chunk}\n\n`);
38 |     }
39 | 
40 |     res.end();
41 |   } catch (error) {
42 |     console.error("Error streaming logs:", error);
43 |     res.status(500).json({ error: "Internal Server Error" });
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/pages/api/me.ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   try {
 8 |     const headers = new Headers();
 9 |     headers.set("Authorization", `Bearer ${req.headers.authorization}`);
10 |     headers.set("content-type", "application/json");
11 | 
12 |     const response = await fetch(
13 |       `${global.process.env.NEXT_PUBLIC_API_URL}/api/auth/users/me`,
14 |       {
15 |         method: "GET",
16 |         headers,
17 |       }
18 |     );
19 | 
20 |     if (!response.ok) {
21 |       throw new Error(`Error: ${response.statusText}`);
22 |     }
23 | 
24 |     const result = await response.json();
25 |     res.status(200).json(result);
26 |   } catch (error) {
27 |     console.error("Error submitting scrape job:", error);
28 |     res.status(500).json({ error: "Internal Server Error" });
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/pages/api/media/get-media.ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   const { id } = req.query;
 8 | 
 9 |   try {
10 |     const response = await fetch(
11 |       `${process.env.NEXT_PUBLIC_API_URL}/get-media?id=${id}`
12 |     );
13 | 
14 |     if (!response.ok) {
15 |       throw new Error(`Error: ${response.statusText}`);
16 |     }
17 | 
18 |     const data = await response.json();
19 |     res.status(200).json(data);
20 |   } catch (error) {
21 |     console.error("Error streaming video:", error);
22 |     res.status(404).json({ error: "Error streaming video" });
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/pages/api/media/index.ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   const { id, type, file } = req.query;
 8 | 
 9 |   if (!id || !type || !file) {
10 |     return res.status(400).json({ error: "Missing required parameters" });
11 |   }
12 | 
13 |   try {
14 |     const response = await fetch(
15 |       `${process.env.NEXT_PUBLIC_API_URL}/media?id=${id}&type=${type}&file=${file}`
16 |     );
17 | 
18 |     if (!response.ok) {
19 |       throw new Error(`Error: ${response.statusText}`);
20 |     }
21 | 
22 |     const contentType =
23 |       response.headers.get("content-type") || "application/octet-stream";
24 | 
25 |     res.setHeader("Content-Type", contentType);
26 | 
27 |     const arrayBuffer = await response.arrayBuffer();
28 |     res.status(200).send(Buffer.from(arrayBuffer));
29 |   } catch (error) {
30 |     console.error("Error streaming media:", error);
31 |     res.status(404).json({ error: "Error retrieving media file" });
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/pages/api/recordings/[id].ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   const { id } = req.query;
 8 | 
 9 |   try {
10 |     const response = await fetch(
11 |       `${process.env.NEXT_PUBLIC_API_URL}/recordings/${id}`
12 |     );
13 | 
14 |     if (!response.ok) {
15 |       throw new Error(`Error: ${response.statusText}`);
16 |     }
17 | 
18 |     res.setHeader("Content-Type", "video/mp4");
19 |     res.setHeader("Accept-Ranges", "bytes");
20 | 
21 |     const reader = response.body?.getReader();
22 | 
23 |     if (!reader) {
24 |       res.status(404).json({ error: "Recording not found" });
25 |       return;
26 |     }
27 | 
28 |     while (true) {
29 |       const { done, value } = await reader.read();
30 |       if (done) break;
31 |       res.write(value);
32 |     }
33 | 
34 |     res.end();
35 |   } catch (error) {
36 |     console.error("Error streaming video:", error);
37 |     res.status(404).json({ error: "Error streaming video" });
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/pages/api/retrieve.ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   if (req.method === "POST") {
 8 |     const { data } = req.body;
 9 | 
10 |     const headers = new Headers();
11 |     headers.set("content-type", "application/json");
12 |     headers.set("Authorization", `Bearer ${req.headers.authorization}`);
13 | 
14 |     try {
15 |       const response = await fetch(
16 |         `${global.process.env.NEXT_PUBLIC_API_URL}/api/retrieve-scrape-jobs`,
17 |         {
18 |           method: "POST",
19 |           headers,
20 |           body: JSON.stringify(data),
21 |         }
22 |       );
23 | 
24 |       if (!response.ok) {
25 |         throw new Error(`Error: ${response.statusText}`);
26 |       }
27 | 
28 |       const result = await response.json();
29 |       res.status(200).json(result);
30 |     } catch (error) {
31 |       console.error("Error submitting scrape job:", error);
32 |       res.status(500).json({ error: "Internal Server Error" });
33 |     }
34 |   } else {
35 |     res.setHeader("Allow", ["POST"]);
36 |     res.status(405).end(`Method ${req.method} Not Allowed`);
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/pages/api/schedule-cron-job.ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   if (req.method === "POST") {
 8 |     const { data } = req.body;
 9 |     console.log("Data", data);
10 | 
11 |     const headers = new Headers();
12 |     headers.set("content-type", "application/json");
13 | 
14 |     try {
15 |       const response = await fetch(
16 |         `${global.process.env.NEXT_PUBLIC_API_URL}/api/schedule-cron-job`,
17 |         {
18 |           method: "POST",
19 |           headers,
20 |           body: JSON.stringify(data),
21 |         }
22 |       );
23 | 
24 |       if (!response.ok) {
25 |         console.error(response);
26 |         throw new Error(`Error: ${response.statusText}`);
27 |       }
28 | 
29 |       const result = await response.json();
30 |       res.status(200).json(result);
31 |     } catch (error) {
32 |       console.error("Error scheduling cron job:", error);
33 |       res.status(500).json({ error: "Internal Server Error" });
34 |     }
35 |   } else {
36 |     res.setHeader("Allow", ["POST"]);
37 |     res.status(405).end(`Method ${req.method} Not Allowed`);
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/pages/api/signup.ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   if (req.method === "POST") {
 8 |     const { data } = req.body;
 9 | 
10 |     const headers = new Headers();
11 |     headers.set("content-type", "application/json");
12 | 
13 |     try {
14 |       const response = await fetch(
15 |         `${global.process.env.NEXT_PUBLIC_API_URL}/api/auth/signup`,
16 |         {
17 |           method: "POST",
18 |           headers,
19 |           body: JSON.stringify(data),
20 |         }
21 |       );
22 | 
23 |       if (!response.ok) {
24 |         throw new Error(`Error: ${response.statusText}`);
25 |       }
26 | 
27 |       const result = await response.json();
28 |       res.status(200).json(result);
29 |     } catch (error) {
30 |       console.error("Error submitting scrape job:", error);
31 |       res.status(500).json({ error: "Internal Server Error" });
32 |     }
33 |   } else {
34 |     res.setHeader("Allow", ["POST"]);
35 |     res.status(405).end(`Method ${req.method} Not Allowed`);
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/pages/api/submit-scrape-job.ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   if (req.method === "POST") {
 8 |     const { data } = req.body;
 9 | 
10 |     const headers = new Headers();
11 |     headers.set("Authorization", `Bearer ${req.headers.authorization}`);
12 |     headers.set("content-type", "application/json");
13 | 
14 |     try {
15 |       const response = await fetch(
16 |         `${global.process.env.NEXT_PUBLIC_API_URL}/api/submit-scrape-job`,
17 |         {
18 |           method: "POST",
19 |           headers,
20 |           body: JSON.stringify(data),
21 |         }
22 |       );
23 | 
24 |       if (!response.ok) {
25 |         throw new Error(`Error: ${response.statusText}`);
26 |       }
27 | 
28 |       const result = await response.json();
29 |       res.status(200).json(result);
30 |     } catch (error) {
31 |       console.error("Error submitting scrape job:", error);
32 |       res.status(500).json({ error: "Internal Server Error" });
33 |     }
34 |   } else {
35 |     res.setHeader("Allow", ["POST"]);
36 |     res.status(405).end(`Method ${req.method} Not Allowed`);
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/pages/api/token.ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   if (req.method === "POST") {
 8 |     const body = new URLSearchParams(req.body as string);
 9 |     const username = body.get("username") || "";
10 |     const password = body.get("password") || "";
11 | 
12 |     const headers = new Headers();
13 |     headers.set("content-type", "application/x-www-form-urlencoded");
14 | 
15 |     try {
16 |       const response = await fetch(
17 |         `${global.process.env.NEXT_PUBLIC_API_URL}/api/auth/token`,
18 |         {
19 |           method: "POST",
20 |           headers,
21 |           body: new URLSearchParams({ username, password }).toString(),
22 |         }
23 |       );
24 | 
25 |       if (!response.ok) {
26 |         throw new Error(`Error: ${response.statusText}`);
27 |       }
28 | 
29 |       const result = await response.json();
30 |       res.status(200).json(result);
31 |     } catch (error) {
32 |       console.error("Error submitting scrape job:", error);
33 |       res.status(500).json({ error: "Internal Server Error" });
34 |     }
35 |   } else {
36 |     res.setHeader("Allow", ["POST"]);
37 |     res.status(405).end(`Method ${req.method} Not Allowed`);
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/pages/api/update.ts:
--------------------------------------------------------------------------------
 1 | import { NextApiRequest, NextApiResponse } from "next";
 2 | 
 3 | export default async function handler(
 4 |   req: NextApiRequest,
 5 |   res: NextApiResponse
 6 | ) {
 7 |   if (req.method === "POST") {
 8 |     const { data } = req.body;
 9 | 
10 |     const headers = new Headers();
11 |     headers.set("content-type", "application/json");
12 |     headers.set("Authorization", `Bearer ${req.headers.authorization}`);
13 | 
14 |     try {
15 |       const response = await fetch(
16 |         `${global.process.env.NEXT_PUBLIC_API_URL}/api/update`,
17 |         {
18 |           method: "POST",
19 |           headers,
20 |           body: JSON.stringify(data),
21 |         }
22 |       );
23 | 
24 |       if (!response.ok) {
25 |         const errorDetails = await response.text();
26 |         if (response.status === 422) {
27 |           console.error(`422 Error: ${errorDetails}`);
28 |         }
29 |         throw new Error(
30 |           `Error fetching logs: ${response.statusText} - ${errorDetails}`
31 |         );
32 |       }
33 | 
34 |       if (!response.ok) {
35 |         throw new Error(`Error: ${response.statusText}`);
36 |       }
37 | 
38 |       const result = await response.json();
39 |       res.status(200).json(result);
40 |     } catch (error) {
41 |       console.error("Error submitting scrape job:", error);
42 |       res.status(500).json({ error: "Internal Server Error" });
43 |     }
44 |   } else {
45 |     res.setHeader("Allow", ["POST"]);
46 |     res.status(405).end(`Method ${req.method} Not Allowed`);
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/pages/chat.tsx:
--------------------------------------------------------------------------------
1 | export { AI as default } from "../components/pages/chat/chat";
2 | 


--------------------------------------------------------------------------------
/src/pages/cron-jobs.tsx:
--------------------------------------------------------------------------------
1 | import { CronJobs } from "../components/pages/cron-jobs";
2 | import { getServerSideProps } from "../components/pages/cron-jobs/get-server-side-props";
3 | export { getServerSideProps };
4 | export default CronJobs;
5 | 


--------------------------------------------------------------------------------
/src/pages/index.tsx:
--------------------------------------------------------------------------------
 1 | import { Provider as JobSubmitterProvider } from "@/components/submit/job-submitter/provider";
 2 | import { Home } from "@/components/pages/home/home";
 3 | 
 4 | export default function Main() {
 5 |   return (
 6 |     <JobSubmitterProvider>
 7 |       <Home />
 8 |     </JobSubmitterProvider>
 9 |   );
10 | }
11 | 


--------------------------------------------------------------------------------
/src/pages/job/csv/[id].tsx:
--------------------------------------------------------------------------------
1 | export { JobCsvId as default } from "@/components/pages/job/csv/id";
2 | export { getServerSideProps } from "@/components/pages/job/csv/id/get-server-side-props";
3 | 


--------------------------------------------------------------------------------
/src/pages/jobs.tsx:
--------------------------------------------------------------------------------
 1 | import React, { useEffect, useState } from "react";
 2 | import { JobTable } from "../components/jobs";
 3 | import { useAuth } from "../contexts/AuthContext";
 4 | import { Job } from "../types";
 5 | import { GetServerSideProps } from "next/types";
 6 | import axios from "axios";
 7 | import { parseCookies } from "nookies";
 8 | import { fetchJobs } from "../lib";
 9 | 
10 | interface JobsProps {
11 |   initialJobs: Job[];
12 |   initialUser: any;
13 | }
14 | 
15 | export const getServerSideProps: GetServerSideProps = async (context) => {
16 |   const { req } = context;
17 |   const cookies = parseCookies({ req });
18 |   const token = cookies.token;
19 |   let user = null;
20 |   let initialJobs: Job[] = [];
21 | 
22 |   if (token) {
23 |     try {
24 |       const userResponse = await axios.get(`/api/me`, {
25 |         headers: { Authorization: `Bearer ${token}` },
26 |       });
27 |       user = userResponse.data;
28 | 
29 |       const jobsResponse = await fetch(`/api/retrieve-scrape-jobs`, {
30 |         method: "POST",
31 |         body: JSON.stringify({ user: user.email }),
32 |         headers: {
33 |           "content-type": "application/json",
34 |           Authorization: `Bearer ${token}`,
35 |         },
36 |       });
37 | 
38 |       initialJobs = await jobsResponse.json();
39 |     } catch (error) {
40 |       console.error("Error fetching user or jobs:", error);
41 |     }
42 |   }
43 | 
44 |   return {
45 |     props: {
46 |       initialJobs,
47 |       initialUser: user,
48 |     },
49 |   };
50 | };
51 | 
52 | const Jobs: React.FC<JobsProps> = ({ initialJobs, initialUser }) => {
53 |   const { user, setUser } = useAuth();
54 |   const [jobs, setJobs] = useState<Job[]>(initialJobs || []);
55 | 
56 |   useEffect(() => {
57 |     if (!user && initialUser) {
58 |       setUser(initialUser);
59 |     }
60 |   }, [user, initialUser, setUser]);
61 | 
62 |   useEffect(() => {
63 |     fetchJobs(setJobs);
64 |   }, [user]);
65 | 
66 |   useEffect(() => {
67 |     const intervalId = setInterval(() => {
68 |       fetchJobs(setJobs);
69 |     }, 5000);
70 |     return () => clearInterval(intervalId);
71 |   }, []);
72 | 
73 |   return <JobTable jobs={jobs} setJobs={setJobs} />;
74 | };
75 | 
76 | export default Jobs;
77 | 


--------------------------------------------------------------------------------
/src/pages/media/index.tsx:
--------------------------------------------------------------------------------
1 | export { MediaId as default } from "@/components/pages/media/id";
2 | 


--------------------------------------------------------------------------------
/src/pages/recordings/index.tsx:
--------------------------------------------------------------------------------
1 | export { RecordingId as default } from "@/components/pages/recordings/id";
2 | 


--------------------------------------------------------------------------------
/src/services/api-service/api-service.ts:
--------------------------------------------------------------------------------
1 | import * as functions from "./functions";
2 | 
3 | export const ApiService = {
4 |   ...functions,
5 | };
6 | 


--------------------------------------------------------------------------------
/src/services/api-service/functions/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./submit-job";
2 | 


--------------------------------------------------------------------------------
/src/services/api-service/functions/submit-job.ts:
--------------------------------------------------------------------------------
 1 | import { SiteMap } from "@/types/job";
 2 | 
 3 | export const submitJob = async (
 4 |   submittedURL: string,
 5 |   rows: any[],
 6 |   user: any,
 7 |   jobOptions: any,
 8 |   customHeaders: any,
 9 |   customCookies: any,
10 |   siteMap: SiteMap | null,
11 |   agentMode: boolean = false,
12 |   prompt?: string
13 | ) => {
14 |   return await fetch(`/api/submit-scrape-job`, {
15 |     method: "POST",
16 |     headers: { "content-type": "application/json" },
17 |     body: JSON.stringify({
18 |       data: {
19 |         url: submittedURL,
20 |         elements: rows,
21 |         user: user?.email,
22 |         time_created: new Date().toISOString(),
23 |         job_options: {
24 |           ...jobOptions,
25 |           collect_media: jobOptions.collect_media || false,
26 |           custom_headers: customHeaders || {},
27 |           proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [],
28 |           site_map: siteMap,
29 |           custom_cookies: customCookies || [],
30 |         },
31 |         agent_mode: agentMode,
32 |         prompt: prompt || "",
33 |       },
34 |     }),
35 |   });
36 | };
37 | 


--------------------------------------------------------------------------------
/src/services/api-service/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./api-service";
2 | 


--------------------------------------------------------------------------------
/src/services/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./api-service";
2 | 


--------------------------------------------------------------------------------
/src/store/hooks.ts:
--------------------------------------------------------------------------------
 1 | import { TypedUseSelectorHook, useDispatch, useSelector } from "react-redux";
 2 | import type { RootState, AppDispatch } from "./store";
 3 | import {
 4 |   SettingsState,
 5 |   setAiEnabled,
 6 |   setRecordingsEnabled,
 7 | } from "./slices/settingsSlice";
 8 | 
 9 | export const useAppDispatch = () => useDispatch<AppDispatch>();
10 | export const useAppSelector: TypedUseSelectorHook<RootState> = useSelector;
11 | 
12 | export const useUserSettings = () => {
13 |   const userSettings = useAppSelector((state) => state.settings);
14 |   const dispatch = useAppDispatch();
15 | 
16 |   const setUserSettings = (userSettings: any) => {
17 |     dispatch(setAiEnabled(userSettings.ai_enabled));
18 |     dispatch(setRecordingsEnabled(userSettings.recordings_enabled));
19 |     return userSettings;
20 |   };
21 | 
22 |   return { userSettings, setUserSettings };
23 | };
24 | 


--------------------------------------------------------------------------------
/src/store/slices/settingsSlice.ts:
--------------------------------------------------------------------------------
 1 | import { createSlice, PayloadAction } from "@reduxjs/toolkit";
 2 | 
 3 | export interface SettingsState {
 4 |   aiEnabled: boolean;
 5 |   recordingsEnabled: boolean;
 6 | }
 7 | 
 8 | const initialState: SettingsState = {
 9 |   aiEnabled: false,
10 |   recordingsEnabled: false,
11 | };
12 | 
13 | const settingsSlice = createSlice({
14 |   name: "settings",
15 |   initialState,
16 |   reducers: {
17 |     setAiEnabled: (state, action: PayloadAction<boolean>) => {
18 |       state.aiEnabled = action.payload;
19 |     },
20 |     setRecordingsEnabled: (state, action: PayloadAction<boolean>) => {
21 |       state.recordingsEnabled = action.payload;
22 |     },
23 |   },
24 | });
25 | 
26 | export const { setAiEnabled, setRecordingsEnabled } = settingsSlice.actions;
27 | 
28 | export default settingsSlice.reducer;
29 | 


--------------------------------------------------------------------------------
/src/store/store.ts:
--------------------------------------------------------------------------------
 1 | import { configureStore } from "@reduxjs/toolkit";
 2 | import { persistStore, persistReducer } from "redux-persist";
 3 | import storage from "redux-persist/lib/storage";
 4 | import { combineReducers } from "@reduxjs/toolkit";
 5 | import settingsReducer from "./slices/settingsSlice";
 6 | 
 7 | const persistConfig = {
 8 |   key: "root",
 9 |   storage,
10 |   whitelist: ["settings"], // only settings will be persisted
11 | };
12 | 
13 | const rootReducer = combineReducers({
14 |   settings: settingsReducer,
15 | });
16 | 
17 | const persistedReducer = persistReducer(persistConfig, rootReducer);
18 | 
19 | export const store = configureStore({
20 |   reducer: persistedReducer,
21 |   middleware: (getDefaultMiddleware) =>
22 |     getDefaultMiddleware({
23 |       serializableCheck: {
24 |         ignoredActions: ["persist/PERSIST", "persist/REHYDRATE"],
25 |       },
26 |     }),
27 | });
28 | 
29 | export const persistor = persistStore(store);
30 | 
31 | export type RootState = ReturnType<typeof store.getState>;
32 | export type AppDispatch = typeof store.dispatch;
33 | 


--------------------------------------------------------------------------------
/src/styles/globals.css:
--------------------------------------------------------------------------------
 1 | @tailwind base;
 2 | @tailwind components;
 3 | @tailwind utilities;
 4 | 
 5 | :root {
 6 |   --delete-red: #ef4444;
 7 |   --delete-red-hover: #ff6969;
 8 |   --primary-blue: #007bff;
 9 |   --primary-gray: #f8f9fa;
10 | }
11 | 
12 | #__next {
13 |   height: 100%;
14 | }
15 | 
16 | html,
17 | body {
18 |   height: 100vh;
19 |   font-family: "Schibsted Grotesk", sans-serif;
20 | }
21 | 
22 | .MuiPopover-paper {
23 |   padding: 0 !important;
24 | }
25 | 
26 | ::-webkit-scrollbar {
27 |   width: 8px;
28 |   height: 8px;
29 | }
30 | 
31 | ::-webkit-scrollbar-track {
32 |   background-color: rgba(0, 0, 0, 0.05);
33 |   border-radius: 8px;
34 | }
35 | 
36 | ::-webkit-scrollbar-thumb {
37 |   background-color: rgba(0, 0, 0, 0.2);
38 |   border-radius: 8px;
39 | }
40 | 
41 | ::-webkit-scrollbar-thumb:hover {
42 |   background-color: rgba(0, 0, 0, 0.3);
43 | }
44 | 


--------------------------------------------------------------------------------
/src/types/element.ts:
--------------------------------------------------------------------------------
1 | export type Element = {
2 |   name: string;
3 |   xpath: string;
4 |   url: string;
5 | };
6 | 


--------------------------------------------------------------------------------
/src/types/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./element";
2 | export * from "./result";
3 | export * from "./job";
4 | export * from "./message";
5 | 


--------------------------------------------------------------------------------
/src/types/job.ts:
--------------------------------------------------------------------------------
 1 | import { Message } from "./message";
 2 | 
 3 | export interface Job {
 4 |   id: string;
 5 |   url: string;
 6 |   elements: Object[];
 7 |   result: Object;
 8 |   time_created: Date;
 9 |   status: string;
10 |   job_options: RawJobOptions;
11 |   favorite: boolean;
12 |   chat?: Message[];
13 |   agent_mode?: boolean;
14 |   prompt?: string;
15 | }
16 | 
17 | export type JobOptions = {
18 |   multi_page_scrape: boolean;
19 |   custom_headers: null | string;
20 |   proxies: string[];
21 |   site_map?: SiteMap;
22 | };
23 | 
24 | export type RawJobOptions = {
25 |   multi_page_scrape: boolean;
26 |   custom_headers: string | null;
27 |   proxies: string | null;
28 |   collect_media: boolean;
29 |   custom_cookies: string | null;
30 | };
31 | 
32 | export type ActionOption = "click" | "input";
33 | 
34 | export type Action = {
35 |   type: ActionOption;
36 |   xpath: string;
37 |   name: string;
38 |   do_once?: boolean;
39 |   input?: string;
40 | };
41 | 
42 | export type SiteMap = {
43 |   actions: Action[];
44 | };
45 | 
46 | export type CronJob = {
47 |   id: string;
48 |   user_email: string;
49 |   job_id: string;
50 |   cron_expression: string;
51 |   time_created: Date;
52 |   time_updated: Date;
53 | };
54 | 


--------------------------------------------------------------------------------
/src/types/message.ts:
--------------------------------------------------------------------------------
1 | export interface Message {
2 |   role: string;
3 |   content: string;
4 | }
5 | 


--------------------------------------------------------------------------------
/src/types/result.ts:
--------------------------------------------------------------------------------
 1 | interface ScrapeResult {
 2 |   xpath: string;
 3 |   text: string;
 4 |   name: string;
 5 | }
 6 | 
 7 | export type Result = {
 8 |   [key: string]: ScrapeResult[];
 9 | };
10 | 


--------------------------------------------------------------------------------
/start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | RECORDINGS_ENABLED=${RECORDINGS_ENABLED:-true}
 4 | 
 5 | if [ "$RECORDINGS_ENABLED" == "false" ]; then
 6 |   pdm run python -m api.backend.worker.job_worker
 7 | else
 8 |   Xvfb :99 -screen 0 1280x1024x24 &
 9 |   XVFB_PID=$!
10 |   sleep 2
11 |   x11vnc -display :99 -rfbport 5900 -forever -nopw &
12 |   VNC_PID=$!
13 |   DISPLAY=:99 pdm run python -m api.backend.worker.job_worker
14 | fi
15 | 


--------------------------------------------------------------------------------
/supervisord.conf:
--------------------------------------------------------------------------------
 1 | [supervisord]
 2 | nodaemon=true
 3 | 
 4 | [program:api]
 5 | command=pdm run python -m uvicorn api.backend.app:app --reload --host 0.0.0.0 --port 8000
 6 | directory=/project/app
 7 | autostart=true
 8 | autorestart=true
 9 | stdout_logfile=/dev/stdout
10 | stderr_logfile=/dev/stderr
11 | stdout_logfile_maxbytes=0
12 | stderr_logfile_maxbytes=0
13 | 
14 | [program:worker]
15 | command=/project/app/start.sh
16 | directory=/project/app
17 | autostart=true
18 | autorestart=true
19 | stdout_logfile=/dev/stdout
20 | stderr_logfile=/dev/stderr
21 | stdout_logfile_maxbytes=0
22 | stderr_logfile_maxbytes=0
23 | 


--------------------------------------------------------------------------------
/tailwind.config.js:
--------------------------------------------------------------------------------
 1 | /** @type {import('tailwindcss').Config} */
 2 | module.exports = {
 3 |   content: ["./src/**/*.{js,jsx,ts,tsx}"],
 4 |   theme: {
 5 |     extend: {
 6 |       animation: {
 7 |         fadeIn: "fadeIn 0.5s ease-in-out",
 8 |         fadeOut: "fadeOut 0.5s ease-in-out",
 9 |       },
10 |       keyframes: {
11 |         fadeIn: {
12 |           "0%": { opacity: 0 },
13 |           "100%": { opacity: 1 },
14 |         },
15 |         fadeOut: {
16 |           "0%": { opacity: 1 },
17 |           "100%": { opacity: 0 },
18 |         },
19 |       },
20 |     },
21 |   },
22 |   plugins: [],
23 | };
24 | 


--------------------------------------------------------------------------------