├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ └── bug_report.md ├── actions │ ├── push-to-helm │ │ └── action.yaml │ └── run-cypress-tests │ │ └── action.yaml └── workflows │ ├── docker-image.yml │ └── unit-tests.yml ├── .gitignore ├── .prettierignore ├── .python-version ├── FUNDING.yml ├── LICENSE ├── Makefile ├── README.md ├── api └── backend │ ├── __init__.py │ ├── ai │ ├── agent │ │ ├── actions.py │ │ ├── agent.py │ │ ├── prompts.py │ │ └── utils.py │ ├── ai_router.py │ └── clients.py │ ├── app.py │ ├── auth │ ├── __init__.py │ ├── auth_router.py │ └── auth_utils.py │ ├── constants.py │ ├── database │ ├── __init__.py │ ├── common.py │ ├── queries │ │ ├── __init__.py │ │ └── queries.py │ ├── schema │ │ ├── __init__.py │ │ └── schema.py │ └── startup.py │ ├── job │ ├── __init__.py │ ├── cron_scheduling │ │ └── cron_scheduling.py │ ├── job.py │ ├── models │ │ ├── __init__.py │ │ ├── job_options.py │ │ └── site_map.py │ ├── scraping │ │ ├── add_custom.py │ │ ├── collect_media.py │ │ └── scraping_utils.py │ ├── site_mapping │ │ ├── __init__.py │ │ └── site_mapping.py │ └── utils │ │ ├── clean_job_format.py │ │ └── stream_md_from_job_results.py │ ├── models.py │ ├── routers │ ├── job_router.py │ └── stats_router.py │ ├── scheduler.py │ ├── schemas.py │ ├── scraping.py │ ├── tests │ ├── factories │ │ └── job_factory.py │ ├── job │ │ ├── __init__.py │ │ └── test_download_job.py │ └── scraping │ │ ├── __init__.py │ │ └── test_scraping.py │ ├── utils.py │ └── worker │ ├── job_worker.py │ ├── logger.py │ └── post_job_complete │ ├── discord_notification.py │ ├── email_notifcation.py │ ├── models.py │ └── post_job_complete.py ├── cypress.config.ts ├── cypress ├── e2e │ ├── authentication.cy.ts │ ├── navigation.cy.ts │ └── submit-job.cy.ts ├── fixtures │ └── example.json └── support │ ├── commands.ts │ └── e2e.ts ├── docker-compose.dev.yml ├── docker-compose.yml ├── docker ├── api │ └── Dockerfile └── frontend │ └── Dockerfile ├── docs ├── chat_page.png ├── docs_page.png ├── job_page.png ├── log_page.png ├── login.png ├── logo_picture.png ├── main_page.png └── stats_page.png ├── helm ├── .helmignore ├── Chart.yaml ├── templates │ ├── deployment.yaml │ └── service.yaml └── values.yaml ├── next-env.d.ts ├── next.config.mjs ├── package.json ├── pdm.lock ├── postcss.config.js ├── public ├── favicon.ico ├── images │ └── scraperr_logo.png ├── manifest.json └── robots.txt ├── pyproject.toml ├── src ├── components │ ├── ai │ │ ├── Chat.tsx │ │ ├── JobSelector.tsx │ │ └── index.ts │ ├── common │ │ ├── advanced-job-options │ │ │ ├── advanced-job-options.tsx │ │ │ ├── dialog │ │ │ │ ├── advanced-job-options-dialog.tsx │ │ │ │ └── index.ts │ │ │ └── index.ts │ │ ├── csv-table │ │ │ ├── csv-table.tsx │ │ │ └── index.ts │ │ ├── disabled │ │ │ ├── disabled.tsx │ │ │ └── index.ts │ │ ├── expanded-table-input │ │ │ ├── expanded-table-input.tsx │ │ │ └── index.ts │ │ ├── index.ts │ │ ├── job-download-dialog │ │ │ ├── index.ts │ │ │ └── job-download-dialog.tsx │ │ ├── media-viewer │ │ │ ├── audio │ │ │ │ ├── audio-viewer.tsx │ │ │ │ └── index.ts │ │ │ ├── image │ │ │ │ ├── image-viewer.tsx │ │ │ │ └── index.ts │ │ │ ├── index.ts │ │ │ ├── media-viewer.tsx │ │ │ ├── pdf-viewer │ │ │ │ ├── index.ts │ │ │ │ └── pdf-viewer.tsx │ │ │ ├── tile-grid-view │ │ │ │ ├── index.ts │ │ │ │ └── tile-grid-view.tsx │ │ │ └── video │ │ │ │ ├── index.ts │ │ │ │ └── video-viewer.tsx │ │ └── nav-drawer │ │ │ ├── index.ts │ │ │ ├── nav-drawer.module.css │ │ │ ├── nav-drawer.tsx │ │ │ ├── nav-item │ │ │ ├── index.ts │ │ │ └── nav-item.tsx │ │ │ ├── nav-items │ │ │ ├── index.ts │ │ │ └── nav-items.tsx │ │ │ └── user-control │ │ │ ├── index.ts │ │ │ ├── logged-in-control │ │ │ ├── index.ts │ │ │ ├── logged-in-control.module.css │ │ │ └── logged-in-control.tsx │ │ │ ├── logged-out-control │ │ │ ├── index.ts │ │ │ ├── logged-out-control.module.css │ │ │ └── logged-out-control.tsx │ │ │ ├── user-control.module.css │ │ │ └── user-control.tsx │ ├── jobs │ │ ├── Favorites.tsx │ │ ├── JobQueue.tsx │ │ ├── JobTable.tsx │ │ └── index.tsx │ ├── nav │ │ └── quick-settings │ │ │ ├── index.ts │ │ │ ├── quick-settings.module.css │ │ │ └── quick-settings.tsx │ ├── pages │ │ ├── agent │ │ │ ├── agent.tsx │ │ │ └── index.ts │ │ ├── chat │ │ │ └── chat.tsx │ │ ├── cron-jobs │ │ │ ├── create-cron-jobs │ │ │ │ ├── create-cron-jobs.tsx │ │ │ │ └── index.ts │ │ │ ├── cron-jobs.module.css │ │ │ ├── cron-jobs.tsx │ │ │ ├── get-server-side-props.ts │ │ │ └── index.ts │ │ ├── home │ │ │ ├── home.tsx │ │ │ └── index.ts │ │ ├── job │ │ │ └── csv │ │ │ │ └── id │ │ │ │ ├── get-server-side-props.ts │ │ │ │ ├── id.tsx │ │ │ │ └── index.ts │ │ ├── media │ │ │ └── id │ │ │ │ ├── id.tsx │ │ │ │ └── index.ts │ │ └── recordings │ │ │ └── id │ │ │ ├── id.tsx │ │ │ └── index.ts │ └── submit │ │ ├── index.ts │ │ └── job-submitter │ │ ├── element-table │ │ ├── element-table.tsx │ │ └── index.ts │ │ ├── index.ts │ │ ├── job-submitter-header │ │ ├── index.ts │ │ ├── job-submitter-header.module.css │ │ └── job-submitter-header.tsx │ │ ├── job-submitter-input │ │ ├── index.ts │ │ ├── job-submitter-input.module.css │ │ └── job-submitter-input.tsx │ │ ├── job-submitter-options │ │ ├── index.ts │ │ └── job-submitter-options.tsx │ │ ├── job-submitter.tsx │ │ ├── provider.tsx │ │ └── site-map │ │ ├── index.ts │ │ ├── site-map-input │ │ ├── index.ts │ │ ├── site-map-input.module.css │ │ └── site-map-input.tsx │ │ └── site-map.tsx ├── contexts │ └── AuthContext.tsx ├── declaration.d.ts ├── lib │ ├── constants.ts │ ├── helpers │ │ ├── index.ts │ │ ├── parse-job-options.ts │ │ ├── parse-json-to-entries.ts │ │ └── validate-url.ts │ ├── hooks │ │ └── use-advanced-job-options │ │ │ ├── index.ts │ │ │ └── use-advanced-job-options.ts │ ├── index.ts │ └── utils.ts ├── pages │ ├── _app.tsx │ ├── _document.tsx │ ├── agent.tsx │ ├── api │ │ ├── ai │ │ │ └── index.ts │ │ ├── check.ts │ │ ├── delete-cron-job.ts │ │ ├── delete.ts │ │ ├── download.ts │ │ ├── get-average-element-per-link.ts │ │ ├── get-average-jobs-per-day.ts │ │ ├── job │ │ │ └── [id].ts │ │ ├── logs.ts │ │ ├── me.ts │ │ ├── media │ │ │ ├── get-media.ts │ │ │ └── index.ts │ │ ├── recordings │ │ │ └── [id].ts │ │ ├── retrieve.ts │ │ ├── schedule-cron-job.ts │ │ ├── signup.ts │ │ ├── submit-scrape-job.ts │ │ ├── token.ts │ │ └── update.ts │ ├── chat.tsx │ ├── cron-jobs.tsx │ ├── index.tsx │ ├── job │ │ └── csv │ │ │ └── [id].tsx │ ├── jobs.tsx │ ├── login.tsx │ ├── media │ │ └── index.tsx │ ├── recordings │ │ └── index.tsx │ └── statistics.tsx ├── services │ ├── api-service │ │ ├── api-service.ts │ │ ├── functions │ │ │ ├── index.ts │ │ │ └── submit-job.ts │ │ └── index.ts │ └── index.ts ├── store │ ├── hooks.ts │ ├── slices │ │ └── settingsSlice.ts │ └── store.ts ├── styles │ ├── globals.css │ └── themes.ts └── types │ ├── element.ts │ ├── index.ts │ ├── job.ts │ ├── message.ts │ └── result.ts ├── start.sh ├── supervisord.conf ├── tailwind.config.js ├── tsconfig.json └── yarn.lock /.dockerignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | npm-debug.log 3 | Dockerfile 4 | .dockerignore -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: 'Bug reporting ' 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Additional context** 32 | Add any other context about the problem here. 33 | -------------------------------------------------------------------------------- /.github/actions/push-to-helm/action.yaml: -------------------------------------------------------------------------------- 1 | name: Publish Helm Chart 2 | description: Publish a Helm chart to a target repository 3 | 4 | inputs: 5 | app-repo-token: 6 | required: true 7 | description: "The token for the target repository" 8 | 9 | runs: 10 | using: 'composite' 11 | steps: 12 | - name: Checkout app repo 13 | uses: actions/checkout@v4 14 | 15 | - name: Set up Helm 16 | uses: azure/setup-helm@v3 17 | 18 | - name: Package Helm chart 19 | run: | 20 | mkdir -p packaged 21 | helm package helm -d packaged 22 | shell: bash 23 | 24 | - name: Clone target Helm repo 25 | run: | 26 | git clone https://github.com/jaypyles/helm.git target-repo 27 | cd target-repo 28 | git config user.name "github-actions" 29 | git config user.email "github-actions@github.com" 30 | git fetch origin gh-pages # Fetch gh-pages explicitly 31 | git checkout gh-pages # Checkout gh-pages branch 32 | git pull origin gh-pages # Pull latest changes from gh-pages 33 | shell: bash 34 | 35 | - name: Copy package and update index 36 | run: | 37 | APP_NAME="scraperr" 38 | mkdir -p target-repo/charts/$APP_NAME 39 | cp packaged/*.tgz target-repo/charts/$APP_NAME/ 40 | cd target-repo/charts/$APP_NAME 41 | helm repo index . --url https://jaypyles.github.io/helm/charts/$APP_NAME 42 | shell: bash 43 | 44 | - name: Commit and push to target repo 45 | run: | 46 | cd target-repo 47 | git add charts/ 48 | git commit -m "Update $APP_NAME chart $(date +'%Y-%m-%d %H:%M:%S')" || echo "No changes" 49 | git push https://x-access-token:${{ inputs.app-repo-token }}@github.com/jaypyles/helm.git gh-pages 50 | shell: bash -------------------------------------------------------------------------------- /.github/actions/run-cypress-tests/action.yaml: -------------------------------------------------------------------------------- 1 | name: Run Cypress Tests 2 | 3 | description: Run Cypress tests 4 | 5 | runs: 6 | using: "composite" 7 | steps: 8 | - name: Checkout code 9 | uses: actions/checkout@v4 10 | 11 | - name: Setup Node 12 | uses: actions/setup-node@v4 13 | with: 14 | node-version: 22 15 | 16 | - name: Setup Docker project 17 | shell: bash 18 | run: make build-ci up-ci 19 | 20 | - name: Install dependencies 21 | shell: bash 22 | run: yarn install 23 | 24 | - name: Wait for frontend to be ready 25 | shell: bash 26 | run: | 27 | for i in {1..10}; do 28 | curl -s http://127.0.0.1:80 && echo "Frontend is ready" && exit 0 29 | echo "Waiting for frontend to be ready... attempt $i" 30 | sleep 1 31 | done 32 | echo "Frontend failed to be ready after 10 retries" 33 | exit 1 34 | 35 | - name: Wait for backend to be ready 36 | shell: bash 37 | run: | 38 | for i in {1..10}; do 39 | curl -s http://127.0.0.1:8000 && echo "Backend is ready" && exit 0 40 | echo "Waiting for backend to be ready... attempt $i" 41 | sleep 1 42 | done 43 | echo "Backend failed to be ready after 10 retries" 44 | exit 1 45 | 46 | - name: Show backend logs on failure 47 | if: failure() 48 | shell: bash 49 | run: | 50 | echo "== Docker Containers ==" 51 | docker ps -a 52 | echo "== Backend Logs ==" 53 | docker logs $(docker ps -a --filter "name=scraperr_api" --format "{{.Names}}") || echo "Could not get backend logs" 54 | 55 | - name: Run Cypress tests 56 | shell: bash 57 | run: npm run cy:run 58 | 59 | -------------------------------------------------------------------------------- /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image 2 | on: 3 | workflow_dispatch: 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Checkout 10 | uses: actions/checkout@v4 11 | 12 | - name: Get version from helm chart 13 | run: | 14 | VERSION=$(grep "version:" ./helm/Chart.yaml | cut -d: -f2 | tr -d ' ') 15 | echo "VERSION=$VERSION" >> $GITHUB_ENV 16 | echo "Version is $VERSION" 17 | 18 | - name: Login to Docker Hub 19 | uses: docker/login-action@v3 20 | with: 21 | username: ${{ secrets.DOCKERHUB_USERNAME }} 22 | password: ${{ secrets.DOCKERHUB_TOKEN }} 23 | 24 | - name: Set up Docker Buildx 25 | uses: docker/setup-buildx-action@v3 26 | 27 | - name: Build and push frontend 28 | uses: docker/build-push-action@v5 29 | with: 30 | context: . 31 | file: ./docker/frontend/Dockerfile 32 | push: true 33 | tags: | 34 | ${{ secrets.DOCKERHUB_USERNAME }}/scraperr:latest 35 | ${{ secrets.DOCKERHUB_USERNAME }}/scraperr:${{ env.VERSION }} 36 | 37 | - name: Build and push api 38 | uses: docker/build-push-action@v5 39 | with: 40 | context: . 41 | file: ./docker/api/Dockerfile 42 | push: true 43 | tags: | 44 | ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest 45 | ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:${{ env.VERSION }} 46 | 47 | push-helm-chart: 48 | runs-on: ubuntu-latest 49 | needs: 50 | - build 51 | steps: 52 | - uses: actions/checkout@v4 53 | 54 | - name: Push Helm Chart 55 | uses: ./.github/actions/push-to-helm 56 | with: 57 | app-repo-token: ${{ secrets.GPAT_TOKEN }} 58 | 59 | success-message: 60 | runs-on: ubuntu-latest 61 | needs: 62 | - build 63 | - push-helm-chart 64 | steps: 65 | - name: Send Discord Message 66 | uses: jaypyles/discord-webhook-action@v1.0.0 67 | with: 68 | webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }} 69 | content: "Scraperr Successfully Built Docker Images" 70 | username: "Scraperr CI" 71 | embed-title: "✅ Deployment Status" 72 | embed-description: "Scraperr successfully built docker images." 73 | embed-color: 3066993 # Green 74 | embed-footer-text: "Scraperr CI" 75 | embed-timestamp: ${{ github.event.head_commit.timestamp }} 76 | -------------------------------------------------------------------------------- /.github/workflows/unit-tests.yml: -------------------------------------------------------------------------------- 1 | name: Unit Tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | pull_request: 9 | types: [opened, synchronize, reopened] 10 | 11 | workflow_dispatch: 12 | 13 | jobs: 14 | unit-tests: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout 18 | uses: actions/checkout@v4 19 | 20 | - name: Set env 21 | run: echo "ENV=test" >> $GITHUB_ENV 22 | 23 | - name: Install pdm 24 | run: pip install pdm 25 | 26 | - name: Install project dependencies 27 | run: pdm install 28 | 29 | - name: Install playwright 30 | run: pdm run playwright install 31 | 32 | - name: Run tests 33 | run: PYTHONPATH=. pdm run pytest -v -ra api/backend/tests 34 | 35 | cypress-tests: 36 | runs-on: ubuntu-latest 37 | steps: 38 | - uses: actions/checkout@v4 39 | - uses: ./.github/actions/run-cypress-tests 40 | 41 | success-message: 42 | runs-on: ubuntu-latest 43 | needs: 44 | - unit-tests 45 | - cypress-tests 46 | steps: 47 | - name: Send Discord Message 48 | uses: jaypyles/discord-webhook-action@v1.0.0 49 | with: 50 | webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }} 51 | content: "Scraperr Successfully Passed Tests" 52 | username: "Scraperr CI" 53 | embed-title: "✅ Deployment Status" 54 | embed-description: "Scraperr successfully passed all tests." 55 | embed-color: 3066993 # Green 56 | embed-footer-text: "Scraperr CI" 57 | embed-timestamp: ${{ github.event.head_commit.timestamp }} 58 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | *.yaml 2 | *.yml 3 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.10.12 2 | -------------------------------------------------------------------------------- /FUNDING.yml: -------------------------------------------------------------------------------- 1 | custom: ["https://www.buymeacoffee.com/jaypyles"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Jayden Pyles 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .DEFAULT_GOAL := help 2 | 3 | COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.local.yml 4 | COMPOSE_PROD = docker compose -f docker-compose.yml 5 | 6 | .PHONY: help deps build pull up up-dev down setup deploy 7 | 8 | help: 9 | @echo "Usage:" 10 | @echo " make logs - Check Docker container logs" 11 | @echo " make deps - Build frontend assets" 12 | @echo " make build - Build Docker images" 13 | @echo " make build-force - Build Docker images" 14 | @echo " make pull - Pull Docker images" 15 | @echo " make up - Start production environment" 16 | @echo " make up-dev - Start development environment" 17 | @echo " make down - Stop and remove containers, networks, images, and volumes" 18 | @echo " make setup - Setup server with dependencies and clone repo" 19 | @echo " make deploy - Deploy site onto server" 20 | @echo " make cypress-start - Start Cypress" 21 | @echo "" 22 | 23 | logs: 24 | docker compose logs -f 25 | 26 | deps: 27 | pdm install 28 | npm install 29 | npm run build 30 | 31 | build: 32 | $(COMPOSE_DEV) build 33 | 34 | build-force: 35 | $(COMPOSE_DEV) build --no-cache 36 | 37 | pull: 38 | docker compose pull 39 | 40 | up: 41 | $(COMPOSE_PROD) up -d --force-recreate 42 | 43 | up-dev: 44 | $(COMPOSE_DEV) up -d --force-recreate 45 | 46 | down: 47 | $(COMPOSE_DEV) down 48 | $(COMPOSE_PROD) down 49 | 50 | setup: 51 | ansible-playbook -i ./ansible/inventory.yaml ./ansible/setup.yaml 52 | 53 | deploy: 54 | ansible-playbook -i ./ansible/inventory.yaml ./ansible/deploy_site.yaml -v 55 | 56 | build-ci: 57 | docker compose -f docker-compose.yml -f docker-compose.dev.yml build 58 | 59 | up-ci: 60 | docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d --force-recreate 61 | 62 | cypress-start: 63 | DISPLAY=:0 npx cypress open -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | Scraperr Logo 3 | 4 | **A powerful self-hosted web scraping solution** 5 | 6 |
7 | MongoDB 8 | FastAPI 9 | Next JS 10 | TailwindCSS 11 |
12 |
13 | 14 | ## 📋 Overview 15 | 16 | Scrape websites without writing a single line of code. 17 | 18 | > 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information. 19 | 20 |
21 | Scraperr Main Interface 22 |
23 | 24 | ## ✨ Key Features 25 | 26 | - **XPath-Based Extraction**: Precisely target page elements 27 | - **Queue Management**: Submit and manage multiple scraping jobs 28 | - **Domain Spidering**: Option to scrape all pages within the same domain 29 | - **Custom Headers**: Add JSON headers to your scraping requests 30 | - **Media Downloads**: Automatically download images, videos, and other media 31 | - **Results Visualization**: View scraped data in a structured table format 32 | - **Data Export**: Export your results in markdown and csv formats 33 | - **Notifcation Channels**: Send completion notifcations, through various channels 34 | 35 | ## 🚀 Getting Started 36 | 37 | ### Docker 38 | 39 | ```bash 40 | make up 41 | ``` 42 | 43 | ### Helm 44 | 45 | > Refer to the docs for helm deployment: https://scraperr-docs.pages.dev/guides/helm-deployment 46 | 47 | ## ⚖️ Legal and Ethical Guidelines 48 | 49 | When using Scraperr, please remember to: 50 | 51 | 1. **Respect `robots.txt`**: Always check a website's `robots.txt` file to verify which pages permit scraping 52 | 2. **Terms of Service**: Adhere to each website's Terms of Service regarding data extraction 53 | 3. **Rate Limiting**: Implement reasonable delays between requests to avoid overloading servers 54 | 55 | > **Disclaimer**: Scraperr is intended for use only on websites that explicitly permit scraping. The creator accepts no responsibility for misuse of this tool. 56 | 57 | ## 💬 Join the Community 58 | 59 | Get support, report bugs, and chat with other users and contributors. 60 | 61 | 👉 [Join the Scraperr Discord](https://discord.gg/89q7scsGEK) 62 | 63 | ## 📄 License 64 | 65 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. 66 | 67 | ## 👏 Contributions 68 | 69 | Development made easier with the [webapp template](https://github.com/jaypyles/webapp-template). 70 | 71 | To get started, simply run `make build up-dev`. -------------------------------------------------------------------------------- /api/backend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/api/backend/__init__.py -------------------------------------------------------------------------------- /api/backend/ai/agent/actions.py: -------------------------------------------------------------------------------- 1 | from typing_extensions import TypedDict 2 | 3 | 4 | class Action(TypedDict): 5 | type: str 6 | url: str 7 | -------------------------------------------------------------------------------- /api/backend/ai/agent/agent.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import Any 3 | 4 | from camoufox import AsyncCamoufox 5 | from playwright.async_api import Page 6 | 7 | from api.backend.ai.agent.utils import ( 8 | capture_elements, 9 | convert_to_markdown, 10 | parse_response, 11 | ) 12 | 13 | from api.backend.ai.clients import ask_open_ai, ask_ollama, open_ai_key 14 | 15 | from api.backend.ai.agent.prompts import ( 16 | ELEMENT_EXTRACTION_PROMPT, 17 | EXTRACT_ELEMENTS_PROMPT, 18 | ) 19 | 20 | from api.backend.job.scraping.collect_media import collect_media 21 | from api.backend.worker.logger import LOG 22 | 23 | from api.backend.job.scraping.add_custom import add_custom_items 24 | 25 | from api.backend.models import CapturedElement 26 | 27 | 28 | ask_ai = ask_open_ai if open_ai_key else ask_ollama 29 | 30 | 31 | async def scrape_with_agent(agent_job: dict[str, Any]): 32 | LOG.info(f"Starting work for agent job: {agent_job}") 33 | pages = set() 34 | 35 | if agent_job["job_options"]["proxies"]: 36 | proxy = random.choice(agent_job["job_options"]["proxies"]) 37 | LOG.info(f"Using proxy: {proxy}") 38 | 39 | async with AsyncCamoufox(headless=True) as browser: 40 | page: Page = await browser.new_page() 41 | 42 | await add_custom_items( 43 | agent_job["url"], 44 | page, 45 | agent_job["job_options"]["custom_cookies"], 46 | agent_job["job_options"]["custom_headers"], 47 | ) 48 | 49 | try: 50 | await page.set_viewport_size({"width": 1920, "height": 1080}) 51 | await page.goto(agent_job["url"], timeout=60000) 52 | 53 | if agent_job["job_options"]["collect_media"]: 54 | await collect_media(agent_job["id"], page) 55 | 56 | html_content = await page.content() 57 | markdown_content = convert_to_markdown(html_content) 58 | 59 | response = await ask_ai( 60 | ELEMENT_EXTRACTION_PROMPT.format( 61 | extraction_prompt=EXTRACT_ELEMENTS_PROMPT, 62 | webpage=markdown_content, 63 | prompt=agent_job["prompt"], 64 | ) 65 | ) 66 | 67 | xpaths = parse_response(response) 68 | 69 | captured_elements = await capture_elements(page, xpaths) 70 | 71 | final_url = page.url 72 | 73 | pages.add((html_content, final_url)) 74 | finally: 75 | await page.close() 76 | await browser.close() 77 | 78 | name_to_elements = {} 79 | 80 | for page in pages: 81 | for element in captured_elements: 82 | if element.name not in name_to_elements: 83 | name_to_elements[element.name] = [] 84 | 85 | name_to_elements[element.name].append(element) 86 | 87 | scraped_elements: list[dict[str, dict[str, list[CapturedElement]]]] = [ 88 | { 89 | page[1]: name_to_elements, 90 | } 91 | for page in pages 92 | ] 93 | 94 | return scraped_elements 95 | -------------------------------------------------------------------------------- /api/backend/ai/agent/prompts.py: -------------------------------------------------------------------------------- 1 | EXTRACT_ELEMENTS_PROMPT = """ 2 | You are an assistant that extracts XPath expressions from webpages. 3 | 4 | You will receive HTML content in markdown format. 5 | 6 | Each element in the markdown has their xpath shown above them in a path like: 7 | 8 | 9 | Respond only with a list of general XPath expressions inside `...` tags. 10 | 11 | You will also decide the decision of what to do next. If there is no decision available, return nothing for that section. 12 | """ 13 | 14 | ELEMENT_EXTRACTION_PROMPT = """ 15 | {extraction_prompt} 16 | 17 | **Guidelines:** 18 | - Prefer shorter, more general XPaths like `//div[...]` or `//span[...]`. 19 | - Avoid overly specific or deep paths like `//div[3]/ul/li[2]/a`. 20 | - Do **not** chain multiple elements deeply (e.g., `//div/span/a`). 21 | - Use XPaths further down the tree when possible. 22 | - Do not include any extra explanation or text. 23 | - One XPath is acceptable if that's all that's needed. 24 | - Try and limit it down to 1 - 3 xpaths. 25 | - Include a name for each xpath. 26 | 27 | 28 | - USE THE MOST SIMPLE XPATHS POSSIBLE. 29 | - USE THE MOST GENERAL XPATHS POSSIBLE. 30 | - USE THE MOST SPECIFIC XPATHS POSSIBLE. 31 | - USE THE MOST GENERAL XPATHS POSSIBLE. 32 | 33 | 34 | **Example Format:** 35 | ```xml 36 | 37 | - : 38 | - : 39 | - : 40 | - : 41 | - : 42 | - etc 43 | 44 | 45 | 46 | 47 | - //a[@href='next_page_url'] 48 | 49 | 50 | ``` 51 | 52 | **Input webpage:** 53 | {webpage} 54 | 55 | **Target content:** 56 | {prompt} 57 | 58 | """ 59 | -------------------------------------------------------------------------------- /api/backend/ai/ai_router.py: -------------------------------------------------------------------------------- 1 | # STL 2 | import logging 3 | from collections.abc import Iterable, AsyncGenerator 4 | 5 | # PDM 6 | from fastapi import APIRouter 7 | from fastapi.responses import JSONResponse, StreamingResponse 8 | from openai.types.chat import ChatCompletionMessageParam 9 | 10 | # LOCAL 11 | from ollama import Message 12 | from api.backend.models import AI 13 | 14 | from api.backend.ai.clients import ( 15 | llama_client, 16 | llama_model, 17 | openai_client, 18 | open_ai_model, 19 | open_ai_key, 20 | ) 21 | 22 | 23 | LOG = logging.getLogger(__name__) 24 | 25 | ai_router = APIRouter() 26 | 27 | 28 | async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]: 29 | if llama_client and llama_model: 30 | try: 31 | async for part in await llama_client.chat( 32 | model=llama_model, messages=chat_messages, stream=True 33 | ): 34 | yield part["message"]["content"] 35 | except Exception as e: 36 | LOG.error(f"Error during chat: {e}") 37 | yield "An error occurred while processing your request." 38 | 39 | 40 | async def openai_chat( 41 | chat_messages: Iterable[ChatCompletionMessageParam], 42 | ) -> AsyncGenerator[str, None]: 43 | if openai_client and not open_ai_model: 44 | LOG.error("OpenAI model is not set") 45 | yield "An error occurred while processing your request." 46 | 47 | if not openai_client: 48 | LOG.error("OpenAI client is not set") 49 | yield "An error occurred while processing your request." 50 | 51 | if openai_client and open_ai_model: 52 | try: 53 | response = openai_client.chat.completions.create( 54 | model=open_ai_model, messages=chat_messages, stream=True 55 | ) 56 | for part in response: 57 | yield part.choices[0].delta.content or "" 58 | except Exception as e: 59 | LOG.error(f"Error during OpenAI chat: {e}") 60 | yield "An error occurred while processing your request." 61 | 62 | 63 | chat_function = llama_chat if llama_client else openai_chat 64 | 65 | 66 | @ai_router.post("/ai") 67 | async def ai(c: AI): 68 | return StreamingResponse( 69 | chat_function(chat_messages=c.messages), media_type="text/plain" 70 | ) 71 | 72 | 73 | @ai_router.get("/ai/check") 74 | async def check(): 75 | return JSONResponse(content={"ai_enabled": bool(open_ai_key or llama_model)}) 76 | -------------------------------------------------------------------------------- /api/backend/ai/clients.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from openai import OpenAI 4 | from ollama import AsyncClient 5 | 6 | 7 | # Load environment variables 8 | open_ai_key = os.getenv("OPENAI_KEY") 9 | open_ai_model = os.getenv("OPENAI_MODEL") 10 | llama_url = os.getenv("OLLAMA_URL") 11 | llama_model = os.getenv("OLLAMA_MODEL") 12 | 13 | # Initialize clients 14 | openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None 15 | llama_client = AsyncClient(host=llama_url) if llama_url else None 16 | 17 | 18 | async def ask_open_ai(prompt: str) -> str: 19 | if not openai_client: 20 | raise ValueError("OpenAI client not initialized") 21 | 22 | response = openai_client.chat.completions.create( 23 | model=open_ai_model or "gpt-4.1-mini", 24 | messages=[{"role": "user", "content": prompt}], 25 | ) 26 | 27 | return response.choices[0].message.content or "" 28 | 29 | 30 | async def ask_ollama(prompt: str) -> str: 31 | if not llama_client: 32 | raise ValueError("Ollama client not initialized") 33 | 34 | response = await llama_client.chat( 35 | model=llama_model or "", messages=[{"role": "user", "content": prompt}] 36 | ) 37 | 38 | return response.message.content or "" 39 | -------------------------------------------------------------------------------- /api/backend/app.py: -------------------------------------------------------------------------------- 1 | # STL 2 | import os 3 | import logging 4 | import apscheduler # type: ignore 5 | from contextlib import asynccontextmanager 6 | 7 | # PDM 8 | import apscheduler.schedulers 9 | import apscheduler.schedulers.background 10 | from fastapi import FastAPI, Request, status 11 | from fastapi.exceptions import RequestValidationError 12 | from fastapi.middleware.cors import CORSMiddleware 13 | 14 | # LOCAL 15 | from api.backend.ai.ai_router import ai_router 16 | from api.backend.auth.auth_router import auth_router 17 | from api.backend.utils import get_log_level 18 | from api.backend.routers.job_router import job_router 19 | from api.backend.routers.stats_router import stats_router 20 | from api.backend.database.startup import init_database 21 | from fastapi.responses import JSONResponse 22 | 23 | from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler 24 | from api.backend.scheduler import scheduler 25 | 26 | log_level = os.getenv("LOG_LEVEL") 27 | LOG_LEVEL = get_log_level(log_level) 28 | 29 | logging.basicConfig( 30 | level=LOG_LEVEL, 31 | format="%(levelname)s: %(asctime)s - %(name)s - %(message)s", 32 | handlers=[logging.StreamHandler()], 33 | ) 34 | 35 | LOG = logging.getLogger(__name__) 36 | 37 | 38 | @asynccontextmanager 39 | async def lifespan(app: FastAPI): 40 | # Startup 41 | LOG.info("Starting application...") 42 | 43 | init_database() 44 | 45 | LOG.info("Starting cron scheduler...") 46 | start_cron_scheduler(scheduler) 47 | scheduler.start() 48 | LOG.info("Cron scheduler started successfully") 49 | 50 | yield 51 | 52 | # Shutdown 53 | LOG.info("Shutting down application...") 54 | LOG.info("Stopping cron scheduler...") 55 | scheduler.shutdown(wait=False) # Set wait=False to not block shutdown 56 | LOG.info("Cron scheduler stopped") 57 | LOG.info("Application shutdown complete") 58 | 59 | 60 | app = FastAPI(title="api", root_path="/api", lifespan=lifespan) 61 | 62 | app.add_middleware( 63 | CORSMiddleware, 64 | allow_origins=["*"], 65 | allow_credentials=True, 66 | allow_methods=["*"], 67 | allow_headers=["*"], 68 | ) 69 | 70 | app.include_router(auth_router) 71 | app.include_router(ai_router) 72 | app.include_router(job_router) 73 | app.include_router(stats_router) 74 | 75 | 76 | @app.exception_handler(RequestValidationError) 77 | async def validation_exception_handler(request: Request, exc: RequestValidationError): 78 | exc_str = f"{exc}".replace("\n", " ").replace(" ", " ") 79 | logging.error(f"{request}: {exc_str}") 80 | content = {"status_code": 10422, "message": exc_str, "data": None} 81 | return JSONResponse( 82 | content=content, status_code=status.HTTP_422_UNPROCESSABLE_ENTITY 83 | ) 84 | -------------------------------------------------------------------------------- /api/backend/auth/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/api/backend/auth/__init__.py -------------------------------------------------------------------------------- /api/backend/auth/auth_router.py: -------------------------------------------------------------------------------- 1 | # STL 2 | from datetime import timedelta 3 | import os 4 | 5 | # PDM 6 | from fastapi import Depends, APIRouter, HTTPException, status 7 | from fastapi.security import OAuth2PasswordRequestForm 8 | 9 | # LOCAL 10 | from api.backend.schemas import User, Token, UserCreate 11 | from api.backend.auth.auth_utils import ( 12 | ACCESS_TOKEN_EXPIRE_MINUTES, 13 | get_current_user, 14 | authenticate_user, 15 | get_password_hash, 16 | create_access_token, 17 | ) 18 | import logging 19 | 20 | from api.backend.database.common import update 21 | 22 | auth_router = APIRouter() 23 | 24 | LOG = logging.getLogger("auth_router") 25 | 26 | 27 | @auth_router.post("/auth/token", response_model=Token) 28 | async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()): 29 | user = await authenticate_user(form_data.username, form_data.password) 30 | if not user: 31 | raise HTTPException( 32 | status_code=status.HTTP_401_UNAUTHORIZED, 33 | detail="Incorrect username or password", 34 | headers={"WWW-Authenticate": "Bearer"}, 35 | ) 36 | 37 | expire_minutes = ( 38 | int(ACCESS_TOKEN_EXPIRE_MINUTES) if ACCESS_TOKEN_EXPIRE_MINUTES else 60 39 | ) 40 | 41 | access_token_expires = timedelta(minutes=expire_minutes) 42 | access_token = create_access_token( 43 | data={"sub": user.email}, expires_delta=access_token_expires 44 | ) 45 | 46 | return {"access_token": access_token, "token_type": "bearer"} 47 | 48 | 49 | @auth_router.post("/auth/signup", response_model=User) 50 | async def create_user(user: UserCreate): 51 | hashed_password = get_password_hash(user.password) 52 | user_dict = user.model_dump() 53 | user_dict["hashed_password"] = hashed_password 54 | del user_dict["password"] 55 | 56 | query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)" 57 | _ = update(query, (user_dict["email"], hashed_password, user_dict["full_name"])) 58 | 59 | return user_dict 60 | 61 | 62 | @auth_router.get("/auth/users/me", response_model=User) 63 | async def read_users_me(current_user: User = Depends(get_current_user)): 64 | return current_user 65 | 66 | 67 | @auth_router.get("/auth/check") 68 | async def check_auth(): 69 | return { 70 | "registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True", 71 | "recordings_enabled": os.environ.get("RECORDINGS_ENABLED", "true").lower() 72 | == "true", 73 | } 74 | -------------------------------------------------------------------------------- /api/backend/constants.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import os 3 | 4 | DATABASE_PATH = "data/database.db" 5 | RECORDINGS_DIR = Path("media/recordings") 6 | RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true" 7 | MEDIA_DIR = Path("media") 8 | MEDIA_TYPES = [ 9 | "audio", 10 | "documents", 11 | "images", 12 | "pdfs", 13 | "presentations", 14 | "spreadsheets", 15 | "videos", 16 | ] 17 | -------------------------------------------------------------------------------- /api/backend/database/__init__.py: -------------------------------------------------------------------------------- 1 | from .common import insert, QUERIES, update 2 | 3 | __all__ = ["insert", "QUERIES", "update"] 4 | -------------------------------------------------------------------------------- /api/backend/database/common.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | from typing import Any, Optional 3 | from api.backend.constants import DATABASE_PATH 4 | from api.backend.utils import format_json, format_sql_row_to_python 5 | from api.backend.database.schema import INIT_QUERY 6 | from api.backend.database.queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY 7 | import logging 8 | 9 | LOG = logging.getLogger(__name__) 10 | 11 | 12 | def connect(): 13 | connection = sqlite3.connect(DATABASE_PATH) 14 | connection.set_trace_callback(print) 15 | cursor = connection.cursor() 16 | return cursor 17 | 18 | 19 | def insert(query: str, values: tuple[Any, ...]): 20 | connection = sqlite3.connect(DATABASE_PATH) 21 | cursor = connection.cursor() 22 | copy = list(values) 23 | format_json(copy) 24 | 25 | try: 26 | _ = cursor.execute(query, copy) 27 | connection.commit() 28 | except sqlite3.Error as e: 29 | LOG.error(f"An error occurred: {e}") 30 | finally: 31 | cursor.close() 32 | connection.close() 33 | 34 | 35 | def query(query: str, values: Optional[tuple[Any, ...]] = None): 36 | connection = sqlite3.connect(DATABASE_PATH) 37 | connection.row_factory = sqlite3.Row 38 | cursor = connection.cursor() 39 | rows = [] 40 | try: 41 | if values: 42 | _ = cursor.execute(query, values) 43 | else: 44 | _ = cursor.execute(query) 45 | 46 | rows = cursor.fetchall() 47 | 48 | finally: 49 | cursor.close() 50 | connection.close() 51 | 52 | formatted_rows: list[dict[str, Any]] = [] 53 | 54 | for row in rows: 55 | row = dict(row) 56 | formatted_row = format_sql_row_to_python(row) 57 | formatted_rows.append(formatted_row) 58 | 59 | return formatted_rows 60 | 61 | 62 | def update(query: str, values: Optional[tuple[Any, ...]] = None): 63 | connection = sqlite3.connect(DATABASE_PATH) 64 | cursor = connection.cursor() 65 | 66 | copy = None 67 | 68 | if values: 69 | copy = list(values) 70 | format_json(copy) 71 | 72 | try: 73 | if copy: 74 | res = cursor.execute(query, copy) 75 | else: 76 | res = cursor.execute(query) 77 | connection.commit() 78 | return res.rowcount 79 | except sqlite3.Error as e: 80 | LOG.error(f"An error occurred: {e}") 81 | finally: 82 | cursor.close() 83 | connection.close() 84 | 85 | return 0 86 | 87 | 88 | QUERIES = { 89 | "init": INIT_QUERY, 90 | "insert_job": JOB_INSERT_QUERY, 91 | "delete_job": DELETE_JOB_QUERY, 92 | } 93 | -------------------------------------------------------------------------------- /api/backend/database/queries/__init__.py: -------------------------------------------------------------------------------- 1 | from .queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY 2 | 3 | __all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"] 4 | -------------------------------------------------------------------------------- /api/backend/database/queries/queries.py: -------------------------------------------------------------------------------- 1 | JOB_INSERT_QUERY = """ 2 | INSERT INTO jobs 3 | (id, url, elements, user, time_created, result, status, chat, job_options, agent_mode, prompt) 4 | VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) 5 | """ 6 | 7 | DELETE_JOB_QUERY = """ 8 | DELETE FROM jobs WHERE id IN () 9 | """ 10 | -------------------------------------------------------------------------------- /api/backend/database/schema/__init__.py: -------------------------------------------------------------------------------- 1 | from .schema import INIT_QUERY 2 | 3 | __all__ = ["INIT_QUERY"] 4 | -------------------------------------------------------------------------------- /api/backend/database/schema/schema.py: -------------------------------------------------------------------------------- 1 | INIT_QUERY = """ 2 | CREATE TABLE IF NOT EXISTS jobs ( 3 | id STRING PRIMARY KEY NOT NULL, 4 | url STRING NOT NULL, 5 | elements JSON NOT NULL, 6 | user STRING, 7 | time_created DATETIME NOT NULL, 8 | result JSON NOT NULL, 9 | status STRING NOT NULL, 10 | chat JSON, 11 | job_options JSON 12 | ); 13 | 14 | CREATE TABLE IF NOT EXISTS users ( 15 | email STRING PRIMARY KEY NOT NULL, 16 | hashed_password STRING NOT NULL, 17 | full_name STRING, 18 | disabled BOOLEAN 19 | ); 20 | 21 | CREATE TABLE IF NOT EXISTS cron_jobs ( 22 | id STRING PRIMARY KEY NOT NULL, 23 | user_email STRING NOT NULL, 24 | job_id STRING NOT NULL, 25 | cron_expression STRING NOT NULL, 26 | time_created DATETIME NOT NULL, 27 | time_updated DATETIME NOT NULL, 28 | FOREIGN KEY (job_id) REFERENCES jobs(id) 29 | ); 30 | 31 | ALTER TABLE jobs ADD COLUMN agent_mode BOOLEAN NOT NULL DEFAULT FALSE; 32 | ALTER TABLE jobs ADD COLUMN prompt STRING; 33 | """ 34 | -------------------------------------------------------------------------------- /api/backend/database/startup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from api.backend.database.common import connect, QUERIES, insert 3 | import logging 4 | import sqlite3 5 | 6 | from api.backend.auth.auth_utils import get_password_hash 7 | 8 | LOG = logging.getLogger(__name__) 9 | 10 | 11 | def init_database(): 12 | cursor = connect() 13 | 14 | for query in QUERIES["init"].strip().split(";"): 15 | query = query.strip() 16 | if not query: 17 | continue 18 | 19 | try: 20 | LOG.info(f"Executing query: {query}") 21 | _ = cursor.execute(query) 22 | except sqlite3.OperationalError as e: 23 | if "duplicate column name" in str(e).lower(): 24 | LOG.warning(f"Skipping duplicate column error: {e}") 25 | continue 26 | else: 27 | LOG.error(f"Error executing query: {query}") 28 | raise 29 | 30 | if os.environ.get("REGISTRATION_ENABLED", "true").lower() == "false": 31 | default_user_email = os.environ.get("DEFAULT_USER_EMAIL") 32 | default_user_password = os.environ.get("DEFAULT_USER_PASSWORD") 33 | default_user_full_name = os.environ.get("DEFAULT_USER_FULL_NAME") 34 | 35 | if ( 36 | not default_user_email 37 | or not default_user_password 38 | or not default_user_full_name 39 | ): 40 | LOG.error( 41 | "DEFAULT_USER_EMAIL, DEFAULT_USER_PASSWORD, or DEFAULT_USER_FULL_NAME is not set!" 42 | ) 43 | exit(1) 44 | 45 | query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)" 46 | _ = insert( 47 | query, 48 | ( 49 | default_user_email, 50 | get_password_hash(default_user_password), 51 | default_user_full_name, 52 | ), 53 | ) 54 | 55 | cursor.close() 56 | -------------------------------------------------------------------------------- /api/backend/job/__init__.py: -------------------------------------------------------------------------------- 1 | from .job import ( 2 | insert, 3 | update_job, 4 | delete_jobs, 5 | get_jobs_per_day, 6 | get_queued_job, 7 | average_elements_per_link, 8 | ) 9 | 10 | __all__ = [ 11 | "insert", 12 | "update_job", 13 | "delete_jobs", 14 | "get_jobs_per_day", 15 | "get_queued_job", 16 | "average_elements_per_link", 17 | ] 18 | -------------------------------------------------------------------------------- /api/backend/job/cron_scheduling/cron_scheduling.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from typing import Any 3 | import uuid 4 | from api.backend.database.common import insert, query 5 | from api.backend.models import CronJob 6 | from apscheduler.schedulers.background import BackgroundScheduler # type: ignore 7 | from apscheduler.triggers.cron import CronTrigger # type: ignore 8 | 9 | from api.backend.job import insert as insert_job 10 | import logging 11 | 12 | LOG = logging.getLogger("Cron Scheduler") 13 | 14 | 15 | def insert_cron_job(cron_job: CronJob): 16 | query = """ 17 | INSERT INTO cron_jobs (id, user_email, job_id, cron_expression, time_created, time_updated) 18 | VALUES (?, ?, ?, ?, ?, ?) 19 | """ 20 | values = ( 21 | cron_job.id, 22 | cron_job.user_email, 23 | cron_job.job_id, 24 | cron_job.cron_expression, 25 | cron_job.time_created, 26 | cron_job.time_updated, 27 | ) 28 | 29 | insert(query, values) 30 | 31 | return True 32 | 33 | 34 | def delete_cron_job(id: str, user_email: str): 35 | query = """ 36 | DELETE FROM cron_jobs 37 | WHERE id = ? AND user_email = ? 38 | """ 39 | values = (id, user_email) 40 | insert(query, values) 41 | 42 | return True 43 | 44 | 45 | def get_cron_jobs(user_email: str): 46 | cron_jobs = query("SELECT * FROM cron_jobs WHERE user_email = ?", (user_email,)) 47 | 48 | return cron_jobs 49 | 50 | 51 | def get_all_cron_jobs(): 52 | cron_jobs = query("SELECT * FROM cron_jobs") 53 | 54 | return cron_jobs 55 | 56 | 57 | def insert_job_from_cron_job(job: dict[str, Any]): 58 | insert_job( 59 | { 60 | **job, 61 | "id": uuid.uuid4().hex, 62 | "status": "Queued", 63 | "result": "", 64 | "chat": None, 65 | "time_created": datetime.datetime.now(), 66 | "time_updated": datetime.datetime.now(), 67 | } 68 | ) 69 | 70 | 71 | def get_cron_job_trigger(cron_expression: str): 72 | expression_parts = cron_expression.split() 73 | 74 | if len(expression_parts) != 5: 75 | print(f"Invalid cron expression: {cron_expression}") 76 | return None 77 | 78 | minute, hour, day, month, day_of_week = expression_parts 79 | 80 | return CronTrigger( 81 | minute=minute, hour=hour, day=day, month=month, day_of_week=day_of_week 82 | ) 83 | 84 | 85 | def start_cron_scheduler(scheduler: BackgroundScheduler): 86 | cron_jobs = get_all_cron_jobs() 87 | 88 | LOG.info(f"Cron jobs: {cron_jobs}") 89 | 90 | for job in cron_jobs: 91 | queried_job = query("SELECT * FROM jobs WHERE id = ?", (job["job_id"],)) 92 | 93 | LOG.info(f"Adding job: {queried_job}") 94 | 95 | scheduler.add_job( 96 | insert_job_from_cron_job, 97 | get_cron_job_trigger(job["cron_expression"]), 98 | id=job["id"], 99 | args=[queried_job[0]], 100 | ) 101 | -------------------------------------------------------------------------------- /api/backend/job/job.py: -------------------------------------------------------------------------------- 1 | # STL 2 | import logging 3 | from typing import Any 4 | 5 | # LOCAL 6 | from api.backend.utils import format_list_for_query 7 | from api.backend.database.common import ( 8 | insert as common_insert, 9 | query as common_query, 10 | QUERIES, 11 | update as common_update, 12 | ) 13 | 14 | LOG = logging.getLogger(__name__) 15 | 16 | 17 | def insert(item: dict[str, Any]) -> None: 18 | common_insert( 19 | QUERIES["insert_job"], 20 | ( 21 | item["id"], 22 | item["url"], 23 | item["elements"], 24 | item["user"], 25 | item["time_created"], 26 | item["result"], 27 | item["status"], 28 | item["chat"], 29 | item["job_options"], 30 | item["agent_mode"], 31 | item["prompt"], 32 | ), 33 | ) 34 | LOG.info(f"Inserted item: {item}") 35 | 36 | 37 | async def get_queued_job(): 38 | query = ( 39 | "SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1" 40 | ) 41 | res = common_query(query) 42 | LOG.info(f"Got queued job: {res}") 43 | return res[0] if res else None 44 | 45 | 46 | async def update_job(ids: list[str], field: str, value: Any): 47 | query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}" 48 | res = common_update(query, tuple([value] + ids)) 49 | LOG.info(f"Updated job: {res}") 50 | 51 | 52 | async def delete_jobs(jobs: list[str]): 53 | if not jobs: 54 | LOG.info("No jobs to delete.") 55 | return False 56 | 57 | query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}" 58 | res = common_update(query, tuple(jobs)) 59 | 60 | return res > 0 61 | 62 | 63 | async def average_elements_per_link(user: str): 64 | job_query = """ 65 | SELECT 66 | DATE(time_created) AS date, 67 | AVG(json_array_length(elements)) AS average_elements, 68 | COUNT(*) AS count 69 | FROM 70 | jobs 71 | WHERE 72 | status = 'Completed' AND user = ? 73 | GROUP BY 74 | DATE(time_created) 75 | ORDER BY 76 | date ASC; 77 | """ 78 | results = common_query(job_query, (user,)) 79 | 80 | return results 81 | 82 | 83 | async def get_jobs_per_day(user: str): 84 | job_query = """ 85 | SELECT 86 | DATE(time_created) AS date, 87 | COUNT(*) AS job_count 88 | FROM 89 | jobs 90 | WHERE 91 | status = 'Completed' AND user = ? 92 | GROUP BY 93 | DATE(time_created) 94 | ORDER BY 95 | date ASC; 96 | """ 97 | results = common_query(job_query, (user,)) 98 | 99 | return results 100 | -------------------------------------------------------------------------------- /api/backend/job/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .job_options import JobOptions 2 | 3 | __all__ = ["JobOptions"] 4 | -------------------------------------------------------------------------------- /api/backend/job/models/job_options.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import Any, Optional 3 | from api.backend.job.models.site_map import SiteMap 4 | 5 | 6 | class FetchOptions(BaseModel): 7 | chat: Optional[bool] = None 8 | 9 | 10 | class JobOptions(BaseModel): 11 | multi_page_scrape: bool = False 12 | custom_headers: dict[str, Any] = {} 13 | proxies: list[str] = [] 14 | site_map: Optional[SiteMap] = None 15 | collect_media: bool = False 16 | custom_cookies: list[dict[str, Any]] = [] 17 | -------------------------------------------------------------------------------- /api/backend/job/models/site_map.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import Literal 3 | 4 | 5 | class Action(BaseModel): 6 | type: Literal["click", "input"] 7 | xpath: str 8 | name: str 9 | input: str = "" 10 | do_once: bool = True 11 | 12 | 13 | class SiteMap(BaseModel): 14 | actions: list[Action] 15 | -------------------------------------------------------------------------------- /api/backend/job/scraping/add_custom.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional 2 | from urllib.parse import urlparse 3 | 4 | from playwright.async_api import Page, BrowserContext 5 | 6 | import logging 7 | 8 | LOG = logging.getLogger(__name__) 9 | 10 | 11 | async def add_custom_cookies( 12 | custom_cookies: list[dict[str, Any]], 13 | url: str, 14 | context: BrowserContext, 15 | ) -> None: 16 | parsed_url = urlparse(url) 17 | domain = parsed_url.netloc 18 | 19 | for cookie in custom_cookies: 20 | cookie_dict = { 21 | "name": cookie.get("name", "default_name"), 22 | "value": cookie.get("value", "default_value"), 23 | "domain": domain, 24 | "path": "/", 25 | } 26 | 27 | LOG.info(f"Adding cookie: {cookie_dict}") 28 | await context.add_cookies([cookie_dict]) # type: ignore 29 | 30 | 31 | async def add_custom_headers( 32 | custom_headers: dict[str, Any], 33 | page: Page, 34 | ) -> None: 35 | await page.set_extra_http_headers(custom_headers) 36 | 37 | 38 | async def add_custom_items( 39 | url: str, 40 | page: Page, 41 | cookies: Optional[list[dict[str, Any]]] = None, 42 | headers: Optional[dict[str, Any]] = None, 43 | ) -> None: 44 | if cookies: 45 | await add_custom_cookies(cookies, url, page.context) 46 | 47 | if headers: 48 | await add_custom_headers(headers, page) 49 | -------------------------------------------------------------------------------- /api/backend/job/scraping/scraping_utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import Set, Tuple 3 | from playwright.async_api import Page 4 | 5 | from api.backend.utils import LOG 6 | 7 | from api.backend.job.scraping.collect_media import collect_media as collect_media_utils 8 | 9 | 10 | async def scrape_content( 11 | id: str, page: Page, pages: Set[Tuple[str, str]], collect_media: bool 12 | ) -> str: 13 | last_height = await page.evaluate("document.body.scrollHeight") 14 | 15 | while True: 16 | await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") 17 | await asyncio.sleep(3) 18 | new_height = await page.evaluate("document.body.scrollHeight") 19 | 20 | if new_height == last_height: 21 | break 22 | 23 | last_height = new_height 24 | 25 | html = await page.content() 26 | pages.add((html, page.url)) 27 | 28 | if collect_media: 29 | LOG.info("Collecting media") 30 | await collect_media_utils(id, page) 31 | 32 | return html 33 | 34 | 35 | def clean_format_characters(text: str) -> str: 36 | text = text.strip() 37 | text = text.replace("\n", " ") 38 | text = text.replace("\t", " ") 39 | text = text.replace("\r", " ") 40 | text = text.replace("\f", " ") 41 | text = text.replace("\v", " ") 42 | text = text.replace("\b", " ") 43 | text = text.replace("\a", " ") 44 | 45 | return text 46 | -------------------------------------------------------------------------------- /api/backend/job/site_mapping/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/api/backend/job/site_mapping/__init__.py -------------------------------------------------------------------------------- /api/backend/job/site_mapping/site_mapping.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import asyncio 3 | from copy import deepcopy 4 | from typing import Any 5 | 6 | from playwright.async_api import Page 7 | 8 | from api.backend.job.models.site_map import Action, SiteMap 9 | from api.backend.job.scraping.scraping_utils import scrape_content 10 | 11 | LOG = logging.getLogger(__name__) 12 | 13 | 14 | def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]: 15 | """Clear all actions that have been clicked.""" 16 | cleared_site_map = deepcopy(site_map) 17 | cleared_site_map["actions"] = [ 18 | action for action in cleared_site_map["actions"] if not action["do_once"] 19 | ] 20 | 21 | return cleared_site_map 22 | 23 | 24 | async def handle_input(action: Action, page: Page) -> bool: 25 | try: 26 | element = page.locator(f"xpath={action.xpath}") 27 | LOG.info(f"Sending keys: {action.input} to element: {action.xpath}") 28 | await element.fill(action.input) 29 | return True 30 | except Exception as e: 31 | LOG.warning(f"Error handling input for xpath '{action.xpath}': {e}") 32 | return False 33 | 34 | 35 | async def handle_click(action: Action, page: Page) -> bool: 36 | try: 37 | element = page.locator(f"xpath={action.xpath}") 38 | LOG.info(f"Clicking element: {action.xpath}") 39 | await element.click() 40 | return True 41 | except Exception as e: 42 | LOG.warning(f"Error clicking element at xpath '{action.xpath}': {e}") 43 | return False 44 | 45 | 46 | ACTION_MAP = { 47 | "click": handle_click, 48 | "input": handle_input, 49 | } 50 | 51 | 52 | async def handle_site_mapping( 53 | id: str, 54 | site_map_dict: dict[str, Any], 55 | page: Page, 56 | pages: set[tuple[str, str]], 57 | collect_media: bool = False, 58 | ): 59 | site_map = SiteMap(**site_map_dict) 60 | 61 | for action in site_map.actions: 62 | action_handler = ACTION_MAP[action.type] 63 | success = await action_handler(action, page) 64 | 65 | if not success: 66 | return 67 | 68 | await asyncio.sleep(2) 69 | 70 | await scrape_content(id, page, pages, collect_media=collect_media) 71 | 72 | cleared_site_map_dict = clear_done_actions(site_map_dict) 73 | 74 | if cleared_site_map_dict["actions"]: 75 | await handle_site_mapping( 76 | id, cleared_site_map_dict, page, pages, collect_media=collect_media 77 | ) 78 | -------------------------------------------------------------------------------- /api/backend/job/utils/clean_job_format.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from api.backend.utils import clean_text 4 | 5 | 6 | def clean_job_format(jobs: list[dict[str, Any]]) -> dict[str, Any]: 7 | """ 8 | Convert a single job to a dictionary format. 9 | """ 10 | headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"] 11 | 12 | cleaned_rows = [] 13 | 14 | for job in jobs: 15 | for res in job["result"]: 16 | for url, elements in res.items(): 17 | for element_name, values in elements.items(): 18 | for value in values: 19 | text = clean_text(value.get("text", "")).strip() 20 | if text: 21 | cleaned_rows.append( 22 | { 23 | "id": job.get("id", ""), 24 | "url": url, 25 | "element_name": element_name, 26 | "xpath": value.get("xpath", ""), 27 | "text": text, 28 | "user": job.get("user", ""), 29 | "time_created": job.get("time_created", ""), 30 | } 31 | ) 32 | 33 | return { 34 | "headers": headers, 35 | "rows": cleaned_rows, 36 | } 37 | -------------------------------------------------------------------------------- /api/backend/job/utils/stream_md_from_job_results.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from api.backend.utils import clean_text 4 | 5 | 6 | def stream_md_from_job_results(jobs: list[dict[str, Any]]): 7 | md = "# Job Results Summary\n\n" 8 | for i, job in enumerate(jobs, start=1): 9 | md += f"## Job #{i}\n" 10 | yield f"- **Job URL:** {job.get('url', 'N/A')}\n" 11 | yield f"- **Timestamp:** {job.get('time_created', 'N/A')}\n" 12 | yield f"- **ID:** {job.get('id', 'N/A')}\n" 13 | yield "### Extracted Results:\n" 14 | 15 | for res in job.get("result", []): 16 | for url, elements in res.items(): 17 | yield f"\n#### URL: {url}\n" 18 | for element_name, values in elements.items(): 19 | for value in values: 20 | text = clean_text(value.get("text", "")).strip() 21 | if text: 22 | yield f"- **Element:** `{element_name}`\n" 23 | yield f" - **Text:** {text}\n" 24 | yield "\n---\n" 25 | -------------------------------------------------------------------------------- /api/backend/models.py: -------------------------------------------------------------------------------- 1 | # STL 2 | from typing import Any, Literal, Optional, Union 3 | from datetime import datetime 4 | 5 | # LOCAL 6 | from api.backend.job.models.job_options import JobOptions 7 | 8 | # PDM 9 | import pydantic 10 | 11 | 12 | class Element(pydantic.BaseModel): 13 | name: str 14 | xpath: str 15 | url: Optional[str] = None 16 | 17 | 18 | class CapturedElement(pydantic.BaseModel): 19 | xpath: str 20 | text: str 21 | name: str 22 | 23 | 24 | class RetrieveScrapeJobs(pydantic.BaseModel): 25 | user: str 26 | 27 | 28 | class DownloadJob(pydantic.BaseModel): 29 | ids: list[str] 30 | job_format: Literal["csv", "md"] 31 | 32 | 33 | class DeleteScrapeJobs(pydantic.BaseModel): 34 | ids: list[str] 35 | 36 | 37 | class GetStatistics(pydantic.BaseModel): 38 | user: str 39 | 40 | 41 | class UpdateJobs(pydantic.BaseModel): 42 | ids: list[str] 43 | field: str 44 | value: Any 45 | 46 | 47 | class AI(pydantic.BaseModel): 48 | messages: list[Any] 49 | 50 | 51 | class Job(pydantic.BaseModel): 52 | id: Optional[str] = None 53 | url: str 54 | elements: list[Element] 55 | user: str = "" 56 | time_created: Optional[Union[datetime, str]] = None 57 | result: list[dict[str, dict[str, list[CapturedElement]]]] = [] 58 | job_options: JobOptions 59 | status: str = "Queued" 60 | chat: Optional[str] = None 61 | agent_mode: bool = False 62 | prompt: Optional[str] = None 63 | 64 | 65 | class CronJob(pydantic.BaseModel): 66 | id: Optional[str] = None 67 | user_email: str 68 | job_id: str 69 | cron_expression: str 70 | time_created: Optional[Union[datetime, str]] = None 71 | time_updated: Optional[Union[datetime, str]] = None 72 | 73 | 74 | class DeleteCronJob(pydantic.BaseModel): 75 | id: str 76 | user_email: str 77 | -------------------------------------------------------------------------------- /api/backend/routers/stats_router.py: -------------------------------------------------------------------------------- 1 | # STL 2 | import logging 3 | 4 | # PDM 5 | from fastapi import APIRouter, Depends 6 | 7 | # LOCAL 8 | from api.backend.job import ( 9 | get_jobs_per_day, 10 | average_elements_per_link, 11 | ) 12 | from api.backend.auth.auth_utils import get_current_user 13 | from api.backend.schemas import User 14 | 15 | 16 | LOG = logging.getLogger(__name__) 17 | 18 | stats_router = APIRouter() 19 | 20 | 21 | @stats_router.get("/statistics/get-average-element-per-link") 22 | async def get_average_element_per_link(user: User = Depends(get_current_user)): 23 | return await average_elements_per_link(user.email) 24 | 25 | 26 | @stats_router.get("/statistics/get-average-jobs-per-day") 27 | async def average_jobs_per_day(user: User = Depends(get_current_user)): 28 | data = await get_jobs_per_day(user.email) 29 | return data 30 | -------------------------------------------------------------------------------- /api/backend/scheduler.py: -------------------------------------------------------------------------------- 1 | from apscheduler.schedulers.background import BackgroundScheduler # type: ignore 2 | 3 | scheduler = BackgroundScheduler() 4 | -------------------------------------------------------------------------------- /api/backend/schemas.py: -------------------------------------------------------------------------------- 1 | # STL 2 | from typing import Union, Literal, Optional 3 | 4 | # PDM 5 | from pydantic import EmailStr, BaseModel 6 | 7 | 8 | class Token(BaseModel): 9 | access_token: str 10 | token_type: str 11 | 12 | 13 | class TokenData(BaseModel): 14 | email: Optional[str] = None 15 | 16 | 17 | class User(BaseModel): 18 | email: Union[EmailStr, Literal[""]] 19 | full_name: Optional[str] = None 20 | disabled: Optional[bool] = None 21 | 22 | 23 | class UserInDB(User): 24 | hashed_password: str 25 | 26 | 27 | class UserCreate(BaseModel): 28 | email: EmailStr 29 | password: str 30 | full_name: Optional[str] = None 31 | -------------------------------------------------------------------------------- /api/backend/tests/factories/job_factory.py: -------------------------------------------------------------------------------- 1 | from api.backend.models import Element, Job, JobOptions, CapturedElement 2 | import uuid 3 | from faker import Faker 4 | 5 | fake = Faker() 6 | 7 | 8 | def create_job( 9 | job_options: JobOptions = JobOptions(multi_page_scrape=False, custom_headers={}) 10 | ): 11 | return Job( 12 | id=uuid.uuid4().hex, 13 | url="https://example.com", 14 | elements=[Element(name="test", xpath="xpath")], 15 | job_options=job_options, 16 | ) 17 | 18 | 19 | def create_completed_job() -> Job: 20 | return Job( 21 | id=uuid.uuid4().hex, 22 | url="http://example.com", 23 | elements=[ 24 | Element( 25 | name="element_name", 26 | xpath="//div", 27 | url="https://example.com", 28 | ) 29 | ], 30 | job_options=JobOptions(multi_page_scrape=False, custom_headers={}), 31 | user=fake.name(), 32 | time_created=fake.date(), 33 | result=[ 34 | { 35 | "https://example.com": { 36 | "element_name": [ 37 | CapturedElement( 38 | xpath="//div", text="example", name="element_name" 39 | ) 40 | ] 41 | } 42 | } 43 | ], 44 | ) 45 | -------------------------------------------------------------------------------- /api/backend/tests/job/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/api/backend/tests/job/__init__.py -------------------------------------------------------------------------------- /api/backend/tests/job/test_download_job.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from fastapi.testclient import TestClient 3 | from unittest.mock import AsyncMock, patch 4 | from api.backend.app import app 5 | from api.backend.models import DownloadJob 6 | from api.backend.tests.factories.job_factory import create_completed_job 7 | 8 | client = TestClient(app) 9 | 10 | mocked_job = create_completed_job().model_dump() 11 | mock_results = [mocked_job] 12 | mocked_random_int = 123456 13 | 14 | 15 | @pytest.mark.asyncio 16 | @patch("api.backend.routers.job_router.query") 17 | @patch("api.backend.routers.job_router.random.randint") 18 | async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock): 19 | # Ensure the mock returns immediately 20 | mock_query.return_value = mock_results 21 | mock_randint.return_value = mocked_random_int 22 | 23 | # Create a DownloadJob instance 24 | download_job = DownloadJob(ids=[mocked_job["id"]], job_format="csv") 25 | 26 | # Make a POST request to the /download endpoint 27 | response = client.post("/download", json=download_job.model_dump()) 28 | 29 | # Assertions 30 | assert response.status_code == 200 31 | assert response.headers["Content-Disposition"] == "attachment; filename=export.csv" 32 | 33 | # Check the content of the CSV 34 | csv_content = response.content.decode("utf-8") 35 | expected_csv = ( 36 | f'"id","url","element_name","xpath","text","user","time_created"\r\n' 37 | f'"{mocked_job["id"]}-{mocked_random_int}","https://example.com","element_name","//div","example",' 38 | f'"{mocked_job["user"]}","{mocked_job["time_created"]}"\r\n' 39 | ) 40 | assert csv_content == expected_csv 41 | -------------------------------------------------------------------------------- /api/backend/tests/scraping/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/api/backend/tests/scraping/__init__.py -------------------------------------------------------------------------------- /api/backend/tests/scraping/test_scraping.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import logging 3 | from typing import Dict 4 | from playwright.async_api import async_playwright, Cookie, Route 5 | from api.backend.job.scraping.add_custom import add_custom_items 6 | 7 | logging.basicConfig(level=logging.DEBUG) 8 | LOG = logging.getLogger(__name__) 9 | 10 | 11 | @pytest.mark.asyncio 12 | async def test_add_custom_items(): 13 | test_cookies = [{"name": "big", "value": "cookie"}] 14 | test_headers = {"User-Agent": "test-agent", "Accept": "application/json"} 15 | 16 | async with async_playwright() as p: 17 | browser = await p.chromium.launch(headless=True) 18 | context = await browser.new_context() 19 | page = await context.new_page() 20 | 21 | # Set up request interception 22 | captured_headers: Dict[str, str] = {} 23 | 24 | async def handle_route(route: Route) -> None: 25 | nonlocal captured_headers 26 | captured_headers = route.request.headers 27 | await route.continue_() 28 | 29 | await page.route("**/*", handle_route) 30 | 31 | await add_custom_items( 32 | url="http://example.com", 33 | page=page, 34 | cookies=test_cookies, 35 | headers=test_headers, 36 | ) 37 | 38 | # Navigate to example.com 39 | await page.goto("http://example.com") 40 | 41 | # Verify cookies were added 42 | cookies: list[Cookie] = await page.context.cookies() 43 | test_cookie = next((c for c in cookies if c.get("name") == "big"), None) 44 | 45 | assert test_cookie is not None 46 | assert test_cookie.get("value") == "cookie" 47 | assert test_cookie.get("path") == "/" # Default path should be set 48 | assert test_cookie.get("sameSite") == "Lax" # Default sameSite should be set 49 | 50 | # Verify headers were added 51 | assert captured_headers.get("user-agent") == "test-agent" 52 | 53 | await browser.close() 54 | -------------------------------------------------------------------------------- /api/backend/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional 2 | import logging 3 | import json 4 | 5 | LOG = logging.getLogger(__name__) 6 | 7 | 8 | def clean_text(text: str): 9 | text = text.replace("\r\n", "\n") # Normalize newlines 10 | text = text.replace("\n", "\\n") # Escape newlines 11 | text = text.replace('"', '\\"') # Escape double quotes 12 | return text 13 | 14 | 15 | def get_log_level(level_name: Optional[str]) -> int: 16 | level = logging.INFO 17 | 18 | if level_name: 19 | level_name = level_name.upper() 20 | level = getattr(logging, level_name, logging.INFO) 21 | 22 | return level 23 | 24 | 25 | def format_list_for_query(ids: list[str]): 26 | return ( 27 | f"({','.join(['?' for _ in ids])})" # Returns placeholders, e.g., "(?, ?, ?)" 28 | ) 29 | 30 | 31 | def format_sql_row_to_python(row: dict[str, Any]): 32 | new_row: dict[str, Any] = {} 33 | for key, value in row.items(): 34 | if isinstance(value, str): 35 | try: 36 | new_row[key] = json.loads(value) 37 | except json.JSONDecodeError: 38 | new_row[key] = value 39 | else: 40 | new_row[key] = value 41 | 42 | return new_row 43 | 44 | 45 | def format_json(items: list[Any]): 46 | for idx, item in enumerate(items): 47 | if isinstance(item, (dict, list)): 48 | formatted_item = json.dumps(item) 49 | items[idx] = formatted_item 50 | -------------------------------------------------------------------------------- /api/backend/worker/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from api.backend.utils import get_log_level 5 | 6 | logging.basicConfig( 7 | level=get_log_level(os.getenv("LOG_LEVEL")), 8 | format="%(levelname)s: %(asctime)s - %(name)s - %(message)s", 9 | handlers=[logging.StreamHandler()], 10 | ) 11 | 12 | LOG = logging.getLogger(__name__) 13 | -------------------------------------------------------------------------------- /api/backend/worker/post_job_complete/discord_notification.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any 3 | 4 | import requests 5 | 6 | from api.backend.worker.logger import LOG 7 | from api.backend.worker.post_job_complete.models import ( 8 | PostJobCompleteOptions, 9 | JOB_COLOR_MAP, 10 | ) 11 | 12 | 13 | def discord_notification(job: dict[str, Any], options: PostJobCompleteOptions): 14 | webhook_url = options["webhook_url"] 15 | scraperr_frontend_url = options["scraperr_frontend_url"] 16 | 17 | LOG.info(f"Sending discord notification to {webhook_url}") 18 | 19 | embed = { 20 | "title": "Job Completed", 21 | "description": "Scraping job has been completed.", 22 | "color": JOB_COLOR_MAP[job["status"]], 23 | "url": f"{scraperr_frontend_url}/jobs?search={job['id']}&type=id", 24 | "image": { 25 | "url": "https://github.com/jaypyles/Scraperr/raw/master/docs/logo_picture.png", 26 | }, 27 | "author": { 28 | "name": "Scraperr", 29 | "url": "https://github.com/jaypyles/Scraperr", 30 | }, 31 | "fields": [ 32 | { 33 | "name": "Status", 34 | "value": "Completed", 35 | "inline": True, 36 | }, 37 | { 38 | "name": "URL", 39 | "value": job["url"], 40 | "inline": True, 41 | }, 42 | { 43 | "name": "ID", 44 | "value": job["id"], 45 | "inline": False, 46 | }, 47 | { 48 | "name": "Options", 49 | "value": f"```json\n{json.dumps(job['job_options'], indent=4)}\n```", 50 | "inline": False, 51 | }, 52 | ], 53 | } 54 | 55 | payload = {"embeds": [embed]} 56 | requests.post(webhook_url, json=payload) 57 | -------------------------------------------------------------------------------- /api/backend/worker/post_job_complete/email_notifcation.py: -------------------------------------------------------------------------------- 1 | import smtplib 2 | import ssl 3 | from email.mime.text import MIMEText 4 | from email.mime.multipart import MIMEMultipart 5 | import json 6 | from typing import Any 7 | 8 | from api.backend.worker.logger import LOG 9 | 10 | from api.backend.worker.post_job_complete.models import ( 11 | JOB_COLOR_MAP, 12 | PostJobCompleteOptions, 13 | ) 14 | 15 | 16 | def send_job_complete_email( 17 | job: dict[str, Any], 18 | options: PostJobCompleteOptions, 19 | ): 20 | status = job["status"] 21 | status_color = JOB_COLOR_MAP.get(status, 0x808080) 22 | job_url = job["url"] 23 | job_id = job["id"] 24 | job_options_json = json.dumps(job["job_options"], indent=4) 25 | frontend_url = options["scraperr_frontend_url"] 26 | 27 | subject = "📦 Job Completed - Scraperr Notification" 28 | 29 | html = f""" 30 | 31 | 32 |

✅ Job Completed

33 |

Scraping job has been completed successfully.

34 | 35 | 36 | Scraperr Logo 37 | 38 | 39 |

Job Info:

40 | 45 | 46 |

Options:

47 |
48 | {job_options_json}
49 |         
50 | 51 |

View your job here:

52 | Scraperr Job 53 | 54 |

55 | Sent by Scraperr 56 |

57 | 58 | 59 | """ 60 | 61 | # Create email 62 | message = MIMEMultipart("alternative") 63 | message["From"] = options["email"] 64 | message["To"] = options["to"] 65 | message["Subject"] = subject 66 | message.attach( 67 | MIMEText( 68 | "Job completed. View this email in HTML format for full details.", "plain" 69 | ) 70 | ) 71 | message.attach(MIMEText(html, "html")) 72 | 73 | context = ssl.create_default_context() 74 | 75 | try: 76 | if options["use_tls"]: 77 | with smtplib.SMTP(options["smtp_host"], options["smtp_port"]) as server: 78 | server.starttls(context=context) 79 | server.login(options["smtp_user"], options["smtp_password"]) 80 | server.sendmail( 81 | from_addr=options["email"], 82 | to_addrs=options["to"], 83 | msg=message.as_string(), 84 | ) 85 | else: 86 | with smtplib.SMTP_SSL( 87 | options["smtp_host"], options["smtp_port"], context=context 88 | ) as server: 89 | server.login(options["smtp_user"], options["smtp_password"]) 90 | server.sendmail( 91 | from_addr=options["email"], 92 | to_addrs=options["to"], 93 | msg=message.as_string(), 94 | ) 95 | LOG.info("✅ Email sent successfully!") 96 | except Exception as e: 97 | LOG.error(f"❌ Failed to send email: {e}") 98 | -------------------------------------------------------------------------------- /api/backend/worker/post_job_complete/models.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict 2 | 3 | 4 | class PostJobCompleteOptions(TypedDict): 5 | channel: str 6 | webhook_url: str 7 | scraperr_frontend_url: str 8 | email: str 9 | to: str 10 | smtp_host: str 11 | smtp_port: int 12 | smtp_user: str 13 | smtp_password: str 14 | use_tls: bool 15 | 16 | 17 | JOB_COLOR_MAP = { 18 | "Queued": 0x0000FF, 19 | "Scraping": 0x0000FF, 20 | "Completed": 0x00FF00, 21 | "Failed": 0xFF0000, 22 | } 23 | -------------------------------------------------------------------------------- /api/backend/worker/post_job_complete/post_job_complete.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from api.backend.worker.post_job_complete.models import PostJobCompleteOptions 4 | from api.backend.worker.post_job_complete.email_notifcation import ( 5 | send_job_complete_email, 6 | ) 7 | from api.backend.worker.post_job_complete.discord_notification import ( 8 | discord_notification, 9 | ) 10 | 11 | 12 | async def post_job_complete(job: dict[str, Any], options: PostJobCompleteOptions): 13 | if options["channel"] == "": 14 | return 15 | 16 | if not options.values(): 17 | return 18 | 19 | if options["channel"] == "discord": 20 | discord_notification(job, options) 21 | elif options["channel"] == "email": 22 | send_job_complete_email(job, options) 23 | else: 24 | raise ValueError(f"Invalid channel: {options['channel']}") 25 | -------------------------------------------------------------------------------- /cypress.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from "cypress"; 2 | 3 | export default defineConfig({ 4 | e2e: { 5 | setupNodeEvents(on, config) {}, 6 | baseUrl: "http://localhost", 7 | }, 8 | }); 9 | -------------------------------------------------------------------------------- /cypress/e2e/authentication.cy.ts: -------------------------------------------------------------------------------- 1 | describe("Authentication", () => { 2 | it("should register", () => { 3 | cy.intercept("POST", "/api/signup").as("signup"); 4 | 5 | cy.visit("/").then(() => { 6 | cy.get("button").contains("Login").click(); 7 | cy.url().should("include", "/login"); 8 | 9 | cy.get("form").should("be.visible"); 10 | cy.get("button") 11 | .contains("No Account? Sign up") 12 | .should("be.visible") 13 | .click(); 14 | 15 | cy.get("input[name='email']").type("test@test.com"); 16 | cy.get("input[name='password']").type("password"); 17 | cy.get("input[name='fullName']").type("John Doe"); 18 | cy.get("button[type='submit']").contains("Signup").click(); 19 | 20 | cy.wait("@signup").then((interception) => { 21 | if (!interception.response) { 22 | cy.log("No response received!"); 23 | throw new Error("signup request did not return a response"); 24 | } 25 | 26 | cy.log("Response status: " + interception.response.statusCode); 27 | cy.log("Response body: " + JSON.stringify(interception.response.body)); 28 | 29 | expect(interception.response.statusCode).to.eq(200); 30 | }); 31 | }); 32 | }); 33 | 34 | it("should login", () => { 35 | cy.intercept("POST", "/api/token").as("token"); 36 | 37 | cy.visit("/").then(() => { 38 | cy.get("button") 39 | .contains("Login") 40 | .click() 41 | .then(() => { 42 | cy.get("input[name='email']").type("test@test.com"); 43 | cy.get("input[name='password']").type("password"); 44 | cy.get("button[type='submit']").contains("Login").click(); 45 | 46 | cy.wait("@token").then((interception) => { 47 | if (!interception.response) { 48 | cy.log("No response received!"); 49 | throw new Error("token request did not return a response"); 50 | } 51 | 52 | cy.log("Response status: " + interception.response.statusCode); 53 | cy.log("Response body: " + JSON.stringify(interception.response.body)); 54 | 55 | expect(interception.response.statusCode).to.eq(200); 56 | }); 57 | }); 58 | }); 59 | }); 60 | }); 61 | -------------------------------------------------------------------------------- /cypress/e2e/navigation.cy.ts: -------------------------------------------------------------------------------- 1 | describe("General site navigation", () => { 2 | it("passes", () => { 3 | cy.visit("/"); 4 | }); 5 | }); 6 | -------------------------------------------------------------------------------- /cypress/e2e/submit-job.cy.ts: -------------------------------------------------------------------------------- 1 | describe.only("Job", () => { 2 | it("should create a job", () => { 3 | cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob"); 4 | 5 | cy.visit("/"); 6 | 7 | cy.get('[data-cy="url-input"]').type("https://example.com"); 8 | cy.get('[data-cy="name-field"]').type("example"); 9 | cy.get('[data-cy="xpath-field"]').type("//body"); 10 | cy.get('[data-cy="add-button"]').click(); 11 | 12 | cy.contains("Submit").click(); 13 | 14 | cy.wait("@submitScrapeJob").then((interception) => { 15 | if (!interception.response) { 16 | cy.log("No response received!"); 17 | cy.log("Request body: " + JSON.stringify(interception.request?.body)); 18 | throw new Error("submitScrapeJob request did not return a response"); 19 | } 20 | 21 | cy.log("Response status: " + interception.response.statusCode); 22 | cy.log("Response body: " + JSON.stringify(interception.response.body)); 23 | 24 | expect(interception.response.statusCode).to.eq(200); 25 | }); 26 | 27 | cy.get("li").contains("Jobs").click(); 28 | 29 | cy.contains("div", "https://example.com", { timeout: 10000 }).should( 30 | "exist" 31 | ); 32 | cy.contains("div", "Completed", { timeout: 20000 }).should("exist"); 33 | 34 | cy.get("tbody tr") 35 | .first() 36 | .within(() => { 37 | cy.get('input[type="checkbox"]').click(); 38 | }); 39 | 40 | cy.get("[data-testid='DeleteIcon']").click(); 41 | 42 | cy.contains("div", "https://example.com", { timeout: 10000 }).should( 43 | "not.exist" 44 | ); 45 | }); 46 | 47 | it("should create a job with advanced options (media)", () => { 48 | cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob"); 49 | 50 | cy.visit("/"); 51 | 52 | cy.get("button").contains("Advanced Job Options").click(); 53 | 54 | cy.get('[data-cy="collect-media-checkbox"]').click(); 55 | cy.get("body").type("{esc}"); 56 | 57 | cy.get('[data-cy="url-input"]').type("https://books.toscrape.com"); 58 | cy.get('[data-cy="name-field"]').type("example"); 59 | cy.get('[data-cy="xpath-field"]').type("//body"); 60 | cy.get('[data-cy="add-button"]').click(); 61 | 62 | cy.get("button").contains("Submit").click(); 63 | 64 | cy.get("li").contains("Jobs").click(); 65 | 66 | cy.contains("div", "https://books.toscrape.com", { timeout: 10000 }).should( 67 | "exist" 68 | ); 69 | 70 | cy.contains("div", "Completed", { timeout: 20000 }).should("exist"); 71 | cy.get("li").contains("Media").click(); 72 | 73 | cy.get("div[id='select-job']").click(); 74 | cy.get("li[role='option']").click(); 75 | 76 | cy.get("[data-testid='media-grid']", { timeout: 10000 }).should("exist"); 77 | 78 | cy.get("li").contains("Jobs").click(); 79 | 80 | cy.get("tbody tr") 81 | .first() 82 | .within(() => { 83 | cy.get('input[type="checkbox"]').click(); 84 | }); 85 | 86 | cy.get("[data-testid='DeleteIcon']").click(); 87 | }); 88 | }); 89 | -------------------------------------------------------------------------------- /cypress/fixtures/example.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Using fixtures to represent data", 3 | "email": "hello@cypress.io", 4 | "body": "Fixtures are a great way to mock data for responses to routes" 5 | } 6 | -------------------------------------------------------------------------------- /cypress/support/commands.ts: -------------------------------------------------------------------------------- 1 | /// 2 | // *********************************************** 3 | // This example commands.ts shows you how to 4 | // create various custom commands and overwrite 5 | // existing commands. 6 | // 7 | // For more comprehensive examples of custom 8 | // commands please read more here: 9 | // https://on.cypress.io/custom-commands 10 | // *********************************************** 11 | // 12 | // 13 | // -- This is a parent command -- 14 | // Cypress.Commands.add('login', (email, password) => { ... }) 15 | // 16 | // 17 | // -- This is a child command -- 18 | // Cypress.Commands.add('drag', { prevSubject: 'element'}, (subject, options) => { ... }) 19 | // 20 | // 21 | // -- This is a dual command -- 22 | // Cypress.Commands.add('dismiss', { prevSubject: 'optional'}, (subject, options) => { ... }) 23 | // 24 | // 25 | // -- This will overwrite an existing command -- 26 | // Cypress.Commands.overwrite('visit', (originalFn, url, options) => { ... }) 27 | // 28 | // declare global { 29 | // namespace Cypress { 30 | // interface Chainable { 31 | // login(email: string, password: string): Chainable 32 | // drag(subject: string, options?: Partial): Chainable 33 | // dismiss(subject: string, options?: Partial): Chainable 34 | // visit(originalFn: CommandOriginalFn, url: string, options: Partial): Chainable 35 | // } 36 | // } 37 | // } 38 | -------------------------------------------------------------------------------- /cypress/support/e2e.ts: -------------------------------------------------------------------------------- 1 | // *********************************************************** 2 | // This example support/e2e.ts is processed and 3 | // loaded automatically before your test files. 4 | // 5 | // This is a great place to put global configuration and 6 | // behavior that modifies Cypress. 7 | // 8 | // You can change the location of this file or turn off 9 | // automatically serving support files with the 10 | // 'supportFile' configuration option. 11 | // 12 | // You can read more here: 13 | // https://on.cypress.io/configuration 14 | // *********************************************************** 15 | 16 | // Import commands.js using ES2015 syntax: 17 | import './commands' 18 | 19 | // Alternatively you can use CommonJS syntax: 20 | // require('./commands') -------------------------------------------------------------------------------- /docker-compose.dev.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | scraperr: 4 | build: 5 | context: . 6 | dockerfile: docker/frontend/Dockerfile 7 | command: ["npm", "run", "dev"] 8 | volumes: 9 | - "$PWD/src:/app/src" 10 | - "$PWD/public:/app/public" 11 | - "$PWD/next.config.mjs:/app/next.config.mjs" 12 | - "$PWD/package.json:/app/package.json" 13 | - "$PWD/package-lock.json:/app/package-lock.json" 14 | - "$PWD/tsconfig.json:/app/tsconfig.json" 15 | scraperr_api: 16 | build: 17 | context: . 18 | dockerfile: docker/api/Dockerfile 19 | environment: 20 | - LOG_LEVEL=INFO 21 | volumes: 22 | - "$PWD/api:/project/app/api" 23 | ports: 24 | - "5900:5900" 25 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | scraperr: 3 | image: jpyles0524/scraperr:latest 4 | container_name: scraperr 5 | command: ["npm", "run", "start"] 6 | environment: 7 | - NEXT_PUBLIC_API_URL=http://scraperr_api:8000 # your API URL 8 | - SERVER_URL=http://scraperr_api:8000 # your docker container API URL 9 | ports: 10 | - 80:3000 11 | networks: 12 | - web 13 | scraperr_api: 14 | init: True 15 | image: jpyles0524/scraperr_api:latest 16 | environment: 17 | - LOG_LEVEL=INFO 18 | container_name: scraperr_api 19 | ports: 20 | - 8000:8000 21 | volumes: 22 | - "$PWD/data:/project/app/data" 23 | - "$PWD/media:/project/app/media" 24 | networks: 25 | - web 26 | 27 | networks: 28 | web: 29 | -------------------------------------------------------------------------------- /docker/api/Dockerfile: -------------------------------------------------------------------------------- 1 | # Build python dependencies 2 | FROM python:3.10.12-slim as pybuilder 3 | 4 | RUN apt-get update && \ 5 | apt-get install -y curl && \ 6 | apt-get install -y x11vnc xvfb uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 ffmpeg && \ 7 | curl -LsSf https://astral.sh/uv/install.sh | sh && \ 8 | apt-get remove -y curl && \ 9 | apt-get autoremove -y && \ 10 | rm -rf /var/lib/apt/lists/* 11 | 12 | RUN python -m pip --no-cache-dir install pdm 13 | RUN pdm config python.use_venv false 14 | 15 | WORKDIR /project/app 16 | COPY pyproject.toml pdm.lock /project/app/ 17 | 18 | RUN pdm install -v --frozen-lockfile 19 | 20 | RUN pdm run playwright install --with-deps 21 | 22 | RUN pdm run camoufox fetch 23 | 24 | COPY ./api/ /project/app/api 25 | 26 | ENV PYTHONPATH=/project/pkgs 27 | 28 | COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf 29 | 30 | EXPOSE 8000 31 | 32 | WORKDIR /project/app 33 | 34 | RUN mkdir -p /project/app/media 35 | RUN mkdir -p /project/app/data 36 | RUN touch /project/app/data/database.db 37 | 38 | EXPOSE 5900 39 | 40 | COPY start.sh /project/app/start.sh 41 | 42 | CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ] -------------------------------------------------------------------------------- /docker/frontend/Dockerfile: -------------------------------------------------------------------------------- 1 | # Build next dependencies 2 | FROM node:23.1-slim 3 | WORKDIR /app 4 | 5 | # Copy package files first to leverage Docker cache 6 | COPY package.json yarn.lock ./ 7 | 8 | # Install dependencies in a separate layer 9 | RUN yarn install --frozen-lockfile 10 | 11 | # Copy the rest of the application 12 | COPY tsconfig.json /app/tsconfig.json 13 | COPY tailwind.config.js /app/tailwind.config.js 14 | COPY next.config.mjs /app/next.config.mjs 15 | COPY postcss.config.js /app/postcss.config.js 16 | 17 | COPY public /app/public 18 | COPY src /app/src 19 | 20 | # Build the application 21 | RUN yarn build 22 | 23 | EXPOSE 3000 -------------------------------------------------------------------------------- /docs/chat_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/chat_page.png -------------------------------------------------------------------------------- /docs/docs_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/docs_page.png -------------------------------------------------------------------------------- /docs/job_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/job_page.png -------------------------------------------------------------------------------- /docs/log_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/log_page.png -------------------------------------------------------------------------------- /docs/login.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/login.png -------------------------------------------------------------------------------- /docs/logo_picture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/logo_picture.png -------------------------------------------------------------------------------- /docs/main_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/main_page.png -------------------------------------------------------------------------------- /docs/stats_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/docs/stats_page.png -------------------------------------------------------------------------------- /helm/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /helm/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: scraperr 3 | description: A Helm chart for Kubernetes 4 | 5 | # A chart can be either an 'application' or a 'library' chart. 6 | # 7 | # Application charts are a collection of templates that can be packaged into versioned archives 8 | # to be deployed. 9 | # 10 | # Library charts provide useful utilities or functions for the chart developer. They're included as 11 | # a dependency of application charts to inject those utilities and functions into the rendering 12 | # pipeline. Library charts do not define any templates and therefore cannot be deployed. 13 | type: application 14 | 15 | # This is the chart version. This version number should be incremented each time you make changes 16 | # to the chart and its templates, including the app version. 17 | # Versions are expected to follow Semantic Versioning (https://semver.org/) 18 | version: 1.1.0 19 | 20 | # This is the version number of the application being deployed. This version number should be 21 | # incremented each time you make changes to the application. Versions are not expected to 22 | # follow Semantic Versioning. They should reflect the version the application is using. 23 | # It is recommended to use it with quotes. 24 | appVersion: "1.16.0" 25 | -------------------------------------------------------------------------------- /helm/templates/deployment.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apps/v1 3 | kind: Deployment 4 | metadata: 5 | name: scraperr 6 | spec: 7 | replicas: {{ .Values.replicaCount }} 8 | selector: 9 | matchLabels: 10 | app: scraperr 11 | template: 12 | metadata: 13 | labels: 14 | app: scraperr 15 | spec: 16 | containers: 17 | - name: scraperr 18 | {{ if .Values.scraperr.image.repository }} 19 | image: "{{ .Values.scraperr.image.repository }}:{{ .Values.scraperr.image.tag }}" 20 | {{ else }} 21 | image: "{{ .Chart.Name }}:{{ .Chart.Version }}" 22 | {{ end }} 23 | imagePullPolicy: {{ .Values.scraperr.image.pullPolicy }} 24 | command: {{ .Values.scraperr.containerCommand | toJson }} 25 | ports: 26 | - containerPort: {{ .Values.scraperr.containerPort }} 27 | env: {{ toYaml .Values.scraperr.env | nindent 12 }} 28 | 29 | --- 30 | apiVersion: apps/v1 31 | kind: Deployment 32 | metadata: 33 | name: scraperr-api 34 | spec: 35 | replicas: {{ .Values.replicaCount }} 36 | selector: 37 | matchLabels: 38 | app: scraperr-api 39 | template: 40 | metadata: 41 | labels: 42 | app: scraperr-api 43 | spec: 44 | containers: 45 | - name: scraperr-api 46 | {{ if .Values.scraperrApi.image.repository }} 47 | image: "{{ .Values.scraperrApi.image.repository }}:{{ .Values.scraperrApi.image.tag }}" 48 | {{ else }} 49 | image: "{{ .Chart.Name }}:{{ .Chart.Version }}" 50 | {{ end }} 51 | imagePullPolicy: {{ .Values.scraperrApi.image.pullPolicy }} 52 | ports: 53 | - containerPort: {{ .Values.scraperrApi.containerPort }} 54 | env: {{ toYaml .Values.scraperrApi.env | nindent 12 }} 55 | volumeMounts: {{ toYaml .Values.scraperrApi.volumeMounts | nindent 12 }} 56 | volumes: {{ toYaml .Values.scraperrApi.volumes | nindent 12 }} -------------------------------------------------------------------------------- /helm/templates/service.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: v1 3 | kind: Service 4 | metadata: 5 | name: scraperr 6 | spec: 7 | type: {{ .Values.scraperr.serviceType }} 8 | selector: 9 | app: scraperr 10 | ports: 11 | {{- range .Values.scraperr.ports }} 12 | - port: {{ .port }} 13 | targetPort: {{ .targetPort }} 14 | {{- if .nodePort }} 15 | nodePort: {{ .nodePort }} 16 | {{- end }} 17 | protocol: {{ .protocol | default "TCP" }} 18 | {{- end }} 19 | 20 | --- 21 | apiVersion: v1 22 | kind: Service 23 | metadata: 24 | name: scraperr-api 25 | spec: 26 | type: {{ .Values.scraperrApi.serviceType }} 27 | selector: 28 | app: scraperr-api 29 | ports: 30 | {{- range .Values.scraperrApi.ports }} 31 | - port: {{ .port }} 32 | targetPort: {{ .targetPort }} 33 | {{- if .nodePort }} 34 | nodePort: {{ .nodePort }} 35 | {{- end }} 36 | protocol: {{ .protocol | default "TCP" }} 37 | {{- end }} 38 | -------------------------------------------------------------------------------- /helm/values.yaml: -------------------------------------------------------------------------------- 1 | scraperr: 2 | image: 3 | repository: jpyles0524/scraperr 4 | tag: latest 5 | pullPolicy: IfNotPresent 6 | containerCommand: ["npm", "run","start"] 7 | containerPort: 3000 8 | serviceType: NodePort 9 | ports: 10 | - port: 80 11 | targetPort: 3000 12 | nodePort: 32300 13 | protocol: TCP 14 | env: 15 | - name: NEXT_PUBLIC_API_URL 16 | value: "http://scraperr-api:8000" 17 | - name: SERVER_URL 18 | value: "http://scraperr-api:8000" 19 | 20 | scraperrApi: 21 | image: 22 | repository: jpyles0524/scraperr_api 23 | tag: latest 24 | pullPolicy: IfNotPresent 25 | containerPort: 8000 26 | serviceType: ClusterIP 27 | ports: 28 | - port: 8000 29 | targetPort: 8000 30 | protocol: TCP 31 | env: 32 | - name: LOG_LEVEL 33 | value: "INFO" 34 | volumeMounts: 35 | - name: data 36 | mountPath: /project/app/data 37 | - name: media 38 | mountPath: /project/app/media 39 | volumes: 40 | - name: data 41 | hostPath: 42 | path: /data/scraperr/data 43 | type: DirectoryOrCreate 44 | - name: media 45 | hostPath: 46 | path: /data/scraperr/media 47 | replicaCount: 1 -------------------------------------------------------------------------------- /next-env.d.ts: -------------------------------------------------------------------------------- 1 | /// 2 | /// 3 | 4 | // NOTE: This file should not be edited 5 | // see https://nextjs.org/docs/basic-features/typescript for more information. 6 | -------------------------------------------------------------------------------- /next.config.mjs: -------------------------------------------------------------------------------- 1 | import dotenv from "dotenv"; 2 | dotenv.config(); 3 | 4 | /** @type {import('next').NextConfig} */ 5 | const nextConfig = { 6 | distDir: "./dist", 7 | images: { unoptimized: true }, 8 | env: { 9 | DOMAIN: `${process.env.NEXT_PUBLIC_API_PATH}`, 10 | }, 11 | }; 12 | 13 | export default nextConfig; 14 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "webapp-template", 3 | "version": "0.1.0", 4 | "private": true, 5 | "dependencies": { 6 | "@auth0/auth0-react": "^2.2.4", 7 | "@auth0/nextjs-auth0": "^3.5.0", 8 | "@chakra-ui/react": "^2.8.2", 9 | "@emotion/react": "^11.11.4", 10 | "@emotion/styled": "^11.11.5", 11 | "@fontsource/roboto": "^5.0.13", 12 | "@minchat/react-chat-ui": "^0.16.2", 13 | "@mui/icons-material": "^5.15.3", 14 | "@mui/material": "^5.16.0", 15 | "@reduxjs/toolkit": "^2.8.2", 16 | "@testing-library/jest-dom": "^5.16.5", 17 | "@testing-library/react": "^13.4.0", 18 | "@testing-library/user-event": "^13.5.0", 19 | "@types/react": "^18.3.21", 20 | "axios": "^1.7.2", 21 | "bootstrap": "^5.3.0", 22 | "chart.js": "^4.4.3", 23 | "cookie": "^0.6.0", 24 | "dotenv": "^16.5.0", 25 | "framer-motion": "^4.1.17", 26 | "js-cookie": "^3.0.5", 27 | "next": "^14.2.4", 28 | "next-auth": "^4.24.7", 29 | "nookies": "^2.5.2", 30 | "react": "^18.3.1", 31 | "react-bootstrap": "^2.8.0", 32 | "react-dom": "^18.3.1", 33 | "react-markdown": "^9.0.0", 34 | "react-modal-image": "^2.6.0", 35 | "react-redux": "^9.2.0", 36 | "react-router": "^6.14.1", 37 | "react-router-dom": "^6.14.1", 38 | "react-spinners": "^0.14.1", 39 | "redux-persist": "^6.0.0", 40 | "typescript": "^4.9.5", 41 | "web-vitals": "^2.1.4" 42 | }, 43 | "scripts": { 44 | "dev": "yarn next dev", 45 | "build": "yarn next build", 46 | "start": "yarn next start", 47 | "serve": "serve -s ./dist", 48 | "cy:open": "cypress open", 49 | "cy:run": "cypress run" 50 | }, 51 | "eslintConfig": { 52 | "extends": [ 53 | "react-app", 54 | "react-app/jest" 55 | ] 56 | }, 57 | "browserslist": { 58 | "production": [ 59 | ">0.2%", 60 | "not dead", 61 | "not op_mini all" 62 | ], 63 | "development": [ 64 | "last 1 chrome version", 65 | "last 1 firefox version", 66 | "last 1 safari version" 67 | ] 68 | }, 69 | "devDependencies": { 70 | "@types/cypress": "^1.1.6", 71 | "@types/js-cookie": "^3.0.6", 72 | "autoprefixer": "^10.4.21", 73 | "cypress": "^13.17.0", 74 | "eslint": "^9.26.0", 75 | "postcss": "^8.5.3", 76 | "tailwindcss": "^3.3.5" 77 | }, 78 | "overrides": { 79 | "react-refresh": "0.11.0" 80 | }, 81 | "resolutions": { 82 | "postcss": "^8.4.31" 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /postcss.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | plugins: { 3 | tailwindcss: {}, 4 | autoprefixer: {}, 5 | }, 6 | }; 7 | -------------------------------------------------------------------------------- /public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/public/favicon.ico -------------------------------------------------------------------------------- /public/images/scraperr_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaypyles/Scraperr/d4edb9d93efae4c6c80371ac1bf8128bde98b01c/public/images/scraperr_logo.png -------------------------------------------------------------------------------- /public/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "React App", 3 | "name": "Create React App Sample", 4 | "icons": [ 5 | { 6 | "src": "favicon.ico", 7 | "sizes": "64x64 32x32 24x24 16x16", 8 | "type": "image/x-icon" 9 | }, 10 | { 11 | "src": "logo192.png", 12 | "type": "image/png", 13 | "sizes": "192x192" 14 | }, 15 | { 16 | "src": "logo512.png", 17 | "type": "image/png", 18 | "sizes": "512x512" 19 | } 20 | ], 21 | "start_url": ".", 22 | "display": "standalone", 23 | "theme_color": "#000000", 24 | "background_color": "#ffffff" 25 | } 26 | -------------------------------------------------------------------------------- /public/robots.txt: -------------------------------------------------------------------------------- 1 | # https://www.robotstxt.org/robotstxt.html 2 | User-agent: * 3 | Disallow: 4 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "web-scrape" 3 | version = "0.1.0" 4 | description = "" 5 | authors = [{ name = "Jayden Pyles", email = "jpylesbuisness@gmail.com" }] 6 | dependencies = [ 7 | "uvicorn>=0.30.1", 8 | "fastapi>=0.111.0", 9 | "boto3>=1.34.140", 10 | "python-dotenv>=1.0.1", 11 | "boto3-stubs[essential]>=1.34.140", 12 | "asyncio>=3.4.3", 13 | "aiohttp>=3.9.5", 14 | "bs4>=0.0.2", 15 | "lxml[html_clean]>=5.2.2", 16 | "lxml-stubs>=0.5.1", 17 | "fake-useragent>=1.5.1", 18 | "requests-html>=0.10.0", 19 | "webdriver-manager>=4.0.1", 20 | "pydantic[email]>=2.9.2", 21 | "pandas>=2.2.2", 22 | "openpyxl>=3.1.5", 23 | "xlsxwriter>=3.2.0", 24 | "python-keycloak>=4.2.0", 25 | "fastapi-keycloak>=1.0.11", 26 | "pymongo>=4.8.0", 27 | "motor[asyncio]>=3.5.0", 28 | "python-jose[cryptography]>=3.3.0", 29 | "passlib[bcrypt]>=1.7.4", 30 | "selenium-wire>=5.1.0", 31 | "blinker<1.8.0", 32 | "setuptools>=71.0.4", 33 | "docker>=7.1.0", 34 | "ollama>=0.3.0", 35 | "openai>=1.37.1", 36 | "exceptiongroup>=1.2.2", 37 | "Faker>=30.6.0", 38 | "pytest-asyncio>=0.24.0", 39 | "python-multipart>=0.0.1", 40 | "bcrypt==4.0.1", 41 | "apscheduler>=3.11.0", 42 | "playwright>=1.52.0", 43 | "camoufox>=0.4.11", 44 | "html2text>=2025.4.15", 45 | ] 46 | requires-python = ">=3.10" 47 | readme = "README.md" 48 | license = { text = "MIT" } 49 | 50 | [tool.pdm] 51 | distribution = true 52 | 53 | [tool.pdm.dev-dependencies] 54 | dev = ["ipython>=8.26.0", "pytest>=8.3.3"] 55 | [tool.pyright] 56 | include = ["./api/backend/"] 57 | exclude = ["**/node_modules", "**/__pycache__"] 58 | ignore = [] 59 | defineConstant = { DEBUG = true } 60 | stubPath = "" 61 | 62 | # Type checking strictness 63 | typeCheckingMode = "strict" # Enables strict type checking mode 64 | reportPrivateUsage = "none" 65 | reportMissingTypeStubs = "none" 66 | reportUntypedFunctionDecorator = "error" 67 | reportUntypedClassDecorator = "error" 68 | reportUntypedBaseClass = "error" 69 | reportInvalidTypeVarUse = "error" 70 | reportUnnecessaryTypeIgnoreComment = "information" 71 | reportUnknownVariableType = "none" 72 | reportUnknownMemberType = "none" 73 | reportUnknownParameterType = "none" 74 | 75 | # Additional checks 76 | reportImplicitStringConcatenation = "error" 77 | reportInvalidStringEscapeSequence = "error" 78 | reportMissingImports = "error" 79 | reportMissingModuleSource = "error" 80 | reportOptionalCall = "error" 81 | reportOptionalIterable = "error" 82 | reportOptionalMemberAccess = "error" 83 | reportOptionalOperand = "error" 84 | reportOptionalSubscript = "error" 85 | reportTypedDictNotRequiredAccess = "error" 86 | 87 | # Function return type checking 88 | reportIncompleteStub = "error" 89 | reportIncompatibleMethodOverride = "error" 90 | reportInvalidStubStatement = "error" 91 | reportInconsistentOverload = "error" 92 | 93 | # Misc settings 94 | pythonVersion = "3.10" # Matches your Python version from pyproject.toml 95 | strictListInference = true 96 | strictDictionaryInference = true 97 | strictSetInference = true 98 | 99 | 100 | [tool.isort] 101 | length_sort = "1" 102 | profile = "black" 103 | sections = "STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER" 104 | import_heading_stdlib = "STL" 105 | import_heading_thirdparty = "PDM" 106 | import_heading_firstparty = "LOCAL" 107 | import_heading_localfolder = "LOCAL" 108 | -------------------------------------------------------------------------------- /src/components/ai/Chat.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | 3 | export const Chat = () => { 4 | return

Chat

; 5 | }; 6 | -------------------------------------------------------------------------------- /src/components/ai/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./Chat"; 2 | export * from "./JobSelector"; 3 | -------------------------------------------------------------------------------- /src/components/common/advanced-job-options/advanced-job-options.tsx: -------------------------------------------------------------------------------- 1 | import { Box, Link, Typography } from "@mui/material"; 2 | import { SetStateAction, Dispatch, useState } from "react"; 3 | import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog"; 4 | import { RawJobOptions } from "@/types"; 5 | 6 | export type AdvancedJobOptionsProps = { 7 | jobOptions: RawJobOptions; 8 | setJobOptions: Dispatch>; 9 | multiPageScrapeEnabled?: boolean; 10 | }; 11 | 12 | export const AdvancedJobOptions = ({ 13 | jobOptions, 14 | setJobOptions, 15 | multiPageScrapeEnabled = true, 16 | }: AdvancedJobOptionsProps) => { 17 | const [open, setOpen] = useState(false); 18 | return ( 19 | 20 | setOpen(true)} 24 | sx={{ 25 | textDecoration: "none", 26 | color: "primary.main", 27 | "&:hover": { 28 | color: "primary.dark", 29 | textDecoration: "underline", 30 | }, 31 | paddingLeft: 1, 32 | display: "inline-flex", 33 | alignItems: "center", 34 | gap: 0.5, 35 | }} 36 | > 37 | Advanced Job Options 38 | 39 | setOpen(false)} 42 | jobOptions={jobOptions} 43 | setJobOptions={setJobOptions} 44 | multiPageScrapeEnabled={multiPageScrapeEnabled} 45 | /> 46 | 47 | ); 48 | }; 49 | -------------------------------------------------------------------------------- /src/components/common/advanced-job-options/dialog/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./advanced-job-options-dialog"; 2 | -------------------------------------------------------------------------------- /src/components/common/advanced-job-options/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./advanced-job-options"; 2 | -------------------------------------------------------------------------------- /src/components/common/csv-table/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./csv-table"; 2 | -------------------------------------------------------------------------------- /src/components/common/disabled/disabled.tsx: -------------------------------------------------------------------------------- 1 | import { Box } from "@mui/material"; 2 | 3 | export type DisabledProps = { 4 | message: string; 5 | }; 6 | 7 | export const Disabled = ({ message }: DisabledProps) => { 8 | return ( 9 | 16 |

25 | {message} 26 |

27 |
28 | ); 29 | }; 30 | -------------------------------------------------------------------------------- /src/components/common/disabled/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./disabled"; 2 | -------------------------------------------------------------------------------- /src/components/common/expanded-table-input/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./expanded-table-input"; 2 | -------------------------------------------------------------------------------- /src/components/common/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./nav-drawer"; 2 | -------------------------------------------------------------------------------- /src/components/common/job-download-dialog/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./job-download-dialog"; 2 | -------------------------------------------------------------------------------- /src/components/common/job-download-dialog/job-download-dialog.tsx: -------------------------------------------------------------------------------- 1 | import { 2 | Dialog, 3 | DialogTitle, 4 | DialogContent, 5 | DialogActions, 6 | Button, 7 | FormControl, 8 | RadioGroup, 9 | FormControlLabel, 10 | Radio, 11 | FormLabel, 12 | Typography, 13 | Box, 14 | } from "@mui/material"; 15 | import { useState } from "react"; 16 | 17 | export type JobDownloadDialogProps = { 18 | open: boolean; 19 | onClose: () => void; 20 | ids: string[]; 21 | }; 22 | 23 | export const JobDownloadDialog = ({ 24 | open, 25 | onClose, 26 | ids, 27 | }: JobDownloadDialogProps) => { 28 | const [jobFormat, setJobFormat] = useState("csv"); 29 | const handleDownload = async () => { 30 | const response = await fetch("/api/download", { 31 | method: "POST", 32 | headers: { "Content-Type": "application/json" }, 33 | body: JSON.stringify({ data: { ids: ids, job_format: jobFormat } }), 34 | }); 35 | 36 | if (response.ok) { 37 | const blob = await response.blob(); 38 | const url = window.URL.createObjectURL(blob); 39 | const a = document.createElement("a"); 40 | a.style.display = "none"; 41 | a.href = url; 42 | a.download = `job_${ids[0]}.${jobFormat}`; 43 | document.body.appendChild(a); 44 | a.click(); 45 | window.URL.revokeObjectURL(url); 46 | document.body.removeChild(a); 47 | } else { 48 | console.error("Failed to download the file."); 49 | } 50 | }; 51 | 52 | return ( 53 | 54 | Download Job 55 | 56 | 57 | 58 | You are about to download {ids.length} job(s). Please select the 59 | format that you would like to download them in. 60 | 61 |
62 | 71 | Format 72 |
73 | setJobFormat(e.target.value)} 78 | > 79 | } label="CSV" /> 80 | } 83 | label="Markdown" 84 | /> 85 | 86 |
87 |
88 | 91 |
92 |
93 |
94 | ); 95 | }; 96 | -------------------------------------------------------------------------------- /src/components/common/media-viewer/audio/audio-viewer.tsx: -------------------------------------------------------------------------------- 1 | 2 | import { Box, Typography } from "@mui/material"; 3 | 4 | interface AudioViewerProps { 5 | mediaUrl: string; 6 | selectedMedia: string; 7 | onError: () => void; 8 | } 9 | 10 | export const AudioViewer = ({ 11 | mediaUrl, 12 | selectedMedia, 13 | onError, 14 | }: AudioViewerProps) => { 15 | return ( 16 | 26 | {selectedMedia} 27 | 38 | 39 | ); 40 | }; 41 | -------------------------------------------------------------------------------- /src/components/common/media-viewer/audio/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./audio-viewer"; 2 | -------------------------------------------------------------------------------- /src/components/common/media-viewer/image/image-viewer.tsx: -------------------------------------------------------------------------------- 1 | import { Box, useTheme } from "@mui/material"; 2 | 3 | export const ImageViewer = ({ 4 | mediaUrl, 5 | selectedMedia, 6 | }: { 7 | mediaUrl: string; 8 | selectedMedia: string; 9 | }) => { 10 | const theme = useTheme(); 11 | return ( 12 | 23 | {selectedMedia} 34 | 35 | ); 36 | }; 37 | -------------------------------------------------------------------------------- /src/components/common/media-viewer/image/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./image-viewer"; 2 | -------------------------------------------------------------------------------- /src/components/common/media-viewer/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./media-viewer"; 2 | -------------------------------------------------------------------------------- /src/components/common/media-viewer/media-viewer.tsx: -------------------------------------------------------------------------------- 1 | import { Box, Typography } from "@mui/material"; 2 | import { ImageViewer } from "./image"; 3 | import { VideoViewer } from "./video"; 4 | import { AudioViewer } from "./audio"; 5 | import { PDFViewer } from "./pdf-viewer"; 6 | 7 | interface MediaViewerProps { 8 | selectedMedia: string; 9 | activeTab: string; 10 | getMediaUrl: (fileName: string) => string; 11 | onError: (error: string) => void; 12 | } 13 | 14 | export const MediaViewer = ({ 15 | selectedMedia, 16 | activeTab, 17 | getMediaUrl, 18 | onError, 19 | }: MediaViewerProps) => { 20 | if (!selectedMedia) { 21 | return ( 22 | 30 | 31 | Select a file to view 32 | 33 | 34 | ); 35 | } 36 | 37 | const mediaUrl = getMediaUrl(selectedMedia); 38 | 39 | switch (activeTab) { 40 | case "images": 41 | return ; 42 | case "videos": 43 | return ( 44 | onError("Error loading video")} 47 | /> 48 | ); 49 | case "audio": 50 | return ( 51 | onError("Error loading audio")} 55 | /> 56 | ); 57 | case "pdfs": 58 | return ; 59 | default: 60 | return ( 61 | 69 | 70 | {selectedMedia} - Download this file to view it 71 | 72 | 73 | ); 74 | } 75 | }; 76 | -------------------------------------------------------------------------------- /src/components/common/media-viewer/pdf-viewer/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./pdf-viewer"; 2 | -------------------------------------------------------------------------------- /src/components/common/media-viewer/pdf-viewer/pdf-viewer.tsx: -------------------------------------------------------------------------------- 1 | import { Box, useTheme } from "@mui/material"; 2 | 3 | interface PDFViewerProps { 4 | mediaUrl: string; 5 | selectedMedia: string; 6 | } 7 | 8 | export const PDFViewer = ({ mediaUrl, selectedMedia }: PDFViewerProps) => { 9 | const theme = useTheme(); 10 | 11 | return ( 12 | 20 |