├── .env ├── .github ├── ISSUE_TEMPLATE │ └── bug_report.md └── workflows │ ├── ci.yml │ └── release.yml ├── .gitignore ├── .pre-commit-config.yaml ├── API_CHANGELOG.md ├── ARCHITECTURE_OVERVIEW.md ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── DEPLOYMENT.md ├── Dockerfile ├── FAQ.md ├── GLOSSARY.md ├── LICENSE.md ├── Makefile ├── PERFORMANCE_TUNING.md ├── README.md ├── ROADMAP.md ├── SECURITY.md ├── SECURITY_GUIDELINES.md ├── SUPPORT.md ├── UPGRADE_GUIDE.md ├── app ├── .env ├── __init__.py ├── api │ ├── deps.py │ ├── endpoints │ │ └── example.py │ └── routes │ │ └── jobs.py ├── cache.py ├── config.py ├── core │ ├── config.py │ ├── config_bridge.py │ ├── log_filters.py │ └── logging_config.py ├── db │ ├── crud.py │ └── database.py ├── exceptions.py ├── main.py ├── middleware │ ├── __init__.py │ ├── api_key_auth.py │ ├── rate_limiter.py │ └── request_logger.py ├── models.py ├── models │ ├── __init__.py │ ├── health_models.py │ └── job_models.py ├── routes │ ├── __init__.py │ ├── api.py │ └── health.py ├── services │ ├── __init__.py │ ├── background_service.py │ ├── external_service.py │ └── job_service.py └── utils │ ├── auth_health.py │ ├── env_debugger.py │ ├── error_handlers.py │ ├── logging_config.py │ ├── logging_docs.py │ └── validation_helpers.py ├── docker-compose.dev.yml ├── docker-compose.yml ├── examples └── api_usage.py ├── main.py ├── pyproject.toml ├── requirements-dev.txt ├── requirements.txt ├── scripts ├── check_auth.py ├── check_config_consistency.py ├── check_env.py ├── confirm_env.sh ├── debug_env_conflicts.py ├── debug_env_load_order.sh ├── docker-entrypoint.sh ├── increment_version.py ├── load_local_env.py ├── load_test.py ├── make_scripts_executable.sh ├── set_log_level.sh └── verify_env_loading.py └── tests ├── __init__.py ├── conftest.py └── test_api.py /.env: -------------------------------------------------------------------------------- 1 | # Default configuration values for JobSpy Docker API 2 | # This file is committed to version control 3 | # For local overrides, use .env.local which is not committed 4 | 5 | # Application Settings 6 | # Set to true to enable debug logging (including health checks) 7 | DEBUG=false 8 | # Or set specific log level (DEBUG, INFO, WARNING, ERROR, CRITICAL) 9 | LOG_LEVEL=INFO 10 | ENVIRONMENT=production 11 | 12 | # API Security (use placeholder values in committed .env) 13 | ENABLE_API_KEY_AUTH=false 14 | API_KEYS= 15 | API_KEY_HEADER_NAME=x-api-key 16 | 17 | # Rate Limiting 18 | RATE_LIMIT_ENABLED=false 19 | RATE_LIMIT_REQUESTS=100 20 | RATE_LIMIT_TIMEFRAME=3600 21 | 22 | # Proxy Configuration 23 | DEFAULT_PROXIES= 24 | CA_CERT_PATH= 25 | 26 | # JobSpy Default Settings 27 | DEFAULT_SITE_NAMES=indeed,linkedin,zip_recruiter,glassdoor,google,bayt,naukri 28 | DEFAULT_RESULTS_WANTED=20 29 | DEFAULT_DISTANCE=50 30 | DEFAULT_DESCRIPTION_FORMAT=markdown 31 | DEFAULT_COUNTRY_INDEED=USA 32 | 33 | # Caching 34 | ENABLE_CACHE=false 35 | CACHE_EXPIRY=3600 36 | 37 | # CORS 38 | CORS_ORIGINS=* 39 | 40 | # Health Endpoints 41 | ENABLE_HEALTH_ENDPOINTS=true 42 | ENABLE_DETAILED_HEALTH=true 43 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '[BUG] ' 5 | labels: bug 6 | assignees: '' 7 | --- 8 | 9 | ## Describe the bug 10 | A clear and concise description of what the bug is. 11 | 12 | ## To Reproduce 13 | Steps to reproduce the behavior: 14 | 1. Start the application with '...' 15 | 2. Send request to '...' 16 | 3. Check response '...' 17 | 4. See error 18 | 19 | ## Expected behavior 20 | A clear and concise description of what you expected to happen. 21 | 22 | ## Screenshots 23 | If applicable, add screenshots to help explain your problem. 24 | 25 | ## Environment: 26 | - OS: [e.g. Ubuntu 20.04] 27 | - Deployment Method: [e.g. Docker, Docker Compose, direct] 28 | - Version: [e.g. 1.0.0] 29 | 30 | ## Configuration 31 | ``` 32 | # Redacted configuration or environment variables you're using 33 | ENABLE_API_KEY_AUTH=true 34 | # ... 35 | ``` 36 | 37 | ## Additional context 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI/CD Pipeline 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v3 14 | - name: Set up Python 3.13 15 | uses: actions/setup-python@v4 16 | with: 17 | python-version: '3.13' 18 | - name: Install dependencies 19 | run: | 20 | python -m pip install --upgrade pip 21 | pip install -r requirements.txt 22 | pip install pytest pytest-cov 23 | - name: Run tests 24 | run: | 25 | pytest --cov=app tests/ 26 | 27 | build: 28 | needs: test 29 | runs-on: ubuntu-latest 30 | if: github.event_name == 'push' && github.ref == 'refs/heads/main' 31 | steps: 32 | - uses: actions/checkout@v3 33 | - name: Remove existing test container 34 | run: | 35 | # if a container named jobspy-api-test exists, remove it 36 | if [ "$(docker ps -a -q -f name=jobspy-api-test)" ]; then 37 | docker rm -f jobspy-api-test 38 | fi 39 | - name: Build Docker image 40 | run: docker build -t jobspy-api . 41 | - name: Run Docker container 42 | run: docker run -d -p 8000:8000 --name jobspy-api-test jobspy-api 43 | - name: Test Docker container 44 | run: | 45 | sleep 5 # Wait for container to start 46 | curl -f http://localhost:8000/health || exit 1 47 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release Pipeline 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*.*.*' 7 | 8 | jobs: 9 | build_and_test: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v3 13 | - name: Set up Python 3.13 14 | uses: actions/setup-python@v4 15 | with: 16 | python-version: '3.13' 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install -r requirements.txt 21 | pip install pytest pytest-cov 22 | - name: Run tests 23 | run: pytest --cov=app tests/ 24 | 25 | build_and_publish_docker: 26 | needs: build_and_test 27 | runs-on: ubuntu-latest 28 | steps: 29 | - uses: actions/checkout@v3 30 | 31 | - name: Set environment variables 32 | run: | 33 | echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV 34 | 35 | - name: Login to Docker Hub 36 | uses: docker/login-action@v2 37 | with: 38 | username: ${{ secrets.DOCKERHUB_USERNAME }} 39 | password: ${{ secrets.DOCKERHUB_TOKEN }} 40 | 41 | - name: Build and push Docker image 42 | uses: docker/build-push-action@v4 43 | with: 44 | context: . 45 | push: true 46 | platforms: linux/amd64,linux/arm64 47 | tags: | 48 | ${{ secrets.DOCKERHUB_USERNAME }}/jobspy-api:latest 49 | ${{ secrets.DOCKERHUB_USERNAME }}/jobspy-api:${{ env.VERSION }} 50 | 51 | create_github_release: 52 | needs: build_and_publish_docker 53 | runs-on: ubuntu-latest 54 | permissions: 55 | contents: write 56 | steps: 57 | - uses: actions/checkout@v3 58 | with: 59 | fetch-depth: 0 60 | 61 | - name: Generate release notes 62 | run: | 63 | VERSION=${GITHUB_REF#refs/tags/v} 64 | PREVIOUS_TAG=$(git tag --sort=-version:refname | head -n 2 | tail -n 1) 65 | if [ -z "$PREVIOUS_TAG" ]; then 66 | echo "## Changes in $VERSION" > release_notes.md 67 | echo "* First official release" >> release_notes.md 68 | else 69 | echo "## Changes in $VERSION since $PREVIOUS_TAG" > release_notes.md 70 | git log --pretty=format:"* %s (%h)" $PREVIOUS_TAG..HEAD >> release_notes.md 71 | fi 72 | 73 | - name: Create GitHub Release 74 | uses: softprops/action-gh-release@v1 75 | with: 76 | body_path: release_notes.md 77 | draft: false 78 | prerelease: false 79 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Environment variables 2 | .env.local 3 | .env.*.local 4 | 5 | # Python 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | *.so 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # Unit test / coverage reports 29 | htmlcov/ 30 | .tox/ 31 | .nox/ 32 | .coverage 33 | .coverage.* 34 | .cache 35 | nosetests.xml 36 | coverage.xml 37 | *.cover 38 | .hypothesis/ 39 | .pytest_cache/ 40 | 41 | # Jupyter Notebook 42 | .ipynb_checkpoints 43 | 44 | # Environments 45 | .env.local 46 | .venv 47 | env/ 48 | venv/ 49 | ENV/ 50 | env.bak/ 51 | venv.bak/ 52 | 53 | # IDE specific files 54 | .idea/ 55 | .vscode/ 56 | *.swp 57 | *.swo 58 | .DS_Store 59 | 60 | # Project specific 61 | logs/ 62 | *.log 63 | temp/ 64 | jobs.csv 65 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.4.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: check-yaml 8 | - id: check-added-large-files 9 | 10 | - repo: https://github.com/pycqa/isort 11 | rev: 5.12.0 12 | hooks: 13 | - id: isort 14 | 15 | - repo: https://github.com/psf/black 16 | rev: 23.3.0 17 | hooks: 18 | - id: black 19 | 20 | - repo: https://github.com/pycqa/flake8 21 | rev: 6.0.0 22 | hooks: 23 | - id: flake8 24 | additional_dependencies: [flake8-docstrings] 25 | -------------------------------------------------------------------------------- /API_CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # API Changelog 2 | 3 | All notable changes to the JobSpy Docker API. 4 | 5 | ## [Unreleased] 6 | - (pending changes) 7 | 8 | ## [1.0.0] – 2025‑04‑28 9 | ### Added 10 | - `GET /api/v1/search_jobs` 11 | - `x-api-key` authentication 12 | - Rate limiting & caching 13 | ... 14 | -------------------------------------------------------------------------------- /ARCHITECTURE_OVERVIEW.md: -------------------------------------------------------------------------------- 1 | # Architecture Overview 2 | 3 | ``` 4 | Client → API Gateway (FastAPI) → JobSpy Library → External Job APIs 5 | ↓ 6 | Cache (Redis/File) 7 | ↓ 8 | Logs/Monitoring 9 | ``` 10 | 11 | ## Components 12 | - FastAPI application (`app/main.py`) 13 | - Caching layer (in‑memory or Redis) 14 | - Rate limiter middleware 15 | - Docker / Kubernetes deployment 16 | 17 | ## Data Flow 18 | 1. Client request with API key. 19 | 2. Check rate limit & cache. 20 | 3. Scrape job sites via JobSpy. 21 | 4. Return JSON/CSV response. 22 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to the JobSpy Docker API will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [Unreleased] 9 | 10 | ### Added 11 | - Placeholder for upcoming changes 12 | 13 | ## [1.0.1] - 2023-11-30 14 | 15 | ### Fixed 16 | - Fixed CSV export functionality when using `format=csv` parameter 17 | - Fixed `DEFAULT_COUNTRY_INDEED` environment variable not being used as fallback when parameter is not provided 18 | - Fixed `site_name=all` being rejected as an invalid option 19 | - Fixed `ENABLE_API_KEY_AUTH` defaulting to disabled instead of enabled when not specified 20 | 21 | 22 | 23 | ## [1.0.0] - 2025-04-28 24 | 25 | ### Added 26 | - Initial release of JobSpy Docker API 27 | - Comprehensive job search across multiple platforms 28 | - API Key Authentication system 29 | - Rate limiting capabilities 30 | - Response caching 31 | - Proxy support 32 | - Customizable default search parameters 33 | - CORS support 34 | - Health check endpoints 35 | - Comprehensive logging 36 | 37 | ### Changed 38 | - N/A (initial release) 39 | 40 | ### Fixed 41 | - N/A (initial release) 42 | 43 | [Unreleased]: https://github.com/username/job-spy-fastapi/compare/v1.0.0...HEAD 44 | [1.0.1]: https://github.com/username/job-spy-fastapi/releases/tag/v1.0.1 45 | [1.0.0]: https://github.com/username/job-spy-fastapi/releases/tag/v1.0.0 46 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | All contributors and users of this project are expected to adhere to the following Code of Conduct. 4 | 5 | ## Our Pledge 6 | Be respectful, considerate, and collaborative. 7 | 8 | ## Our Standards 9 | - **Respectful Communication**: No harassment, discrimination, or abusive language. 10 | - **Inclusivity**: Welcome participants of all backgrounds and identities. 11 | - **Constructive Feedback**: Critique ideas, not people. 12 | 13 | ## Enforcement 14 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project maintainers. Violations may result in removal from the project. 15 | 16 | For full details, see our [Contributor Covenant](https://www.contributor-covenant.org/). 17 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Thank you for considering contributing to JobSpy API! 4 | 5 | ## How to Contribute 6 | 1. Fork the repo and create a feature branch. 7 | 2. Write tests for new features or bug fixes. 8 | 3. Follow the existing code style: Black, isort, flake8. 9 | 4. Commit messages: Use imperative tense and reference issue numbers. 10 | 5. Open a PR against `main` with a clear description. 11 | 12 | ## Code Review 13 | - Ensure CI passes: tests, lint, formatting. 14 | - Address review feedback promptly. 15 | 16 | ## Issue Reporting 17 | - Search existing issues before opening a new one. 18 | - Provide steps to reproduce, environment details, and logs. 19 | -------------------------------------------------------------------------------- /DEPLOYMENT.md: -------------------------------------------------------------------------------- 1 | # Deployment Guide 2 | 3 | ## Production (Docker Compose) 4 | 1. Copy `.env.example` → `.env` and fill in values. 5 | 2. `docker-compose up -d` 6 | 3. Verify: `docker-compose logs jobspy-api` 7 | 8 | ## Standalone Docker 9 | ```bash 10 | docker build -t jobspy-api:latest . 11 | docker run -d -p 8000:8000 \ 12 | --env-file .env \ 13 | jobspy-api:latest 14 | ``` 15 | 16 | ## Kubernetes (example) 17 | - Define Deployment, Service, ConfigMap for env vars. 18 | - Mount `ca_cert` as a Secret if needed. 19 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.13-slim 2 | 3 | WORKDIR /app 4 | 5 | LABEL maintainer="Shannon Atkinson 1.0.1)" 34 | @echo " make version-minor - Increment minor version (1.0.0 -> 1.1.0)" 35 | @echo " make version-major - Increment major version (1.0.0 -> 2.0.0)" 36 | 37 | install: 38 | pip install -r requirements.txt 39 | pip install pytest pytest-cov pylint 40 | 41 | run: 42 | uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 43 | 44 | test: 45 | pytest --cov=app tests/ 46 | 47 | lint: 48 | pylint app/ 49 | 50 | docker-build: 51 | docker build -t jobspy-api . 52 | 53 | docker-buildx: 54 | @echo "Building multi-arch Docker image (linux/amd64,linux/arm64)..." 55 | @python -c "from app import __version__; print(f'Current version: {__version__}')" 56 | @VERSION=$$(python -c "from app import __version__; print(__version__)") && \ 57 | docker buildx build --platform linux/amd64,linux/arm64 \ 58 | -t jobspy-api:$$VERSION -t jobspy-api:latest \ 59 | --load . 60 | 61 | docker-run: 62 | docker run -p 8000:8000 jobspy-api 63 | 64 | docker-compose-up: 65 | docker-compose up -d 66 | 67 | docker-compose-down: 68 | docker-compose down 69 | 70 | docker-compose-dev: 71 | docker-compose -f docker-compose.dev.yml up 72 | 73 | # Combined commands 74 | dev: 75 | docker-compose -f docker-compose.dev.yml up --build 76 | 77 | prod: 78 | docker-compose build 79 | docker-compose up -d 80 | 81 | clean-start: 82 | docker-compose down -v 83 | docker-compose rm -f 84 | docker-compose build --no-cache 85 | docker-compose up -d 86 | 87 | update: 88 | pip install -U -r requirements.txt 89 | pip install -U -r requirements-dev.txt 90 | docker-compose build --no-cache 91 | docker-compose up -d 92 | 93 | test-and-build: 94 | pytest --cov=app tests/ && docker-compose build 95 | 96 | ci: 97 | pytest --cov=app tests/ 98 | docker-compose build 99 | docker-compose up -d 100 | 101 | logs: 102 | docker-compose logs -f 103 | 104 | restart: 105 | docker-compose restart 106 | 107 | rebuild: 108 | docker-compose down 109 | docker-compose build 110 | docker-compose up -d 111 | 112 | check-env: 113 | python scripts/check_env.py 114 | 115 | debug-docker: 116 | docker-compose run --rm jobspy-api python /app/scripts/check_env.py 117 | 118 | docker-push: 119 | @echo "Building and pushing Docker image to Docker Hub..." 120 | @python -c "from app import __version__; print(f'Current version: {__version__}')" 121 | @VERSION=$$(python -c "from app import __version__; print(__version__)") && \ 122 | echo "Building version $$VERSION" && \ 123 | docker build -t jobspy-api:$$VERSION -t jobspy-api:latest . && \ 124 | echo "Enter your Docker Hub username:" && \ 125 | read DOCKER_USER && \ 126 | docker tag jobspy-api:$$VERSION $$DOCKER_USER/jobspy-api:$$VERSION && \ 127 | docker tag jobspy-api:latest $$DOCKER_USER/jobspy-api:latest && \ 128 | docker push $$DOCKER_USER/jobspy-api:$$VERSION && \ 129 | docker push $$DOCKER_USER/jobspy-api:latest && \ 130 | echo "Successfully pushed version $$VERSION to Docker Hub" 131 | 132 | docker-pushx: 133 | @echo "Building and pushing multi-arch Docker image to Docker Hub..." 134 | @python -c "from app import __version__; print(f'Current version: {__version__}')" 135 | @VERSION=$$(python -c "from app import __version__; print(__version__)") && \ 136 | echo "Enter your Docker Hub username:" && \ 137 | read DOCKER_USER && \ 138 | docker buildx build --platform linux/amd64,linux/arm64 \ 139 | -t $$DOCKER_USER/jobspy-api:$$VERSION -t $$DOCKER_USER/jobspy-api:latest \ 140 | --push . 141 | 142 | version: 143 | @python -c "from app import __version__; print(f'Current version: {__version__}')" 144 | 145 | version-patch: 146 | @python scripts/increment_version.py patch 147 | @$(MAKE) version 148 | 149 | version-minor: 150 | @python scripts/increment_version.py minor 151 | @$(MAKE) version 152 | 153 | version-major: 154 | @python scripts/increment_version.py major 155 | @$(MAKE) version 156 | -------------------------------------------------------------------------------- /PERFORMANCE_TUNING.md: -------------------------------------------------------------------------------- 1 | # Performance Tuning 2 | 3 | ## Caching 4 | - Adjust `CACHE_EXPIRY` via env var. 5 | - Use Redis by mounting external cache. 6 | 7 | ## Concurrency 8 | - Increase FastAPI workers: `uvicorn --workers 4`. 9 | - Use Gunicorn with Uvicorn workers in production. 10 | 11 | ## Logging 12 | - Set `LOG_LEVEL=INFO` or `DEBUG` as needed. 13 | - Rotate logs by mounting a log driver. 14 | 15 | ## Monitoring 16 | - Expose `/metrics` with Prometheus. 17 | -------------------------------------------------------------------------------- /ROADMAP.md: -------------------------------------------------------------------------------- 1 | # Project Roadmap 2 | 3 | ## v1.x (Current) 4 | - Stable multi‑site job search 5 | - Docker image & Compose support 6 | - Caching, rate limiting, API key auth 7 | 8 | ## v2.0 9 | - GraphQL endpoint 10 | - OAuth2 integration for platform APIs 11 | - Plugin architecture for new job sources 12 | 13 | ## Future 14 | - Frontend dashboard 15 | - Analytics & metrics export 16 | - Enterprise features: SSO, team quotas 17 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | We currently support the following versions with security updates: 6 | 7 | | Version | Supported | 8 | | ------- | ------------------ | 9 | | 1.0.x | :white_check_mark: | 10 | | < 1.0 | :x: | 11 | 12 | ## Reporting a Vulnerability 13 | 14 | We take the security of JobSpy Docker API seriously. If you believe you've found a security vulnerability, please follow these steps: 15 | 16 | 1. **Do Not** disclose the vulnerability publicly 17 | 2. **Do Not** open a public GitHub issue 18 | 19 | Instead, please email us at [security@example.com](mailto:security@example.com) with: 20 | 21 | - Description of the vulnerability 22 | - Steps to reproduce 23 | - Potential impact 24 | - Any suggestions for remediation 25 | 26 | We will acknowledge receipt of your report within 48 hours and provide an estimated timeline for a fix. We'll keep you informed of our progress. 27 | 28 | ## Security Measures 29 | 30 | - API key authentication (when enabled) 31 | - Rate limiting capabilities 32 | - Regular dependency updates 33 | - Input validation 34 | - Safe error handling 35 | 36 | ## Security Best Practices for Users 37 | 38 | 1. **API Keys**: When using API key authentication, follow best practices: 39 | - Use unique keys for different use cases 40 | - Rotate keys regularly 41 | - Only share keys securely 42 | 43 | 2. **Environment Variables**: Never commit real API keys to version control 44 | - Use `.env.local` for local development 45 | - Use secure methods for production deployment 46 | 47 | 3. **Rate Limiting**: Enable rate limiting in production 48 | - Adjust limits according to your expected usage 49 | 50 | 4. **Regular Updates**: Update to the latest version regularly 51 | 52 | ## Disclosure Policy 53 | 54 | When we receive a security bug report, we will: 55 | 56 | 1. Confirm the vulnerability 57 | 2. Determine its impact and severity 58 | 3. Develop and test a fix 59 | 4. Release a patched version 60 | 5. Acknowledge your contribution (unless you prefer to remain anonymous) 61 | -------------------------------------------------------------------------------- /SECURITY_GUIDELINES.md: -------------------------------------------------------------------------------- 1 | # Security Guidelines 2 | 3 | ## API Authentication 4 | - Use `x-api-key` header; rotate keys regularly. 5 | - Enforce HTTPS/TLS. 6 | 7 | ## Secrets Management 8 | - Don’t commit `.env` with real keys. 9 | - Use Docker secrets or Kubernetes Secrets. 10 | 11 | ## Dependencies 12 | - Regularly run `pip install -U`. 13 | - Audit with `safety` or `dependabot`. 14 | 15 | ## Vulnerability Reporting 16 | - See `SECURITY.md` for reporting process. 17 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # Support 2 | 3 | ## Reporting Issues 4 | - Open an issue at [GitHub Issues](https://github.com/username/jobspy-api/issues). 5 | 6 | ## Contact 7 | - For security: email security@example.com 8 | - For general queries: use the issue tracker. 9 | 10 | ## Community 11 | - Join our Slack channel (link). 12 | - Follow updates on Twitter @jobspy_api. 13 | -------------------------------------------------------------------------------- /UPGRADE_GUIDE.md: -------------------------------------------------------------------------------- 1 | # Upgrade Guide 2 | 3 | ## From v1.x to v2.0 4 | 1. Review breaking changes in `API_CHANGELOG.md`. 5 | 2. Update Docker image tag to `username/jobspy-api:2.0.0`. 6 | 3. Migrate environment vars: new OAuth2 settings. 7 | 4. Run integration tests against staging. 8 | 9 | ## Patch Releases 10 | ```bash 11 | make docker-pull && make docker-compose-down && make docker-compose-up -d 12 | ``` 13 | -------------------------------------------------------------------------------- /app/.env: -------------------------------------------------------------------------------- 1 | # ...existing code... 2 | 3 | # API Documentation 4 | ENABLE_SWAGGER_UI=true 5 | ENABLE_REDOC=true 6 | SWAGGER_UI_PATH=/docs 7 | REDOC_PATH=/redoc 8 | 9 | # ...existing code... -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- 1 | """JobSpy Docker API application package.""" 2 | __version__ = "1.0.1" 3 | -------------------------------------------------------------------------------- /app/api/deps.py: -------------------------------------------------------------------------------- 1 | from fastapi import Depends, HTTPException, status, Request 2 | from fastapi.security import APIKeyHeader 3 | from typing import Optional 4 | import logging 5 | 6 | from app.core.config import settings 7 | from app.config import settings as app_settings 8 | 9 | logger = logging.getLogger(__name__) 10 | api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False) 11 | 12 | async def get_api_key(request: Request, api_key: Optional[str] = Depends(api_key_header)): 13 | # Log detailed information about the authentication attempt 14 | logger.debug(f"API Key authentication check - core API_KEY configured: {bool(settings.API_KEY)}") 15 | logger.debug(f"Request path: {request.url.path}") 16 | logger.debug(f"API Key in request: {'Present' if api_key else 'Missing'}") 17 | 18 | # Check both authentication systems for consistency 19 | # First check app.core.config settings 20 | if not settings.API_KEY: 21 | logger.debug("No API key configured in core settings, checking app settings") 22 | 23 | # Then check app.config settings 24 | if not app_settings.ENABLE_API_KEY_AUTH or not app_settings.API_KEYS: 25 | logger.debug("Authentication disabled or no API keys configured in app settings") 26 | return None 27 | 28 | # App settings require auth but no core setting, issue a warning 29 | logger.warning("Inconsistent config: API_KEY auth enabled in app settings but not in core settings") 30 | 31 | # At this point, some form of authentication is required 32 | # Check if API key is missing 33 | if not api_key: 34 | logger.warning(f"API key is missing in request to {request.url.path}") 35 | raise HTTPException( 36 | status_code=status.HTTP_403_FORBIDDEN, 37 | detail="Missing API Key", 38 | ) 39 | 40 | # Check against core config API key if configured 41 | if settings.API_KEY and api_key != settings.API_KEY: 42 | # Fall back to checking against app config API keys 43 | if not (app_settings.API_KEYS and api_key in app_settings.API_KEYS): 44 | logger.warning(f"Invalid API key provided in request to {request.url.path}") 45 | raise HTTPException( 46 | status_code=status.HTTP_403_FORBIDDEN, 47 | detail="Invalid API Key", 48 | ) 49 | 50 | logger.debug("Valid API key provided, authentication successful") 51 | return api_key 52 | -------------------------------------------------------------------------------- /app/api/endpoints/example.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, Depends 2 | from sqlalchemy.orm import Session 3 | 4 | from app import crud, schemas 5 | from app.api.dependencies import get_db 6 | from app.core.logging_config import get_logger 7 | 8 | logger = get_logger("api.endpoints.example") 9 | 10 | router = APIRouter() 11 | 12 | @router.get("/items/") 13 | async def get_items(skip: int = 0, limit: int = 100, db: Session = Depends(get_db)): 14 | logger.debug(f"Fetching items with skip={skip}, limit={limit}") 15 | try: 16 | items = crud.get_items(db, skip=skip, limit=limit) 17 | logger.debug(f"Successfully retrieved {len(items)} items") 18 | return items 19 | except Exception as e: 20 | logger.exception(f"Error retrieving items: {str(e)}") 21 | raise 22 | 23 | @router.post("/items/") 24 | async def create_item(item: schemas.ItemCreate, db: Session = Depends(get_db)): 25 | logger.debug(f"Creating new item: {item.dict()}") 26 | try: 27 | db_item = crud.create_item(db=db, item=item) 28 | logger.info(f"Successfully created item with id: {db_item.id}") 29 | return db_item 30 | except Exception as e: 31 | logger.exception(f"Error creating item: {str(e)}") 32 | raise -------------------------------------------------------------------------------- /app/api/routes/jobs.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, Depends, Query 2 | from app.api.deps import get_api_key 3 | 4 | router = APIRouter() 5 | 6 | @router.get("/search_jobs") 7 | async def search_jobs( 8 | api_key: str = Depends(get_api_key) 9 | ): 10 | pass -------------------------------------------------------------------------------- /app/cache.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Dict, Any, Optional, Tuple 3 | import hashlib 4 | import json 5 | import pandas as pd 6 | from app.config import settings 7 | 8 | class JobSearchCache: 9 | def __init__(self): 10 | self.cache: Dict[str, Tuple[float, pd.DataFrame]] = {} 11 | self.enabled = settings.ENABLE_CACHE 12 | self.expiry = settings.CACHE_EXPIRY 13 | 14 | def _generate_key(self, params: Dict[str, Any]) -> str: 15 | """Generate a cache key from the search parameters""" 16 | # Sort the dictionary to ensure consistent keys 17 | sorted_params = {k: params[k] for k in sorted(params.keys())} 18 | param_str = json.dumps(sorted_params, sort_keys=True) 19 | return hashlib.md5(param_str.encode()).hexdigest() 20 | 21 | def get(self, params: Dict[str, Any]) -> Optional[pd.DataFrame]: 22 | """Get cached results if they exist and are not expired""" 23 | if not self.enabled: 24 | return None 25 | 26 | key = self._generate_key(params) 27 | if key not in self.cache: 28 | return None 29 | 30 | timestamp, df = self.cache[key] 31 | if time.time() - timestamp > self.expiry: 32 | # Cache expired 33 | del self.cache[key] 34 | return None 35 | 36 | return df 37 | 38 | def set(self, params: Dict[str, Any], df: pd.DataFrame) -> None: 39 | """Cache search results""" 40 | if not self.enabled: 41 | return 42 | 43 | key = self._generate_key(params) 44 | self.cache[key] = (time.time(), df) 45 | 46 | def clear(self) -> None: 47 | """Clear all cached data""" 48 | self.cache.clear() 49 | 50 | def cleanup_expired(self) -> None: 51 | """Remove expired cache entries""" 52 | current_time = time.time() 53 | expired_keys = [ 54 | key for key, (timestamp, _) in self.cache.items() 55 | if current_time - timestamp > self.expiry 56 | ] 57 | for key in expired_keys: 58 | del self.cache[key] 59 | 60 | # Initialize global cache 61 | cache = JobSearchCache() 62 | -------------------------------------------------------------------------------- /app/config.py: -------------------------------------------------------------------------------- 1 | """Configuration settings for the JobSpy Docker API.""" 2 | import os 3 | from typing import List, Optional, Any, Dict, Tuple 4 | import logging 5 | 6 | # Try to load .env files - will be ignored if python-dotenv is not installed 7 | try: 8 | from dotenv import load_dotenv, find_dotenv 9 | # Load only .env by default 10 | dotenv_file = find_dotenv(".env") 11 | if dotenv_file: 12 | load_dotenv(dotenv_file) 13 | 14 | # .env.local is not loaded by default anymore 15 | # If you need to load it, do so explicitly in your code 16 | except ImportError: 17 | pass 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | class Settings: 22 | """Simple settings class that loads values from environment variables.""" 23 | 24 | def __init__(self): 25 | # Track the source of each setting 26 | self.setting_sources = {} 27 | 28 | # API Security 29 | self.API_KEYS, self.API_KEYS_SOURCE = self._get_setting_with_source( 30 | "API_KEYS", "", self._parse_list 31 | ) 32 | self.API_KEY_HEADER_NAME, self.API_KEY_HEADER_NAME_SOURCE = self._get_setting_with_source( 33 | "API_KEY_HEADER_NAME", "x-api-key" 34 | ) 35 | self.ENABLE_API_KEY_AUTH, self.ENABLE_API_KEY_AUTH_SOURCE = self._get_setting_with_source( 36 | "ENABLE_API_KEY_AUTH", "false", self._parse_bool 37 | ) 38 | 39 | # Rate Limiting 40 | self.RATE_LIMIT_ENABLED, self.RATE_LIMIT_ENABLED_SOURCE = self._get_setting_with_source( 41 | "RATE_LIMIT_ENABLED", "false", self._parse_bool 42 | ) 43 | self.RATE_LIMIT_REQUESTS, self.RATE_LIMIT_REQUESTS_SOURCE = self._get_setting_with_source( 44 | "RATE_LIMIT_REQUESTS", "100", int 45 | ) 46 | self.RATE_LIMIT_TIMEFRAME, self.RATE_LIMIT_TIMEFRAME_SOURCE = self._get_setting_with_source( 47 | "RATE_LIMIT_TIMEFRAME", "3600", int 48 | ) 49 | 50 | # Proxy Configuration 51 | self.DEFAULT_PROXIES, self.DEFAULT_PROXIES_SOURCE = self._get_setting_with_source( 52 | "DEFAULT_PROXIES", "", self._parse_list 53 | ) 54 | self.CA_CERT_PATH, self.CA_CERT_PATH_SOURCE = self._get_setting_with_source( 55 | "CA_CERT_PATH", None 56 | ) 57 | 58 | # JobSpy Default Settings 59 | default_sites = "indeed,linkedin,zip_recruiter,glassdoor,google,bayt,naukri" 60 | self.DEFAULT_SITE_NAMES, self.DEFAULT_SITE_NAMES_SOURCE = self._get_setting_with_source( 61 | "DEFAULT_SITE_NAMES", default_sites, self._parse_list 62 | ) 63 | self.DEFAULT_RESULTS_WANTED, self.DEFAULT_RESULTS_WANTED_SOURCE = self._get_setting_with_source( 64 | "DEFAULT_RESULTS_WANTED", "20", int 65 | ) 66 | self.DEFAULT_DISTANCE, self.DEFAULT_DISTANCE_SOURCE = self._get_setting_with_source( 67 | "DEFAULT_DISTANCE", "50", int 68 | ) 69 | self.DEFAULT_DESCRIPTION_FORMAT, self.DEFAULT_DESCRIPTION_FORMAT_SOURCE = self._get_setting_with_source( 70 | "DEFAULT_DESCRIPTION_FORMAT", "markdown" 71 | ) 72 | self.DEFAULT_COUNTRY_INDEED, self.DEFAULT_COUNTRY_INDEED_SOURCE = self._get_setting_with_source( 73 | "DEFAULT_COUNTRY_INDEED", None 74 | ) 75 | 76 | # Caching 77 | self.ENABLE_CACHE, self.ENABLE_CACHE_SOURCE = self._get_setting_with_source( 78 | "ENABLE_CACHE", "false", self._parse_bool 79 | ) 80 | self.CACHE_EXPIRY, self.CACHE_EXPIRY_SOURCE = self._get_setting_with_source( 81 | "CACHE_EXPIRY", "3600", int 82 | ) 83 | 84 | # Logging 85 | self.LOG_LEVEL, self.LOG_LEVEL_SOURCE = self._get_setting_with_source( 86 | "LOG_LEVEL", "INFO" 87 | ) 88 | self.ENVIRONMENT, self.ENVIRONMENT_SOURCE = self._get_setting_with_source( 89 | "ENVIRONMENT", "production" 90 | ) 91 | 92 | # CORS 93 | self.CORS_ORIGINS, self.CORS_ORIGINS_SOURCE = self._get_setting_with_source( 94 | "CORS_ORIGINS", "*", self._parse_list 95 | ) 96 | 97 | # Health Endpoints 98 | self.ENABLE_HEALTH_ENDPOINTS, self.ENABLE_HEALTH_ENDPOINTS_SOURCE = self._get_setting_with_source( 99 | "ENABLE_HEALTH_ENDPOINTS", "true", self._parse_bool 100 | ) 101 | self.ENABLE_DETAILED_HEALTH, self.ENABLE_DETAILED_HEALTH_SOURCE = self._get_setting_with_source( 102 | "ENABLE_DETAILED_HEALTH", "true", self._parse_bool 103 | ) 104 | 105 | # API Documentation 106 | self.ENABLE_SWAGGER_UI, self.ENABLE_SWAGGER_UI_SOURCE = self._get_setting_with_source( 107 | "ENABLE_SWAGGER_UI", "true", self._parse_bool 108 | ) 109 | self.ENABLE_REDOC, self.ENABLE_REDOC_SOURCE = self._get_setting_with_source( 110 | "ENABLE_REDOC", "true", self._parse_bool 111 | ) 112 | self.SWAGGER_UI_PATH, self.SWAGGER_UI_PATH_SOURCE = self._get_setting_with_source( 113 | "SWAGGER_UI_PATH", "/docs" 114 | ) 115 | self.REDOC_PATH, self.REDOC_PATH_SOURCE = self._get_setting_with_source( 116 | "REDOC_PATH", "/redoc" 117 | ) 118 | 119 | # Fix configuration inconsistencies 120 | self._fix_configuration_inconsistencies() 121 | 122 | def _get_setting_with_source(self, key: str, default_value: Any, 123 | parser_func=None) -> Tuple[Any, str]: 124 | """Get a setting value and its source.""" 125 | if key in os.environ: 126 | value = os.environ[key] 127 | source = f"environment variable ({value})" 128 | else: 129 | value = default_value 130 | source = f"default value ({value})" 131 | 132 | # Apply parser if provided 133 | if parser_func and value is not None: 134 | value = parser_func(value) 135 | 136 | # Log loading for critical settings 137 | critical_settings = ["ENABLE_API_KEY_AUTH", "API_KEYS", "RATE_LIMIT_ENABLED", "ENABLE_CACHE"] 138 | if key in critical_settings: 139 | logger.debug(f"Setting {key}={value} loaded from {source}") 140 | 141 | return value, source 142 | 143 | def _fix_configuration_inconsistencies(self): 144 | """Fix any inconsistencies in configuration.""" 145 | # If API keys are configured but auth is disabled, log a warning 146 | if self.API_KEYS and not self.ENABLE_API_KEY_AUTH: 147 | logger.warning("API keys are configured but authentication is disabled. This may lead to security issues.") 148 | 149 | def _parse_bool(self, value: Any) -> bool: 150 | """Parse a boolean from a string or any value.""" 151 | if isinstance(value, bool): 152 | return value 153 | if isinstance(value, str): 154 | return value.lower() in ("yes", "true", "t", "1", "on") 155 | return bool(value) 156 | 157 | def _parse_list(self, value: Any) -> List[str]: 158 | """Parse a comma-separated list from a string.""" 159 | if not value: 160 | return [] 161 | if isinstance(value, list): 162 | return [str(item) for item in value if item] 163 | if isinstance(value, str): 164 | return [item.strip() for item in value.split(",") if item.strip()] 165 | return [] 166 | 167 | def get_all_settings(self) -> Dict[str, Dict[str, Any]]: 168 | """Get all settings with their sources, useful for debugging.""" 169 | settings_with_sources = {} 170 | for key in dir(self): 171 | if key.isupper() and not key.endswith("_SOURCE"): 172 | source_attr = f"{key}_SOURCE" 173 | source = getattr(self, source_attr) if hasattr(self, source_attr) else "unknown" 174 | settings_with_sources[key] = { 175 | "value": getattr(self, key), 176 | "source": source 177 | } 178 | return settings_with_sources 179 | 180 | # Create a global settings instance 181 | settings = Settings() 182 | -------------------------------------------------------------------------------- /app/core/config.py: -------------------------------------------------------------------------------- 1 | from pydantic_settings import BaseSettings 2 | from typing import Optional 3 | 4 | class Settings(BaseSettings): 5 | PROJECT_NAME: str = "Job Spy FastAPI" 6 | API_V1_STR: str = "/api/v1" 7 | DATABASE_URL: Optional[str] = None # Made optional with default None 8 | API_KEY: Optional[str] = None 9 | 10 | # Logging settings 11 | LOG_LEVEL: str = "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL 12 | LOG_TO_FILE: bool = True 13 | LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 14 | 15 | @property 16 | def get_log_level(self): 17 | """Convert string log level to logging module level""" 18 | import logging 19 | return getattr(logging, self.LOG_LEVEL) 20 | 21 | class Config: 22 | case_sensitive = True 23 | 24 | settings = Settings() -------------------------------------------------------------------------------- /app/core/config_bridge.py: -------------------------------------------------------------------------------- 1 | """ 2 | Bridge between app.core.config and app.config to ensure consistent settings. 3 | This module synchronizes API key settings between the two config modules. 4 | """ 5 | import logging 6 | 7 | from app.config import settings as app_settings 8 | from app.core.config import settings as core_settings 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | def sync_api_key_settings(): 13 | """ 14 | Synchronize API key settings between core.config and main config. 15 | This ensures that authentication works consistently. 16 | """ 17 | # If core API_KEY is set but app API_KEYS is not, add the core key to app settings 18 | if core_settings.API_KEY and not app_settings.API_KEYS: 19 | logger.debug("Syncing core API_KEY to app API_KEYS") 20 | app_settings.API_KEYS = [core_settings.API_KEY] 21 | 22 | # If app has API_KEYS but core doesn't have API_KEY, set the first app key as core key 23 | if app_settings.API_KEYS and not core_settings.API_KEY: 24 | logger.debug("Setting core API_KEY from app API_KEYS") 25 | # We can't actually modify core_settings.API_KEY directly, but we can log a warning 26 | logger.warning("Cannot sync app API_KEYS to core API_KEY - core settings are immutable") 27 | 28 | # Log configuration status 29 | auth_enabled = bool(core_settings.API_KEY) or (app_settings.ENABLE_API_KEY_AUTH and bool(app_settings.API_KEYS)) 30 | logger.info(f"Authentication enabled: {auth_enabled}") 31 | logger.debug(f"Core API_KEY configured: {bool(core_settings.API_KEY)}") 32 | logger.debug(f"App API_KEYS configured: {bool(app_settings.API_KEYS)}") 33 | 34 | # Run synchronization when module is imported 35 | sync_api_key_settings() 36 | -------------------------------------------------------------------------------- /app/core/log_filters.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | class HealthCheckFilter(logging.Filter): 4 | """Filter out health check requests from logs""" 5 | 6 | def __init__(self, path="/health"): 7 | super().__init__() 8 | self.path = path 9 | 10 | def filter(self, record): 11 | # Check if the record has a message attribute and contains the health check path 12 | if hasattr(record, 'message'): 13 | return self.path not in record.message 14 | # For records that haven't been formatted yet, check the raw message 15 | if hasattr(record, 'msg') and isinstance(record.msg, str): 16 | return self.path not in record.msg 17 | return True 18 | -------------------------------------------------------------------------------- /app/core/logging_config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | from logging.handlers import RotatingFileHandler 5 | from pathlib import Path 6 | 7 | def setup_logging(log_level=None): 8 | """Configure logging for the application""" 9 | 10 | # Determine log level from environment or parameter 11 | if log_level is None: 12 | env_level = os.getenv("LOG_LEVEL", "INFO").upper() 13 | log_level = getattr(logging, env_level, logging.INFO) 14 | 15 | # Create logs directory if it doesn't exist 16 | log_dir = Path("logs") 17 | log_dir.mkdir(exist_ok=True) 18 | 19 | # Configure root logger 20 | logger = logging.getLogger() 21 | logger.setLevel(log_level) 22 | 23 | # Clear existing handlers to avoid duplicate logs 24 | if logger.handlers: 25 | logger.handlers.clear() 26 | 27 | # Console handler 28 | console_handler = logging.StreamHandler(sys.stdout) 29 | console_handler.setLevel(log_level) 30 | console_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 31 | console_handler.setFormatter(console_formatter) 32 | 33 | # File handler with rotation (10MB max, keep 5 backups) 34 | file_handler = RotatingFileHandler( 35 | "logs/app.log", 36 | maxBytes=10*1024*1024, 37 | backupCount=5 38 | ) 39 | file_handler.setLevel(log_level) 40 | file_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 41 | file_handler.setFormatter(file_formatter) 42 | 43 | # Add handlers to logger 44 | logger.addHandler(console_handler) 45 | logger.addHandler(file_handler) 46 | 47 | # Set Uvicorn's access logger to a higher level to reduce noise 48 | uvicorn_access = logging.getLogger("uvicorn.access") 49 | if log_level == logging.DEBUG: 50 | uvicorn_access.setLevel(logging.INFO) # Show access logs in debug mode, but not health checks 51 | else: 52 | uvicorn_access.setLevel(logging.WARNING) # Only show warnings and errors otherwise 53 | 54 | # Return the configured logger 55 | return logger 56 | 57 | def get_logger(name): 58 | """Get a named logger""" 59 | return logging.getLogger(name) 60 | -------------------------------------------------------------------------------- /app/db/crud.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy.orm import Session 2 | 3 | from app import models, schemas 4 | from app.core.logging_config import get_logger 5 | 6 | logger = get_logger("db.crud") 7 | 8 | def get_items(db: Session, skip: int = 0, limit: int = 100): 9 | logger.debug(f"DB query: get_items(skip={skip}, limit={limit})") 10 | try: 11 | result = db.query(models.Item).offset(skip).limit(limit).all() 12 | logger.debug(f"DB query successful, returned {len(result)} records") 13 | return result 14 | except Exception as e: 15 | logger.exception(f"DB query failed: {str(e)}") 16 | raise 17 | 18 | def create_item(db: Session, item: schemas.ItemCreate): 19 | logger.debug(f"DB operation: create_item with data: {item.dict()}") 20 | try: 21 | db_item = models.Item(**item.dict()) 22 | db.add(db_item) 23 | db.commit() 24 | db.refresh(db_item) 25 | logger.debug(f"DB operation successful, created item with id: {db_item.id}") 26 | return db_item 27 | except Exception as e: 28 | db.rollback() 29 | logger.exception(f"DB operation failed, rolling back: {str(e)}") 30 | raise -------------------------------------------------------------------------------- /app/db/database.py: -------------------------------------------------------------------------------- 1 | """Database configuration and session management.""" 2 | from sqlalchemy import create_engine 3 | from sqlalchemy.ext.declarative import declarative_base 4 | from sqlalchemy.orm import sessionmaker 5 | import logging 6 | from app.config import settings 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | # Get database URL from settings or use SQLite in-memory if not configured 11 | SQLALCHEMY_DATABASE_URL = settings.DATABASE_URL or "sqlite:///:memory:" 12 | 13 | # Only create engine if a database URL is provided 14 | if SQLALCHEMY_DATABASE_URL and SQLALCHEMY_DATABASE_URL != "sqlite:///:memory:": 15 | engine = create_engine(SQLALCHEMY_DATABASE_URL) 16 | SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) 17 | else: 18 | logger.warning("No DATABASE_URL configured, database functionality will be limited") 19 | engine = None 20 | SessionLocal = None 21 | 22 | Base = declarative_base() 23 | 24 | def get_db(): 25 | """Get a database session.""" 26 | if SessionLocal is None: 27 | raise RuntimeError("Database not configured. Set DATABASE_URL in environment.") 28 | 29 | db = SessionLocal() 30 | try: 31 | yield db 32 | finally: 33 | db.close() 34 | -------------------------------------------------------------------------------- /app/exceptions.py: -------------------------------------------------------------------------------- 1 | """Custom exception classes for JobSpy Docker API.""" 2 | from fastapi import HTTPException 3 | from starlette.status import HTTP_429_TOO_MANY_REQUESTS, HTTP_400_BAD_REQUEST, HTTP_500_INTERNAL_SERVER_ERROR 4 | 5 | class JobSpyAPIException(HTTPException): 6 | """Base exception for JobSpy Docker API.""" 7 | def __init__(self, status_code: int, detail: str, headers: dict = None): 8 | super().__init__(status_code=status_code, detail=detail, headers=headers) 9 | 10 | class RateLimitExceeded(JobSpyAPIException): 11 | """Exception raised when rate limit is exceeded.""" 12 | def __init__(self, detail: str = "Rate limit exceeded"): 13 | super().__init__(status_code=HTTP_429_TOO_MANY_REQUESTS, detail=detail) 14 | 15 | class InvalidSearchParameters(JobSpyAPIException): 16 | """Exception raised when search parameters are invalid.""" 17 | def __init__(self, detail: str = "Invalid search parameters"): 18 | super().__init__(status_code=HTTP_400_BAD_REQUEST, detail=detail) 19 | 20 | class JobSearchError(JobSpyAPIException): 21 | """Exception raised when job search fails.""" 22 | def __init__(self, detail: str = "Job search failed"): 23 | super().__init__(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=detail) 24 | -------------------------------------------------------------------------------- /app/main.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import io 3 | import logging 4 | import os 5 | import time 6 | import uuid 7 | from typing import List, Optional, Union 8 | 9 | from contextlib import asynccontextmanager 10 | 11 | from fastapi import FastAPI, Request, Query 12 | from fastapi.exceptions import RequestValidationError 13 | from fastapi.middleware.cors import CORSMiddleware 14 | from fastapi.responses import JSONResponse, StreamingResponse 15 | from starlette.exceptions import HTTPException as StarletteHTTPException 16 | 17 | from app.cache import cache 18 | from app.config import settings 19 | from app.core import config_bridge 20 | from app.core.logging_config import get_logger, setup_logging 21 | from app.middleware.rate_limiter import RateLimitMiddleware 22 | from app.middleware.request_logger import RequestLoggerMiddleware, log_request_middleware 23 | from app.routes import api, health 24 | from app.utils.env_debugger import log_environment_settings 25 | from app.utils.error_handlers import ( 26 | general_exception_handler, 27 | http_exception_handler, 28 | validation_exception_handler, 29 | ) 30 | 31 | # Determine log level from environment - priority to "LOG_LEVEL" over "DEBUG" flag for consistency 32 | log_level_name = os.getenv("LOG_LEVEL", "INFO").upper() 33 | try: 34 | log_level = getattr(logging, log_level_name) 35 | except AttributeError: 36 | print(f"WARNING: Invalid LOG_LEVEL: {log_level_name}, using INFO") 37 | log_level = logging.INFO 38 | 39 | # Setup logging with determined level 40 | setup_logging(log_level) 41 | logger = get_logger("main") 42 | 43 | logger.info(f"Starting application with log level: {log_level_name}") 44 | 45 | # Set Uvicorn's access logger to WARNING to avoid logging health checks 46 | logging.getLogger("uvicorn.access").setLevel(logging.WARNING) 47 | 48 | SUPPORTED_SITES = ["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"] 49 | 50 | def get_env_bool(var_name, default=True): 51 | val = os.getenv(var_name) 52 | if val is None: 53 | return default 54 | return str(val).lower() in ("1", "true", "yes", "on") 55 | 56 | @asynccontextmanager 57 | async def lifespan(app: FastAPI): 58 | # Startup: Initialize services, connections, etc. 59 | logger.info("Starting up JobSpy Docker API") 60 | 61 | # Log environment variables to help debugging 62 | log_environment_settings() 63 | 64 | # Yield control to the application 65 | yield 66 | 67 | # Shutdown: Clean up resources 68 | logger.info("Shutting down JobSpy Docker API") 69 | cache.clear() 70 | 71 | # Create FastAPI app with enhanced documentation 72 | app = FastAPI( 73 | title="JobSpy Docker API", 74 | description=""" 75 | # JobSpy Docker API 76 | 77 | An API for searching jobs across multiple platforms including LinkedIn, Indeed, Glassdoor, Google, ZipRecruiter, Bayt, and Naukri. 78 | 79 | ## Authentication 80 | 81 | All API endpoints require an API key to be passed in the `x-api-key` header. 82 | 83 | ## Rate Limiting 84 | 85 | Requests are limited based on your API key. The default limit is 100 requests per hour. 86 | 87 | ## Caching 88 | 89 | Results are cached for 1 hour by default to improve performance and reduce load on job board sites. 90 | """, 91 | version="1.0.0", 92 | lifespan=lifespan, 93 | # Configure docs endpoints based on settings 94 | docs_url=settings.SWAGGER_UI_PATH if settings.ENABLE_SWAGGER_UI else None, 95 | redoc_url=settings.REDOC_PATH if settings.ENABLE_REDOC else None, 96 | openapi_tags=[ 97 | { 98 | "name": "Jobs", 99 | "description": "Operations related to job searching", 100 | }, 101 | { 102 | "name": "Health", 103 | "description": "API health check endpoints", 104 | }, 105 | { 106 | "name": "Info", 107 | "description": "General API information", 108 | }, 109 | ], 110 | swagger_ui_parameters={"defaultModelsExpandDepth": -1}, 111 | ) 112 | 113 | @app.on_event("startup") 114 | async def startup_event(): 115 | logger.info("Starting up Job Spy FastAPI application") 116 | 117 | # Set API key auth 118 | global ENABLE_API_KEY_AUTH 119 | ENABLE_API_KEY_AUTH = get_env_bool("ENABLE_API_KEY_AUTH", default=True) 120 | if ENABLE_API_KEY_AUTH: 121 | logger.info("API key authentication is enabled") 122 | else: 123 | logger.warning("API key authentication is disabled. Set ENABLE_API_KEY_AUTH=true to enable.") 124 | 125 | # Additional startup logic 126 | 127 | @app.on_event("shutdown") 128 | async def shutdown_event(): 129 | logger.info("Shutting down Job Spy FastAPI application") 130 | # Additional shutdown logic can be added here 131 | 132 | # Add CORS middleware 133 | app.add_middleware( 134 | CORSMiddleware, 135 | allow_origins=settings.CORS_ORIGINS, 136 | allow_credentials=True, 137 | allow_methods=["*"], 138 | allow_headers=["*"], 139 | ) 140 | 141 | # Add rate limiting middleware 142 | app.add_middleware(RateLimitMiddleware) 143 | 144 | # Add request logging middleware 145 | app.add_middleware(RequestLoggerMiddleware) 146 | 147 | # Add exception handlers 148 | app.add_exception_handler(RequestValidationError, validation_exception_handler) 149 | app.add_exception_handler(StarletteHTTPException, http_exception_handler) 150 | app.add_exception_handler(Exception, general_exception_handler) 151 | 152 | # Add request timing and logging middleware 153 | @app.middleware("http") 154 | async def log_requests(request: Request, call_next): 155 | start_time = time.time() 156 | 157 | # Generate request ID for tracking 158 | request_id = str(uuid.uuid4()) 159 | logger.debug(f"Request {request_id} started: {request.method} {request.url.path}") 160 | 161 | try: 162 | response = await call_next(request) 163 | process_time = time.time() - start_time 164 | logger.debug( 165 | f"Request {request_id} completed: {request.method} {request.url.path} " 166 | f"- Status: {response.status_code} - Time: {process_time:.3f}s" 167 | ) 168 | response.headers["X-Process-Time"] = str(process_time) 169 | return response 170 | except Exception as e: 171 | logger.exception(f"Request {request_id} failed: {str(e)}") 172 | raise 173 | 174 | # Include routers 175 | app.include_router(api.router, prefix="/api/v1", tags=["Jobs"]) 176 | app.include_router(health.router, tags=["Health"]) 177 | 178 | @app.get("/", tags=["Info"]) 179 | def read_root(): 180 | return { 181 | "message": "Welcome to JobSpy Docker API!", 182 | "docs_url": "/docs", 183 | "api_root": "/api/v1", 184 | "health_check": "/health" 185 | } 186 | 187 | # Add health check endpoint with minimal logging 188 | @app.get("/health") 189 | async def health_check(): 190 | """Health check endpoint for monitoring systems""" 191 | # Only log health checks in debug mode 192 | if logger.isEnabledFor(logging.DEBUG): 193 | logger.debug("Health check requested") 194 | return {"status": "healthy"} 195 | 196 | @app.get("/api/v1/search_jobs") 197 | async def search_jobs( 198 | site_name: Union[List[str], str] = Query(default=None, description="Job sites to search on"), 199 | search_term: Optional[str] = Query(None, description="Job search term"), 200 | google_search_term: Optional[str] = Query(None, description="Search term for Google jobs"), 201 | location: Optional[str] = Query(None, description="Job location"), 202 | distance: Optional[int] = Query(None, description="Distance in miles"), 203 | job_type: Optional[str] = Query(None, description="Job type (fulltime, parttime, internship, contract)"), 204 | is_remote: Optional[bool] = Query(None, description="Remote job filter"), 205 | results_wanted: Optional[int] = Query(None, description="Number of results per site"), 206 | hours_old: Optional[int] = Query(None, description="Filter by hours since posting"), 207 | easy_apply: Optional[bool] = Query(None, description="Filter for easy apply jobs"), 208 | description_format: Optional[str] = Query(None, description="Format of job description"), 209 | offset: Optional[int] = Query(None, description="Offset for pagination"), 210 | verbose: Optional[int] = Query(None, description="Controls verbosity"), 211 | linkedin_fetch_description: Optional[bool] = Query(None, description="Fetch full LinkedIn descriptions"), 212 | country_indeed: Optional[str] = Query(None, description="Country filter for Indeed & Glassdoor"), 213 | enforce_annual_salary: Optional[bool] = Query(None, description="Convert wages to annual salary"), 214 | format: str = Query("json", description="Output format: json or csv"), 215 | paginate: bool = Query(False, description="Enable pagination"), 216 | page: int = Query(1, description="Page number when pagination is enabled"), 217 | page_size: int = Query(10, ge=1, le=100, description="Results per page when pagination is enabled"), 218 | ): 219 | try: 220 | # Handle site_name=all explicitly 221 | if site_name is None: 222 | site_name = SUPPORTED_SITES 223 | elif isinstance(site_name, str): 224 | if site_name.lower() == "all": 225 | site_name = SUPPORTED_SITES 226 | else: 227 | site_name = [site_name] 228 | elif isinstance(site_name, list): 229 | if any(s.lower() == "all" for s in site_name): 230 | site_name = SUPPORTED_SITES 231 | 232 | # Use env default for country_indeed if not provided 233 | if country_indeed is None: 234 | country_indeed = os.getenv("DEFAULT_COUNTRY_INDEED", "USA") 235 | logger.debug(f"Using default country_indeed from environment: {country_indeed}") 236 | 237 | # Call your existing job scraping code 238 | # ...existing job scraping code... 239 | 240 | # This is a placeholder - replace with your actual jobs data 241 | jobs_data = [] # Replace this with your actual jobs_data 242 | 243 | # Format conversion and response 244 | if format.lower() == "csv": 245 | logger.debug("Returning CSV format") 246 | if not jobs_data: 247 | output = io.StringIO() 248 | writer = csv.writer(output) 249 | writer.writerow(["No results"]) 250 | output.seek(0) 251 | return StreamingResponse( 252 | output, 253 | media_type="text/csv", 254 | headers={"Content-Disposition": "attachment; filename=jobs.csv"} 255 | ) 256 | 257 | output = io.StringIO() 258 | writer = csv.DictWriter(output, fieldnames=jobs_data[0].keys()) 259 | writer.writeheader() 260 | writer.writerows(jobs_data) 261 | output.seek(0) 262 | return StreamingResponse( 263 | output, 264 | media_type="text/csv", 265 | headers={"Content-Disposition": "attachment; filename=jobs.csv"} 266 | ) 267 | 268 | # Default: JSON response 269 | return { 270 | "count": len(jobs_data), 271 | "jobs": jobs_data 272 | } 273 | 274 | except Exception as e: 275 | logger.exception(f"Error in search_jobs: {str(e)}") 276 | raise 277 | 278 | # API key auth default logic (at app startup or dependency) 279 | ENABLE_API_KEY_AUTH = get_env_bool("ENABLE_API_KEY_AUTH", default=True) 280 | if not ENABLE_API_KEY_AUTH: 281 | import warnings 282 | warnings.warn("API key authentication is disabled. Set ENABLE_API_KEY_AUTH=true to enable.") 283 | 284 | if __name__ == "__main__": 285 | import uvicorn 286 | uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True) 287 | -------------------------------------------------------------------------------- /app/middleware/__init__.py: -------------------------------------------------------------------------------- 1 | """Middleware components for the JobSpy Docker API.""" 2 | -------------------------------------------------------------------------------- /app/middleware/api_key_auth.py: -------------------------------------------------------------------------------- 1 | from fastapi import Request, HTTPException, Depends 2 | from fastapi.security import APIKeyHeader 3 | from starlette.status import HTTP_403_FORBIDDEN 4 | from typing import Optional 5 | import logging 6 | 7 | from app.config import settings 8 | 9 | logger = logging.getLogger(__name__) 10 | api_key_header = APIKeyHeader(name=settings.API_KEY_HEADER_NAME, auto_error=False) 11 | 12 | async def get_api_key(api_key_header: Optional[str] = Depends(api_key_header)): 13 | """ 14 | Dependency that checks if the API key is valid. 15 | Allows requests without authentication if: 16 | 1. API key authentication is disabled, or 17 | 2. No API keys are configured, or 18 | 3. API keys list is empty 19 | """ 20 | # Log detailed info about settings for debugging 21 | logger.debug(f"API key auth enabled: {settings.ENABLE_API_KEY_AUTH}") 22 | logger.debug(f"API keys configured: {bool(settings.API_KEYS)}") 23 | 24 | # Skip authentication if it's disabled or no keys are configured 25 | if not settings.ENABLE_API_KEY_AUTH or not settings.API_KEYS: 26 | logger.debug("Skipping API key validation - auth disabled or no keys configured") 27 | return True 28 | 29 | # At this point, auth is enabled and keys are configured, so require a key 30 | if not api_key_header: 31 | logger.warning(f"Missing API key in request, auth enabled with {len(settings.API_KEYS)} configured keys") 32 | raise HTTPException( 33 | status_code=HTTP_403_FORBIDDEN, 34 | detail="Missing API Key" 35 | ) 36 | 37 | if api_key_header not in settings.API_KEYS: 38 | logger.warning(f"Invalid API key provided") 39 | raise HTTPException( 40 | status_code=HTTP_403_FORBIDDEN, 41 | detail="Invalid API Key" 42 | ) 43 | 44 | logger.debug("Valid API key provided") 45 | return True 46 | -------------------------------------------------------------------------------- /app/middleware/rate_limiter.py: -------------------------------------------------------------------------------- 1 | import time 2 | from collections import defaultdict 3 | from typing import DefaultDict, Dict, List 4 | 5 | from fastapi import Request, HTTPException 6 | from starlette.middleware.base import BaseHTTPMiddleware 7 | from starlette.status import HTTP_429_TOO_MANY_REQUESTS 8 | 9 | from app.config import settings 10 | 11 | class RateLimitMiddleware(BaseHTTPMiddleware): 12 | def __init__(self, app): 13 | super().__init__(app) 14 | self.rate_limits: DefaultDict[str, List[float]] = defaultdict(list) 15 | self.enabled = settings.RATE_LIMIT_ENABLED 16 | self.max_requests = settings.RATE_LIMIT_REQUESTS 17 | self.timeframe = settings.RATE_LIMIT_TIMEFRAME 18 | 19 | async def dispatch(self, request: Request, call_next): 20 | if not self.enabled: 21 | return await call_next(request) 22 | 23 | # Get client identifier (use API key if available, otherwise IP) 24 | client_identifier = request.headers.get(settings.API_KEY_HEADER_NAME, request.client.host) 25 | 26 | # Check rate limit 27 | current_time = time.time() 28 | 29 | # Clean up old request timestamps 30 | self.rate_limits[client_identifier] = [ 31 | timestamp for timestamp in self.rate_limits[client_identifier] 32 | if current_time - timestamp < self.timeframe 33 | ] 34 | 35 | # Check if rate limit exceeded 36 | if len(self.rate_limits[client_identifier]) >= self.max_requests: 37 | reset_time = min(self.rate_limits[client_identifier]) + self.timeframe - current_time 38 | headers = {"X-RateLimit-Reset": str(int(reset_time))} 39 | 40 | raise HTTPException( 41 | status_code=HTTP_429_TOO_MANY_REQUESTS, 42 | detail=f"Rate limit exceeded. Maximum {self.max_requests} requests per {self.timeframe} seconds.", 43 | headers=headers 44 | ) 45 | 46 | # Add current request timestamp 47 | self.rate_limits[client_identifier].append(current_time) 48 | 49 | # Process the request 50 | response = await call_next(request) 51 | 52 | # Add rate limit headers 53 | remaining = self.max_requests - len(self.rate_limits[client_identifier]) 54 | response.headers["X-RateLimit-Limit"] = str(self.max_requests) 55 | response.headers["X-RateLimit-Remaining"] = str(remaining) 56 | 57 | return response 58 | -------------------------------------------------------------------------------- /app/middleware/request_logger.py: -------------------------------------------------------------------------------- 1 | """Middleware for logging requests and responses.""" 2 | import json 3 | import logging 4 | import time 5 | from fastapi import Request, Response 6 | from starlette.middleware.base import BaseHTTPMiddleware 7 | from starlette.types import ASGIApp 8 | 9 | logger = logging.getLogger("app.middleware.request_logger") 10 | 11 | # Paths to exclude from detailed logging 12 | EXCLUDED_PATHS = ["/health", "/metrics"] 13 | 14 | # Paths that should only be logged in debug mode 15 | DEBUG_ONLY_PATHS = ["/health", "/metrics"] 16 | 17 | # Add the missing function 18 | async def log_request_middleware(request: Request, call_next): 19 | """Function-based middleware for logging requests. 20 | Simpler alternative to the RequestLoggerMiddleware class.""" 21 | # Generate a unique request ID 22 | request_id = request.headers.get("X-Request-ID", f"req_{time.time()}") 23 | 24 | # Get path 25 | path = request.url.path 26 | 27 | # Only log health checks and monitoring endpoints in debug mode 28 | should_log = True 29 | if path in DEBUG_ONLY_PATHS: 30 | should_log = logger.isEnabledFor(logging.DEBUG) 31 | 32 | if should_log: 33 | # Log the request 34 | client_host = request.client.host if request.client else "unknown" 35 | logger.info(f"Request {request_id}: {request.method} {request.url} from {client_host}") 36 | 37 | # Process the request and measure timing 38 | start_time = time.time() 39 | response = await call_next(request) 40 | process_time = time.time() - start_time 41 | 42 | if should_log: 43 | # Log the response 44 | logger.info(f"Response {request_id}: {response.status_code} in {process_time:.4f} seconds") 45 | 46 | # Add custom headers 47 | response.headers["X-Request-ID"] = request_id 48 | response.headers["X-Process-Time"] = f"{process_time:.4f}" 49 | 50 | return response 51 | 52 | class RequestLoggerMiddleware(BaseHTTPMiddleware): 53 | def __init__(self, app: ASGIApp): 54 | super().__init__(app) 55 | 56 | async def dispatch(self, request: Request, call_next): 57 | # Generate a unique request ID 58 | request_id = request.headers.get("X-Request-ID", f"req_{time.time()}") 59 | 60 | # Get path and method 61 | path = request.url.path 62 | method = request.method # Move method extraction here, outside the condition 63 | 64 | # Only log health checks and monitoring endpoints in debug mode 65 | should_log = True 66 | if path in DEBUG_ONLY_PATHS: 67 | should_log = logger.isEnabledFor(logging.DEBUG) 68 | 69 | if should_log: 70 | # Log the request 71 | client_host = request.client.host if request.client else "unknown" 72 | url = str(request.url) 73 | 74 | logger.info(f"Request {request_id}: {method} {url} from {client_host}") 75 | 76 | # Get body if it's a POST/PUT 77 | if method in ["POST", "PUT"]: 78 | try: 79 | # Store the request body for logging 80 | body = await request.body() 81 | await self._log_request_body(request_id, body) 82 | 83 | # Need to create a new Request with the body because the original was consumed 84 | request = Request( 85 | scope=request.scope, 86 | receive=self._receive_with_body(body) 87 | ) 88 | except Exception as e: 89 | logger.warning(f"Failed to log request body: {str(e)}") 90 | 91 | # Process the request and measure timing 92 | start_time = time.time() 93 | response = await call_next(request) 94 | process_time = time.time() - start_time 95 | 96 | if should_log: 97 | # Log the response 98 | status_code = response.status_code 99 | logger.info(f"Response {request_id}: {status_code} in {process_time:.4f} seconds") 100 | 101 | # Add custom headers 102 | response.headers["X-Request-ID"] = request_id 103 | response.headers["X-Process-Time"] = f"{process_time:.4f}" 104 | 105 | return response 106 | 107 | async def _log_request_body(self, request_id: str, body: bytes): 108 | """Log the request body in a safe manner.""" 109 | try: 110 | # Only log if body is not too large 111 | if len(body) > 1000: 112 | logger.debug(f"Request {request_id} body: [too large to log]") 113 | return 114 | 115 | # Try to parse as JSON 116 | json_body = json.loads(body) 117 | # Mask sensitive fields 118 | self._mask_sensitive_fields(json_body) 119 | logger.debug(f"Request {request_id} body: {json.dumps(json_body)}") 120 | except: 121 | # Not JSON, log as string (truncated if needed) 122 | body_str = body.decode('utf-8', errors='replace') 123 | if len(body_str) > 200: 124 | body_str = body_str[:200] + "..." 125 | logger.debug(f"Request {request_id} body: {body_str}") 126 | 127 | def _mask_sensitive_fields(self, data): 128 | """Mask sensitive fields in the request data.""" 129 | if not isinstance(data, dict): 130 | return 131 | 132 | # List of fields to mask 133 | sensitive_fields = ["password", "token", "api_key", "secret", "credit_card"] 134 | 135 | for key in data: 136 | if isinstance(data[key], dict): 137 | self._mask_sensitive_fields(data[key]) 138 | elif isinstance(data[key], list): 139 | for item in data[key]: 140 | if isinstance(item, dict): 141 | self._mask_sensitive_fields(item) 142 | elif any(sensitive in key.lower() for sensitive in sensitive_fields): 143 | data[key] = "********" 144 | 145 | async def _receive_with_body(self, body: bytes): 146 | """Create a new receive function that returns the stored body.""" 147 | async def receive(): 148 | return {"type": "http.request", "body": body} 149 | return receive 150 | -------------------------------------------------------------------------------- /app/models.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Optional, Union 2 | 3 | from pydantic import BaseModel, Field 4 | 5 | class JobSearchParams(BaseModel): 6 | site_name: Union[List[str], str] = Field( 7 | default_factory=lambda: ["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"], 8 | description="Job sites to search on", 9 | ) 10 | search_term: Optional[str] = Field(default=None, description="Job search term") 11 | google_search_term: Optional[str] = Field(default=None, description="Search term for Google jobs") 12 | location: Optional[str] = Field(default=None, description="Job location") 13 | distance: Optional[int] = Field(default=50, description="Distance in miles") 14 | job_type: Optional[str] = Field(default=None, description="Job type (fulltime, parttime, internship, contract)") 15 | proxies: Optional[List[str]] = Field(default=None, description="Proxies in format ['user:pass@host:port', 'localhost']") 16 | is_remote: Optional[bool] = Field(default=None, description="Remote job filter") 17 | results_wanted: Optional[int] = Field(default=20, description="Number of results per site") 18 | hours_old: Optional[int] = Field(default=None, description="Filter by hours since posting") 19 | easy_apply: Optional[bool] = Field(default=None, description="Filter for easy apply jobs") 20 | description_format: Optional[str] = Field(default="markdown", description="Format of job description") 21 | offset: Optional[int] = Field(default=0, description="Offset for pagination") 22 | verbose: Optional[int] = Field(default=2, description="Controls verbosity (0: errors only, 1: errors+warnings, 2: all logs)") 23 | linkedin_fetch_description: Optional[bool] = Field(default=False, description="Fetch full LinkedIn descriptions") 24 | linkedin_company_ids: Optional[List[int]] = Field(default=None, description="LinkedIn company IDs to filter by") 25 | country_indeed: Optional[str] = Field(default=None, description="Country filter for Indeed & Glassdoor") 26 | enforce_annual_salary: Optional[bool] = Field(default=False, description="Convert wages to annual salary") 27 | ca_cert: Optional[str] = Field(default=None, description="Path to CA Certificate file for proxies") 28 | 29 | class JobResponse(BaseModel): 30 | count: int 31 | jobs: List[Dict[str, Any]] 32 | cached: bool = False 33 | 34 | class PaginatedJobResponse(BaseModel): 35 | count: int 36 | total_pages: int 37 | current_page: int 38 | page_size: int 39 | jobs: List[Dict[str, Any]] 40 | cached: bool = False 41 | next_page: Optional[str] = None 42 | previous_page: Optional[str] = None 43 | 44 | class HealthCheck(BaseModel): 45 | status: str = "ok" 46 | version: str = "1.0.0" 47 | -------------------------------------------------------------------------------- /app/models/__init__.py: -------------------------------------------------------------------------------- 1 | """Models for the JobSpy Docker API.""" 2 | from .health_models import HealthCheck, DetailedHealthCheck 3 | from .job_models import JobSearchParams, JobResponse, PaginatedJobResponse 4 | 5 | # Re-export all models 6 | __all__ = [ 7 | "HealthCheck", 8 | "DetailedHealthCheck", 9 | "JobSearchParams", 10 | "JobResponse", 11 | "PaginatedJobResponse" 12 | ] 13 | -------------------------------------------------------------------------------- /app/models/health_models.py: -------------------------------------------------------------------------------- 1 | """Models for health check endpoints.""" 2 | from pydantic import BaseModel, Field 3 | from typing import Dict, List, Any, Optional 4 | 5 | class HealthCheck(BaseModel): 6 | """Health check response model with detailed information.""" 7 | status: str = "ok" 8 | version: str = "1.0.0" 9 | environment: str = "production" 10 | log_level: str = "INFO" 11 | auth: Optional[Dict[str, Any]] = None 12 | rate_limiting: Optional[Dict[str, Any]] = None 13 | cache: Optional[Dict[str, Any]] = None 14 | config: Optional[Dict[str, Any]] = None 15 | health_endpoints: Optional[Dict[str, bool]] = None 16 | timestamp: Optional[float] = None 17 | 18 | class DetailedHealthCheck(BaseModel): 19 | """Placeholder for detailed health check model.""" 20 | status: str = "ok" 21 | version: str = "1.0.0" 22 | -------------------------------------------------------------------------------- /app/models/job_models.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Optional, Union 2 | 3 | from pydantic import BaseModel, Field 4 | 5 | class JobSearchParams(BaseModel): 6 | site_name: Union[List[str], str] = Field( 7 | default_factory=lambda: ["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"], 8 | description="Job sites to search on", 9 | ) 10 | search_term: Optional[str] = Field(default=None, description="Job search term") 11 | google_search_term: Optional[str] = Field(default=None, description="Search term for Google jobs") 12 | location: Optional[str] = Field(default=None, description="Job location") 13 | distance: Optional[int] = Field(default=50, description="Distance in miles") 14 | job_type: Optional[str] = Field(default=None, description="Job type (fulltime, parttime, internship, contract)") 15 | proxies: Optional[List[str]] = Field(default=None, description="Proxies in format ['user:pass@host:port', 'localhost']") 16 | is_remote: Optional[bool] = Field(default=None, description="Remote job filter") 17 | results_wanted: Optional[int] = Field(default=20, description="Number of results per site") 18 | hours_old: Optional[int] = Field(default=None, description="Filter by hours since posting") 19 | easy_apply: Optional[bool] = Field(default=None, description="Filter for easy apply jobs") 20 | description_format: Optional[str] = Field(default="markdown", description="Format of job description") 21 | offset: Optional[int] = Field(default=0, description="Offset for pagination") 22 | verbose: Optional[int] = Field(default=2, description="Controls verbosity (0: errors only, 1: errors+warnings, 2: all logs)") 23 | linkedin_fetch_description: Optional[bool] = Field(default=False, description="Fetch full LinkedIn descriptions") 24 | linkedin_company_ids: Optional[List[int]] = Field(default=None, description="LinkedIn company IDs to filter by") 25 | country_indeed: Optional[str] = Field(default=None, description="Country filter for Indeed & Glassdoor") 26 | enforce_annual_salary: Optional[bool] = Field(default=False, description="Convert wages to annual salary") 27 | ca_cert: Optional[str] = Field(default=None, description="Path to CA Certificate file for proxies") 28 | 29 | class JobResponse(BaseModel): 30 | count: int 31 | jobs: List[Dict[str, Any]] 32 | cached: bool = False 33 | 34 | class PaginatedJobResponse(BaseModel): 35 | count: int 36 | total_pages: int 37 | current_page: int 38 | page_size: int 39 | jobs: List[Dict[str, Any]] 40 | cached: bool = False 41 | next_page: Optional[str] = None 42 | previous_page: Optional[str] = None 43 | -------------------------------------------------------------------------------- /app/routes/__init__.py: -------------------------------------------------------------------------------- 1 | """API route handlers.""" 2 | -------------------------------------------------------------------------------- /app/routes/api.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, Depends, Query, HTTPException, Request 2 | from typing import List, Optional, Union 3 | import logging 4 | import time 5 | import uuid 6 | import traceback 7 | 8 | from app.models import JobSearchParams, JobResponse, PaginatedJobResponse 9 | from app.config import settings 10 | from app.middleware.api_key_auth import get_api_key 11 | from app.services.job_service import JobService 12 | from app.utils.validation_helpers import VALID_PARAMETERS, get_parameter_suggestion, generate_error_suggestions 13 | 14 | router = APIRouter() 15 | logger = logging.getLogger(__name__) 16 | 17 | SUPPORTED_COUNTRIES_INDEED = { 18 | "Argentina", "Australia", "Austria", "Bahrain", "Belgium", "Brazil", "Canada", "Chile", "China", "Colombia", 19 | "Costa Rica", "Czech Republic", "Denmark", "Ecuador", "Egypt", "Finland", "France", "Germany", "Greece", 20 | "Hong Kong", "Hungary", "India", "Indonesia", "Ireland", "Israel", "Italy", "Japan", "Kuwait", "Luxembourg", 21 | "Malaysia", "Mexico", "Morocco", "Netherlands", "New Zealand", "Nigeria", "Norway", "Oman", "Pakistan", 22 | "Panama", "Peru", "Philippines", "Poland", "Portugal", "Qatar", "Romania", "Saudi Arabia", "Singapore", 23 | "South Africa", "South Korea", "Spain", "Sweden", "Switzerland", "Taiwan", "Thailand", "Turkey", "Ukraine", 24 | "United Arab Emirates", "UK", "USA", "Uruguay", "Venezuela", "Vietnam" 25 | } 26 | 27 | def validate_job_search_params( 28 | site_name, 29 | country_indeed, 30 | hours_old, 31 | job_type, 32 | is_remote, 33 | easy_apply, 34 | description_format=None, 35 | verbose=None, 36 | page=None, 37 | page_size=None, 38 | paginate=None, 39 | endpoint="search_jobs" 40 | ): 41 | # Normalize site names 42 | snames = [s.lower() for s in site_name] if site_name else [] 43 | # Supported country validation for Indeed/Glassdoor 44 | if ("indeed" in snames or "glassdoor" in snames): 45 | if not country_indeed: 46 | raise HTTPException( 47 | status_code=400, 48 | detail={ 49 | "error": "Missing required parameter", 50 | "parameter": "country_indeed", 51 | "message": "country_indeed is required when searching Indeed or Glassdoor.", 52 | "suggestion": "Specify a supported country using the country_indeed parameter. See documentation for valid values." 53 | } 54 | ) 55 | if country_indeed not in SUPPORTED_COUNTRIES_INDEED: 56 | raise HTTPException( 57 | status_code=400, 58 | detail={ 59 | "error": "Invalid country_indeed value", 60 | "invalid_value": country_indeed, 61 | "valid_countries": sorted(SUPPORTED_COUNTRIES_INDEED), 62 | "suggestion": "Use one of the supported country names exactly as listed in the documentation." 63 | } 64 | ) 65 | # Parameter conflict logic for Indeed 66 | if "indeed" in snames: 67 | conflict_params = [] 68 | if hours_old is not None: 69 | if (job_type is not None or is_remote is not None) or (easy_apply is not None): 70 | conflict_params = ["hours_old", "job_type/is_remote", "easy_apply"] 71 | elif (job_type is not None or is_remote is not None) and easy_apply is not None: 72 | conflict_params = ["job_type/is_remote", "easy_apply"] 73 | if conflict_params: 74 | raise HTTPException( 75 | status_code=400, 76 | detail={ 77 | "error": "Parameter conflict for Indeed", 78 | "conflicting_parameters": conflict_params, 79 | "message": ( 80 | "Indeed searches only support one of the following at a time: " 81 | "hours_old, (job_type & is_remote), or easy_apply." 82 | ), 83 | "suggestion": ( 84 | "Remove one or more of these parameters so that only one group is used per search. " 85 | "See documentation for details." 86 | ) 87 | } 88 | ) 89 | # Parameter conflict logic for LinkedIn 90 | if "linkedin" in snames: 91 | if hours_old is not None and easy_apply is not None: 92 | raise HTTPException( 93 | status_code=400, 94 | detail={ 95 | "error": "Parameter conflict for LinkedIn", 96 | "conflicting_parameters": ["hours_old", "easy_apply"], 97 | "message": ( 98 | "LinkedIn searches only support one of the following at a time: hours_old or easy_apply." 99 | ), 100 | "suggestion": ( 101 | "Remove either hours_old or easy_apply from your search parameters." 102 | ) 103 | } 104 | ) 105 | 106 | # --- General parameter validation --- 107 | errors = [] 108 | # site_name 109 | if site_name: 110 | for s in site_name: 111 | if s not in VALID_PARAMETERS["site_name"]: 112 | errors.append(get_parameter_suggestion("site_name", s)) 113 | # job_type 114 | if job_type and job_type not in VALID_PARAMETERS["job_type"]: 115 | errors.append(get_parameter_suggestion("job_type", job_type)) 116 | # description_format 117 | if description_format and description_format not in VALID_PARAMETERS["description_format"]: 118 | errors.append(get_parameter_suggestion("description_format", description_format)) 119 | # verbose 120 | if verbose is not None and verbose not in VALID_PARAMETERS["verbose"]: 121 | errors.append(get_parameter_suggestion("verbose", verbose)) 122 | # page_size 123 | if page_size is not None and (page_size < 1 or page_size > 100): 124 | errors.append(get_parameter_suggestion("page_size", page_size)) 125 | # paginate 126 | if paginate is not None and paginate not in [True, False, 0, 1]: 127 | errors.append(get_parameter_suggestion("paginate", paginate)) 128 | # page 129 | if page is not None and page < 1: 130 | errors.append(get_parameter_suggestion("page", page)) 131 | # If any errors, raise with all suggestions 132 | if errors: 133 | raise HTTPException( 134 | status_code=400, 135 | detail={ 136 | "error": "Invalid parameter(s)", 137 | "suggestions": errors, 138 | } 139 | ) 140 | 141 | @router.get("/search_jobs", response_model=Union[JobResponse, PaginatedJobResponse], dependencies=[Depends(get_api_key)]) 142 | async def search_jobs( 143 | request: Request, 144 | # Pagination parameters 145 | paginate: bool = Query(False, description="Enable pagination"), 146 | page: int = Query(1, ge=1, description="Page number (if pagination enabled)"), 147 | page_size: int = Query(10, ge=1, le=100, description="Items per page (if pagination enabled)"), 148 | 149 | # Basic search parameters 150 | site_name: List[str] = Query(default=None, description="Job sites to search on"), 151 | search_term: str = Query(None, description="Job search term"), 152 | google_search_term: Optional[str] = Query(None, description="Search term for Google jobs"), 153 | location: str = Query(None, description="Job location"), 154 | distance: int = Query(None, description="Distance in miles"), 155 | 156 | # Job filters 157 | job_type: Optional[str] = Query(None, description="Job type (fulltime, parttime, internship, contract)"), 158 | is_remote: Optional[bool] = Query(None, description="Remote job filter"), 159 | hours_old: Optional[int] = Query(None, description="Filter by hours since posting"), 160 | easy_apply: Optional[bool] = Query(None, description="Filter for easy apply jobs"), 161 | 162 | # Advanced parameters 163 | results_wanted: int = Query(None, description="Number of results per site"), 164 | description_format: str = Query(None, description="Format of job description"), 165 | offset: int = Query(None, description="Offset for pagination"), 166 | verbose: int = Query(None, description="Controls verbosity (0: errors only, 1: errors+warnings, 2: all logs)"), 167 | linkedin_fetch_description: bool = Query(None, description="Fetch full LinkedIn descriptions"), 168 | linkedin_company_ids: Optional[List[int]] = Query(None, description="LinkedIn company IDs to filter by"), 169 | country_indeed: Optional[str] = Query(None, description="Country filter for Indeed & Glassdoor"), 170 | enforce_annual_salary: bool = Query(None, description="Convert wages to annual salary"), 171 | ): 172 | """ 173 | Search for jobs across multiple platforms with optional pagination. 174 | 175 | If paginate=True, returns paginated results with next/previous page links. 176 | Otherwise, returns all results in a single response. 177 | """ 178 | request_id = str(uuid.uuid4()) 179 | start_time = time.time() 180 | 181 | validate_job_search_params( 182 | site_name=site_name, 183 | country_indeed=country_indeed, 184 | hours_old=hours_old, 185 | job_type=job_type, 186 | is_remote=is_remote, 187 | easy_apply=easy_apply, 188 | description_format=description_format, 189 | verbose=verbose, 190 | page=page, 191 | page_size=page_size, 192 | paginate=paginate, 193 | ) 194 | 195 | # Validate site_name values 196 | if site_name: 197 | invalid_sites = [site for site in site_name if site not in VALID_PARAMETERS["site_name"]] 198 | if invalid_sites: 199 | suggestions = [get_parameter_suggestion("site_name", site) for site in invalid_sites] 200 | raise HTTPException( 201 | status_code=400, 202 | detail={ 203 | "error": "Invalid job site name(s)", 204 | "invalid_values": invalid_sites, 205 | "valid_sites": VALID_PARAMETERS["site_name"], 206 | "suggestions": suggestions 207 | } 208 | ) 209 | 210 | # Validate job_type 211 | if job_type and job_type not in VALID_PARAMETERS["job_type"]: 212 | suggestion = get_parameter_suggestion("job_type", job_type) 213 | raise HTTPException( 214 | status_code=400, 215 | detail={ 216 | "error": "Invalid job type", 217 | "invalid_value": job_type, 218 | "valid_types": VALID_PARAMETERS["job_type"], 219 | "suggestion": suggestion 220 | } 221 | ) 222 | 223 | # Validate description_format 224 | if description_format and description_format not in VALID_PARAMETERS["description_format"]: 225 | suggestion = get_parameter_suggestion("description_format", description_format) 226 | raise HTTPException( 227 | status_code=400, 228 | detail={ 229 | "error": "Invalid description format", 230 | "invalid_value": description_format, 231 | "valid_formats": VALID_PARAMETERS["description_format"], 232 | "suggestion": suggestion 233 | } 234 | ) 235 | 236 | # Create parameters object with all search parameters 237 | params = JobSearchParams( 238 | site_name=site_name if site_name else settings.DEFAULT_SITE_NAMES, 239 | search_term=search_term, 240 | google_search_term=google_search_term, 241 | location=location, 242 | distance=distance if distance is not None else settings.DEFAULT_DISTANCE, 243 | job_type=job_type, 244 | proxies=settings.DEFAULT_PROXIES if settings.DEFAULT_PROXIES else None, 245 | is_remote=is_remote, 246 | results_wanted=results_wanted if results_wanted is not None else settings.DEFAULT_RESULTS_WANTED, 247 | hours_old=hours_old, 248 | easy_apply=easy_apply, 249 | description_format=description_format if description_format else settings.DEFAULT_DESCRIPTION_FORMAT, 250 | offset=offset if offset is not None else 0, 251 | verbose=verbose if verbose is not None else 2, 252 | linkedin_fetch_description=linkedin_fetch_description if linkedin_fetch_description is not None else False, 253 | linkedin_company_ids=linkedin_company_ids, 254 | country_indeed=country_indeed if country_indeed else settings.DEFAULT_COUNTRY_INDEED, 255 | enforce_annual_salary=enforce_annual_salary if enforce_annual_salary is not None else False, 256 | ca_cert=settings.CA_CERT_PATH, 257 | ) 258 | 259 | logger.info(f"Request {request_id}: Starting job search with parameters: {params.dict(exclude_none=True)}") 260 | 261 | try: 262 | # Execute the search 263 | jobs_df, is_cached = JobService.search_jobs(params.dict(exclude_none=True)) 264 | 265 | # Return results - either paginated or all at once 266 | if paginate: 267 | # Calculate pagination 268 | total_items = len(jobs_df) 269 | total_pages = (total_items + page_size - 1) // page_size if total_items > 0 else 1 270 | 271 | # Validate page number 272 | if page > total_pages and total_pages > 0: 273 | raise HTTPException( 274 | status_code=404, 275 | detail={ 276 | "error": f"Page {page} not found", 277 | "total_pages": total_pages, 278 | "suggestion": f"Use a page number between 1 and {total_pages}" 279 | } 280 | ) 281 | 282 | # Apply pagination 283 | start_idx = (page - 1) * page_size 284 | end_idx = min(start_idx + page_size, total_items) 285 | paginated_df = jobs_df.iloc[start_idx:end_idx] if total_items > 0 else jobs_df 286 | 287 | # Generate next/previous page URLs 288 | base_url = str(request.url).split("?")[0] 289 | query_params = dict(request.query_params) 290 | 291 | next_page = None 292 | if page < total_pages: 293 | query_params["page"] = str(page + 1) 294 | next_page = f"{base_url}?{'&'.join([f'{k}={v}' for k, v in query_params.items()])}" 295 | 296 | previous_page = None 297 | if page > 1: 298 | query_params["page"] = str(page - 1) 299 | previous_page = f"{base_url}?{'&'.join([f'{k}={v}' for k, v in query_params.items()])}" 300 | 301 | # Convert DataFrame to dictionary format 302 | jobs_list = paginated_df.to_dict('records') if not paginated_df.empty else [] 303 | 304 | end_time = time.time() 305 | logger.info(f"Request {request_id}: Completed in {end_time - start_time:.2f} seconds. Found {total_items} jobs, returning page {page}/{total_pages}") 306 | 307 | return { 308 | "count": total_items, 309 | "total_pages": total_pages, 310 | "current_page": page, 311 | "page_size": page_size, 312 | "jobs": jobs_list, 313 | "cached": is_cached, 314 | "next_page": next_page, 315 | "previous_page": previous_page 316 | } 317 | else: 318 | # Return all results without pagination 319 | jobs_list = jobs_df.to_dict('records') if not jobs_df.empty else [] 320 | 321 | end_time = time.time() 322 | logger.info(f"Request {request_id}: Completed in {end_time - start_time:.2f} seconds. Found {len(jobs_list)} jobs") 323 | 324 | return { 325 | "count": len(jobs_list), 326 | "jobs": jobs_list, 327 | "cached": is_cached 328 | } 329 | except Exception as e: 330 | if isinstance(e, HTTPException): 331 | raise e 332 | 333 | logger.error(f"Request {request_id}: Error scraping jobs: {str(e)}") 334 | logger.debug(traceback.format_exc()) 335 | 336 | # Provide more helpful error details 337 | error_message = str(e) 338 | suggestion = "Try simplifying your search or using fewer job sites" 339 | 340 | if "proxy" in error_message.lower(): 341 | suggestion = "Check your proxy configuration or try without a proxy" 342 | elif "timeout" in error_message.lower(): 343 | suggestion = "The request timed out. Try reducing the number of job sites or results_wanted" 344 | elif "captcha" in error_message.lower(): 345 | suggestion = "A CAPTCHA was encountered. Try using a different proxy or reduce request frequency" 346 | 347 | raise HTTPException( 348 | status_code=500, 349 | detail={ 350 | "error": "Error scraping jobs", 351 | "message": error_message, 352 | "suggestion": suggestion 353 | } 354 | ) 355 | 356 | @router.post("/search_jobs", response_model=Union[JobResponse, PaginatedJobResponse], dependencies=[Depends(get_api_key)]) 357 | async def search_jobs_post( 358 | params: JobSearchParams, 359 | request: Request, 360 | ): 361 | """ 362 | Search for jobs across multiple platforms using POST method. 363 | """ 364 | request_id = str(uuid.uuid4()) 365 | start_time = time.time() 366 | 367 | validate_job_search_params( 368 | site_name=params.site_name if isinstance(params.site_name, list) else [params.site_name], 369 | country_indeed=params.country_indeed, 370 | hours_old=params.hours_old, 371 | job_type=params.job_type, 372 | is_remote=params.is_remote, 373 | easy_apply=params.easy_apply, 374 | description_format=params.description_format, 375 | verbose=params.verbose, 376 | page=getattr(params, "page", None), 377 | page_size=getattr(params, "page_size", None), 378 | paginate=getattr(params, "paginate", None), 379 | ) 380 | 381 | logger.info(f"Request {request_id}: Starting job search with parameters: {params.dict(exclude_none=True)}") 382 | 383 | try: 384 | # Execute the search 385 | jobs_df, is_cached = JobService.search_jobs(params.dict(exclude_none=True)) 386 | 387 | # Return all results without pagination 388 | jobs_list = jobs_df.to_dict('records') if not jobs_df.empty else [] 389 | 390 | end_time = time.time() 391 | logger.info(f"Request {request_id}: Completed in {end_time - start_time:.2f} seconds. Found {len(jobs_list)} jobs") 392 | 393 | return { 394 | "count": len(jobs_list), 395 | "jobs": jobs_list, 396 | "cached": is_cached 397 | } 398 | except Exception as e: 399 | if isinstance(e, HTTPException): 400 | raise e 401 | 402 | logger.error(f"Request {request_id}: Error scraping jobs: {str(e)}") 403 | logger.debug(traceback.format_exc()) 404 | 405 | # Provide more helpful error details 406 | error_message = str(e) 407 | suggestion = "Try simplifying your search or using fewer job sites" 408 | 409 | if "proxy" in error_message.lower(): 410 | suggestion = "Check your proxy configuration or try without a proxy" 411 | elif "timeout" in error_message.lower(): 412 | suggestion = "The request timed out. Try reducing the number of job sites or results_wanted" 413 | elif "captcha" in error_message.lower(): 414 | suggestion = "A CAPTCHA was encountered. Try using a different proxy or reduce request frequency" 415 | 416 | raise HTTPException( 417 | status_code=500, 418 | detail={ 419 | "error": "Error scraping jobs", 420 | "message": error_message, 421 | "suggestion": suggestion 422 | } 423 | ) 424 | -------------------------------------------------------------------------------- /app/routes/health.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, Request, Depends, HTTPException, status 2 | from app.models import HealthCheck 3 | from app.core.config import settings as core_settings 4 | from app.config import settings as app_settings 5 | import logging 6 | import os 7 | import platform 8 | import time 9 | from app.utils.auth_health import check_auth_configuration 10 | 11 | router = APIRouter() 12 | logger = logging.getLogger(__name__) 13 | 14 | # Create a dependency to check if health endpoints are enabled 15 | async def verify_health_enabled(): 16 | """Verify that health endpoints are enabled via configuration.""" 17 | if not app_settings.ENABLE_HEALTH_ENDPOINTS: 18 | raise HTTPException( 19 | status_code=status.HTTP_404_NOT_FOUND, 20 | detail="Health endpoints are disabled" 21 | ) 22 | return True 23 | 24 | @router.get("/health", response_model=HealthCheck, tags=["Health"], dependencies=[Depends(verify_health_enabled)]) 25 | async def health_check(): 26 | """ 27 | Health check endpoint to verify the API is running correctly and return system status 28 | """ 29 | # Get authentication status 30 | auth_status = check_auth_configuration() 31 | 32 | # Build response with all the requested information 33 | return HealthCheck( 34 | status="ok", 35 | version="1.0.0", 36 | environment=app_settings.ENVIRONMENT, 37 | log_level=app_settings.LOG_LEVEL, 38 | auth={ 39 | "enabled": app_settings.ENABLE_API_KEY_AUTH, 40 | "api_keys_configured": bool(app_settings.API_KEYS), 41 | "api_keys_count": len(app_settings.API_KEYS) if app_settings.API_KEYS else 0, 42 | "inconsistent": auth_status["inconsistent_config"], 43 | }, 44 | rate_limiting={ 45 | "enabled": app_settings.RATE_LIMIT_ENABLED, 46 | "requests_limit": app_settings.RATE_LIMIT_REQUESTS, 47 | "timeframe_seconds": app_settings.RATE_LIMIT_TIMEFRAME, 48 | }, 49 | cache={ 50 | "enabled": app_settings.ENABLE_CACHE, 51 | "expiry_seconds": app_settings.CACHE_EXPIRY, 52 | }, 53 | health_endpoints={ 54 | "enabled": app_settings.ENABLE_HEALTH_ENDPOINTS, 55 | "detailed_health": app_settings.ENABLE_DETAILED_HEALTH, 56 | }, 57 | config={ 58 | "default_site_names": app_settings.DEFAULT_SITE_NAMES, 59 | "default_results_wanted": app_settings.DEFAULT_RESULTS_WANTED, 60 | "default_distance": app_settings.DEFAULT_DISTANCE, 61 | "default_description_format": app_settings.DEFAULT_DESCRIPTION_FORMAT, 62 | "default_country_indeed": app_settings.DEFAULT_COUNTRY_INDEED, 63 | }, 64 | timestamp=time.time() 65 | ) 66 | 67 | @router.get("/ping", tags=["Health"], dependencies=[Depends(verify_health_enabled)]) 68 | async def ping(): 69 | """ 70 | Simple ping endpoint for load balancers and monitoring 71 | """ 72 | return {"status": "ok"} 73 | 74 | @router.get("/auth-status", tags=["Health"], dependencies=[Depends(verify_health_enabled)]) 75 | async def auth_status(request: Request): 76 | """ 77 | Diagnostic endpoint to check authentication settings 78 | """ 79 | logger.info("Auth status endpoint called") 80 | 81 | # Check if the request has the API key header 82 | api_key_header_name = "X-API-Key" 83 | api_key_in_request = request.headers.get(api_key_header_name) 84 | 85 | return { 86 | "api_key_configured": bool(core_settings.API_KEY), 87 | "api_key_header_name": api_key_header_name, 88 | "api_key_in_request": bool(api_key_in_request), 89 | "authentication_enabled": bool(core_settings.API_KEY), 90 | "environment": core_settings.ENVIRONMENT if hasattr(core_settings, "ENVIRONMENT") else app_settings.ENVIRONMENT 91 | } 92 | 93 | @router.get("/api-config", tags=["Health"], dependencies=[Depends(verify_health_enabled)]) 94 | async def api_config(): 95 | """ 96 | Diagnostic endpoint to check API configuration settings 97 | """ 98 | logger.info("API configuration endpoint called") 99 | 100 | # Only provide detailed info if it's enabled 101 | if not app_settings.ENABLE_DETAILED_HEALTH: 102 | return { 103 | "status": "ok", 104 | "message": "Detailed health information is disabled. Enable with ENABLE_DETAILED_HEALTH=true" 105 | } 106 | 107 | # Build comprehensive config information 108 | system_info = { 109 | "platform": platform.platform(), 110 | "python_version": platform.python_version(), 111 | } 112 | 113 | # Configuration information 114 | config = { 115 | "environment": app_settings.ENVIRONMENT, 116 | "log_level": app_settings.LOG_LEVEL, 117 | "authentication": { 118 | "enabled": app_settings.ENABLE_API_KEY_AUTH, 119 | "api_keys_configured": bool(app_settings.API_KEYS), 120 | "api_keys_count": len(app_settings.API_KEYS) if app_settings.API_KEYS else 0, 121 | "header_name": app_settings.API_KEY_HEADER_NAME, 122 | }, 123 | "rate_limiting": { 124 | "enabled": app_settings.RATE_LIMIT_ENABLED, 125 | "requests_limit": app_settings.RATE_LIMIT_REQUESTS, 126 | "timeframe_seconds": app_settings.RATE_LIMIT_TIMEFRAME, 127 | }, 128 | "caching": { 129 | "enabled": app_settings.ENABLE_CACHE, 130 | "expiry_seconds": app_settings.CACHE_EXPIRY, 131 | }, 132 | "health_endpoints": { 133 | "enabled": app_settings.ENABLE_HEALTH_ENDPOINTS, 134 | "detailed_health": app_settings.ENABLE_DETAILED_HEALTH, 135 | }, 136 | } 137 | 138 | return { 139 | "status": "ok", 140 | "system": system_info, 141 | "config": config, 142 | "timestamp": time.time() 143 | } 144 | 145 | @router.get("/config-sources", tags=["Health"], dependencies=[Depends(verify_health_enabled)]) 146 | async def config_sources(): 147 | """ 148 | Diagnostic endpoint to view the source of each configuration setting 149 | """ 150 | logger.info("Configuration sources endpoint called") 151 | 152 | # Only provide detailed info if it's enabled 153 | if not app_settings.ENABLE_DETAILED_HEALTH: 154 | return { 155 | "status": "ok", 156 | "message": "Detailed health information is disabled. Enable with ENABLE_DETAILED_HEALTH=true" 157 | } 158 | 159 | # Get all settings with their sources 160 | settings_with_sources = app_settings.get_all_settings() 161 | 162 | # Format for output, focusing on key settings 163 | important_settings = [ 164 | "ENABLE_API_KEY_AUTH", "API_KEYS", "RATE_LIMIT_ENABLED", 165 | "ENABLE_CACHE", "ENVIRONMENT", "LOG_LEVEL" 166 | ] 167 | 168 | focused_settings = {k: settings_with_sources[k] for k in important_settings if k in settings_with_sources} 169 | 170 | # Check for configuration inconsistencies 171 | auth_status = check_auth_configuration() 172 | inconsistencies = [] 173 | 174 | if auth_status["inconsistent_config"]: 175 | inconsistencies.extend(auth_status["recommendations"]) 176 | 177 | return { 178 | "status": "ok", 179 | "key_settings": focused_settings, 180 | "all_settings": settings_with_sources, 181 | "inconsistencies": inconsistencies, 182 | "timestamp": time.time() 183 | } 184 | -------------------------------------------------------------------------------- /app/services/__init__.py: -------------------------------------------------------------------------------- 1 | """Business logic services.""" 2 | -------------------------------------------------------------------------------- /app/services/background_service.py: -------------------------------------------------------------------------------- 1 | """Background job processing for JobSpy Docker API.""" 2 | import asyncio 3 | from typing import Dict, Any, Optional 4 | import uuid 5 | import logging 6 | from datetime import datetime 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | # Simple in-memory job storage (would use a database in production) 11 | jobs = {} 12 | 13 | async def process_job_async(job_id: str, search_function, params: Dict[str, Any]): 14 | """Process a job asynchronously.""" 15 | try: 16 | logger.info(f"Starting background job {job_id}") 17 | jobs[job_id]["status"] = "running" 18 | 19 | # Execute the search 20 | result, is_cached = await asyncio.to_thread(search_function, params) 21 | 22 | # Store result 23 | jobs[job_id]["status"] = "completed" 24 | jobs[job_id]["result"] = result 25 | jobs[job_id]["is_cached"] = is_cached 26 | jobs[job_id]["completed_at"] = datetime.now().isoformat() 27 | 28 | logger.info(f"Completed background job {job_id}") 29 | except Exception as e: 30 | logger.error(f"Error processing job {job_id}: {str(e)}") 31 | jobs[job_id]["status"] = "failed" 32 | jobs[job_id]["error"] = str(e) 33 | 34 | def create_background_job(search_function, params: Dict[str, Any]) -> str: 35 | """Create a new background job.""" 36 | job_id = str(uuid.uuid4()) 37 | jobs[job_id] = { 38 | "id": job_id, 39 | "status": "pending", 40 | "created_at": datetime.now().isoformat(), 41 | "params": params, 42 | } 43 | 44 | # Start the background task 45 | asyncio.create_task(process_job_async(job_id, search_function, params)) 46 | 47 | return job_id 48 | 49 | def get_job_status(job_id: str) -> Optional[Dict[str, Any]]: 50 | """Get the status of a job.""" 51 | return jobs.get(job_id) 52 | -------------------------------------------------------------------------------- /app/services/external_service.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | import time 3 | import httpx 4 | from app.core.logging_config import get_logger 5 | 6 | logger = get_logger("services.external_service") 7 | 8 | 9 | async def fetch_data_from_external_api(url: str, params: dict = None): 10 | start_time = time.time() 11 | request_id = str(uuid.uuid4()) 12 | 13 | logger.debug(f"External API request {request_id} started: GET {url} - Params: {params}") 14 | 15 | try: 16 | async with httpx.AsyncClient() as client: 17 | response = await client.get(url, params=params) 18 | 19 | elapsed_time = time.time() - start_time 20 | logger.debug( 21 | f"External API request {request_id} completed: GET {url} - " 22 | f"Status: {response.status_code} - Time: {elapsed_time:.3f}s" 23 | ) 24 | 25 | response.raise_for_status() 26 | return response.json() 27 | except httpx.HTTPStatusError as e: 28 | elapsed_time = time.time() - start_time 29 | logger.error( 30 | f"External API request {request_id} failed with status {e.response.status_code}: " 31 | f"GET {url} - Time: {elapsed_time:.3f}s - Response: {e.response.text}" 32 | ) 33 | raise 34 | except Exception as e: 35 | elapsed_time = time.time() - start_time 36 | logger.exception( 37 | f"External API request {request_id} failed: GET {url} - " 38 | f"Time: {elapsed_time:.3f}s - Error: {str(e)}" 39 | ) 40 | raise -------------------------------------------------------------------------------- /app/services/job_service.py: -------------------------------------------------------------------------------- 1 | """Job search service layer.""" 2 | from typing import Dict, Any 3 | import pandas as pd 4 | from jobspy import scrape_jobs 5 | import logging 6 | 7 | from app.config import settings 8 | from app.cache import cache 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class JobService: 13 | """Service for interacting with JobSpy library.""" 14 | 15 | @staticmethod 16 | def search_jobs(params: Dict[str, Any]) -> pd.DataFrame: 17 | """ 18 | Execute a job search using the JobSpy library. 19 | 20 | Args: 21 | params: Dictionary of search parameters 22 | 23 | Returns: 24 | DataFrame containing job results 25 | """ 26 | # Apply default proxies from env if none provided 27 | if params.get('proxies') is None and settings.DEFAULT_PROXIES: 28 | params['proxies'] = settings.DEFAULT_PROXIES 29 | 30 | # Apply default CA cert path if none provided 31 | if params.get('ca_cert') is None and settings.CA_CERT_PATH: 32 | params['ca_cert'] = settings.CA_CERT_PATH 33 | 34 | # Apply default country_indeed if none provided 35 | if params.get('country_indeed') is None and settings.DEFAULT_COUNTRY_INDEED: 36 | params['country_indeed'] = settings.DEFAULT_COUNTRY_INDEED 37 | 38 | # Check cache first 39 | cached_results = cache.get(params) 40 | if cached_results is not None: 41 | logger.info(f"Returning cached results with {len(cached_results)} jobs") 42 | return cached_results, True 43 | 44 | # Execute search 45 | jobs_df = scrape_jobs(**params) 46 | 47 | # Cache the results 48 | cache.set(params, jobs_df) 49 | 50 | return jobs_df, False 51 | 52 | @staticmethod 53 | def filter_jobs(jobs_df: pd.DataFrame, filters: Dict[str, Any]) -> pd.DataFrame: 54 | """Filter job results based on criteria.""" 55 | filtered_df = jobs_df.copy() 56 | 57 | # Filter by salary range 58 | if 'min_salary' in filters and filters['min_salary'] is not None: 59 | # Convert to numeric first to handle comparison properly 60 | filtered_df = filtered_df[filtered_df['MIN_AMOUNT'].astype(float) >= float(filters['min_salary'])] 61 | 62 | if 'max_salary' in filters and filters['max_salary'] is not None: 63 | filtered_df = filtered_df[filtered_df['MAX_AMOUNT'].astype(float) <= float(filters['max_salary'])] 64 | 65 | # Filter by company 66 | if 'company' in filters and filters['company']: 67 | filtered_df = filtered_df[filtered_df['COMPANY'].str.contains(filters['company'], case=False, na=False)] 68 | 69 | # Filter by job type 70 | if 'job_type' in filters and filters['job_type']: 71 | filtered_df = filtered_df[filtered_df['JOB_TYPE'] == filters['job_type']] 72 | 73 | # Filter by location 74 | if 'city' in filters and filters['city']: 75 | filtered_df = filtered_df[filtered_df['CITY'].str.contains(filters['city'], case=False, na=False)] 76 | 77 | if 'state' in filters and filters['state']: 78 | filtered_df = filtered_df[filtered_df['STATE'].str.contains(filters['state'], case=False, na=False)] 79 | 80 | # Filter by keyword in title 81 | if 'title_keywords' in filters and filters['title_keywords']: 82 | filtered_df = filtered_df[filtered_df['TITLE'].str.contains(filters['title_keywords'], case=False, na=False)] 83 | 84 | return filtered_df 85 | 86 | @staticmethod 87 | def sort_jobs(jobs_df: pd.DataFrame, sort_by: str, sort_order: str = 'desc') -> pd.DataFrame: 88 | """Sort job results by specified field.""" 89 | if not sort_by or sort_by not in jobs_df.columns: 90 | return jobs_df 91 | 92 | ascending = sort_order.lower() != 'desc' 93 | return jobs_df.sort_values(by=sort_by, ascending=ascending) 94 | -------------------------------------------------------------------------------- /app/utils/auth_health.py: -------------------------------------------------------------------------------- 1 | """Utility functions for checking authentication health.""" 2 | import logging 3 | from typing import Dict, Any 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | def check_auth_configuration() -> Dict[str, Any]: 8 | """ 9 | Check the authentication configuration and return status details. 10 | This helps diagnose authentication issues by checking all relevant settings. 11 | """ 12 | # Import here to avoid circular imports 13 | from app.core.config import settings as core_settings 14 | from app.config import settings as app_settings 15 | 16 | # Check core settings 17 | core_api_key_set = bool(core_settings.API_KEY) 18 | 19 | # Check app settings 20 | app_auth_enabled = app_settings.ENABLE_API_KEY_AUTH 21 | app_keys_configured = bool(app_settings.API_KEYS) 22 | app_keys_count = len(app_settings.API_KEYS) 23 | 24 | # Check for configuration inconsistencies 25 | inconsistent_config = (app_keys_configured and not app_auth_enabled) 26 | 27 | # Generate recommendations 28 | recommendations = [] 29 | if inconsistent_config: 30 | recommendations.append( 31 | "API keys are configured but authentication is disabled. Consider enabling ENABLE_API_KEY_AUTH." 32 | ) 33 | logger.warning("API keys are configured but authentication is disabled. This may lead to unexpected behavior.") 34 | 35 | # Determine if authentication is needed based on both configs 36 | auth_required = core_api_key_set or (app_auth_enabled and app_keys_configured) 37 | 38 | # Log configuration sources 39 | logger.debug(f"API keys loaded from: {app_settings.API_KEYS_SOURCE}") 40 | logger.debug(f"Auth enabled setting loaded from: {app_settings.ENABLE_API_KEY_AUTH_SOURCE}") 41 | 42 | return { 43 | "auth_required": auth_required, 44 | "core_settings": { 45 | "api_key_configured": core_api_key_set, 46 | }, 47 | "app_settings": { 48 | "auth_enabled": app_auth_enabled, 49 | "api_keys_configured": app_keys_configured, 50 | "api_keys_count": app_keys_count, 51 | "header_name": app_settings.API_KEY_HEADER_NAME, 52 | "api_keys_source": app_settings.API_KEYS_SOURCE, 53 | "auth_enabled_source": app_settings.ENABLE_API_KEY_AUTH_SOURCE, 54 | }, 55 | "inconsistent_config": inconsistent_config, 56 | "recommendations": recommendations 57 | } 58 | -------------------------------------------------------------------------------- /app/utils/env_debugger.py: -------------------------------------------------------------------------------- 1 | """Utility to debug environment variable loading.""" 2 | import logging 3 | import os 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | def log_environment_settings(): 8 | """ 9 | Log all environment variables relevant to application configuration. 10 | This helps diagnose when environment variables aren't being loaded correctly. 11 | """ 12 | env_vars = { 13 | "API_KEYS": os.getenv("API_KEYS", "[not set]"), 14 | "ENABLE_API_KEY_AUTH": os.getenv("ENABLE_API_KEY_AUTH", "[not set]"), 15 | "API_KEY_HEADER_NAME": os.getenv("API_KEY_HEADER_NAME", "[not set]"), 16 | "RATE_LIMIT_ENABLED": os.getenv("RATE_LIMIT_ENABLED", "[not set]"), 17 | "RATE_LIMIT_REQUESTS": os.getenv("RATE_LIMIT_REQUESTS", "[not set]"), 18 | "RATE_LIMIT_TIMEFRAME": os.getenv("RATE_LIMIT_TIMEFRAME", "[not set]"), 19 | "DEFAULT_PROXIES": os.getenv("DEFAULT_PROXIES", "[not set]"), 20 | "DEFAULT_SITE_NAMES": os.getenv("DEFAULT_SITE_NAMES", "[not set]"), 21 | "ENABLE_CACHE": os.getenv("ENABLE_CACHE", "[not set]"), 22 | "CACHE_EXPIRY": os.getenv("CACHE_EXPIRY", "[not set]"), 23 | "ENVIRONMENT": os.getenv("ENVIRONMENT", "[not set]"), 24 | "LOG_LEVEL": os.getenv("LOG_LEVEL", "[not set]"), 25 | } 26 | 27 | # Mask sensitive values 28 | if env_vars["API_KEYS"] != "[not set]": 29 | env_vars["API_KEYS"] = "****[MASKED]****" 30 | 31 | # Log all relevant environment variables 32 | logger.info("Environment variables loaded:") 33 | for key, value in env_vars.items(): 34 | logger.info(f" {key}={value}") 35 | 36 | return env_vars 37 | -------------------------------------------------------------------------------- /app/utils/error_handlers.py: -------------------------------------------------------------------------------- 1 | """Error handling utilities for the API.""" 2 | from fastapi import Request, status 3 | from fastapi.responses import JSONResponse 4 | from fastapi.exceptions import RequestValidationError 5 | from starlette.exceptions import HTTPException as StarletteHTTPException 6 | import logging 7 | 8 | from app.utils.validation_helpers import generate_error_suggestions, get_parameter_suggestion 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | async def validation_exception_handler(request: Request, exc: RequestValidationError): 13 | """Handle validation errors in a consistent way.""" 14 | error_details = [] 15 | for error in exc.errors(): 16 | error_details.append({ 17 | "location": error["loc"], 18 | "message": error["msg"], 19 | "type": error["type"] 20 | }) 21 | 22 | logger.warning(f"Validation error: {error_details}") 23 | 24 | # Generate helpful suggestions 25 | suggestions = generate_error_suggestions(error_details) 26 | 27 | return JSONResponse( 28 | status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, 29 | content={ 30 | "error": "Validation Error", 31 | "details": error_details, 32 | "path": request.url.path, 33 | "suggestions": suggestions, 34 | "documentation_url": "/docs" 35 | } 36 | ) 37 | 38 | async def http_exception_handler(request: Request, exc: StarletteHTTPException): 39 | """Handle HTTP exceptions with consistent response format.""" 40 | logger.warning(f"HTTP exception: {exc.status_code} - {exc.detail}") 41 | 42 | # Create a content object with standard fields 43 | content = { 44 | "error": "Request Error", 45 | "status_code": exc.status_code, 46 | "message": exc.detail, 47 | "path": request.url.path 48 | } 49 | 50 | # Add suggestions for common errors 51 | if exc.status_code == status.HTTP_403_FORBIDDEN: 52 | if "API Key" in exc.detail: 53 | content["suggestions"] = [{ 54 | "parameter": "x-api-key", 55 | "message": "Missing or invalid API key", 56 | "suggestion": "Include a valid API key in the x-api-key header", 57 | "documentation_url": "/docs#section/Authentication" 58 | }] 59 | elif exc.status_code == status.HTTP_404_NOT_FOUND: 60 | if "Page" in exc.detail and "not found" in exc.detail: 61 | content["suggestions"] = [{ 62 | "parameter": "page", 63 | "message": "Page number out of range", 64 | "suggestion": "Use a page number within the available range", 65 | }] 66 | 67 | return JSONResponse( 68 | status_code=exc.status_code, 69 | content=content, 70 | headers=exc.headers 71 | ) 72 | 73 | async def general_exception_handler(request: Request, exc: Exception): 74 | """Handle all other exceptions with consistent response format.""" 75 | logger.error(f"Unhandled exception: {str(exc)}", exc_info=True) 76 | 77 | # Create basic error response 78 | content = { 79 | "error": "Server Error", 80 | "message": str(exc), 81 | "path": request.url.path 82 | } 83 | 84 | # Add suggestions based on exception type or message 85 | if "scrape_jobs" in str(exc): 86 | content["suggestions"] = [{ 87 | "message": "Error occurred during job scraping", 88 | "suggestion": "Check your search parameters and try again with fewer job boards or results", 89 | "troubleshooting": "Try using only one job site at a time (e.g., site_name=linkedin)" 90 | }] 91 | 92 | return JSONResponse( 93 | status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, 94 | content=content 95 | ) 96 | -------------------------------------------------------------------------------- /app/utils/logging_config.py: -------------------------------------------------------------------------------- 1 | """Logging configuration for JobSpy Docker API.""" 2 | import logging 3 | import logging.config 4 | import os 5 | from pathlib import Path 6 | 7 | def setup_logging(log_level: str = "INFO"): 8 | """Configure logging for the application.""" 9 | log_config = { 10 | "version": 1, 11 | "disable_existing_loggers": False, 12 | "formatters": { 13 | "default": { 14 | "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s", 15 | "datefmt": "%Y-%m-%d %H:%M:%S", 16 | }, 17 | "json": { 18 | "()": "pythonjsonlogger.jsonlogger.JsonFormatter", 19 | "format": "%(asctime)s %(name)s %(levelname)s %(message)s", 20 | }, 21 | }, 22 | "handlers": { 23 | "console": { 24 | "class": "logging.StreamHandler", 25 | "formatter": "default", 26 | "level": log_level, 27 | }, 28 | }, 29 | "loggers": { 30 | "": {"level": log_level, "handlers": ["console"], "propagate": True}, 31 | "app": {"level": log_level, "handlers": ["console"], "propagate": False}, 32 | "uvicorn": {"level": log_level, "handlers": ["console"], "propagate": False}, 33 | }, 34 | } 35 | 36 | # Create logs directory if it doesn't exist 37 | logs_dir = Path("logs") 38 | logs_dir.mkdir(exist_ok=True) 39 | 40 | # Add file handler if not in development mode 41 | if os.environ.get("ENVIRONMENT", "development") != "development": 42 | log_config["handlers"]["file"] = { 43 | "class": "logging.handlers.RotatingFileHandler", 44 | "formatter": "json", 45 | "filename": "logs/app.log", 46 | "maxBytes": 10485760, # 10MB 47 | "backupCount": 5, 48 | "level": log_level, 49 | } 50 | log_config["loggers"][""]["handlers"].append("file") 51 | log_config["loggers"]["app"]["handlers"].append("file") 52 | 53 | logging.config.dictConfig(log_config) 54 | -------------------------------------------------------------------------------- /app/utils/logging_docs.py: -------------------------------------------------------------------------------- 1 | """Documentation for logging levels and troubleshooting.""" 2 | 3 | LOGGING_LEVELS = { 4 | "DEBUG": { 5 | "level": 10, 6 | "description": "Detailed information, typically of interest only when diagnosing problems", 7 | "use_case": "Shows detailed flow of the application, including variable values and decision points", 8 | "shows_auth_errors": True, 9 | "environment": "Development" 10 | }, 11 | "INFO": { 12 | "level": 20, 13 | "description": "Confirmation that things are working as expected", 14 | "use_case": "Normal operation events like startup, shutdown, or successful requests", 15 | "shows_auth_errors": False, 16 | "environment": "Development/Production" 17 | }, 18 | "WARNING": { 19 | "level": 30, 20 | "description": "Indication that something unexpected happened, or may happen in the near future", 21 | "use_case": "Non-critical issues like deprecation notices or improper usage", 22 | "shows_auth_errors": True, 23 | "environment": "Development/Production" 24 | }, 25 | "ERROR": { 26 | "level": 40, 27 | "description": "Due to a more serious problem, the software has not been able to perform some function", 28 | "use_case": "Exception handling and error conditions that should be investigated", 29 | "shows_auth_errors": True, 30 | "environment": "Development/Production" 31 | }, 32 | "CRITICAL": { 33 | "level": 50, 34 | "description": "A very serious error, indicating that the program itself may be unable to continue running", 35 | "use_case": "Application crashes and severe system issues", 36 | "shows_auth_errors": True, 37 | "environment": "Development/Production" 38 | } 39 | } 40 | 41 | def get_appropriate_level_for_issue(issue_type): 42 | """Get the appropriate logging level for different issue types.""" 43 | issue_levels = { 44 | "auth": ["DEBUG", "WARNING"], 45 | "api_key": ["DEBUG", "WARNING"], 46 | "request_validation": ["DEBUG", "WARNING"], 47 | "server_error": ["ERROR", "CRITICAL"], 48 | "rate_limit": ["WARNING"], 49 | "performance": ["DEBUG", "INFO"] 50 | } 51 | return issue_levels.get(issue_type, ["DEBUG"]) 52 | 53 | def get_troubleshooting_tips(): 54 | """Get troubleshooting tips for common issues.""" 55 | return { 56 | "authentication_issues": [ 57 | "Check if API_KEY is set in your environment or .env file", 58 | "Verify your requests include the X-API-Key header with the correct value", 59 | "Try the /auth-status endpoint to check current authentication settings", 60 | "Set LOG_LEVEL=DEBUG to see detailed authentication logging" 61 | ], 62 | "missing_api_key_error": [ 63 | "This error occurs when API_KEY is configured but not included in your request", 64 | "Either add the X-API-Key header to your request or remove the API_KEY from your settings" 65 | ], 66 | "invalid_api_key_error": [ 67 | "This error occurs when the API key in your request doesn't match the configured value", 68 | "Check the API_KEY value in your environment or .env file" 69 | ], 70 | "server_errors": [ 71 | "Check the application logs for details about the error", 72 | "Ensure all required environment variables are set", 73 | "Verify the application has appropriate permissions" 74 | ] 75 | } 76 | -------------------------------------------------------------------------------- /app/utils/validation_helpers.py: -------------------------------------------------------------------------------- 1 | """Utility functions for parameter validation and providing helpful error messages.""" 2 | from typing import Any, Dict, List, Tuple 3 | 4 | # Define valid values for different parameters 5 | VALID_PARAMETERS = { 6 | "site_name": ["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"], 7 | "job_type": ["fulltime", "parttime", "internship", "contract"], 8 | "description_format": ["markdown", "html"], 9 | "verbose": [0, 1, 2], 10 | "page_size": list(range(1, 101)), 11 | "paginate": [True, False], 12 | } 13 | 14 | # Define parameter type information to improve error messages 15 | PARAMETER_TYPES = { 16 | "site_name": "string or list", 17 | "search_term": "string", 18 | "location": "string", 19 | "distance": "integer", 20 | "job_type": "string", 21 | "is_remote": "boolean", 22 | "results_wanted": "integer", 23 | "hours_old": "integer", 24 | "linkedin_fetch_description": "boolean", 25 | "linkedin_company_ids": "list of integers", 26 | "country_indeed": "string", 27 | "enforce_annual_salary": "boolean", 28 | "description_format": "string", 29 | "offset": "integer", 30 | "easy_apply": "boolean", 31 | "page": "integer", 32 | "page_size": "integer", 33 | "paginate": "boolean", 34 | } 35 | 36 | # Parameter descriptions for helpful error messages 37 | PARAMETER_DESCRIPTIONS = { 38 | "site_name": "Job sites to search on (e.g., indeed, linkedin)", 39 | "search_term": "Job search term (e.g., 'software engineer')", 40 | "location": "Job location (e.g., 'San Francisco, CA')", 41 | "distance": "Distance in miles (default: 50)", 42 | "job_type": "Type of job (e.g., fulltime, parttime)", 43 | "is_remote": "Whether to include remote jobs (true or false)", 44 | "results_wanted": "Number of job results per site", 45 | "hours_old": "Filter jobs by hours since posting", 46 | "linkedin_fetch_description": "Fetch full LinkedIn descriptions", 47 | "linkedin_company_ids": "LinkedIn company IDs to filter by", 48 | "country_indeed": "Country filter for Indeed & Glassdoor", 49 | "enforce_annual_salary": "Convert wages to annual salary", 50 | "description_format": "Format of job description (markdown, html)", 51 | "offset": "Offset for pagination", 52 | "easy_apply": "Filter for easy apply jobs", 53 | "page": "Page number for paginated results", 54 | "page_size": "Number of results per page", 55 | "paginate": "Enable pagination", 56 | } 57 | 58 | # Parameter limitations and notes 59 | PARAMETER_LIMITATIONS = { 60 | "hours_old": "Cannot be used with job_type, is_remote, or easy_apply for Indeed searches", 61 | "easy_apply": "Cannot be used with hours_old for LinkedIn and Indeed searches", 62 | "job_type": "Cannot be used with hours_old for Indeed searches when combined with is_remote", 63 | "page_size": "Must be between 1 and 100", 64 | } 65 | 66 | def get_parameter_suggestion(param_name: str, invalid_value: Any = None) -> Dict[str, Any]: 67 | """Generate helpful suggestions for invalid parameters.""" 68 | suggestion = { 69 | "parameter": param_name, 70 | "message": f"Invalid value for {param_name}", 71 | } 72 | 73 | # Add information about the parameter type 74 | if param_name in PARAMETER_TYPES: 75 | suggestion["expected_type"] = PARAMETER_TYPES[param_name] 76 | 77 | # Add description if available 78 | if param_name in PARAMETER_DESCRIPTIONS: 79 | suggestion["description"] = PARAMETER_DESCRIPTIONS[param_name] 80 | 81 | # Add valid values if available 82 | if param_name in VALID_PARAMETERS: 83 | suggestion["valid_values"] = VALID_PARAMETERS[param_name] 84 | 85 | # Add limitations if applicable 86 | if param_name in PARAMETER_LIMITATIONS: 87 | suggestion["limitation"] = PARAMETER_LIMITATIONS[param_name] 88 | 89 | # Add specific suggestions based on the parameter 90 | if param_name == "site_name" and invalid_value: 91 | suggestion["message"] = f"'{invalid_value}' is not a valid job site" 92 | suggestion["suggestion"] = f"Use one or more of the valid job sites: {', '.join(VALID_PARAMETERS['site_name'])}" 93 | elif param_name == "job_type" and invalid_value: 94 | suggestion["message"] = f"'{invalid_value}' is not a valid job type" 95 | suggestion["suggestion"] = f"Use one of: {', '.join(VALID_PARAMETERS['job_type'])}" 96 | elif param_name == "description_format" and invalid_value: 97 | suggestion["message"] = f"'{invalid_value}' is not a valid description format" 98 | suggestion["suggestion"] = f"Use one of: {', '.join(VALID_PARAMETERS['description_format'])}" 99 | elif param_name == "verbose" and invalid_value is not None: 100 | suggestion["message"] = f"'{invalid_value}' is not a valid verbosity level" 101 | suggestion["suggestion"] = f"Use one of: {', '.join(map(str, VALID_PARAMETERS['verbose']))}" 102 | elif param_name == "page_size" and invalid_value is not None: 103 | suggestion["message"] = f"'{invalid_value}' is not a valid page size" 104 | suggestion["suggestion"] = "Page size must be between 1 and 100" 105 | elif param_name == "paginate" and invalid_value is not None: 106 | suggestion["message"] = f"'{invalid_value}' is not a valid value for paginate" 107 | suggestion["suggestion"] = "Use true or false" 108 | 109 | return suggestion 110 | 111 | def extract_validation_location(error_location: Tuple) -> str: 112 | """Extract the parameter name from the error location tuple.""" 113 | if len(error_location) > 1: 114 | return error_location[1] 115 | return str(error_location[0]) 116 | 117 | def generate_error_suggestions(validation_errors: List[Dict]) -> List[Dict]: 118 | """Generate helpful suggestions for validation errors.""" 119 | suggestions = [] 120 | 121 | for error in validation_errors: 122 | error_type = error.get("type", "") 123 | error_loc = error.get("location", []) 124 | 125 | if not error_loc: 126 | continue 127 | 128 | param_name = extract_validation_location(error_loc) 129 | invalid_value = None 130 | 131 | # For value errors, extract the invalid value if possible 132 | if "value_error" in error_type and "msg" in error: 133 | # Try to extract the invalid value from the error message 134 | msg = error["message"] 135 | if "not a valid" in msg and "=" in msg: 136 | invalid_value = msg.split("=")[-1].strip().strip("'\"") 137 | 138 | suggestion = get_parameter_suggestion(param_name, invalid_value) 139 | suggestions.append(suggestion) 140 | 141 | return suggestions 142 | -------------------------------------------------------------------------------- /docker-compose.dev.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | jobspy-api: 5 | build: . 6 | image: jobspy-docker-api-dev 7 | container_name: jobspy-docker-api-dev 8 | ports: 9 | - "8000:8000" 10 | env_file: 11 | - .env 12 | - .env.local # Load .env.local to override .env values 13 | environment: 14 | # The following values intentionally override .env settings for development 15 | # These hardcoded values ensure consistent behavior in development environment 16 | - ENVIRONMENT=production 17 | - LOG_LEVEL=INFO 18 | - ENABLE_API_KEY_AUTH=true 19 | - API_KEYS=${API_KEYS:-dev-key-123} # Use from .env.local or default to dev-key 20 | - RATE_LIMIT_ENABLED=true 21 | - ENABLE_CACHE=true 22 | volumes: 23 | - .:/app 24 | command: uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload 25 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | jobspy-api: 5 | build: . 6 | image: jobspy-docker-api 7 | container_name: jobspy-docker-api 8 | ports: 9 | - "8000:8000" 10 | env_file: 11 | - .env 12 | environment: 13 | # The most important settings that need consistent values 14 | - LOG_LEVEL=${LOG_LEVEL:-INFO} # Default to INFO if not set elsewhere 15 | - ENABLE_API_KEY_AUTH=${ENABLE_API_KEY_AUTH:-false} 16 | - API_KEYS=${API_KEYS:-} 17 | 18 | # Rate Limiting (only enable if needed) 19 | - RATE_LIMIT_ENABLED=${RATE_LIMIT_ENABLED:-false} 20 | - RATE_LIMIT_REQUESTS=${RATE_LIMIT_REQUESTS:-100} 21 | - RATE_LIMIT_TIMEFRAME=${RATE_LIMIT_TIMEFRAME:-3600} 22 | 23 | # Proxy Configuration 24 | - DEFAULT_PROXIES=${DEFAULT_PROXIES} 25 | - CA_CERT_PATH=${CA_CERT_PATH} 26 | 27 | # JobSpy Default Settings 28 | - DEFAULT_SITE_NAMES=${DEFAULT_SITE_NAMES:-indeed,linkedin,zip_recruiter,glassdoor,google,bayt,naukri} 29 | - DEFAULT_RESULTS_WANTED=${DEFAULT_RESULTS_WANTED:-20} 30 | - DEFAULT_DISTANCE=${DEFAULT_DISTANCE:-50} 31 | - DEFAULT_DESCRIPTION_FORMAT=${DEFAULT_DESCRIPTION_FORMAT:-markdown} 32 | - DEFAULT_COUNTRY_INDEED=${DEFAULT_COUNTRY_INDEED:-USA} 33 | 34 | # Caching 35 | - ENABLE_CACHE=${ENABLE_CACHE:-false} 36 | - CACHE_EXPIRY=${CACHE_EXPIRY:-3600} 37 | 38 | # Logging 39 | - ENVIRONMENT=${ENVIRONMENT:-production} 40 | 41 | # CORS 42 | - CORS_ORIGINS=${CORS_ORIGINS:-*} 43 | 44 | # Health Endpoints 45 | - ENABLE_HEALTH_ENDPOINTS=${ENABLE_HEALTH_ENDPOINTS:-true} 46 | - ENABLE_DETAILED_HEALTH=${ENABLE_DETAILED_HEALTH:-true} 47 | 48 | # API Documentation 49 | - ENABLE_SWAGGER_UI=${ENABLE_SWAGGER_UI:-true} 50 | - ENABLE_REDOC=${ENABLE_REDOC:-true} 51 | - SWAGGER_UI_PATH=${SWAGGER_UI_PATH:-/docs} 52 | - REDOC_PATH=${REDOC_PATH:-/redoc} 53 | volumes: 54 | - ./logs:/app/logs 55 | - ./scripts:/app/scripts # Ensure scripts are mounted properly 56 | restart: unless-stopped 57 | # Use bash explicitly to execute scripts and fix permission issues 58 | command: > 59 | /bin/bash -c "bash /app/scripts/docker-entrypoint.sh" 60 | healthcheck: 61 | test: ["CMD", "curl", "-f", "http://localhost:8000/health"] 62 | interval: 120s 63 | timeout: 5s 64 | retries: 3 65 | start_period: 10s 66 | -------------------------------------------------------------------------------- /examples/api_usage.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import pandas as pd 4 | 5 | # Base URL for the API 6 | BASE_URL = "http://localhost:8000" 7 | 8 | def search_jobs_simple(): 9 | """ 10 | Simple job search using the consolidated GET endpoint 11 | """ 12 | params = { 13 | "site_name": ["indeed", "linkedin"], 14 | "search_term": "software engineer", 15 | "location": "San Francisco, CA", 16 | "results_wanted": 5 17 | } 18 | 19 | response = requests.get(f"{BASE_URL}/api/v1/search_jobs", params=params) 20 | 21 | if response.status_code == 200: 22 | data = response.json() 23 | print(f"Found {data['count']} jobs") 24 | 25 | # Convert to pandas DataFrame for easier viewing 26 | df = pd.DataFrame(data['jobs']) 27 | print(df.head()) 28 | 29 | # Save to CSV 30 | df.to_csv("jobs_simple.csv", index=False) 31 | else: 32 | print(f"Error: {response.status_code}") 33 | print(response.text) 34 | 35 | def search_jobs_advanced(): 36 | """ 37 | Advanced job search using GET endpoint with all parameters 38 | """ 39 | params = { 40 | "site_name": ["indeed", "linkedin", "zip_recruiter"], 41 | "search_term": "data scientist", 42 | "google_search_term": "data scientist jobs near New York, NY since yesterday", 43 | "location": "New York, NY", 44 | "distance": 25, 45 | "job_type": "fulltime", 46 | "is_remote": True, 47 | "results_wanted": 10, 48 | "hours_old": 48, 49 | "description_format": "markdown", 50 | "country_indeed": "USA", 51 | "enforce_annual_salary": True, 52 | "linkedin_fetch_description": True 53 | } 54 | 55 | response = requests.get( 56 | f"{BASE_URL}/api/v1/search_jobs", 57 | params=params 58 | ) 59 | 60 | if response.status_code == 200: 61 | data = response.json() 62 | print(f"Found {data['count']} jobs") 63 | 64 | # Convert to pandas DataFrame for easier viewing 65 | df = pd.DataFrame(data['jobs']) 66 | print(df.head()) 67 | 68 | # Save to CSV 69 | df.to_csv("jobs_advanced.csv", index=False) 70 | else: 71 | print(f"Error: {response.status_code}") 72 | print(response.text) 73 | 74 | def search_jobs_paginated(): 75 | """ 76 | Paginated job search using GET endpoint 77 | """ 78 | params = { 79 | "paginate": True, 80 | "page": 1, 81 | "page_size": 5, 82 | "site_name": ["indeed", "linkedin"], 83 | "search_term": "software engineer", 84 | "location": "San Francisco, CA", 85 | "results_wanted": 20 86 | } 87 | 88 | response = requests.get(f"{BASE_URL}/api/v1/search_jobs", params=params) 89 | 90 | if response.status_code == 200: 91 | data = response.json() 92 | print(f"Found {data['count']} total jobs, showing page {data['current_page']} of {data['total_pages']}") 93 | print(f"Page size: {data['page_size']}, showing {len(data['jobs'])} jobs") 94 | 95 | # Convert to pandas DataFrame for easier viewing 96 | df = pd.DataFrame(data['jobs']) 97 | print(df.head()) 98 | 99 | # Check if there's a next page 100 | if data['next_page']: 101 | print(f"Next page URL: {data['next_page']}") 102 | 103 | # Save to CSV 104 | df.to_csv("jobs_paginated.csv", index=False) 105 | else: 106 | print(f"Error: {response.status_code}") 107 | print(response.text) 108 | 109 | if __name__ == "__main__": 110 | print("Running simple job search...") 111 | search_jobs_simple() 112 | 113 | print("\nRunning advanced job search...") 114 | search_jobs_advanced() 115 | 116 | print("\nRunning paginated job search...") 117 | search_jobs_paginated() 118 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List, Optional, Union, Dict, Any 3 | from fastapi import FastAPI, Query, HTTPException 4 | from pydantic import BaseModel, Field 5 | from jobspy import scrape_jobs 6 | import pandas as pd 7 | 8 | app = FastAPI( 9 | title="JobSpy Docker API", 10 | description="API for searching jobs across multiple platforms using JobSpy", 11 | version="1.0.0", 12 | ) 13 | 14 | SUPPORTED_SITES = ["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"] 15 | 16 | def get_env_bool(var_name, default=True): 17 | val = os.getenv(var_name) 18 | if val is None: 19 | return default 20 | return str(val).lower() in ("1", "true", "yes", "on") 21 | 22 | class JobSearchParams(BaseModel): 23 | site_name: Union[List[str], str] = Field( 24 | default=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"], 25 | description="Job sites to search on", 26 | ) 27 | search_term: Optional[str] = Field(default=None, description="Job search term") 28 | google_search_term: Optional[str] = Field(default=None, description="Search term for Google jobs") 29 | location: Optional[str] = Field(default=None, description="Job location") 30 | distance: Optional[int] = Field(default=50, description="Distance in miles") 31 | job_type: Optional[str] = Field(default=None, description="Job type (fulltime, parttime, internship, contract)") 32 | proxies: Optional[List[str]] = Field(default=None, description="Proxies in format ['user:pass@host:port', 'localhost']") 33 | is_remote: Optional[bool] = Field(default=None, description="Remote job filter") 34 | results_wanted: Optional[int] = Field(default=20, description="Number of results per site") 35 | hours_old: Optional[int] = Field(default=None, description="Filter by hours since posting") 36 | easy_apply: Optional[bool] = Field(default=None, description="Filter for easy apply jobs") 37 | description_format: Optional[str] = Field(default="markdown", description="Format of job description") 38 | offset: Optional[int] = Field(default=0, description="Offset for pagination") 39 | verbose: Optional[int] = Field(default=2, description="Controls verbosity (0: errors only, 1: errors+warnings, 2: all logs)") 40 | linkedin_fetch_description: Optional[bool] = Field(default=False, description="Fetch full LinkedIn descriptions") 41 | linkedin_company_ids: Optional[List[int]] = Field(default=None, description="LinkedIn company IDs to filter by") 42 | country_indeed: Optional[str] = Field(default=None, description="Country filter for Indeed & Glassdoor") 43 | enforce_annual_salary: Optional[bool] = Field(default=False, description="Convert wages to annual salary") 44 | ca_cert: Optional[str] = Field(default=None, description="Path to CA Certificate file for proxies") 45 | 46 | class JobResponse(BaseModel): 47 | count: int 48 | jobs: List[Dict[str, Any]] 49 | 50 | @app.get("/", tags=["Info"]) 51 | def read_root(): 52 | return {"message": "Welcome to JobSpy Docker API! Go to /docs for the API documentation."} 53 | 54 | @app.post("/search_jobs", response_model=JobResponse, tags=["Jobs"]) 55 | def search_jobs(params: JobSearchParams): 56 | try: 57 | jobs_df = scrape_jobs( 58 | site_name=params.site_name, 59 | search_term=params.search_term, 60 | google_search_term=params.google_search_term, 61 | location=params.location, 62 | distance=params.distance, 63 | job_type=params.job_type, 64 | proxies=params.proxies, 65 | is_remote=params.is_remote, 66 | results_wanted=params.results_wanted, 67 | hours_old=params.hours_old, 68 | easy_apply=params.easy_apply, 69 | description_format=params.description_format, 70 | offset=params.offset, 71 | verbose=params.verbose, 72 | linkedin_fetch_description=params.linkedin_fetch_description, 73 | linkedin_company_ids=params.linkedin_company_ids, 74 | country_indeed=params.country_indeed, 75 | enforce_annual_salary=params.enforce_annual_salary, 76 | ca_cert=params.ca_cert, 77 | ) 78 | 79 | # Convert DataFrame to dictionary format 80 | jobs_list = jobs_df.to_dict('records') 81 | 82 | return { 83 | "count": len(jobs_list), 84 | "jobs": jobs_list 85 | } 86 | except Exception as e: 87 | raise HTTPException(status_code=500, detail=f"Error scraping jobs: {str(e)}") 88 | 89 | @app.get("/api/v1/search_jobs") 90 | async def search_jobs_get( 91 | site_name: Union[List[str], str] = Query("all", description="Job sites to search on"), 92 | search_term: str = Query(None, description="Job search term"), 93 | google_search_term: Optional[str] = Query(None, description="Search term for Google jobs"), 94 | location: str = Query(None, description="Job location"), 95 | distance: int = Query(50, description="Distance in miles"), 96 | job_type: Optional[str] = Query(None, description="Job type (fulltime, parttime, internship, contract)"), 97 | is_remote: Optional[bool] = Query(None, description="Remote job filter"), 98 | results_wanted: int = Query(10, description="Number of results per site"), 99 | hours_old: Optional[int] = Query(None, description="Filter by hours since posting"), 100 | easy_apply: Optional[bool] = Query(None, description="Filter for easy apply jobs"), 101 | description_format: str = Query("markdown", description="Format of job description"), 102 | offset: int = Query(0, description="Offset for pagination"), 103 | verbose: int = Query(2, description="Controls verbosity (0: errors only, 1: errors+warnings, 2: all logs)"), 104 | linkedin_fetch_description: bool = Query(False, description="Fetch full LinkedIn descriptions"), 105 | country_indeed: Optional[str] = Query(None, description="Country filter for Indeed & Glassdoor"), 106 | enforce_annual_salary: bool = Query(False, description="Convert wages to annual salary"), 107 | format: str = Query("json", description="Output format: json or csv"), 108 | ): 109 | # Handle site_name=all 110 | if isinstance(site_name, str): 111 | if site_name.lower() == "all": 112 | site_name = SUPPORTED_SITES 113 | else: 114 | site_name = [site_name] 115 | elif isinstance(site_name, list): 116 | if "all" in [s.lower() for s in site_name]: 117 | site_name = SUPPORTED_SITES 118 | 119 | # Use env default for country_indeed if not provided 120 | if not country_indeed: 121 | country_indeed = os.getenv("DEFAULT_COUNTRY_INDEED", "USA") 122 | 123 | try: 124 | jobs_df = scrape_jobs( 125 | site_name=site_name, 126 | search_term=search_term, 127 | google_search_term=google_search_term, 128 | location=location, 129 | distance=distance, 130 | job_type=job_type, 131 | is_remote=is_remote, 132 | results_wanted=results_wanted, 133 | hours_old=hours_old, 134 | easy_apply=easy_apply, 135 | description_format=description_format, 136 | offset=offset, 137 | verbose=verbose, 138 | linkedin_fetch_description=linkedin_fetch_description, 139 | country_indeed=country_indeed, 140 | enforce_annual_salary=enforce_annual_salary, 141 | ) 142 | 143 | # Convert DataFrame to dictionary format 144 | jobs_data = jobs_df.to_dict('records') 145 | 146 | if format.lower() == "csv": 147 | import io, csv 148 | from fastapi.responses import StreamingResponse 149 | if not jobs_data: 150 | output = io.StringIO() 151 | writer = csv.writer(output) 152 | writer.writerow(["No results"]) 153 | output.seek(0) 154 | return StreamingResponse(output, media_type="text/csv", headers={"Content-Disposition": "attachment; filename=jobs.csv"}) 155 | output = io.StringIO() 156 | writer = csv.DictWriter(output, fieldnames=jobs_data[0].keys()) 157 | writer.writeheader() 158 | writer.writerows(jobs_data) 159 | output.seek(0) 160 | return StreamingResponse(output, media_type="text/csv", headers={"Content-Disposition": "attachment; filename=jobs.csv"}) 161 | # Default: JSON 162 | from fastapi.responses import JSONResponse 163 | return JSONResponse(content={"count": len(jobs_data), "jobs": jobs_data}) 164 | except Exception as e: 165 | raise HTTPException(status_code=500, detail=f"Error scraping jobs: {str(e)}") 166 | 167 | # API key auth default logic (at app startup or dependency) 168 | ENABLE_API_KEY_AUTH = get_env_bool("ENABLE_API_KEY_AUTH", default=True) 169 | if not ENABLE_API_KEY_AUTH: 170 | import warnings 171 | warnings.warn("API key authentication is disabled. Set ENABLE_API_KEY_AUTH=true to enable.") 172 | 173 | if __name__ == "__main__": 174 | import uvicorn 175 | uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True) 176 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "jobspy-docker-api" 3 | version = "1.0.0" 4 | description = "A Docker-containerized FastAPI application providing secure API access to the Python JobSpy library." 5 | authors = [ 6 | { name = "Shannon Atkinson", email = "rainmanjam@gmail.com" } 7 | ] 8 | readme = "README.md" 9 | requires-python = ">=3.8" 10 | license = { file = "LICENSE" } 11 | keywords = ["fastapi", "jobspy", "docker", "api", "job-search"] 12 | classifiers = [ 13 | "Programming Language :: Python :: 3", 14 | "Programming Language :: Python :: 3.8", 15 | "Programming Language :: Python :: 3.9", 16 | "Programming Language :: Python :: 3.10", 17 | "Programming Language :: Python :: 3.11", 18 | "Programming Language :: Python :: 3.12", 19 | "Framework :: FastAPI", 20 | "License :: OSI Approved :: MIT License", 21 | "Operating System :: OS Independent" 22 | ] 23 | dependencies = [ 24 | "fastapi=0.115.12", 25 | "uvicorn[standard]=0.34.2", 26 | "python-jobspy=1.1.80", 27 | "pydantic=2.11.3", 28 | "pydantic-settings=2.9.1", 29 | "python-multipart=0.0.20", 30 | "psutil=7.0.0", 31 | "python-dotenv=1.1.0" 32 | ] 33 | 34 | [project.optional-dependencies] 35 | dev = [ 36 | "pytest>=8.2.2", 37 | "pytest-cov>=5.0.0", 38 | "pytest-asyncio>=0.23.6", 39 | "pylint>=3.1.0", 40 | "black>=24.4.2", 41 | "isort>=5.13.2", 42 | "pre-commit>=3.7.1", 43 | "safety>=3.2.0" 44 | ] 45 | 46 | [tool.setuptools] 47 | packages = ["app"] 48 | 49 | [tool.black] 50 | line-length = 88 51 | target-version = ['py38'] 52 | exclude = ''' 53 | /( 54 | \.git 55 | | \.venv 56 | | build 57 | | dist 58 | | logs 59 | | temp 60 | | __pycache__ 61 | )/ 62 | ''' 63 | 64 | [tool.isort] 65 | profile = "black" 66 | line_length = 88 67 | multi_line_output = 3 68 | include_trailing_comma = true 69 | 70 | [tool.pytest.ini_options] 71 | minversion = "7.0" 72 | addopts = "--cov=app --cov-report=term-missing" 73 | testpaths = [ 74 | "tests" 75 | ] 76 | 77 | [tool.pylint.'MESSAGES CONTROL'] 78 | disable = [ 79 | "C0114", # missing-module-docstring 80 | "C0115", # missing-class-docstring 81 | "C0116", # missing-function-docstring 82 | ] 83 | 84 | [tool.pylint.format] 85 | max-line-length = 88 86 | 87 | [tool.coverage.run] 88 | branch = true 89 | source = [ 90 | "app" 91 | ] 92 | 93 | [tool.coverage.report] 94 | show_missing = true 95 | skip_covered = true 96 | exclude_lines = [ 97 | "pragma: no cover", 98 | "if __name__ == .__main__.:" 99 | ] 100 | 101 | [build-system] 102 | requires = ["setuptools>=61.0", "wheel"] 103 | build-backend = "setuptools.build_meta" 104 | 105 | [tool.pre-commit] 106 | repos = [ 107 | { repo = "https://github.com/pre-commit/pre-commit-hooks", rev = "v4.4.0", hooks = [ 108 | { id = "trailing-whitespace" }, 109 | { id = "end-of-file-fixer" }, 110 | { id = "check-yaml" }, 111 | { id = "check-added-large-files" } 112 | ] }, 113 | { repo = "https://github.com/psf/black", rev = "23.3.0", hooks = [ 114 | { id = "black" } 115 | ] }, 116 | { repo = "https://github.com/pycqa/isort", rev = "5.12.0", hooks = [ 117 | { id = "isort" } 118 | ] }, 119 | { repo = "https://github.com/pycqa/flake8", rev = "6.0.0", hooks = [ 120 | { id = "flake8", additional_dependencies = ["flake8-docstrings"] } 121 | ] } 122 | ] 123 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | pytest>=7.0.0 3 | pytest-cov>=4.0.0 4 | pytest-asyncio>=0.21.0 5 | pylint>=2.15.0 6 | black>=23.0.0 7 | isort>=5.12.0 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi 2 | uvicorn[standard] 3 | python-jobspy 4 | pydantic 5 | pydantic-settings 6 | python-multipart 7 | psutil 8 | python-dotenv 9 | -------------------------------------------------------------------------------- /scripts/check_auth.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Script to check API authentication configuration. 4 | Run this script to debug issues with API key authentication. 5 | """ 6 | import os 7 | import sys 8 | from pathlib import Path 9 | 10 | # Add parent directory to path so we can import app modules 11 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 12 | 13 | def check_auth_config(): 14 | """Print authentication configuration settings.""" 15 | print("=== API Authentication Configuration ===") 16 | 17 | # Check environment variables 18 | env_vars = { 19 | "API_KEY": os.getenv("API_KEY", ""), 20 | "ENABLE_API_KEY_AUTH": os.getenv("ENABLE_API_KEY_AUTH", ""), 21 | "API_KEY_HEADER_NAME": os.getenv("API_KEY_HEADER_NAME", ""), 22 | } 23 | 24 | print("\nEnvironment Variables:") 25 | for key, value in env_vars.items(): 26 | masked_value = "********" if key == "API_KEY" and value else value 27 | print(f"{key}={masked_value!r}") 28 | 29 | # Try to load app settings 30 | print("\nApp Settings:") 31 | try: 32 | from app.core.config import settings 33 | print(f"API_KEY configured: {bool(settings.API_KEY)}") 34 | print(f"API_KEY value is set: {bool(settings.API_KEY and settings.API_KEY != '')}") 35 | except Exception as e: 36 | print(f"Error loading settings: {e}") 37 | 38 | # Check .env file 39 | env_file = Path(".env") 40 | env_local_file = Path(".env.local") 41 | 42 | print("\nEnvironment Files:") 43 | print(f".env exists: {env_file.exists()}") 44 | print(f".env.local exists: {env_local_file.exists()}") 45 | 46 | # Provide troubleshooting tips 47 | print("\n=== Troubleshooting Tips ===") 48 | print("1. If you want to disable API key authentication:") 49 | print(" - Ensure API_KEY is not set in your environment or .env files") 50 | print(" - Or explicitly set API_KEY='' (empty string) in your .env file") 51 | print("\n2. If you want to enable API key authentication:") 52 | print(" - Set API_KEY='your-secret-key' in your .env.local file") 53 | print(" - Include the X-API-Key header in your requests") 54 | print("\n3. To see detailed authentication logs:") 55 | print(" - Set LOG_LEVEL=DEBUG in your environment or .env file") 56 | 57 | if __name__ == "__main__": 58 | check_auth_config() 59 | -------------------------------------------------------------------------------- /scripts/check_config_consistency.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Script to check for configuration consistency across different settings files. 4 | This helps identify and resolve inconsistencies in environment variables. 5 | """ 6 | import os 7 | import sys 8 | import yaml 9 | import dotenv 10 | from pathlib import Path 11 | import re 12 | from typing import Dict, Any, List, Set 13 | 14 | # Add parent directory to path so we can import app modules 15 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 16 | 17 | def color_text(text, color_code): 18 | """Add color to terminal output.""" 19 | return f"\033[{color_code}m{text}\033[0m" 20 | 21 | def red(text): 22 | return color_text(text, 31) 23 | 24 | def green(text): 25 | return color_text(text, 32) 26 | 27 | def yellow(text): 28 | return color_text(text, 33) 29 | 30 | def blue(text): 31 | return color_text(text, 34) 32 | 33 | def load_env_file(path: Path) -> Dict[str, str]: 34 | """Load environment variables from a .env file.""" 35 | if not path.exists(): 36 | print(f"Warning: {path} not found") 37 | return {} 38 | 39 | return dotenv.dotenv_values(path) 40 | 41 | def extract_dockerfile_env_vars(path: Path) -> Dict[str, str]: 42 | """Extract environment variables from a Dockerfile.""" 43 | if not path.exists(): 44 | print(f"Warning: {path} not found") 45 | return {} 46 | 47 | env_vars = {} 48 | try: 49 | with open(path, 'r') as f: 50 | content = f.read() 51 | 52 | # Look for ENV statements 53 | # This is a simple approach - a proper parser would be better 54 | env_pattern = r'ENV\s+([A-Za-z0-9_]+)=([^\s\\]+)' 55 | simple_envs = re.findall(env_pattern, content) 56 | for key, value in simple_envs: 57 | env_vars[key] = value.strip('"\'') 58 | 59 | # Look for multi-line ENV statements 60 | multi_pattern = r'ENV\s+([A-Za-z0-9_]+)=([^\s\\]+)(\s*\\\s*\n\s*([A-Za-z0-9_]+)=([^\s\\]+))*' 61 | multi_envs = re.findall(multi_pattern, content) 62 | for match in multi_envs: 63 | for i in range(0, len(match), 3): 64 | if i+1 < len(match) and match[i] and match[i+1]: 65 | env_vars[match[i]] = match[i+1].strip('"\'') 66 | except Exception as e: 67 | print(f"Error parsing Dockerfile: {e}") 68 | 69 | return env_vars 70 | 71 | def load_docker_compose_vars(path: Path) -> Dict[str, str]: 72 | """Load environment variables from a docker-compose file.""" 73 | if not path.exists(): 74 | print(f"Warning: {path} not found") 75 | return {} 76 | 77 | try: 78 | with open(path, 'r') as f: 79 | compose_data = yaml.safe_load(f) 80 | 81 | env_vars = {} 82 | for service_name, service_data in compose_data.get('services', {}).items(): 83 | # Check environment section 84 | environment = service_data.get('environment', []) 85 | if isinstance(environment, list): 86 | for env in environment: 87 | if isinstance(env, str) and '=' in env: 88 | key, value = env.split('=', 1) 89 | # Handle ${VAR:-default} format 90 | if '${' in value and ':-' in value and '}' in value: 91 | default_val = value.split(':-')[1].split('}')[0] 92 | env_vars[key] = default_val 93 | else: 94 | env_vars[key] = value 95 | elif isinstance(environment, dict): 96 | env_vars.update(environment) 97 | 98 | return env_vars 99 | except Exception as e: 100 | print(f"Error parsing docker-compose file: {e}") 101 | return {} 102 | 103 | def check_config_consistency(): 104 | """Check configuration consistency across different settings files.""" 105 | print(yellow("=== Configuration Consistency Checker ===\n")) 106 | 107 | # Define paths to all configuration files 108 | env_path = Path(".env") 109 | env_local_path = Path(".env.local") 110 | dockerfile_path = Path("Dockerfile") 111 | docker_compose_path = Path("docker-compose.yml") 112 | docker_compose_dev_path = Path("docker-compose.dev.yml") 113 | 114 | # Load environment variables from each file 115 | env_vars = load_env_file(env_path) 116 | env_local_vars = load_env_file(env_local_path) 117 | dockerfile_vars = extract_dockerfile_env_vars(dockerfile_path) 118 | docker_compose_vars = load_docker_compose_vars(docker_compose_path) 119 | docker_compose_dev_vars = load_docker_compose_vars(docker_compose_dev_path) 120 | 121 | # Collect all variable names across all files 122 | all_vars = set() 123 | all_vars.update(env_vars.keys()) 124 | all_vars.update(env_local_vars.keys()) 125 | all_vars.update(dockerfile_vars.keys()) 126 | all_vars.update(docker_compose_vars.keys()) 127 | all_vars.update(docker_compose_dev_vars.keys()) 128 | 129 | # Filter out non-app related environment variables 130 | excluded_vars = {'PYTHONDONTWRITEBYTECODE', 'PYTHONUNBUFFERED', 'PYTHONPATH'} 131 | app_vars = all_vars - excluded_vars 132 | 133 | # Check for presence of each variable in each file 134 | print(yellow("Checking variable presence in each configuration file:")) 135 | missing_vars = { 136 | ".env": [], 137 | ".env.local": [], 138 | "Dockerfile": [], 139 | "docker-compose.yml": [], 140 | "docker-compose.dev.yml": [] 141 | } 142 | 143 | for var in sorted(app_vars): 144 | print(f"\n{blue(var)}:") 145 | 146 | if var not in env_vars: 147 | missing_vars[".env"].append(var) 148 | print(f" .env: {red('MISSING')}") 149 | else: 150 | print(f" .env: {env_vars[var]}") 151 | 152 | if var not in env_local_vars: 153 | # Only mark as missing if uncommented in .env.local 154 | print(f" .env.local: {yellow('Not specified')}") 155 | else: 156 | print(f" .env.local: {env_local_vars[var]}") 157 | 158 | if var not in dockerfile_vars: 159 | missing_vars["Dockerfile"].append(var) 160 | print(f" Dockerfile: {red('MISSING')}") 161 | else: 162 | print(f" Dockerfile: {dockerfile_vars[var]}") 163 | 164 | if var not in docker_compose_vars: 165 | missing_vars["docker-compose.yml"].append(var) 166 | print(f" docker-compose.yml: {red('MISSING')}") 167 | else: 168 | print(f" docker-compose.yml: {docker_compose_vars[var]}") 169 | 170 | if var not in docker_compose_dev_vars: 171 | missing_vars["docker-compose.dev.yml"].append(var) 172 | print(f" docker-compose.dev.yml: {red('MISSING')}") 173 | else: 174 | print(f" docker-compose.dev.yml: {docker_compose_dev_vars[var]}") 175 | 176 | # Print summary of missing variables 177 | print("\n" + yellow("=== Missing Variables Summary ===")) 178 | for file_path, vars_list in missing_vars.items(): 179 | if vars_list: 180 | print(f"\n{file_path} is missing these variables:") 181 | for var in vars_list: 182 | print(f" - {var}") 183 | 184 | # Check for inconsistent default values 185 | print("\n" + yellow("=== Inconsistent Default Values ===")) 186 | inconsistent_defaults = [] 187 | for var in sorted(app_vars): 188 | values = {} 189 | if var in env_vars: 190 | values[".env"] = env_vars[var] 191 | if var in dockerfile_vars: 192 | values["Dockerfile"] = dockerfile_vars[var] 193 | 194 | # Skip if we don't have at least two sources to compare 195 | if len(values) < 2: 196 | continue 197 | 198 | # Check if values are inconsistent 199 | if len(set(values.values())) > 1: 200 | inconsistent_defaults.append((var, values)) 201 | 202 | if inconsistent_defaults: 203 | for var, values in inconsistent_defaults: 204 | print(f"\n{red(var)} has inconsistent default values:") 205 | for source, value in values.items(): 206 | print(f" {source}: {value}") 207 | else: 208 | print(green("No inconsistencies found in default values!")) 209 | 210 | # Provide recommendations 211 | print("\n" + yellow("=== Recommendations ===")) 212 | 213 | if missing_vars[".env"]: 214 | print("\n1. Add these missing variables to .env:") 215 | for var in missing_vars[".env"]: 216 | # Try to find a default value from other files 217 | default_val = docker_compose_vars.get(var) or dockerfile_vars.get(var) or "" 218 | print(f" {var}={default_val}") 219 | 220 | if missing_vars["Dockerfile"]: 221 | print("\n2. Consider adding these variables to Dockerfile ENV section:") 222 | for var in missing_vars["Dockerfile"]: 223 | # Try to find a default value from other files 224 | default_val = env_vars.get(var) or docker_compose_vars.get(var) or "" 225 | print(f" {var}={default_val}") 226 | 227 | print("\n3. Ensure docker-compose.dev.yml loads from .env:") 228 | print(" Add this to the service configuration:") 229 | print(" env_file:") 230 | print(" - .env") 231 | 232 | if inconsistent_defaults: 233 | print("\n4. Fix inconsistent default values between files") 234 | 235 | if __name__ == "__main__": 236 | try: 237 | check_config_consistency() 238 | except Exception as e: 239 | print(red(f"Error: {e}")) 240 | import traceback 241 | traceback.print_exc() 242 | -------------------------------------------------------------------------------- /scripts/check_env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Script to check environment variables and configuration settings. 4 | Run this script to debug issues with environment variables. 5 | """ 6 | import os 7 | import sys 8 | import json 9 | from pathlib import Path 10 | 11 | # Add parent directory to path so we can import app modules 12 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 13 | 14 | def check_env(): 15 | """Print environment variables and settings.""" 16 | print("=== Environment Variables ===") 17 | env_vars = { 18 | "API_KEYS": os.getenv("API_KEYS", ""), 19 | "ENABLE_API_KEY_AUTH": os.getenv("ENABLE_API_KEY_AUTH", ""), 20 | "API_KEY_HEADER_NAME": os.getenv("API_KEY_HEADER_NAME", ""), 21 | "RATE_LIMIT_ENABLED": os.getenv("RATE_LIMIT_ENABLED", ""), 22 | "DEFAULT_PROXIES": os.getenv("DEFAULT_PROXIES", ""), 23 | "DEFAULT_SITE_NAMES": os.getenv("DEFAULT_SITE_NAMES", ""), 24 | "ENABLE_CACHE": os.getenv("ENABLE_CACHE", ""), 25 | "ENVIRONMENT": os.getenv("ENVIRONMENT", "") 26 | } 27 | 28 | for key, value in env_vars.items(): 29 | print(f"{key}={value!r}") 30 | 31 | print("\n=== Testing Settings Loading ===") 32 | try: 33 | from app.config import settings 34 | 35 | print(f"API_KEYS: {settings.API_KEYS}") 36 | print(f"ENABLE_API_KEY_AUTH: {settings.ENABLE_API_KEY_AUTH}") 37 | print(f"API_KEY_HEADER_NAME: {settings.API_KEY_HEADER_NAME}") 38 | print(f"RATE_LIMIT_ENABLED: {settings.RATE_LIMIT_ENABLED}") 39 | print(f"DEFAULT_PROXIES: {settings.DEFAULT_PROXIES}") 40 | print(f"DEFAULT_SITE_NAMES: {settings.DEFAULT_SITE_NAMES}") 41 | print(f"ENABLE_CACHE: {settings.ENABLE_CACHE}") 42 | print(f"ENVIRONMENT: {settings.ENVIRONMENT}") 43 | 44 | print("\nSettings were loaded successfully!") 45 | except Exception as e: 46 | print(f"Error loading settings: {e}") 47 | import traceback 48 | traceback.print_exc() 49 | 50 | if __name__ == "__main__": 51 | check_env() 52 | -------------------------------------------------------------------------------- /scripts/confirm_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Script to confirm environment variables are set correctly 3 | # Run this at container startup 4 | 5 | echo "=== Environment Variable Confirmation ===" 6 | echo "ENABLE_API_KEY_AUTH: $ENABLE_API_KEY_AUTH" 7 | echo "API_KEYS: ${API_KEYS:0:3}... (truncated for security)" 8 | echo "RATE_LIMIT_ENABLED: $RATE_LIMIT_ENABLED" 9 | echo "ENABLE_CACHE: $ENABLE_CACHE" 10 | echo "LOG_LEVEL: $LOG_LEVEL" 11 | echo "ENVIRONMENT: $ENVIRONMENT" 12 | echo "========================================" 13 | -------------------------------------------------------------------------------- /scripts/debug_env_conflicts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Script to detect environment variable conflicts between different sources. 4 | This helps diagnose issues where values in code, .env, or Docker might conflict. 5 | """ 6 | import json 7 | import os 8 | import sys 9 | from pathlib import Path 10 | 11 | # Add parent directory to path so we can import app modules 12 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 13 | 14 | def color_text(text, color_code): 15 | """Add color to terminal output.""" 16 | return f"\033[{color_code}m{text}\033[0m" 17 | 18 | def red(text): 19 | return color_text(text, 31) 20 | 21 | def green(text): 22 | return color_text(text, 32) 23 | 24 | def yellow(text): 25 | return color_text(text, 33) 26 | 27 | def get_docker_env_vars(): 28 | """Get environment variables from docker-compose.yml.""" 29 | docker_compose_path = Path("docker-compose.yml") 30 | if not docker_compose_path.exists(): 31 | return {} 32 | 33 | try: 34 | import yaml 35 | with open(docker_compose_path, 'r') as f: 36 | docker_compose = yaml.safe_load(f) 37 | 38 | if not docker_compose or 'services' not in docker_compose: 39 | return {} 40 | 41 | for service in docker_compose['services'].values(): 42 | if 'environment' in service: 43 | env_vars = {} 44 | for env in service['environment']: 45 | if isinstance(env, str) and '=' in env: 46 | key, value = env.split('=', 1) 47 | env_vars[key] = value 48 | elif isinstance(env, dict): 49 | env_vars.update(env) 50 | return env_vars 51 | return {} 52 | except Exception as e: 53 | print(f"Error parsing docker-compose.yml: {e}") 54 | return {} 55 | 56 | def get_dotenv_vars(): 57 | """Get environment variables from .env and .env.local.""" 58 | env_vars = {} 59 | try: 60 | import dotenv 61 | # Load .env 62 | env_path = Path(".env") 63 | if env_path.exists(): 64 | env_vars.update(dotenv.dotenv_values(env_path)) 65 | 66 | # Load .env.local which overrides .env 67 | local_env_path = Path(".env.local") 68 | if local_env_path.exists(): 69 | env_vars.update(dotenv.dotenv_values(local_env_path)) 70 | except ImportError: 71 | print("python-dotenv not installed. Please install with: pip install python-dotenv") 72 | 73 | return env_vars 74 | 75 | def debug_env_conflicts(): 76 | """Find and report conflicts between environment variable sources.""" 77 | print(yellow("=== Environment Variable Conflict Detector ===\n")) 78 | 79 | # Get environment variables from different sources 80 | os_env_vars = {k: v for k, v in os.environ.items() if k.isupper()} 81 | dotenv_vars = get_dotenv_vars() 82 | docker_vars = get_docker_env_vars() 83 | 84 | # Check for key environment variables 85 | key_vars = [ 86 | "ENABLE_API_KEY_AUTH", "API_KEYS", "RATE_LIMIT_ENABLED", 87 | "ENABLE_CACHE", "ENVIRONMENT", "LOG_LEVEL" 88 | ] 89 | 90 | print(yellow("Checking key environment variables:")) 91 | for var in key_vars: 92 | values = {} 93 | if var in os_env_vars: 94 | values["OS"] = os_env_vars[var] 95 | if var in dotenv_vars: 96 | values["dotenv"] = dotenv_vars[var] 97 | if var in docker_vars: 98 | values["docker"] = docker_vars[var] 99 | 100 | if not values: 101 | print(f" {var}: {yellow('Not set in any source')}") 102 | continue 103 | 104 | if len(set(values.values())) > 1: 105 | print(f" {var}: {red('CONFLICT DETECTED')}") 106 | for source, value in values.items(): 107 | print(f" - {source}: {value}") 108 | else: 109 | value = next(iter(values.values())) 110 | sources = ", ".join(values.keys()) 111 | print(f" {var}: {green(value)} (from {sources})") 112 | 113 | # Check app config (after environment variables are resolved) 114 | print("\n" + yellow("Checking application config:")) 115 | try: 116 | from app.config import settings 117 | from app.utils.auth_health import check_auth_configuration 118 | 119 | # Check for auth configuration inconsistencies 120 | auth_status = check_auth_configuration() 121 | if auth_status["inconsistent_config"]: 122 | print(red(" Authentication configuration issue detected:")) 123 | for rec in auth_status["recommendations"]: 124 | print(f" - {rec}") 125 | else: 126 | print(green(" Authentication configuration is consistent")) 127 | 128 | # Check other important settings 129 | print("\n" + yellow("Final resolved configuration:")) 130 | print(f" ENABLE_API_KEY_AUTH: {settings.ENABLE_API_KEY_AUTH}") 131 | print(f" API_KEYS configured: {bool(settings.API_KEYS)}") 132 | print(f" API_KEYS count: {len(settings.API_KEYS)}") 133 | print(f" RATE_LIMIT_ENABLED: {settings.RATE_LIMIT_ENABLED}") 134 | print(f" ENABLE_CACHE: {settings.ENABLE_CACHE}") 135 | print(f" ENVIRONMENT: {settings.ENVIRONMENT}") 136 | print(f" LOG_LEVEL: {settings.LOG_LEVEL}") 137 | 138 | except ImportError: 139 | print(red(" Could not import app.config. Make sure you're running from the project root")) 140 | 141 | if __name__ == "__main__": 142 | try: 143 | debug_env_conflicts() 144 | except Exception as e: 145 | print(red(f"Error: {e}")) 146 | import traceback 147 | traceback.print_exc() 148 | -------------------------------------------------------------------------------- /scripts/debug_env_load_order.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Script to debug the order of environment variable loading 3 | # Run this to understand where each environment variable comes from 4 | 5 | echo "=== Environment Variable Load Order Debug ===" 6 | 7 | # Check different environment variable sources in order of precedence 8 | echo "Environment variables from different sources:" 9 | echo "1. Command line/docker-compose.yml environment section:" 10 | echo " LOG_LEVEL=$LOG_LEVEL" 11 | echo " ENABLE_API_KEY_AUTH=$ENABLE_API_KEY_AUTH" 12 | echo 13 | 14 | # Check Dockerfile ENV vs runtime environment 15 | echo "2. Default values from Dockerfile (these should be overridden at runtime):" 16 | echo " Dockerfile ARG LOG_LEVEL default=DEBUG" 17 | echo " Dockerfile ARG ENABLE_API_KEY_AUTH default=false" 18 | echo 19 | 20 | # Dump all environment variables for analysis 21 | echo "3. All current environment variables (alphabetical):" 22 | env | grep -E "LOG_LEVEL|ENABLE_|API_KEY|ENVIRONMENT" | sort 23 | echo 24 | 25 | echo "=== Environment Variable Override Chain ===" 26 | echo "Command line args > docker-compose environment > .env > Dockerfile ENV > Dockerfile ARG defaults" 27 | echo "===========================================" 28 | -------------------------------------------------------------------------------- /scripts/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Docker entrypoint script that handles script permissions and execution 3 | 4 | # Ensure scripts are executable (needed when mounted as volumes) 5 | find /app/scripts -type f -name "*.sh" -exec chmod +x {} \; 6 | find /app/scripts -type f -name "*.py" -exec chmod +x {} \; 7 | 8 | # Display environment variable debug info 9 | echo "=== Environment Variable Load Order Debug ===" 10 | echo "Environment variables from different sources:" 11 | echo "1. Command line/docker-compose.yml environment section:" 12 | echo " LOG_LEVEL=$LOG_LEVEL" 13 | echo " ENABLE_API_KEY_AUTH=$ENABLE_API_KEY_AUTH" 14 | echo 15 | 16 | # Check Dockerfile ENV vs runtime environment 17 | echo "2. Default values from Dockerfile (these should be overridden at runtime):" 18 | echo " Dockerfile ARG LOG_LEVEL default=DEBUG" 19 | echo " Dockerfile ARG ENABLE_API_KEY_AUTH default=false" 20 | echo 21 | 22 | # Dump all environment variables for analysis 23 | echo "3. All current environment variables (alphabetical):" 24 | env | grep -E "LOG_LEVEL|ENABLE_|API_KEY|ENVIRONMENT" | sort 25 | echo 26 | 27 | echo "=== Environment Variable Override Chain ===" 28 | echo "Command line args > docker-compose environment > .env > Dockerfile ENV > Dockerfile ARG defaults" 29 | echo "===========================================" 30 | 31 | # Run the confirmation script 32 | bash /app/scripts/confirm_env.sh 33 | 34 | # Start the FastAPI application 35 | exec uvicorn app.main:app --host 0.0.0.0 --port 8000 --proxy-headers 36 | -------------------------------------------------------------------------------- /scripts/increment_version.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Script to increment the version number in the app's __init__.py file. 4 | Usage: python increment_version.py [major|minor|patch] 5 | """ 6 | import re 7 | import sys 8 | from pathlib import Path 9 | 10 | # Get the project root directory 11 | project_root = Path(__file__).parent.parent 12 | init_file = project_root / "app" / "__init__.py" 13 | 14 | def read_version(): 15 | """Read the current version from __init__.py""" 16 | content = init_file.read_text() 17 | version_match = re.search(r'__version__\s*=\s*["\']([^"\']+)["\']', content) 18 | if not version_match: 19 | raise ValueError("Could not find version string in __init__.py") 20 | return version_match.group(1) 21 | 22 | def write_version(new_version): 23 | """Write the new version to __init__.py""" 24 | content = init_file.read_text() 25 | new_content = re.sub( 26 | r'__version__\s*=\s*["\']([^"\']+)["\']', 27 | f'__version__ = "{new_version}"', 28 | content 29 | ) 30 | init_file.write_text(new_content) 31 | 32 | def increment_version(version_part): 33 | """ 34 | Increment the version number. 35 | version_part: 'major', 'minor', or 'patch' 36 | """ 37 | current = read_version() 38 | print(f"Current version: {current}") 39 | 40 | try: 41 | major, minor, patch = map(int, current.split('.')) 42 | except ValueError: 43 | print(f"Error: Version {current} is not in the format X.Y.Z") 44 | sys.exit(1) 45 | 46 | if version_part == "major": 47 | major += 1 48 | minor = 0 49 | patch = 0 50 | elif version_part == "minor": 51 | minor += 1 52 | patch = 0 53 | elif version_part == "patch": 54 | patch += 1 55 | else: 56 | print(f"Error: Unknown version part '{version_part}'. Use 'major', 'minor', or 'patch'") 57 | sys.exit(1) 58 | 59 | new_version = f"{major}.{minor}.{patch}" 60 | write_version(new_version) 61 | print(f"Version updated to: {new_version}") 62 | return new_version 63 | 64 | if __name__ == "__main__": 65 | if len(sys.argv) != 2 or sys.argv[1] not in ["major", "minor", "patch"]: 66 | print("Usage: python increment_version.py [major|minor|patch]") 67 | sys.exit(1) 68 | 69 | increment_version(sys.argv[1]) 70 | -------------------------------------------------------------------------------- /scripts/load_local_env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Script to explicitly load .env.local environment variables. 4 | Run this script before starting the app if you want to use .env.local. 5 | """ 6 | import os 7 | import sys 8 | from pathlib import Path 9 | 10 | try: 11 | from dotenv import load_dotenv 12 | except ImportError: 13 | print("Error: python-dotenv is not installed. Please install it with:") 14 | print("pip install python-dotenv") 15 | sys.exit(1) 16 | 17 | def load_local_env(): 18 | """Explicitly load .env.local file if it exists.""" 19 | env_local_path = Path(".env.local") 20 | 21 | if not env_local_path.exists(): 22 | print(f"Warning: {env_local_path} not found") 23 | return False 24 | 25 | print(f"Loading environment variables from {env_local_path.absolute()}") 26 | load_dotenv(env_local_path, override=True) 27 | 28 | # Print a few non-sensitive variables to confirm loading 29 | print("Loaded variables (sample):") 30 | for var in ["LOG_LEVEL", "ENVIRONMENT", "ENABLE_CACHE"]: 31 | value = os.getenv(var, "[not set]") 32 | print(f" {var}={value}") 33 | 34 | return True 35 | 36 | if __name__ == "__main__": 37 | if load_local_env(): 38 | print("\nSuccessfully loaded .env.local") 39 | print("Run your application now to use these variables") 40 | else: 41 | print("\nNo .env.local file found") 42 | print("Using default environment variables only") 43 | -------------------------------------------------------------------------------- /scripts/load_test.py: -------------------------------------------------------------------------------- 1 | """Load testing script for the JobSpy Docker API.""" 2 | import argparse 3 | import asyncio 4 | import random 5 | import statistics 6 | import time 7 | from typing import Any, Dict, List 8 | 9 | import aiohttp 10 | 11 | # Job titles and locations for random queries 12 | JOB_TITLES = ["software engineer", "data scientist", "product manager", "devops engineer", "full stack developer"] 13 | LOCATIONS = ["San Francisco, CA", "New York, NY", "Seattle, WA", "Austin, TX", "Boston, MA"] 14 | 15 | async def make_request(session, url, api_key, params=None, json_data=None): 16 | """Make an HTTP request and measure response time.""" 17 | headers = {"x-api-key": api_key, "accept": "application/json"} 18 | 19 | start_time = time.time() 20 | 21 | if json_data: 22 | async with session.post(url, headers=headers, json=json_data) as response: 23 | data = await response.json() 24 | status = response.status 25 | else: 26 | async with session.get(url, headers=headers, params=params) as response: 27 | data = await response.json() 28 | status = response.status 29 | 30 | end_time = time.time() 31 | response_time = end_time - start_time 32 | 33 | return { 34 | "status": status, 35 | "response_time": response_time, 36 | "data": data 37 | } 38 | 39 | async def run_load_test(base_url, api_key, num_requests, concurrency): 40 | """Run a load test with the specified number of concurrent requests.""" 41 | print(f"Starting load test with {num_requests} total requests, {concurrency} concurrent") 42 | 43 | # Create a connection pool 44 | connector = aiohttp.TCPConnector(limit=concurrency) 45 | async with aiohttp.ClientSession(connector=connector) as session: 46 | tasks = [] 47 | 48 | for _ in range(num_requests): 49 | # Generate random query parameters 50 | job_title = random.choice(JOB_TITLES) 51 | location = random.choice(LOCATIONS) 52 | 53 | # Randomly choose between GET and POST 54 | if random.choice([True, False]): 55 | # GET request 56 | params = { 57 | "site_name": random.sample(["indeed", "linkedin", "zip_recruiter"], 1), 58 | "search_term": job_title, 59 | "location": location, 60 | "results_wanted": 5 61 | } 62 | tasks.append(make_request(session, f"{base_url}/api/v1/search_jobs", api_key, params=params)) 63 | else: 64 | # POST request 65 | json_data = { 66 | "site_name": random.sample(["indeed", "linkedin", "zip_recruiter"], 2), 67 | "search_term": job_title, 68 | "location": location, 69 | "results_wanted": 5 70 | } 71 | tasks.append(make_request(session, f"{base_url}/api/v1/search_jobs", api_key, json_data=json_data)) 72 | 73 | # Execute requests with limited concurrency 74 | results = [] 75 | for i in range(0, len(tasks), concurrency): 76 | batch = tasks[i:i+concurrency] 77 | batch_results = await asyncio.gather(*batch) 78 | results.extend(batch_results) 79 | print(f"Completed {min(i+concurrency, len(tasks))}/{len(tasks)} requests") 80 | 81 | return results 82 | 83 | def analyze_results(results): 84 | """Analyze load test results.""" 85 | response_times = [r["response_time"] for r in results] 86 | statuses = [r["status"] for r in results] 87 | 88 | # Calculate statistics 89 | avg_time = statistics.mean(response_times) 90 | median_time = statistics.median(response_times) 91 | min_time = min(response_times) 92 | max_time = max(response_times) 93 | p95_time = sorted(response_times)[int(len(response_times) * 0.95)] 94 | 95 | success_count = statuses.count(200) 96 | error_count = len(statuses) - success_count 97 | 98 | # Print results 99 | print("\n=== Load Test Results ===") 100 | print(f"Total Requests: {len(results)}") 101 | print(f"Success Rate: {success_count/len(results)*100:.2f}% ({success_count}/{len(results)})") 102 | print(f"Average Response Time: {avg_time:.4f} seconds") 103 | print(f"Median Response Time: {median_time:.4f} seconds") 104 | print(f"Min Response Time: {min_time:.4f} seconds") 105 | print(f"Max Response Time: {max_time:.4f} seconds") 106 | print(f"95th Percentile Response Time: {p95_time:.4f} seconds") 107 | 108 | # Count status codes 109 | status_counts = {} 110 | for status in statuses: 111 | status_counts[status] = status_counts.get(status, 0) + 1 112 | 113 | print("\nStatus Code Distribution:") 114 | for status, count in status_counts.items(): 115 | print(f" {status}: {count} ({count/len(results)*100:.2f}%)") 116 | 117 | if __name__ == "__main__": 118 | parser = argparse.ArgumentParser(description="Load test the JobSpy Docker API") 119 | parser.add_argument("--url", default="http://localhost:8000", help="Base URL of the API") 120 | parser.add_argument("--api-key", required=True, help="API key for authentication") 121 | parser.add_argument("--requests", type=int, default=10, help="Total number of requests to make") 122 | parser.add_argument("--concurrency", type=int, default=2, help="Number of concurrent requests") 123 | args = parser.parse_args() 124 | 125 | results = asyncio.run(run_load_test(args.url, args.api_key, args.requests, args.concurrency)) 126 | analyze_results(results) 127 | -------------------------------------------------------------------------------- /scripts/make_scripts_executable.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Make all scripts executable 3 | echo "Making scripts executable..." 4 | chmod +x scripts/*.py 5 | chmod +x scripts/*.sh 6 | echo "Done." 7 | -------------------------------------------------------------------------------- /scripts/set_log_level.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Script to set log level and restart the application 3 | 4 | if [ -z "$1" ]; then 5 | echo "Usage: $0 " 6 | echo "Available levels: DEBUG, INFO, WARNING, ERROR, CRITICAL" 7 | exit 1 8 | fi 9 | 10 | LOG_LEVEL=$(echo "$1" | tr '[:lower:]' '[:upper:]') 11 | echo "Setting log level to: $LOG_LEVEL" 12 | 13 | # Update .env file 14 | sed -i.bak "s/^LOG_LEVEL=.*/LOG_LEVEL=$LOG_LEVEL/" .env 15 | 16 | # Restart the service with new log level 17 | echo "Restarting services..." 18 | docker-compose down 19 | LOG_LEVEL=$LOG_LEVEL docker-compose up -d 20 | 21 | echo "Done! Services restarted with log level: $LOG_LEVEL" 22 | echo "View logs with: docker-compose logs -f" 23 | -------------------------------------------------------------------------------- /scripts/verify_env_loading.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Script to verify that environment variables are being properly loaded. 4 | Run this script to compare .env values with actual loaded values. 5 | """ 6 | import os 7 | import sys 8 | from pathlib import Path 9 | import dotenv 10 | 11 | # Add parent directory to path so we can import app modules 12 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 13 | 14 | def verify_env_loading(): 15 | """Verify environment variables are loaded correctly from .env files.""" 16 | print("=== Environment Variable Loading Verification ===\n") 17 | 18 | # Load .env file content to compare with actual environment variables 19 | env_file = Path(".env") 20 | env_local_file = Path(".env.local") 21 | 22 | env_vars = {} 23 | if env_file.exists(): 24 | print(f"Loading .env file from {env_file.absolute()}") 25 | env_vars.update(dotenv.dotenv_values(env_file)) 26 | else: 27 | print(".env file not found") 28 | 29 | # Check if .env.local exists, but note that it's not loaded by default 30 | if env_local_file.exists(): 31 | print(f"Found .env.local file at {env_local_file.absolute()}") 32 | print("NOTE: .env.local is not automatically loaded by the application.") 33 | print("To use .env.local, you must explicitly load it or use docker-compose.dev.yml") 34 | 35 | # Still load it for debugging purposes 36 | local_vars = dotenv.dotenv_values(env_local_file) 37 | print(f" .env.local contains {len(local_vars)} variables") 38 | else: 39 | print(".env.local file not found") 40 | 41 | print("\n=== Expected vs Actual Values ===") 42 | for key, expected_value in env_vars.items(): 43 | actual_value = os.getenv(key) 44 | match = expected_value == actual_value 45 | status = "✅" if match else "❌" 46 | 47 | # Mask API keys 48 | if "API_KEY" in key and expected_value: 49 | expected_value = "****[MASKED]****" 50 | if "API_KEY" in key and actual_value: 51 | actual_value = "****[MASKED]****" 52 | 53 | print(f"{status} {key}:") 54 | print(f" Expected: {expected_value!r}") 55 | print(f" Actual: {actual_value!r}") 56 | 57 | print("\n=== Docker Environment Note ===") 58 | print("If running in Docker, environment values in docker-compose.yml") 59 | print("will override values from .env files. To fix this:") 60 | print("1. Use ${VAR_NAME:-default} syntax in docker-compose.yml") 61 | print("2. Use the env_file directive to load .env files") 62 | print("3. Ensure .env files are mounted/copied to the container") 63 | 64 | if __name__ == "__main__": 65 | try: 66 | import dotenv 67 | except ImportError: 68 | print("python-dotenv package is required to run this script.") 69 | print("Install it with: pip install python-dotenv") 70 | sys.exit(1) 71 | 72 | verify_env_loading() 73 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Test package for JobSpy Docker API.""" 2 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Pytest configuration for JobSpy Docker API tests.""" 2 | import pytest 3 | from fastapi.testclient import TestClient 4 | from app.main import app 5 | 6 | @pytest.fixture 7 | def client(): 8 | """Get a TestClient instance for the FastAPI app.""" 9 | with TestClient(app) as test_client: 10 | yield test_client 11 | -------------------------------------------------------------------------------- /tests/test_api.py: -------------------------------------------------------------------------------- 1 | """Tests for the JobSpy Docker API.""" 2 | import pytest 3 | from unittest.mock import patch, MagicMock 4 | import pandas as pd 5 | 6 | def test_health_endpoint(client): 7 | """Test the health endpoint.""" 8 | response = client.get("/health") 9 | assert response.status_code == 200 10 | assert response.json()["status"] == "ok" 11 | 12 | @patch('app.services.job_service.scrape_jobs') 13 | def test_search_jobs(mock_scrape_jobs, client): 14 | """Test the search_jobs endpoint.""" 15 | # Setup mock 16 | mock_df = pd.DataFrame({ 17 | 'SITE': ['indeed', 'linkedin'], 18 | 'TITLE': ['Software Engineer', 'Data Scientist'], 19 | 'COMPANY': ['Test Corp', 'Test Inc'], 20 | }) 21 | mock_scrape_jobs.return_value = mock_df 22 | 23 | # Disable auth for testing 24 | with patch('app.config.settings.ENABLE_API_KEY_AUTH', False): 25 | response = client.post( 26 | "/api/v1/search_jobs", 27 | json={ 28 | "site_name": ["indeed", "linkedin"], 29 | "search_term": "software engineer", 30 | "location": "San Francisco", 31 | "country_indeed": "USA" 32 | } 33 | ) 34 | 35 | # Check response 36 | assert response.status_code == 200 37 | assert response.json()["count"] == 2 38 | assert not response.json()["cached"] 39 | assert len(response.json()["jobs"]) == 2 40 | --------------------------------------------------------------------------------