├── .env
├── .github
    ├── ISSUE_TEMPLATE
    │   └── bug_report.md
    └── workflows
    │   ├── ci.yml
    │   └── release.yml
├── .gitignore
├── .pre-commit-config.yaml
├── API_CHANGELOG.md
├── ARCHITECTURE_OVERVIEW.md
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── DEPLOYMENT.md
├── Dockerfile
├── FAQ.md
├── GLOSSARY.md
├── LICENSE.md
├── Makefile
├── PERFORMANCE_TUNING.md
├── README.md
├── ROADMAP.md
├── SECURITY.md
├── SECURITY_GUIDELINES.md
├── SUPPORT.md
├── UPGRADE_GUIDE.md
├── app
    ├── .env
    ├── __init__.py
    ├── api
    │   ├── deps.py
    │   ├── endpoints
    │   │   └── example.py
    │   └── routes
    │   │   └── jobs.py
    ├── cache.py
    ├── config.py
    ├── core
    │   ├── config.py
    │   ├── config_bridge.py
    │   ├── log_filters.py
    │   └── logging_config.py
    ├── db
    │   ├── crud.py
    │   └── database.py
    ├── exceptions.py
    ├── main.py
    ├── middleware
    │   ├── __init__.py
    │   ├── api_key_auth.py
    │   ├── rate_limiter.py
    │   └── request_logger.py
    ├── models.py
    ├── models
    │   ├── __init__.py
    │   ├── health_models.py
    │   └── job_models.py
    ├── routes
    │   ├── __init__.py
    │   ├── api.py
    │   └── health.py
    ├── services
    │   ├── __init__.py
    │   ├── background_service.py
    │   ├── external_service.py
    │   └── job_service.py
    └── utils
    │   ├── auth_health.py
    │   ├── env_debugger.py
    │   ├── error_handlers.py
    │   ├── logging_config.py
    │   ├── logging_docs.py
    │   └── validation_helpers.py
├── docker-compose.dev.yml
├── docker-compose.yml
├── examples
    └── api_usage.py
├── main.py
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── scripts
    ├── check_auth.py
    ├── check_config_consistency.py
    ├── check_env.py
    ├── confirm_env.sh
    ├── debug_env_conflicts.py
    ├── debug_env_load_order.sh
    ├── docker-entrypoint.sh
    ├── increment_version.py
    ├── load_local_env.py
    ├── load_test.py
    ├── make_scripts_executable.sh
    ├── set_log_level.sh
    └── verify_env_loading.py
└── tests
    ├── __init__.py
    ├── conftest.py
    └── test_api.py


/.env:
--------------------------------------------------------------------------------
 1 | # Default configuration values for JobSpy Docker API
 2 | # This file is committed to version control
 3 | # For local overrides, use .env.local which is not committed
 4 | 
 5 | # Application Settings
 6 | # Set to true to enable debug logging (including health checks)
 7 | DEBUG=false
 8 | # Or set specific log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
 9 | LOG_LEVEL=INFO
10 | ENVIRONMENT=production
11 | 
12 | # API Security (use placeholder values in committed .env)
13 | ENABLE_API_KEY_AUTH=false
14 | API_KEYS=
15 | API_KEY_HEADER_NAME=x-api-key
16 | 
17 | # Rate Limiting
18 | RATE_LIMIT_ENABLED=false
19 | RATE_LIMIT_REQUESTS=100
20 | RATE_LIMIT_TIMEFRAME=3600
21 | 
22 | # Proxy Configuration
23 | DEFAULT_PROXIES=
24 | CA_CERT_PATH=
25 | 
26 | # JobSpy Default Settings
27 | DEFAULT_SITE_NAMES=indeed,linkedin,zip_recruiter,glassdoor,google,bayt,naukri
28 | DEFAULT_RESULTS_WANTED=20
29 | DEFAULT_DISTANCE=50
30 | DEFAULT_DESCRIPTION_FORMAT=markdown
31 | DEFAULT_COUNTRY_INDEED=USA
32 | 
33 | # Caching
34 | ENABLE_CACHE=false
35 | CACHE_EXPIRY=3600
36 | 
37 | # CORS
38 | CORS_ORIGINS=*
39 | 
40 | # Health Endpoints
41 | ENABLE_HEALTH_ENDPOINTS=true
42 | ENABLE_DETAILED_HEALTH=true
43 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: '[BUG] '
 5 | labels: bug
 6 | assignees: ''
 7 | ---
 8 | 
 9 | ## Describe the bug
10 | A clear and concise description of what the bug is.
11 | 
12 | ## To Reproduce
13 | Steps to reproduce the behavior:
14 | 1. Start the application with '...'
15 | 2. Send request to '...'
16 | 3. Check response '...'
17 | 4. See error
18 | 
19 | ## Expected behavior
20 | A clear and concise description of what you expected to happen.
21 | 
22 | ## Screenshots
23 | If applicable, add screenshots to help explain your problem.
24 | 
25 | ## Environment:
26 |  - OS: [e.g. Ubuntu 20.04]
27 |  - Deployment Method: [e.g. Docker, Docker Compose, direct]
28 |  - Version: [e.g. 1.0.0]
29 | 
30 | ## Configuration
31 | ```
32 | # Redacted configuration or environment variables you're using
33 | ENABLE_API_KEY_AUTH=true
34 | # ...
35 | ```
36 | 
37 | ## Additional context
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI/CD Pipeline
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   test:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - uses: actions/checkout@v3
14 |     - name: Set up Python 3.13
15 |       uses: actions/setup-python@v4
16 |       with:
17 |         python-version: '3.13'
18 |     - name: Install dependencies
19 |       run: |
20 |         python -m pip install --upgrade pip
21 |         pip install -r requirements.txt
22 |         pip install pytest pytest-cov
23 |     - name: Run tests
24 |       run: |
25 |         pytest --cov=app tests/
26 |         
27 |   build:
28 |     needs: test
29 |     runs-on: ubuntu-latest
30 |     if: github.event_name == 'push' && github.ref == 'refs/heads/main'
31 |     steps:
32 |     - uses: actions/checkout@v3
33 |     - name: Remove existing test container
34 |       run: |
35 |         # if a container named jobspy-api-test exists, remove it
36 |         if [ "$(docker ps -a -q -f name=jobspy-api-test)" ]; then
37 |           docker rm -f jobspy-api-test
38 |         fi
39 |     - name: Build Docker image
40 |       run: docker build -t jobspy-api .
41 |     - name: Run Docker container
42 |       run: docker run -d -p 8000:8000 --name jobspy-api-test jobspy-api
43 |     - name: Test Docker container
44 |       run: |
45 |         sleep 5  # Wait for container to start
46 |         curl -f http://localhost:8000/health || exit 1
47 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release Pipeline
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*.*.*'
 7 | 
 8 | jobs:
 9 |   build_and_test:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v3
13 |     - name: Set up Python 3.13
14 |       uses: actions/setup-python@v4
15 |       with:
16 |         python-version: '3.13'
17 |     - name: Install dependencies
18 |       run: |
19 |         python -m pip install --upgrade pip
20 |         pip install -r requirements.txt
21 |         pip install pytest pytest-cov
22 |     - name: Run tests
23 |       run: pytest --cov=app tests/
24 |   
25 |   build_and_publish_docker:
26 |     needs: build_and_test
27 |     runs-on: ubuntu-latest
28 |     steps:
29 |     - uses: actions/checkout@v3
30 |     
31 |     - name: Set environment variables
32 |       run: |
33 |         echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV
34 |     
35 |     - name: Login to Docker Hub
36 |       uses: docker/login-action@v2
37 |       with:
38 |         username: ${{ secrets.DOCKERHUB_USERNAME }}
39 |         password: ${{ secrets.DOCKERHUB_TOKEN }}
40 |     
41 |     - name: Build and push Docker image
42 |       uses: docker/build-push-action@v4
43 |       with:
44 |         context: .
45 |         push: true
46 |         platforms: linux/amd64,linux/arm64
47 |         tags: |
48 |           ${{ secrets.DOCKERHUB_USERNAME }}/jobspy-api:latest
49 |           ${{ secrets.DOCKERHUB_USERNAME }}/jobspy-api:${{ env.VERSION }}
50 |     
51 |   create_github_release:
52 |     needs: build_and_publish_docker
53 |     runs-on: ubuntu-latest
54 |     permissions:
55 |       contents: write
56 |     steps:
57 |     - uses: actions/checkout@v3
58 |       with:
59 |         fetch-depth: 0
60 |     
61 |     - name: Generate release notes
62 |       run: |
63 |         VERSION=${GITHUB_REF#refs/tags/v}
64 |         PREVIOUS_TAG=$(git tag --sort=-version:refname | head -n 2 | tail -n 1)
65 |         if [ -z "$PREVIOUS_TAG" ]; then
66 |           echo "## Changes in $VERSION" > release_notes.md
67 |           echo "* First official release" >> release_notes.md
68 |         else
69 |           echo "## Changes in $VERSION since $PREVIOUS_TAG" > release_notes.md
70 |           git log --pretty=format:"* %s (%h)" $PREVIOUS_TAG..HEAD >> release_notes.md
71 |         fi
72 |     
73 |     - name: Create GitHub Release
74 |       uses: softprops/action-gh-release@v1
75 |       with:
76 |         body_path: release_notes.md
77 |         draft: false
78 |         prerelease: false
79 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Environment variables
 2 | .env.local
 3 | .env.*.local
 4 | 
 5 | # Python
 6 | __pycache__/
 7 | *.py[cod]
 8 | *$py.class
 9 | *.so
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 | 
28 | # Unit test / coverage reports
29 | htmlcov/
30 | .tox/
31 | .nox/
32 | .coverage
33 | .coverage.*
34 | .cache
35 | nosetests.xml
36 | coverage.xml
37 | *.cover
38 | .hypothesis/
39 | .pytest_cache/
40 | 
41 | # Jupyter Notebook
42 | .ipynb_checkpoints
43 | 
44 | # Environments
45 | .env.local
46 | .venv
47 | env/
48 | venv/
49 | ENV/
50 | env.bak/
51 | venv.bak/
52 | 
53 | # IDE specific files
54 | .idea/
55 | .vscode/
56 | *.swp
57 | *.swo
58 | .DS_Store
59 | 
60 | # Project specific
61 | logs/
62 | *.log
63 | temp/
64 | jobs.csv
65 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.4.0
 4 |     hooks:
 5 |     -   id: trailing-whitespace
 6 |     -   id: end-of-file-fixer
 7 |     -   id: check-yaml
 8 |     -   id: check-added-large-files
 9 | 
10 | -   repo: https://github.com/pycqa/isort
11 |     rev: 5.12.0
12 |     hooks:
13 |     -   id: isort
14 | 
15 | -   repo: https://github.com/psf/black
16 |     rev: 23.3.0
17 |     hooks:
18 |     -   id: black
19 | 
20 | -   repo: https://github.com/pycqa/flake8
21 |     rev: 6.0.0
22 |     hooks:
23 |     -   id: flake8
24 |         additional_dependencies: [flake8-docstrings]
25 | 


--------------------------------------------------------------------------------
/API_CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # API Changelog
 2 | 
 3 | All notable changes to the JobSpy Docker API.
 4 | 
 5 | ## [Unreleased]
 6 | - (pending changes)
 7 | 
 8 | ## [1.0.0] – 2025‑04‑28
 9 | ### Added
10 | - `GET /api/v1/search_jobs`
11 | - `x-api-key` authentication
12 | - Rate limiting & caching
13 | ...
14 | 


--------------------------------------------------------------------------------
/ARCHITECTURE_OVERVIEW.md:
--------------------------------------------------------------------------------
 1 | # Architecture Overview
 2 | 
 3 | ```
 4 | Client → API Gateway (FastAPI) → JobSpy Library → External Job APIs
 5 |                            ↓
 6 |                          Cache (Redis/File)
 7 |                            ↓
 8 |                          Logs/Monitoring
 9 | ```
10 | 
11 | ## Components
12 | - FastAPI application (`app/main.py`)
13 | - Caching layer (in‑memory or Redis)
14 | - Rate limiter middleware
15 | - Docker / Kubernetes deployment
16 | 
17 | ## Data Flow
18 | 1. Client request with API key.
19 | 2. Check rate limit & cache.
20 | 3. Scrape job sites via JobSpy.
21 | 4. Return JSON/CSV response.
22 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to the JobSpy Docker API will be documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 7 | 
 8 | ## [Unreleased]
 9 | 
10 | ### Added
11 | - Placeholder for upcoming changes
12 | 
13 | ## [1.0.1] - 2023-11-30
14 | 
15 | ### Fixed
16 | - Fixed CSV export functionality when using `format=csv` parameter
17 | - Fixed `DEFAULT_COUNTRY_INDEED` environment variable not being used as fallback when parameter is not provided
18 | - Fixed `site_name=all` being rejected as an invalid option
19 | - Fixed `ENABLE_API_KEY_AUTH` defaulting to disabled instead of enabled when not specified
20 | 
21 | 
22 | 
23 | ## [1.0.0] - 2025-04-28
24 | 
25 | ### Added
26 | - Initial release of JobSpy Docker API
27 | - Comprehensive job search across multiple platforms
28 | - API Key Authentication system
29 | - Rate limiting capabilities
30 | - Response caching
31 | - Proxy support
32 | - Customizable default search parameters
33 | - CORS support
34 | - Health check endpoints
35 | - Comprehensive logging
36 | 
37 | ### Changed
38 | - N/A (initial release)
39 | 
40 | ### Fixed
41 | - N/A (initial release)
42 | 
43 | [Unreleased]: https://github.com/username/job-spy-fastapi/compare/v1.0.0...HEAD
44 | [1.0.1]: https://github.com/username/job-spy-fastapi/releases/tag/v1.0.1
45 | [1.0.0]: https://github.com/username/job-spy-fastapi/releases/tag/v1.0.0
46 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | All contributors and users of this project are expected to adhere to the following Code of Conduct.
 4 | 
 5 | ## Our Pledge
 6 | Be respectful, considerate, and collaborative.
 7 | 
 8 | ## Our Standards
 9 | - **Respectful Communication**: No harassment, discrimination, or abusive language.
10 | - **Inclusivity**: Welcome participants of all backgrounds and identities.
11 | - **Constructive Feedback**: Critique ideas, not people.
12 | 
13 | ## Enforcement
14 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project maintainers. Violations may result in removal from the project.
15 | 
16 | For full details, see our [Contributor Covenant](https://www.contributor-covenant.org/).
17 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | Thank you for considering contributing to JobSpy API!
 4 | 
 5 | ## How to Contribute
 6 | 1. Fork the repo and create a feature branch.
 7 | 2. Write tests for new features or bug fixes.
 8 | 3. Follow the existing code style: Black, isort, flake8.
 9 | 4. Commit messages: Use imperative tense and reference issue numbers.
10 | 5. Open a PR against `main` with a clear description.
11 | 
12 | ## Code Review
13 | - Ensure CI passes: tests, lint, formatting.
14 | - Address review feedback promptly.
15 | 
16 | ## Issue Reporting
17 | - Search existing issues before opening a new one.
18 | - Provide steps to reproduce, environment details, and logs.
19 | 


--------------------------------------------------------------------------------
/DEPLOYMENT.md:
--------------------------------------------------------------------------------
 1 | # Deployment Guide
 2 | 
 3 | ## Production (Docker Compose)
 4 | 1. Copy `.env.example` → `.env` and fill in values.
 5 | 2. `docker-compose up -d`
 6 | 3. Verify: `docker-compose logs jobspy-api`
 7 | 
 8 | ## Standalone Docker
 9 | ```bash
10 | docker build -t jobspy-api:latest .
11 | docker run -d -p 8000:8000 \
12 |   --env-file .env \
13 |   jobspy-api:latest
14 | ```
15 | 
16 | ## Kubernetes (example)
17 | - Define Deployment, Service, ConfigMap for env vars.
18 | - Mount `ca_cert` as a Secret if needed.
19 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.13-slim
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | LABEL maintainer="Shannon Atkinson <rainmanjam@gmail.com"
 6 | LABEL description="JobSpy Docker API - Job search across multiple platforms"
 7 | 
 8 | # Use ARG instead of ENV for settings that should be overridable
 9 | ARG API_KEY_HEADER_NAME=x-api-key
10 | ARG ENABLE_API_KEY_AUTH=false
11 | ARG API_KEYS=""
12 | ARG RATE_LIMIT_ENABLED=false
13 | ARG RATE_LIMIT_REQUESTS=100
14 | ARG RATE_LIMIT_TIMEFRAME=3600
15 | ARG DEFAULT_SITE_NAMES=indeed,linkedin,zip_recruiter,glassdoor,google,bayt,naukri
16 | ARG DEFAULT_PROXIES=""
17 | ARG DEFAULT_RESULTS_WANTED=20
18 | ARG DEFAULT_DISTANCE=50
19 | ARG DEFAULT_DESCRIPTION_FORMAT=markdown
20 | ARG ENABLE_CACHE=false
21 | ARG CACHE_EXPIRY=3600
22 | ARG LOG_LEVEL=DEBUG
23 | ARG ENVIRONMENT=production
24 | ARG CORS_ORIGINS=*
25 | ARG ENABLE_HEALTH_ENDPOINTS=true
26 | ARG ENABLE_DETAILED_HEALTH=true
27 | ARG ENABLE_SWAGGER_UI=true
28 | ARG ENABLE_REDOC=true
29 | ARG SWAGGER_UI_PATH=/docs
30 | ARG REDOC_PATH=/redoc
31 | 
32 | # Set environment variables
33 | ENV PYTHONDONTWRITEBYTECODE=1 \
34 |     PYTHONUNBUFFERED=1 \
35 |     PYTHONPATH=/app \
36 |     # Convert ARGs to ENVs
37 |     API_KEY_HEADER_NAME=${API_KEY_HEADER_NAME} \
38 |     ENABLE_API_KEY_AUTH=${ENABLE_API_KEY_AUTH} \
39 |     API_KEYS=${API_KEYS} \
40 |     RATE_LIMIT_ENABLED=${RATE_LIMIT_ENABLED} \
41 |     RATE_LIMIT_REQUESTS=${RATE_LIMIT_REQUESTS} \
42 |     RATE_LIMIT_TIMEFRAME=${RATE_LIMIT_TIMEFRAME} \
43 |     DEFAULT_SITE_NAMES=${DEFAULT_SITE_NAMES} \
44 |     DEFAULT_PROXIES=${DEFAULT_PROXIES} \
45 |     DEFAULT_RESULTS_WANTED=${DEFAULT_RESULTS_WANTED} \
46 |     DEFAULT_DISTANCE=${DEFAULT_DISTANCE} \
47 |     DEFAULT_DESCRIPTION_FORMAT=${DEFAULT_DESCRIPTION_FORMAT} \
48 |     ENABLE_CACHE=${ENABLE_CACHE} \
49 |     CACHE_EXPIRY=${CACHE_EXPIRY} \
50 |     LOG_LEVEL=${LOG_LEVEL} \
51 |     ENVIRONMENT=${ENVIRONMENT} \
52 |     CORS_ORIGINS=${CORS_ORIGINS} \
53 |     ENABLE_HEALTH_ENDPOINTS=${ENABLE_HEALTH_ENDPOINTS} \
54 |     ENABLE_DETAILED_HEALTH=${ENABLE_DETAILED_HEALTH} \
55 |     ENABLE_SWAGGER_UI=${ENABLE_SWAGGER_UI} \
56 |     ENABLE_REDOC=${ENABLE_REDOC} \
57 |     SWAGGER_UI_PATH=${SWAGGER_UI_PATH} \
58 |     REDOC_PATH=${REDOC_PATH}
59 | 
60 | # Install curl and build-essential for healthcheck and required dependencies
61 | RUN apt-get update && \
62 |     apt-get install -y --no-install-recommends curl build-essential && \
63 |     apt-get clean && \
64 |     rm -rf /var/lib/apt/lists/*
65 | 
66 | # Install Python dependencies
67 | COPY requirements.txt requirements-dev.txt ./
68 | RUN pip install --no-cache-dir -r requirements.txt && \
69 |     pip install python-json-logger
70 | 
71 | # Copy application code
72 | COPY . .
73 | 
74 | # Create logs directory
75 | RUN mkdir -p logs
76 | 
77 | # Make all scripts executable
78 | RUN find /app/scripts -type f -name "*.sh" -exec chmod +x {} \; && \
79 |     find /app/scripts -type f -name "*.py" -exec chmod +x {} \; && \
80 |     chmod +x /app/scripts/confirm_env.sh /app/scripts/debug_env_load_order.sh
81 | 
82 | # Add healthcheck
83 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
84 |     CMD curl -f http://localhost:8000/health || exit 1
85 | 
86 | # Expose port
87 | EXPOSE 8000
88 | 
89 | # Run the confirmation script and then start the application
90 | CMD ["/bin/bash", "-c", "/app/scripts/confirm_env.sh && uvicorn app.main:app --host 0.0.0.0 --port 8000 --proxy-headers"]
91 | 


--------------------------------------------------------------------------------
/FAQ.md:
--------------------------------------------------------------------------------
 1 | # Frequently Asked Questions
 2 | 
 3 | Q: How do I enable Swagger UI?  
 4 | A: Set `ENABLE_SWAGGER_UI=true` in your `.env` or Compose file.
 5 | 
 6 | Q: Where are logs stored?  
 7 | A: By default in the `logs/` directory of the container.
 8 | 
 9 | Q: How can I customize default search parameters?  
10 | A: Use environment variables like `DEFAULT_DISTANCE`, `DEFAULT_RESULTS_WANTED`.
11 | 
12 | Q: How do I debug env var issues?  
13 | A: Run `python scripts/check_env.py` or inspect `/config-sources` endpoint.
14 | 


--------------------------------------------------------------------------------
/GLOSSARY.md:
--------------------------------------------------------------------------------
1 | # Glossary
2 | 
3 | - **API Key**: Token for authenticating requests.
4 | - **Cache Expiry**: Time before cached responses expire.
5 | - **Rate Limiting**: Controls request frequency.
6 | - **FastAPI**: Python web framework used.
7 | - **Docker Compose**: Tool to define and run multi‑container apps.
8 | - **JobSpy**: Python library for scraping job boards.
9 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Shannon Atkinson
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy  
 6 | of this software and associated documentation files (the "Software"), to deal  
 7 | in the Software without restriction, including without limitation the rights  
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell  
 9 | copies of the Software, and to permit persons to whom the Software is  
10 | furnished to do so, subject to the following conditions:  
11 | 
12 | The above copyright notice and this permission notice shall be included in  
13 | all copies or substantial portions of the Software.  
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR  
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,  
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE  
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER  
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN  
21 | THE SOFTWARE.  
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | .PHONY: help install run test lint docker-build docker-run docker-compose-up docker-compose-down docker-compose-dev dev prod clean-start update test-and-build ci logs restart rebuild check-env debug-docker docker-push version version-patch version-minor version-major docker-buildx docker-pushx
  2 | 
  3 | help:
  4 | 	@echo "Available commands:"
  5 | 	@echo "  make install            - Install dependencies"
  6 | 	@echo "  make run                - Run development server"
  7 | 	@echo "  make test               - Run tests"
  8 | 	@echo "  make lint               - Run linter"
  9 | 	@echo "  make docker-build       - Build Docker image"
 10 | 	@echo "  make docker-run         - Run Docker container"
 11 | 	@echo "  make docker-compose-up  - Start with Docker Compose"
 12 | 	@echo "  make docker-compose-down- Stop Docker Compose containers"
 13 | 	@echo "  make docker-compose-dev - Start development mode with Docker Compose"
 14 | 	@echo "  make docker-push        - Build and push Docker image to Docker Hub"
 15 | 	@echo "  make docker-buildx      - Build multi-arch Docker image (x86_64, arm64)"
 16 | 	@echo "  make docker-pushx       - Build and push multi-arch Docker image to Docker Hub"
 17 | 	@echo ""
 18 | 	@echo "Combined commands:"
 19 | 	@echo "  make dev                - Run development server with auto-reload"
 20 | 	@echo "  make prod               - Build and run production container"
 21 | 	@echo "  make clean-start        - Remove containers, rebuild and start"
 22 | 	@echo "  make update             - Update dependencies and rebuild"
 23 | 	@echo "  make test-and-build     - Run tests and build if they pass"
 24 | 	@echo "  make ci                 - Run full CI pipeline (test, build, run)"
 25 | 	@echo "  make logs               - Show logs from running containers"
 26 | 	@echo "  make restart            - Restart running containers"
 27 | 	@echo "  make rebuild            - Rebuild and restart containers"
 28 | 	@echo "  make check-env          - Check environment issues"
 29 | 	@echo "  make debug-docker       - Debug environment issues in Docker"
 30 | 	@echo ""
 31 | 	@echo "Version commands:"
 32 | 	@echo "  make version            - Show current version"
 33 | 	@echo "  make version-patch      - Increment patch version (1.0.0 -> 1.0.1)"
 34 | 	@echo "  make version-minor      - Increment minor version (1.0.0 -> 1.1.0)"
 35 | 	@echo "  make version-major      - Increment major version (1.0.0 -> 2.0.0)"
 36 | 
 37 | install:
 38 | 	pip install -r requirements.txt
 39 | 	pip install pytest pytest-cov pylint
 40 | 
 41 | run:
 42 | 	uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
 43 | 
 44 | test:
 45 | 	pytest --cov=app tests/
 46 | 
 47 | lint:
 48 | 	pylint app/
 49 | 
 50 | docker-build:
 51 | 	docker build -t jobspy-api .
 52 | 
 53 | docker-buildx:
 54 | 	@echo "Building multi-arch Docker image (linux/amd64,linux/arm64)..."
 55 | 	@python -c "from app import __version__; print(f'Current version: {__version__}')"
 56 | 	@VERSION=$$(python -c "from app import __version__; print(__version__)") && \
 57 | 	docker buildx build --platform linux/amd64,linux/arm64 \
 58 | 		-t jobspy-api:$$VERSION -t jobspy-api:latest \
 59 | 		--load .
 60 | 
 61 | docker-run:
 62 | 	docker run -p 8000:8000 jobspy-api
 63 | 
 64 | docker-compose-up:
 65 | 	docker-compose up -d
 66 | 
 67 | docker-compose-down:
 68 | 	docker-compose down
 69 | 
 70 | docker-compose-dev:
 71 | 	docker-compose -f docker-compose.dev.yml up
 72 | 
 73 | # Combined commands
 74 | dev:
 75 | 	docker-compose -f docker-compose.dev.yml up --build
 76 | 
 77 | prod:
 78 | 	docker-compose build
 79 | 	docker-compose up -d
 80 | 
 81 | clean-start:
 82 | 	docker-compose down -v
 83 | 	docker-compose rm -f
 84 | 	docker-compose build --no-cache
 85 | 	docker-compose up -d
 86 | 
 87 | update:
 88 | 	pip install -U -r requirements.txt
 89 | 	pip install -U -r requirements-dev.txt
 90 | 	docker-compose build --no-cache
 91 | 	docker-compose up -d
 92 | 
 93 | test-and-build:
 94 | 	pytest --cov=app tests/ && docker-compose build
 95 | 
 96 | ci:
 97 | 	pytest --cov=app tests/
 98 | 	docker-compose build
 99 | 	docker-compose up -d
100 | 
101 | logs:
102 | 	docker-compose logs -f
103 | 
104 | restart:
105 | 	docker-compose restart
106 | 
107 | rebuild:
108 | 	docker-compose down
109 | 	docker-compose build
110 | 	docker-compose up -d
111 | 
112 | check-env:
113 | 	python scripts/check_env.py
114 | 
115 | debug-docker:
116 | 	docker-compose run --rm jobspy-api python /app/scripts/check_env.py
117 | 
118 | docker-push:
119 | 	@echo "Building and pushing Docker image to Docker Hub..."
120 | 	@python -c "from app import __version__; print(f'Current version: {__version__}')"
121 | 	@VERSION=$$(python -c "from app import __version__; print(__version__)") && \
122 | 	echo "Building version $$VERSION" && \
123 | 	docker build -t jobspy-api:$$VERSION -t jobspy-api:latest . && \
124 | 	echo "Enter your Docker Hub username:" && \
125 | 	read DOCKER_USER && \
126 | 	docker tag jobspy-api:$$VERSION $$DOCKER_USER/jobspy-api:$$VERSION && \
127 | 	docker tag jobspy-api:latest $$DOCKER_USER/jobspy-api:latest && \
128 | 	docker push $$DOCKER_USER/jobspy-api:$$VERSION && \
129 | 	docker push $$DOCKER_USER/jobspy-api:latest && \
130 | 	echo "Successfully pushed version $$VERSION to Docker Hub"
131 | 
132 | docker-pushx:
133 | 	@echo "Building and pushing multi-arch Docker image to Docker Hub..."
134 | 	@python -c "from app import __version__; print(f'Current version: {__version__}')"
135 | 	@VERSION=$$(python -c "from app import __version__; print(__version__)") && \
136 | 	echo "Enter your Docker Hub username:" && \
137 | 	read DOCKER_USER && \
138 | 	docker buildx build --platform linux/amd64,linux/arm64 \
139 | 		-t $$DOCKER_USER/jobspy-api:$$VERSION -t $$DOCKER_USER/jobspy-api:latest \
140 | 		--push .
141 | 
142 | version:
143 | 	@python -c "from app import __version__; print(f'Current version: {__version__}')"
144 | 
145 | version-patch:
146 | 	@python scripts/increment_version.py patch
147 | 	@$(MAKE) version
148 | 
149 | version-minor:
150 | 	@python scripts/increment_version.py minor
151 | 	@$(MAKE) version
152 | 
153 | version-major:
154 | 	@python scripts/increment_version.py major
155 | 	@$(MAKE) version
156 | 


--------------------------------------------------------------------------------
/PERFORMANCE_TUNING.md:
--------------------------------------------------------------------------------
 1 | # Performance Tuning
 2 | 
 3 | ## Caching
 4 | - Adjust `CACHE_EXPIRY` via env var.
 5 | - Use Redis by mounting external cache.
 6 | 
 7 | ## Concurrency
 8 | - Increase FastAPI workers: `uvicorn --workers 4`.
 9 | - Use Gunicorn with Uvicorn workers in production.
10 | 
11 | ## Logging
12 | - Set `LOG_LEVEL=INFO` or `DEBUG` as needed.
13 | - Rotate logs by mounting a log driver.
14 | 
15 | ## Monitoring
16 | - Expose `/metrics` with Prometheus.
17 | 


--------------------------------------------------------------------------------
/ROADMAP.md:
--------------------------------------------------------------------------------
 1 | # Project Roadmap
 2 | 
 3 | ## v1.x (Current)
 4 | - Stable multi‑site job search
 5 | - Docker image & Compose support
 6 | - Caching, rate limiting, API key auth
 7 | 
 8 | ## v2.0
 9 | - GraphQL endpoint
10 | - OAuth2 integration for platform APIs
11 | - Plugin architecture for new job sources
12 | 
13 | ## Future
14 | - Frontend dashboard
15 | - Analytics & metrics export
16 | - Enterprise features: SSO, team quotas
17 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Supported Versions
 4 | 
 5 | We currently support the following versions with security updates:
 6 | 
 7 | | Version | Supported          |
 8 | | ------- | ------------------ |
 9 | | 1.0.x   | :white_check_mark: |
10 | | < 1.0   | :x:                |
11 | 
12 | ## Reporting a Vulnerability
13 | 
14 | We take the security of JobSpy Docker API seriously. If you believe you've found a security vulnerability, please follow these steps:
15 | 
16 | 1. **Do Not** disclose the vulnerability publicly
17 | 2. **Do Not** open a public GitHub issue
18 | 
19 | Instead, please email us at [security@example.com](mailto:security@example.com) with:
20 | 
21 | - Description of the vulnerability
22 | - Steps to reproduce
23 | - Potential impact
24 | - Any suggestions for remediation
25 | 
26 | We will acknowledge receipt of your report within 48 hours and provide an estimated timeline for a fix. We'll keep you informed of our progress.
27 | 
28 | ## Security Measures
29 | 
30 | - API key authentication (when enabled)
31 | - Rate limiting capabilities
32 | - Regular dependency updates
33 | - Input validation
34 | - Safe error handling
35 | 
36 | ## Security Best Practices for Users
37 | 
38 | 1. **API Keys**: When using API key authentication, follow best practices:
39 |    - Use unique keys for different use cases
40 |    - Rotate keys regularly
41 |    - Only share keys securely
42 | 
43 | 2. **Environment Variables**: Never commit real API keys to version control
44 |    - Use `.env.local` for local development
45 |    - Use secure methods for production deployment
46 | 
47 | 3. **Rate Limiting**: Enable rate limiting in production
48 |    - Adjust limits according to your expected usage
49 | 
50 | 4. **Regular Updates**: Update to the latest version regularly
51 | 
52 | ## Disclosure Policy
53 | 
54 | When we receive a security bug report, we will:
55 | 
56 | 1. Confirm the vulnerability
57 | 2. Determine its impact and severity
58 | 3. Develop and test a fix
59 | 4. Release a patched version
60 | 5. Acknowledge your contribution (unless you prefer to remain anonymous)
61 | 


--------------------------------------------------------------------------------
/SECURITY_GUIDELINES.md:
--------------------------------------------------------------------------------
 1 | # Security Guidelines
 2 | 
 3 | ## API Authentication
 4 | - Use `x-api-key` header; rotate keys regularly.
 5 | - Enforce HTTPS/TLS.
 6 | 
 7 | ## Secrets Management
 8 | - Don’t commit `.env` with real keys.
 9 | - Use Docker secrets or Kubernetes Secrets.
10 | 
11 | ## Dependencies
12 | - Regularly run `pip install -U`.
13 | - Audit with `safety` or `dependabot`.
14 | 
15 | ## Vulnerability Reporting
16 | - See `SECURITY.md` for reporting process.
17 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # Support
 2 | 
 3 | ## Reporting Issues
 4 | - Open an issue at [GitHub Issues](https://github.com/username/jobspy-api/issues).
 5 | 
 6 | ## Contact
 7 | - For security: email security@example.com
 8 | - For general queries: use the issue tracker.
 9 | 
10 | ## Community
11 | - Join our Slack channel (link).
12 | - Follow updates on Twitter @jobspy_api.
13 | 


--------------------------------------------------------------------------------
/UPGRADE_GUIDE.md:
--------------------------------------------------------------------------------
 1 | # Upgrade Guide
 2 | 
 3 | ## From v1.x to v2.0
 4 | 1. Review breaking changes in `API_CHANGELOG.md`.
 5 | 2. Update Docker image tag to `username/jobspy-api:2.0.0`.
 6 | 3. Migrate environment vars: new OAuth2 settings.
 7 | 4. Run integration tests against staging.
 8 | 
 9 | ## Patch Releases
10 | ```bash
11 | make docker-pull && make docker-compose-down && make docker-compose-up -d
12 | ```
13 | 


--------------------------------------------------------------------------------
/app/.env:
--------------------------------------------------------------------------------
1 | # ...existing code...
2 | 
3 | # API Documentation
4 | ENABLE_SWAGGER_UI=true
5 | ENABLE_REDOC=true
6 | SWAGGER_UI_PATH=/docs
7 | REDOC_PATH=/redoc
8 | 
9 | # ...existing code...


--------------------------------------------------------------------------------
/app/__init__.py:
--------------------------------------------------------------------------------
1 | """JobSpy Docker API application package."""
2 | __version__ = "1.0.1"
3 | 


--------------------------------------------------------------------------------
/app/api/deps.py:
--------------------------------------------------------------------------------
 1 | from fastapi import Depends, HTTPException, status, Request
 2 | from fastapi.security import APIKeyHeader
 3 | from typing import Optional
 4 | import logging
 5 | 
 6 | from app.core.config import settings
 7 | from app.config import settings as app_settings
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
11 | 
12 | async def get_api_key(request: Request, api_key: Optional[str] = Depends(api_key_header)):
13 |     # Log detailed information about the authentication attempt
14 |     logger.debug(f"API Key authentication check - core API_KEY configured: {bool(settings.API_KEY)}")
15 |     logger.debug(f"Request path: {request.url.path}")
16 |     logger.debug(f"API Key in request: {'Present' if api_key else 'Missing'}")
17 |     
18 |     # Check both authentication systems for consistency
19 |     # First check app.core.config settings
20 |     if not settings.API_KEY:
21 |         logger.debug("No API key configured in core settings, checking app settings")
22 |         
23 |         # Then check app.config settings
24 |         if not app_settings.ENABLE_API_KEY_AUTH or not app_settings.API_KEYS:
25 |             logger.debug("Authentication disabled or no API keys configured in app settings")
26 |             return None
27 |         
28 |         # App settings require auth but no core setting, issue a warning
29 |         logger.warning("Inconsistent config: API_KEY auth enabled in app settings but not in core settings")
30 |     
31 |     # At this point, some form of authentication is required
32 |     # Check if API key is missing
33 |     if not api_key:
34 |         logger.warning(f"API key is missing in request to {request.url.path}")
35 |         raise HTTPException(
36 |             status_code=status.HTTP_403_FORBIDDEN,
37 |             detail="Missing API Key",
38 |         )
39 |     
40 |     # Check against core config API key if configured
41 |     if settings.API_KEY and api_key != settings.API_KEY:
42 |         # Fall back to checking against app config API keys
43 |         if not (app_settings.API_KEYS and api_key in app_settings.API_KEYS):
44 |             logger.warning(f"Invalid API key provided in request to {request.url.path}")
45 |             raise HTTPException(
46 |                 status_code=status.HTTP_403_FORBIDDEN,
47 |                 detail="Invalid API Key",
48 |             )
49 |     
50 |     logger.debug("Valid API key provided, authentication successful")
51 |     return api_key
52 | 


--------------------------------------------------------------------------------
/app/api/endpoints/example.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter, Depends
 2 | from sqlalchemy.orm import Session
 3 | 
 4 | from app import crud, schemas
 5 | from app.api.dependencies import get_db
 6 | from app.core.logging_config import get_logger
 7 | 
 8 | logger = get_logger("api.endpoints.example")
 9 | 
10 | router = APIRouter()
11 | 
12 | @router.get("/items/")
13 | async def get_items(skip: int = 0, limit: int = 100, db: Session = Depends(get_db)):
14 |     logger.debug(f"Fetching items with skip={skip}, limit={limit}")
15 |     try:
16 |         items = crud.get_items(db, skip=skip, limit=limit)
17 |         logger.debug(f"Successfully retrieved {len(items)} items")
18 |         return items
19 |     except Exception as e:
20 |         logger.exception(f"Error retrieving items: {str(e)}")
21 |         raise
22 | 
23 | @router.post("/items/")
24 | async def create_item(item: schemas.ItemCreate, db: Session = Depends(get_db)):
25 |     logger.debug(f"Creating new item: {item.dict()}")
26 |     try:
27 |         db_item = crud.create_item(db=db, item=item)
28 |         logger.info(f"Successfully created item with id: {db_item.id}")
29 |         return db_item
30 |     except Exception as e:
31 |         logger.exception(f"Error creating item: {str(e)}")
32 |         raise


--------------------------------------------------------------------------------
/app/api/routes/jobs.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter, Depends, Query
 2 | from app.api.deps import get_api_key
 3 | 
 4 | router = APIRouter()
 5 | 
 6 | @router.get("/search_jobs")
 7 | async def search_jobs(
 8 |     api_key: str = Depends(get_api_key)
 9 | ):
10 |     pass


--------------------------------------------------------------------------------
/app/cache.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from typing import Dict, Any, Optional, Tuple
 3 | import hashlib
 4 | import json
 5 | import pandas as pd
 6 | from app.config import settings
 7 | 
 8 | class JobSearchCache:
 9 |     def __init__(self):
10 |         self.cache: Dict[str, Tuple[float, pd.DataFrame]] = {}
11 |         self.enabled = settings.ENABLE_CACHE
12 |         self.expiry = settings.CACHE_EXPIRY
13 |     
14 |     def _generate_key(self, params: Dict[str, Any]) -> str:
15 |         """Generate a cache key from the search parameters"""
16 |         # Sort the dictionary to ensure consistent keys
17 |         sorted_params = {k: params[k] for k in sorted(params.keys())}
18 |         param_str = json.dumps(sorted_params, sort_keys=True)
19 |         return hashlib.md5(param_str.encode()).hexdigest()
20 |     
21 |     def get(self, params: Dict[str, Any]) -> Optional[pd.DataFrame]:
22 |         """Get cached results if they exist and are not expired"""
23 |         if not self.enabled:
24 |             return None
25 |             
26 |         key = self._generate_key(params)
27 |         if key not in self.cache:
28 |             return None
29 |             
30 |         timestamp, df = self.cache[key]
31 |         if time.time() - timestamp > self.expiry:
32 |             # Cache expired
33 |             del self.cache[key]
34 |             return None
35 |             
36 |         return df
37 |     
38 |     def set(self, params: Dict[str, Any], df: pd.DataFrame) -> None:
39 |         """Cache search results"""
40 |         if not self.enabled:
41 |             return
42 |             
43 |         key = self._generate_key(params)
44 |         self.cache[key] = (time.time(), df)
45 |     
46 |     def clear(self) -> None:
47 |         """Clear all cached data"""
48 |         self.cache.clear()
49 |     
50 |     def cleanup_expired(self) -> None:
51 |         """Remove expired cache entries"""
52 |         current_time = time.time()
53 |         expired_keys = [
54 |             key for key, (timestamp, _) in self.cache.items() 
55 |             if current_time - timestamp > self.expiry
56 |         ]
57 |         for key in expired_keys:
58 |             del self.cache[key]
59 | 
60 | # Initialize global cache
61 | cache = JobSearchCache()
62 | 


--------------------------------------------------------------------------------
/app/config.py:
--------------------------------------------------------------------------------
  1 | """Configuration settings for the JobSpy Docker API."""
  2 | import os
  3 | from typing import List, Optional, Any, Dict, Tuple
  4 | import logging
  5 | 
  6 | # Try to load .env files - will be ignored if python-dotenv is not installed
  7 | try:
  8 |     from dotenv import load_dotenv, find_dotenv
  9 |     # Load only .env by default
 10 |     dotenv_file = find_dotenv(".env")
 11 |     if dotenv_file:
 12 |         load_dotenv(dotenv_file)
 13 |     
 14 |     # .env.local is not loaded by default anymore
 15 |     # If you need to load it, do so explicitly in your code
 16 | except ImportError:
 17 |     pass
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | class Settings:
 22 |     """Simple settings class that loads values from environment variables."""
 23 |     
 24 |     def __init__(self):
 25 |         # Track the source of each setting
 26 |         self.setting_sources = {}
 27 |         
 28 |         # API Security
 29 |         self.API_KEYS, self.API_KEYS_SOURCE = self._get_setting_with_source(
 30 |             "API_KEYS", "", self._parse_list
 31 |         )
 32 |         self.API_KEY_HEADER_NAME, self.API_KEY_HEADER_NAME_SOURCE = self._get_setting_with_source(
 33 |             "API_KEY_HEADER_NAME", "x-api-key"
 34 |         )
 35 |         self.ENABLE_API_KEY_AUTH, self.ENABLE_API_KEY_AUTH_SOURCE = self._get_setting_with_source(
 36 |             "ENABLE_API_KEY_AUTH", "false", self._parse_bool
 37 |         )
 38 |         
 39 |         # Rate Limiting
 40 |         self.RATE_LIMIT_ENABLED, self.RATE_LIMIT_ENABLED_SOURCE = self._get_setting_with_source(
 41 |             "RATE_LIMIT_ENABLED", "false", self._parse_bool
 42 |         )
 43 |         self.RATE_LIMIT_REQUESTS, self.RATE_LIMIT_REQUESTS_SOURCE = self._get_setting_with_source(
 44 |             "RATE_LIMIT_REQUESTS", "100", int
 45 |         )
 46 |         self.RATE_LIMIT_TIMEFRAME, self.RATE_LIMIT_TIMEFRAME_SOURCE = self._get_setting_with_source(
 47 |             "RATE_LIMIT_TIMEFRAME", "3600", int
 48 |         )
 49 |         
 50 |         # Proxy Configuration
 51 |         self.DEFAULT_PROXIES, self.DEFAULT_PROXIES_SOURCE = self._get_setting_with_source(
 52 |             "DEFAULT_PROXIES", "", self._parse_list
 53 |         )
 54 |         self.CA_CERT_PATH, self.CA_CERT_PATH_SOURCE = self._get_setting_with_source(
 55 |             "CA_CERT_PATH", None
 56 |         )
 57 |         
 58 |         # JobSpy Default Settings
 59 |         default_sites = "indeed,linkedin,zip_recruiter,glassdoor,google,bayt,naukri"
 60 |         self.DEFAULT_SITE_NAMES, self.DEFAULT_SITE_NAMES_SOURCE = self._get_setting_with_source(
 61 |             "DEFAULT_SITE_NAMES", default_sites, self._parse_list
 62 |         )
 63 |         self.DEFAULT_RESULTS_WANTED, self.DEFAULT_RESULTS_WANTED_SOURCE = self._get_setting_with_source(
 64 |             "DEFAULT_RESULTS_WANTED", "20", int
 65 |         )
 66 |         self.DEFAULT_DISTANCE, self.DEFAULT_DISTANCE_SOURCE = self._get_setting_with_source(
 67 |             "DEFAULT_DISTANCE", "50", int
 68 |         )
 69 |         self.DEFAULT_DESCRIPTION_FORMAT, self.DEFAULT_DESCRIPTION_FORMAT_SOURCE = self._get_setting_with_source(
 70 |             "DEFAULT_DESCRIPTION_FORMAT", "markdown"
 71 |         )
 72 |         self.DEFAULT_COUNTRY_INDEED, self.DEFAULT_COUNTRY_INDEED_SOURCE = self._get_setting_with_source(
 73 |             "DEFAULT_COUNTRY_INDEED", None
 74 |         )
 75 |         
 76 |         # Caching
 77 |         self.ENABLE_CACHE, self.ENABLE_CACHE_SOURCE = self._get_setting_with_source(
 78 |             "ENABLE_CACHE", "false", self._parse_bool
 79 |         )
 80 |         self.CACHE_EXPIRY, self.CACHE_EXPIRY_SOURCE = self._get_setting_with_source(
 81 |             "CACHE_EXPIRY", "3600", int
 82 |         )
 83 |         
 84 |         # Logging
 85 |         self.LOG_LEVEL, self.LOG_LEVEL_SOURCE = self._get_setting_with_source(
 86 |             "LOG_LEVEL", "INFO"
 87 |         )
 88 |         self.ENVIRONMENT, self.ENVIRONMENT_SOURCE = self._get_setting_with_source(
 89 |             "ENVIRONMENT", "production"
 90 |         )
 91 |         
 92 |         # CORS
 93 |         self.CORS_ORIGINS, self.CORS_ORIGINS_SOURCE = self._get_setting_with_source(
 94 |             "CORS_ORIGINS", "*", self._parse_list
 95 |         )
 96 |         
 97 |         # Health Endpoints
 98 |         self.ENABLE_HEALTH_ENDPOINTS, self.ENABLE_HEALTH_ENDPOINTS_SOURCE = self._get_setting_with_source(
 99 |             "ENABLE_HEALTH_ENDPOINTS", "true", self._parse_bool
100 |         )
101 |         self.ENABLE_DETAILED_HEALTH, self.ENABLE_DETAILED_HEALTH_SOURCE = self._get_setting_with_source(
102 |             "ENABLE_DETAILED_HEALTH", "true", self._parse_bool
103 |         )
104 |         
105 |         # API Documentation
106 |         self.ENABLE_SWAGGER_UI, self.ENABLE_SWAGGER_UI_SOURCE = self._get_setting_with_source(
107 |             "ENABLE_SWAGGER_UI", "true", self._parse_bool
108 |         )
109 |         self.ENABLE_REDOC, self.ENABLE_REDOC_SOURCE = self._get_setting_with_source(
110 |             "ENABLE_REDOC", "true", self._parse_bool
111 |         )
112 |         self.SWAGGER_UI_PATH, self.SWAGGER_UI_PATH_SOURCE = self._get_setting_with_source(
113 |             "SWAGGER_UI_PATH", "/docs"
114 |         )
115 |         self.REDOC_PATH, self.REDOC_PATH_SOURCE = self._get_setting_with_source(
116 |             "REDOC_PATH", "/redoc"
117 |         )
118 |         
119 |         # Fix configuration inconsistencies
120 |         self._fix_configuration_inconsistencies()
121 |     
122 |     def _get_setting_with_source(self, key: str, default_value: Any, 
123 |                                 parser_func=None) -> Tuple[Any, str]:
124 |         """Get a setting value and its source."""
125 |         if key in os.environ:
126 |             value = os.environ[key]
127 |             source = f"environment variable ({value})"
128 |         else:
129 |             value = default_value
130 |             source = f"default value ({value})"
131 |         
132 |         # Apply parser if provided
133 |         if parser_func and value is not None:
134 |             value = parser_func(value)
135 |             
136 |         # Log loading for critical settings
137 |         critical_settings = ["ENABLE_API_KEY_AUTH", "API_KEYS", "RATE_LIMIT_ENABLED", "ENABLE_CACHE"]
138 |         if key in critical_settings:
139 |             logger.debug(f"Setting {key}={value} loaded from {source}")
140 |             
141 |         return value, source
142 |         
143 |     def _fix_configuration_inconsistencies(self):
144 |         """Fix any inconsistencies in configuration."""
145 |         # If API keys are configured but auth is disabled, log a warning
146 |         if self.API_KEYS and not self.ENABLE_API_KEY_AUTH:
147 |             logger.warning("API keys are configured but authentication is disabled. This may lead to security issues.")
148 |     
149 |     def _parse_bool(self, value: Any) -> bool:
150 |         """Parse a boolean from a string or any value."""
151 |         if isinstance(value, bool):
152 |             return value
153 |         if isinstance(value, str):
154 |             return value.lower() in ("yes", "true", "t", "1", "on")
155 |         return bool(value)
156 |     
157 |     def _parse_list(self, value: Any) -> List[str]:
158 |         """Parse a comma-separated list from a string."""
159 |         if not value:
160 |             return []
161 |         if isinstance(value, list):
162 |             return [str(item) for item in value if item]
163 |         if isinstance(value, str):
164 |             return [item.strip() for item in value.split(",") if item.strip()]
165 |         return []
166 |     
167 |     def get_all_settings(self) -> Dict[str, Dict[str, Any]]:
168 |         """Get all settings with their sources, useful for debugging."""
169 |         settings_with_sources = {}
170 |         for key in dir(self):
171 |             if key.isupper() and not key.endswith("_SOURCE"):
172 |                 source_attr = f"{key}_SOURCE"
173 |                 source = getattr(self, source_attr) if hasattr(self, source_attr) else "unknown"
174 |                 settings_with_sources[key] = {
175 |                     "value": getattr(self, key),
176 |                     "source": source
177 |                 }
178 |         return settings_with_sources
179 | 
180 | # Create a global settings instance
181 | settings = Settings()
182 | 


--------------------------------------------------------------------------------
/app/core/config.py:
--------------------------------------------------------------------------------
 1 | from pydantic_settings import BaseSettings
 2 | from typing import Optional
 3 | 
 4 | class Settings(BaseSettings):
 5 |     PROJECT_NAME: str = "Job Spy FastAPI"
 6 |     API_V1_STR: str = "/api/v1"
 7 |     DATABASE_URL: Optional[str] = None  # Made optional with default None
 8 |     API_KEY: Optional[str] = None
 9 | 
10 |     # Logging settings
11 |     LOG_LEVEL: str = "INFO"  # DEBUG, INFO, WARNING, ERROR, CRITICAL
12 |     LOG_TO_FILE: bool = True
13 |     LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
14 |     
15 |     @property
16 |     def get_log_level(self):
17 |         """Convert string log level to logging module level"""
18 |         import logging
19 |         return getattr(logging, self.LOG_LEVEL)
20 | 
21 |     class Config:
22 |         case_sensitive = True
23 | 
24 | settings = Settings()


--------------------------------------------------------------------------------
/app/core/config_bridge.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Bridge between app.core.config and app.config to ensure consistent settings.
 3 | This module synchronizes API key settings between the two config modules.
 4 | """
 5 | import logging
 6 | 
 7 | from app.config import settings as app_settings
 8 | from app.core.config import settings as core_settings
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | def sync_api_key_settings():
13 |     """
14 |     Synchronize API key settings between core.config and main config.
15 |     This ensures that authentication works consistently.
16 |     """
17 |     # If core API_KEY is set but app API_KEYS is not, add the core key to app settings
18 |     if core_settings.API_KEY and not app_settings.API_KEYS:
19 |         logger.debug("Syncing core API_KEY to app API_KEYS")
20 |         app_settings.API_KEYS = [core_settings.API_KEY]
21 |         
22 |     # If app has API_KEYS but core doesn't have API_KEY, set the first app key as core key
23 |     if app_settings.API_KEYS and not core_settings.API_KEY:
24 |         logger.debug("Setting core API_KEY from app API_KEYS")
25 |         # We can't actually modify core_settings.API_KEY directly, but we can log a warning
26 |         logger.warning("Cannot sync app API_KEYS to core API_KEY - core settings are immutable")
27 |     
28 |     # Log configuration status
29 |     auth_enabled = bool(core_settings.API_KEY) or (app_settings.ENABLE_API_KEY_AUTH and bool(app_settings.API_KEYS))
30 |     logger.info(f"Authentication enabled: {auth_enabled}")
31 |     logger.debug(f"Core API_KEY configured: {bool(core_settings.API_KEY)}")
32 |     logger.debug(f"App API_KEYS configured: {bool(app_settings.API_KEYS)}")
33 | 
34 | # Run synchronization when module is imported
35 | sync_api_key_settings()
36 | 


--------------------------------------------------------------------------------
/app/core/log_filters.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | class HealthCheckFilter(logging.Filter):
 4 |     """Filter out health check requests from logs"""
 5 |     
 6 |     def __init__(self, path="/health"):
 7 |         super().__init__()
 8 |         self.path = path
 9 |     
10 |     def filter(self, record):
11 |         # Check if the record has a message attribute and contains the health check path
12 |         if hasattr(record, 'message'):
13 |             return self.path not in record.message
14 |         # For records that haven't been formatted yet, check the raw message
15 |         if hasattr(record, 'msg') and isinstance(record.msg, str):
16 |             return self.path not in record.msg
17 |         return True
18 | 


--------------------------------------------------------------------------------
/app/core/logging_config.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import sys
 4 | from logging.handlers import RotatingFileHandler
 5 | from pathlib import Path
 6 | 
 7 | def setup_logging(log_level=None):
 8 |     """Configure logging for the application"""
 9 |     
10 |     # Determine log level from environment or parameter
11 |     if log_level is None:
12 |         env_level = os.getenv("LOG_LEVEL", "INFO").upper()
13 |         log_level = getattr(logging, env_level, logging.INFO)
14 |     
15 |     # Create logs directory if it doesn't exist
16 |     log_dir = Path("logs")
17 |     log_dir.mkdir(exist_ok=True)
18 |     
19 |     # Configure root logger
20 |     logger = logging.getLogger()
21 |     logger.setLevel(log_level)
22 |     
23 |     # Clear existing handlers to avoid duplicate logs
24 |     if logger.handlers:
25 |         logger.handlers.clear()
26 |     
27 |     # Console handler
28 |     console_handler = logging.StreamHandler(sys.stdout)
29 |     console_handler.setLevel(log_level)
30 |     console_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
31 |     console_handler.setFormatter(console_formatter)
32 |     
33 |     # File handler with rotation (10MB max, keep 5 backups)
34 |     file_handler = RotatingFileHandler(
35 |         "logs/app.log", 
36 |         maxBytes=10*1024*1024, 
37 |         backupCount=5
38 |     )
39 |     file_handler.setLevel(log_level)
40 |     file_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
41 |     file_handler.setFormatter(file_formatter)
42 |     
43 |     # Add handlers to logger
44 |     logger.addHandler(console_handler)
45 |     logger.addHandler(file_handler)
46 |     
47 |     # Set Uvicorn's access logger to a higher level to reduce noise
48 |     uvicorn_access = logging.getLogger("uvicorn.access")
49 |     if log_level == logging.DEBUG:
50 |         uvicorn_access.setLevel(logging.INFO)  # Show access logs in debug mode, but not health checks
51 |     else:
52 |         uvicorn_access.setLevel(logging.WARNING)  # Only show warnings and errors otherwise
53 |         
54 |     # Return the configured logger
55 |     return logger
56 | 
57 | def get_logger(name):
58 |     """Get a named logger"""
59 |     return logging.getLogger(name)
60 | 


--------------------------------------------------------------------------------
/app/db/crud.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy.orm import Session
 2 | 
 3 | from app import models, schemas
 4 | from app.core.logging_config import get_logger
 5 | 
 6 | logger = get_logger("db.crud")
 7 | 
 8 | def get_items(db: Session, skip: int = 0, limit: int = 100):
 9 |     logger.debug(f"DB query: get_items(skip={skip}, limit={limit})")
10 |     try:
11 |         result = db.query(models.Item).offset(skip).limit(limit).all()
12 |         logger.debug(f"DB query successful, returned {len(result)} records")
13 |         return result
14 |     except Exception as e:
15 |         logger.exception(f"DB query failed: {str(e)}")
16 |         raise
17 | 
18 | def create_item(db: Session, item: schemas.ItemCreate):
19 |     logger.debug(f"DB operation: create_item with data: {item.dict()}")
20 |     try:
21 |         db_item = models.Item(**item.dict())
22 |         db.add(db_item)
23 |         db.commit()
24 |         db.refresh(db_item)
25 |         logger.debug(f"DB operation successful, created item with id: {db_item.id}")
26 |         return db_item
27 |     except Exception as e:
28 |         db.rollback()
29 |         logger.exception(f"DB operation failed, rolling back: {str(e)}")
30 |         raise


--------------------------------------------------------------------------------
/app/db/database.py:
--------------------------------------------------------------------------------
 1 | """Database configuration and session management."""
 2 | from sqlalchemy import create_engine
 3 | from sqlalchemy.ext.declarative import declarative_base
 4 | from sqlalchemy.orm import sessionmaker
 5 | import logging
 6 | from app.config import settings
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | # Get database URL from settings or use SQLite in-memory if not configured
11 | SQLALCHEMY_DATABASE_URL = settings.DATABASE_URL or "sqlite:///:memory:"
12 | 
13 | # Only create engine if a database URL is provided
14 | if SQLALCHEMY_DATABASE_URL and SQLALCHEMY_DATABASE_URL != "sqlite:///:memory:":
15 |     engine = create_engine(SQLALCHEMY_DATABASE_URL)
16 |     SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
17 | else:
18 |     logger.warning("No DATABASE_URL configured, database functionality will be limited")
19 |     engine = None
20 |     SessionLocal = None
21 | 
22 | Base = declarative_base()
23 | 
24 | def get_db():
25 |     """Get a database session."""
26 |     if SessionLocal is None:
27 |         raise RuntimeError("Database not configured. Set DATABASE_URL in environment.")
28 |         
29 |     db = SessionLocal()
30 |     try:
31 |         yield db
32 |     finally:
33 |         db.close()
34 | 


--------------------------------------------------------------------------------
/app/exceptions.py:
--------------------------------------------------------------------------------
 1 | """Custom exception classes for JobSpy Docker API."""
 2 | from fastapi import HTTPException
 3 | from starlette.status import HTTP_429_TOO_MANY_REQUESTS, HTTP_400_BAD_REQUEST, HTTP_500_INTERNAL_SERVER_ERROR
 4 | 
 5 | class JobSpyAPIException(HTTPException):
 6 |     """Base exception for JobSpy Docker API."""
 7 |     def __init__(self, status_code: int, detail: str, headers: dict = None):
 8 |         super().__init__(status_code=status_code, detail=detail, headers=headers)
 9 | 
10 | class RateLimitExceeded(JobSpyAPIException):
11 |     """Exception raised when rate limit is exceeded."""
12 |     def __init__(self, detail: str = "Rate limit exceeded"):
13 |         super().__init__(status_code=HTTP_429_TOO_MANY_REQUESTS, detail=detail)
14 | 
15 | class InvalidSearchParameters(JobSpyAPIException):
16 |     """Exception raised when search parameters are invalid."""
17 |     def __init__(self, detail: str = "Invalid search parameters"):
18 |         super().__init__(status_code=HTTP_400_BAD_REQUEST, detail=detail)
19 | 
20 | class JobSearchError(JobSpyAPIException):
21 |     """Exception raised when job search fails."""
22 |     def __init__(self, detail: str = "Job search failed"):
23 |         super().__init__(status_code=HTTP_500_INTERNAL_SERVER_ERROR, detail=detail)
24 | 


--------------------------------------------------------------------------------
/app/main.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import io
  3 | import logging
  4 | import os
  5 | import time
  6 | import uuid
  7 | from typing import List, Optional, Union
  8 | 
  9 | from contextlib import asynccontextmanager
 10 | 
 11 | from fastapi import FastAPI, Request, Query
 12 | from fastapi.exceptions import RequestValidationError
 13 | from fastapi.middleware.cors import CORSMiddleware
 14 | from fastapi.responses import JSONResponse, StreamingResponse
 15 | from starlette.exceptions import HTTPException as StarletteHTTPException
 16 | 
 17 | from app.cache import cache
 18 | from app.config import settings
 19 | from app.core import config_bridge
 20 | from app.core.logging_config import get_logger, setup_logging
 21 | from app.middleware.rate_limiter import RateLimitMiddleware
 22 | from app.middleware.request_logger import RequestLoggerMiddleware, log_request_middleware
 23 | from app.routes import api, health
 24 | from app.utils.env_debugger import log_environment_settings
 25 | from app.utils.error_handlers import (
 26 |     general_exception_handler,
 27 |     http_exception_handler,
 28 |     validation_exception_handler,
 29 | )
 30 | 
 31 | # Determine log level from environment - priority to "LOG_LEVEL" over "DEBUG" flag for consistency
 32 | log_level_name = os.getenv("LOG_LEVEL", "INFO").upper()
 33 | try:
 34 |     log_level = getattr(logging, log_level_name)
 35 | except AttributeError:
 36 |     print(f"WARNING: Invalid LOG_LEVEL: {log_level_name}, using INFO")
 37 |     log_level = logging.INFO
 38 | 
 39 | # Setup logging with determined level
 40 | setup_logging(log_level)
 41 | logger = get_logger("main")
 42 | 
 43 | logger.info(f"Starting application with log level: {log_level_name}")
 44 | 
 45 | # Set Uvicorn's access logger to WARNING to avoid logging health checks
 46 | logging.getLogger("uvicorn.access").setLevel(logging.WARNING)
 47 | 
 48 | SUPPORTED_SITES = ["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"]
 49 | 
 50 | def get_env_bool(var_name, default=True):
 51 |     val = os.getenv(var_name)
 52 |     if val is None:
 53 |         return default
 54 |     return str(val).lower() in ("1", "true", "yes", "on")
 55 | 
 56 | @asynccontextmanager
 57 | async def lifespan(app: FastAPI):
 58 |     # Startup: Initialize services, connections, etc.
 59 |     logger.info("Starting up JobSpy Docker API")
 60 |     
 61 |     # Log environment variables to help debugging
 62 |     log_environment_settings()
 63 |     
 64 |     # Yield control to the application
 65 |     yield
 66 |     
 67 |     # Shutdown: Clean up resources
 68 |     logger.info("Shutting down JobSpy Docker API")
 69 |     cache.clear()
 70 | 
 71 | # Create FastAPI app with enhanced documentation
 72 | app = FastAPI(
 73 |     title="JobSpy Docker API",
 74 |     description="""
 75 |     # JobSpy Docker API
 76 |     
 77 |     An API for searching jobs across multiple platforms including LinkedIn, Indeed, Glassdoor, Google, ZipRecruiter, Bayt, and Naukri.
 78 |     
 79 |     ## Authentication
 80 |     
 81 |     All API endpoints require an API key to be passed in the `x-api-key` header.
 82 |     
 83 |     ## Rate Limiting
 84 |     
 85 |     Requests are limited based on your API key. The default limit is 100 requests per hour.
 86 |     
 87 |     ## Caching
 88 |     
 89 |     Results are cached for 1 hour by default to improve performance and reduce load on job board sites.
 90 |     """,
 91 |     version="1.0.0",
 92 |     lifespan=lifespan,
 93 |     # Configure docs endpoints based on settings
 94 |     docs_url=settings.SWAGGER_UI_PATH if settings.ENABLE_SWAGGER_UI else None,
 95 |     redoc_url=settings.REDOC_PATH if settings.ENABLE_REDOC else None,
 96 |     openapi_tags=[
 97 |         {
 98 |             "name": "Jobs",
 99 |             "description": "Operations related to job searching",
100 |         },
101 |         {
102 |             "name": "Health",
103 |             "description": "API health check endpoints",
104 |         },
105 |         {
106 |             "name": "Info",
107 |             "description": "General API information",
108 |         },
109 |     ],
110 |     swagger_ui_parameters={"defaultModelsExpandDepth": -1},
111 | )
112 | 
113 | @app.on_event("startup")
114 | async def startup_event():
115 |     logger.info("Starting up Job Spy FastAPI application")
116 |     
117 |     # Set API key auth
118 |     global ENABLE_API_KEY_AUTH
119 |     ENABLE_API_KEY_AUTH = get_env_bool("ENABLE_API_KEY_AUTH", default=True)
120 |     if ENABLE_API_KEY_AUTH:
121 |         logger.info("API key authentication is enabled")
122 |     else:
123 |         logger.warning("API key authentication is disabled. Set ENABLE_API_KEY_AUTH=true to enable.")
124 |     
125 |     # Additional startup logic
126 | 
127 | @app.on_event("shutdown")
128 | async def shutdown_event():
129 |     logger.info("Shutting down Job Spy FastAPI application")
130 |     # Additional shutdown logic can be added here
131 | 
132 | # Add CORS middleware
133 | app.add_middleware(
134 |     CORSMiddleware,
135 |     allow_origins=settings.CORS_ORIGINS,
136 |     allow_credentials=True,
137 |     allow_methods=["*"],
138 |     allow_headers=["*"],
139 | )
140 | 
141 | # Add rate limiting middleware
142 | app.add_middleware(RateLimitMiddleware)
143 | 
144 | # Add request logging middleware
145 | app.add_middleware(RequestLoggerMiddleware)
146 | 
147 | # Add exception handlers
148 | app.add_exception_handler(RequestValidationError, validation_exception_handler)
149 | app.add_exception_handler(StarletteHTTPException, http_exception_handler)
150 | app.add_exception_handler(Exception, general_exception_handler)
151 | 
152 | # Add request timing and logging middleware
153 | @app.middleware("http")
154 | async def log_requests(request: Request, call_next):
155 |     start_time = time.time()
156 |     
157 |     # Generate request ID for tracking
158 |     request_id = str(uuid.uuid4())
159 |     logger.debug(f"Request {request_id} started: {request.method} {request.url.path}")
160 |     
161 |     try:
162 |         response = await call_next(request)
163 |         process_time = time.time() - start_time
164 |         logger.debug(
165 |             f"Request {request_id} completed: {request.method} {request.url.path} "
166 |             f"- Status: {response.status_code} - Time: {process_time:.3f}s"
167 |         )
168 |         response.headers["X-Process-Time"] = str(process_time)
169 |         return response
170 |     except Exception as e:
171 |         logger.exception(f"Request {request_id} failed: {str(e)}")
172 |         raise
173 | 
174 | # Include routers
175 | app.include_router(api.router, prefix="/api/v1", tags=["Jobs"])
176 | app.include_router(health.router, tags=["Health"])
177 | 
178 | @app.get("/", tags=["Info"])
179 | def read_root():
180 |     return {
181 |         "message": "Welcome to JobSpy Docker API!",
182 |         "docs_url": "/docs",
183 |         "api_root": "/api/v1",
184 |         "health_check": "/health"
185 |     }
186 | 
187 | # Add health check endpoint with minimal logging
188 | @app.get("/health")
189 | async def health_check():
190 |     """Health check endpoint for monitoring systems"""
191 |     # Only log health checks in debug mode
192 |     if logger.isEnabledFor(logging.DEBUG):
193 |         logger.debug("Health check requested")
194 |     return {"status": "healthy"}
195 | 
196 | @app.get("/api/v1/search_jobs")
197 | async def search_jobs(
198 |     site_name: Union[List[str], str] = Query(default=None, description="Job sites to search on"),
199 |     search_term: Optional[str] = Query(None, description="Job search term"),
200 |     google_search_term: Optional[str] = Query(None, description="Search term for Google jobs"),
201 |     location: Optional[str] = Query(None, description="Job location"),
202 |     distance: Optional[int] = Query(None, description="Distance in miles"),
203 |     job_type: Optional[str] = Query(None, description="Job type (fulltime, parttime, internship, contract)"),
204 |     is_remote: Optional[bool] = Query(None, description="Remote job filter"),
205 |     results_wanted: Optional[int] = Query(None, description="Number of results per site"),
206 |     hours_old: Optional[int] = Query(None, description="Filter by hours since posting"),
207 |     easy_apply: Optional[bool] = Query(None, description="Filter for easy apply jobs"),
208 |     description_format: Optional[str] = Query(None, description="Format of job description"),
209 |     offset: Optional[int] = Query(None, description="Offset for pagination"),
210 |     verbose: Optional[int] = Query(None, description="Controls verbosity"),
211 |     linkedin_fetch_description: Optional[bool] = Query(None, description="Fetch full LinkedIn descriptions"),
212 |     country_indeed: Optional[str] = Query(None, description="Country filter for Indeed & Glassdoor"),
213 |     enforce_annual_salary: Optional[bool] = Query(None, description="Convert wages to annual salary"),
214 |     format: str = Query("json", description="Output format: json or csv"),
215 |     paginate: bool = Query(False, description="Enable pagination"),
216 |     page: int = Query(1, description="Page number when pagination is enabled"),
217 |     page_size: int = Query(10, ge=1, le=100, description="Results per page when pagination is enabled"),
218 | ):
219 |     try:
220 |         # Handle site_name=all explicitly
221 |         if site_name is None:
222 |             site_name = SUPPORTED_SITES
223 |         elif isinstance(site_name, str):
224 |             if site_name.lower() == "all":
225 |                 site_name = SUPPORTED_SITES
226 |             else:
227 |                 site_name = [site_name]
228 |         elif isinstance(site_name, list):
229 |             if any(s.lower() == "all" for s in site_name):
230 |                 site_name = SUPPORTED_SITES
231 | 
232 |         # Use env default for country_indeed if not provided
233 |         if country_indeed is None:
234 |             country_indeed = os.getenv("DEFAULT_COUNTRY_INDEED", "USA")
235 |             logger.debug(f"Using default country_indeed from environment: {country_indeed}")
236 | 
237 |         # Call your existing job scraping code
238 |         # ...existing job scraping code...
239 | 
240 |         # This is a placeholder - replace with your actual jobs data
241 |         jobs_data = []  # Replace this with your actual jobs_data
242 | 
243 |         # Format conversion and response
244 |         if format.lower() == "csv":
245 |             logger.debug("Returning CSV format")
246 |             if not jobs_data:
247 |                 output = io.StringIO()
248 |                 writer = csv.writer(output)
249 |                 writer.writerow(["No results"])
250 |                 output.seek(0)
251 |                 return StreamingResponse(
252 |                     output, 
253 |                     media_type="text/csv", 
254 |                     headers={"Content-Disposition": "attachment; filename=jobs.csv"}
255 |                 )
256 |                 
257 |             output = io.StringIO()
258 |             writer = csv.DictWriter(output, fieldnames=jobs_data[0].keys())
259 |             writer.writeheader()
260 |             writer.writerows(jobs_data)
261 |             output.seek(0)
262 |             return StreamingResponse(
263 |                 output, 
264 |                 media_type="text/csv", 
265 |                 headers={"Content-Disposition": "attachment; filename=jobs.csv"}
266 |             )
267 |             
268 |         # Default: JSON response
269 |         return {
270 |             "count": len(jobs_data),
271 |             "jobs": jobs_data
272 |         }
273 |         
274 |     except Exception as e:
275 |         logger.exception(f"Error in search_jobs: {str(e)}")
276 |         raise
277 | 
278 | # API key auth default logic (at app startup or dependency)
279 | ENABLE_API_KEY_AUTH = get_env_bool("ENABLE_API_KEY_AUTH", default=True)
280 | if not ENABLE_API_KEY_AUTH:
281 |     import warnings
282 |     warnings.warn("API key authentication is disabled. Set ENABLE_API_KEY_AUTH=true to enable.")
283 | 
284 | if __name__ == "__main__":
285 |     import uvicorn
286 |     uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True)
287 | 


--------------------------------------------------------------------------------
/app/middleware/__init__.py:
--------------------------------------------------------------------------------
1 | """Middleware components for the JobSpy Docker API."""
2 | 


--------------------------------------------------------------------------------
/app/middleware/api_key_auth.py:
--------------------------------------------------------------------------------
 1 | from fastapi import Request, HTTPException, Depends
 2 | from fastapi.security import APIKeyHeader
 3 | from starlette.status import HTTP_403_FORBIDDEN
 4 | from typing import Optional
 5 | import logging
 6 | 
 7 | from app.config import settings
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | api_key_header = APIKeyHeader(name=settings.API_KEY_HEADER_NAME, auto_error=False)
11 | 
12 | async def get_api_key(api_key_header: Optional[str] = Depends(api_key_header)):
13 |     """
14 |     Dependency that checks if the API key is valid.
15 |     Allows requests without authentication if:
16 |     1. API key authentication is disabled, or
17 |     2. No API keys are configured, or
18 |     3. API keys list is empty
19 |     """
20 |     # Log detailed info about settings for debugging
21 |     logger.debug(f"API key auth enabled: {settings.ENABLE_API_KEY_AUTH}")
22 |     logger.debug(f"API keys configured: {bool(settings.API_KEYS)}")
23 |     
24 |     # Skip authentication if it's disabled or no keys are configured
25 |     if not settings.ENABLE_API_KEY_AUTH or not settings.API_KEYS:
26 |         logger.debug("Skipping API key validation - auth disabled or no keys configured")
27 |         return True
28 |     
29 |     # At this point, auth is enabled and keys are configured, so require a key
30 |     if not api_key_header:
31 |         logger.warning(f"Missing API key in request, auth enabled with {len(settings.API_KEYS)} configured keys")
32 |         raise HTTPException(
33 |             status_code=HTTP_403_FORBIDDEN, 
34 |             detail="Missing API Key"
35 |         )
36 |     
37 |     if api_key_header not in settings.API_KEYS:
38 |         logger.warning(f"Invalid API key provided")
39 |         raise HTTPException(
40 |             status_code=HTTP_403_FORBIDDEN, 
41 |             detail="Invalid API Key"
42 |         )
43 |     
44 |     logger.debug("Valid API key provided")
45 |     return True
46 | 


--------------------------------------------------------------------------------
/app/middleware/rate_limiter.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from collections import defaultdict
 3 | from typing import DefaultDict, Dict, List
 4 | 
 5 | from fastapi import Request, HTTPException
 6 | from starlette.middleware.base import BaseHTTPMiddleware
 7 | from starlette.status import HTTP_429_TOO_MANY_REQUESTS
 8 | 
 9 | from app.config import settings
10 | 
11 | class RateLimitMiddleware(BaseHTTPMiddleware):
12 |     def __init__(self, app):
13 |         super().__init__(app)
14 |         self.rate_limits: DefaultDict[str, List[float]] = defaultdict(list)
15 |         self.enabled = settings.RATE_LIMIT_ENABLED
16 |         self.max_requests = settings.RATE_LIMIT_REQUESTS
17 |         self.timeframe = settings.RATE_LIMIT_TIMEFRAME
18 |     
19 |     async def dispatch(self, request: Request, call_next):
20 |         if not self.enabled:
21 |             return await call_next(request)
22 |         
23 |         # Get client identifier (use API key if available, otherwise IP)
24 |         client_identifier = request.headers.get(settings.API_KEY_HEADER_NAME, request.client.host)
25 |         
26 |         # Check rate limit
27 |         current_time = time.time()
28 |         
29 |         # Clean up old request timestamps
30 |         self.rate_limits[client_identifier] = [
31 |             timestamp for timestamp in self.rate_limits[client_identifier] 
32 |             if current_time - timestamp < self.timeframe
33 |         ]
34 |         
35 |         # Check if rate limit exceeded
36 |         if len(self.rate_limits[client_identifier]) >= self.max_requests:
37 |             reset_time = min(self.rate_limits[client_identifier]) + self.timeframe - current_time
38 |             headers = {"X-RateLimit-Reset": str(int(reset_time))}
39 |             
40 |             raise HTTPException(
41 |                 status_code=HTTP_429_TOO_MANY_REQUESTS,
42 |                 detail=f"Rate limit exceeded. Maximum {self.max_requests} requests per {self.timeframe} seconds.",
43 |                 headers=headers
44 |             )
45 |         
46 |         # Add current request timestamp
47 |         self.rate_limits[client_identifier].append(current_time)
48 |         
49 |         # Process the request
50 |         response = await call_next(request)
51 |         
52 |         # Add rate limit headers
53 |         remaining = self.max_requests - len(self.rate_limits[client_identifier])
54 |         response.headers["X-RateLimit-Limit"] = str(self.max_requests)
55 |         response.headers["X-RateLimit-Remaining"] = str(remaining)
56 |         
57 |         return response
58 | 


--------------------------------------------------------------------------------
/app/middleware/request_logger.py:
--------------------------------------------------------------------------------
  1 | """Middleware for logging requests and responses."""
  2 | import json
  3 | import logging
  4 | import time
  5 | from fastapi import Request, Response
  6 | from starlette.middleware.base import BaseHTTPMiddleware
  7 | from starlette.types import ASGIApp
  8 | 
  9 | logger = logging.getLogger("app.middleware.request_logger")
 10 | 
 11 | # Paths to exclude from detailed logging
 12 | EXCLUDED_PATHS = ["/health", "/metrics"]
 13 | 
 14 | # Paths that should only be logged in debug mode
 15 | DEBUG_ONLY_PATHS = ["/health", "/metrics"]
 16 | 
 17 | # Add the missing function
 18 | async def log_request_middleware(request: Request, call_next):
 19 |     """Function-based middleware for logging requests. 
 20 |     Simpler alternative to the RequestLoggerMiddleware class."""
 21 |     # Generate a unique request ID
 22 |     request_id = request.headers.get("X-Request-ID", f"req_{time.time()}")
 23 |     
 24 |     # Get path
 25 |     path = request.url.path
 26 |     
 27 |     # Only log health checks and monitoring endpoints in debug mode
 28 |     should_log = True
 29 |     if path in DEBUG_ONLY_PATHS:
 30 |         should_log = logger.isEnabledFor(logging.DEBUG)
 31 |     
 32 |     if should_log:
 33 |         # Log the request
 34 |         client_host = request.client.host if request.client else "unknown"
 35 |         logger.info(f"Request {request_id}: {request.method} {request.url} from {client_host}")
 36 |     
 37 |     # Process the request and measure timing
 38 |     start_time = time.time()
 39 |     response = await call_next(request)
 40 |     process_time = time.time() - start_time
 41 |     
 42 |     if should_log:
 43 |         # Log the response
 44 |         logger.info(f"Response {request_id}: {response.status_code} in {process_time:.4f} seconds")
 45 |     
 46 |     # Add custom headers
 47 |     response.headers["X-Request-ID"] = request_id
 48 |     response.headers["X-Process-Time"] = f"{process_time:.4f}"
 49 |     
 50 |     return response
 51 | 
 52 | class RequestLoggerMiddleware(BaseHTTPMiddleware):
 53 |     def __init__(self, app: ASGIApp):
 54 |         super().__init__(app)
 55 |         
 56 |     async def dispatch(self, request: Request, call_next):
 57 |         # Generate a unique request ID
 58 |         request_id = request.headers.get("X-Request-ID", f"req_{time.time()}")
 59 |         
 60 |         # Get path and method
 61 |         path = request.url.path
 62 |         method = request.method  # Move method extraction here, outside the condition
 63 |         
 64 |         # Only log health checks and monitoring endpoints in debug mode
 65 |         should_log = True
 66 |         if path in DEBUG_ONLY_PATHS:
 67 |             should_log = logger.isEnabledFor(logging.DEBUG)
 68 |         
 69 |         if should_log:
 70 |             # Log the request
 71 |             client_host = request.client.host if request.client else "unknown"
 72 |             url = str(request.url)
 73 |             
 74 |             logger.info(f"Request {request_id}: {method} {url} from {client_host}")
 75 |         
 76 |         # Get body if it's a POST/PUT
 77 |         if method in ["POST", "PUT"]:
 78 |             try:
 79 |                 # Store the request body for logging
 80 |                 body = await request.body()
 81 |                 await self._log_request_body(request_id, body)
 82 |                 
 83 |                 # Need to create a new Request with the body because the original was consumed
 84 |                 request = Request(
 85 |                     scope=request.scope,
 86 |                     receive=self._receive_with_body(body)
 87 |                 )
 88 |             except Exception as e:
 89 |                 logger.warning(f"Failed to log request body: {str(e)}")
 90 |         
 91 |         # Process the request and measure timing
 92 |         start_time = time.time()
 93 |         response = await call_next(request)
 94 |         process_time = time.time() - start_time
 95 |         
 96 |         if should_log:
 97 |             # Log the response
 98 |             status_code = response.status_code
 99 |             logger.info(f"Response {request_id}: {status_code} in {process_time:.4f} seconds")
100 |         
101 |         # Add custom headers
102 |         response.headers["X-Request-ID"] = request_id
103 |         response.headers["X-Process-Time"] = f"{process_time:.4f}"
104 |         
105 |         return response
106 |     
107 |     async def _log_request_body(self, request_id: str, body: bytes):
108 |         """Log the request body in a safe manner."""
109 |         try:
110 |             # Only log if body is not too large
111 |             if len(body) > 1000:
112 |                 logger.debug(f"Request {request_id} body: [too large to log]")
113 |                 return
114 |                 
115 |             # Try to parse as JSON
116 |             json_body = json.loads(body)
117 |             # Mask sensitive fields
118 |             self._mask_sensitive_fields(json_body)
119 |             logger.debug(f"Request {request_id} body: {json.dumps(json_body)}")
120 |         except:
121 |             # Not JSON, log as string (truncated if needed)
122 |             body_str = body.decode('utf-8', errors='replace')
123 |             if len(body_str) > 200:
124 |                 body_str = body_str[:200] + "..."
125 |             logger.debug(f"Request {request_id} body: {body_str}")
126 |     
127 |     def _mask_sensitive_fields(self, data):
128 |         """Mask sensitive fields in the request data."""
129 |         if not isinstance(data, dict):
130 |             return
131 |             
132 |         # List of fields to mask
133 |         sensitive_fields = ["password", "token", "api_key", "secret", "credit_card"]
134 |         
135 |         for key in data:
136 |             if isinstance(data[key], dict):
137 |                 self._mask_sensitive_fields(data[key])
138 |             elif isinstance(data[key], list):
139 |                 for item in data[key]:
140 |                     if isinstance(item, dict):
141 |                         self._mask_sensitive_fields(item)
142 |             elif any(sensitive in key.lower() for sensitive in sensitive_fields):
143 |                 data[key] = "********"
144 |     
145 |     async def _receive_with_body(self, body: bytes):
146 |         """Create a new receive function that returns the stored body."""
147 |         async def receive():
148 |             return {"type": "http.request", "body": body}
149 |         return receive
150 | 


--------------------------------------------------------------------------------
/app/models.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, List, Optional, Union
 2 | 
 3 | from pydantic import BaseModel, Field
 4 | 
 5 | class JobSearchParams(BaseModel):
 6 |     site_name: Union[List[str], str] = Field(
 7 |         default_factory=lambda: ["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"],
 8 |         description="Job sites to search on",
 9 |     )
10 |     search_term: Optional[str] = Field(default=None, description="Job search term")
11 |     google_search_term: Optional[str] = Field(default=None, description="Search term for Google jobs")
12 |     location: Optional[str] = Field(default=None, description="Job location")
13 |     distance: Optional[int] = Field(default=50, description="Distance in miles")
14 |     job_type: Optional[str] = Field(default=None, description="Job type (fulltime, parttime, internship, contract)")
15 |     proxies: Optional[List[str]] = Field(default=None, description="Proxies in format ['user:pass@host:port', 'localhost']")
16 |     is_remote: Optional[bool] = Field(default=None, description="Remote job filter")
17 |     results_wanted: Optional[int] = Field(default=20, description="Number of results per site")
18 |     hours_old: Optional[int] = Field(default=None, description="Filter by hours since posting")
19 |     easy_apply: Optional[bool] = Field(default=None, description="Filter for easy apply jobs")
20 |     description_format: Optional[str] = Field(default="markdown", description="Format of job description")
21 |     offset: Optional[int] = Field(default=0, description="Offset for pagination")
22 |     verbose: Optional[int] = Field(default=2, description="Controls verbosity (0: errors only, 1: errors+warnings, 2: all logs)")
23 |     linkedin_fetch_description: Optional[bool] = Field(default=False, description="Fetch full LinkedIn descriptions")
24 |     linkedin_company_ids: Optional[List[int]] = Field(default=None, description="LinkedIn company IDs to filter by")
25 |     country_indeed: Optional[str] = Field(default=None, description="Country filter for Indeed & Glassdoor")
26 |     enforce_annual_salary: Optional[bool] = Field(default=False, description="Convert wages to annual salary")
27 |     ca_cert: Optional[str] = Field(default=None, description="Path to CA Certificate file for proxies")
28 | 
29 | class JobResponse(BaseModel):
30 |     count: int
31 |     jobs: List[Dict[str, Any]]
32 |     cached: bool = False
33 | 
34 | class PaginatedJobResponse(BaseModel):
35 |     count: int
36 |     total_pages: int
37 |     current_page: int
38 |     page_size: int
39 |     jobs: List[Dict[str, Any]]
40 |     cached: bool = False
41 |     next_page: Optional[str] = None
42 |     previous_page: Optional[str] = None
43 | 
44 | class HealthCheck(BaseModel):
45 |     status: str = "ok"
46 |     version: str = "1.0.0"
47 | 


--------------------------------------------------------------------------------
/app/models/__init__.py:
--------------------------------------------------------------------------------
 1 | """Models for the JobSpy Docker API."""
 2 | from .health_models import HealthCheck, DetailedHealthCheck
 3 | from .job_models import JobSearchParams, JobResponse, PaginatedJobResponse
 4 | 
 5 | # Re-export all models
 6 | __all__ = [
 7 |     "HealthCheck", 
 8 |     "DetailedHealthCheck",
 9 |     "JobSearchParams", 
10 |     "JobResponse", 
11 |     "PaginatedJobResponse"
12 | ]
13 | 


--------------------------------------------------------------------------------
/app/models/health_models.py:
--------------------------------------------------------------------------------
 1 | """Models for health check endpoints."""
 2 | from pydantic import BaseModel, Field
 3 | from typing import Dict, List, Any, Optional
 4 | 
 5 | class HealthCheck(BaseModel):
 6 |     """Health check response model with detailed information."""
 7 |     status: str = "ok"
 8 |     version: str = "1.0.0"
 9 |     environment: str = "production"
10 |     log_level: str = "INFO"
11 |     auth: Optional[Dict[str, Any]] = None
12 |     rate_limiting: Optional[Dict[str, Any]] = None
13 |     cache: Optional[Dict[str, Any]] = None
14 |     config: Optional[Dict[str, Any]] = None
15 |     health_endpoints: Optional[Dict[str, bool]] = None
16 |     timestamp: Optional[float] = None
17 | 
18 | class DetailedHealthCheck(BaseModel):
19 |     """Placeholder for detailed health check model."""
20 |     status: str = "ok"
21 |     version: str = "1.0.0"
22 | 


--------------------------------------------------------------------------------
/app/models/job_models.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, List, Optional, Union
 2 | 
 3 | from pydantic import BaseModel, Field
 4 | 
 5 | class JobSearchParams(BaseModel):
 6 |     site_name: Union[List[str], str] = Field(
 7 |         default_factory=lambda: ["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"],
 8 |         description="Job sites to search on",
 9 |     )
10 |     search_term: Optional[str] = Field(default=None, description="Job search term")
11 |     google_search_term: Optional[str] = Field(default=None, description="Search term for Google jobs")
12 |     location: Optional[str] = Field(default=None, description="Job location")
13 |     distance: Optional[int] = Field(default=50, description="Distance in miles")
14 |     job_type: Optional[str] = Field(default=None, description="Job type (fulltime, parttime, internship, contract)")
15 |     proxies: Optional[List[str]] = Field(default=None, description="Proxies in format ['user:pass@host:port', 'localhost']")
16 |     is_remote: Optional[bool] = Field(default=None, description="Remote job filter")
17 |     results_wanted: Optional[int] = Field(default=20, description="Number of results per site")
18 |     hours_old: Optional[int] = Field(default=None, description="Filter by hours since posting")
19 |     easy_apply: Optional[bool] = Field(default=None, description="Filter for easy apply jobs")
20 |     description_format: Optional[str] = Field(default="markdown", description="Format of job description")
21 |     offset: Optional[int] = Field(default=0, description="Offset for pagination")
22 |     verbose: Optional[int] = Field(default=2, description="Controls verbosity (0: errors only, 1: errors+warnings, 2: all logs)")
23 |     linkedin_fetch_description: Optional[bool] = Field(default=False, description="Fetch full LinkedIn descriptions")
24 |     linkedin_company_ids: Optional[List[int]] = Field(default=None, description="LinkedIn company IDs to filter by")
25 |     country_indeed: Optional[str] = Field(default=None, description="Country filter for Indeed & Glassdoor")
26 |     enforce_annual_salary: Optional[bool] = Field(default=False, description="Convert wages to annual salary")
27 |     ca_cert: Optional[str] = Field(default=None, description="Path to CA Certificate file for proxies")
28 | 
29 | class JobResponse(BaseModel):
30 |     count: int
31 |     jobs: List[Dict[str, Any]]
32 |     cached: bool = False
33 | 
34 | class PaginatedJobResponse(BaseModel):
35 |     count: int
36 |     total_pages: int
37 |     current_page: int
38 |     page_size: int
39 |     jobs: List[Dict[str, Any]]
40 |     cached: bool = False
41 |     next_page: Optional[str] = None
42 |     previous_page: Optional[str] = None
43 | 


--------------------------------------------------------------------------------
/app/routes/__init__.py:
--------------------------------------------------------------------------------
1 | """API route handlers."""
2 | 


--------------------------------------------------------------------------------
/app/routes/api.py:
--------------------------------------------------------------------------------
  1 | from fastapi import APIRouter, Depends, Query, HTTPException, Request
  2 | from typing import List, Optional, Union
  3 | import logging
  4 | import time
  5 | import uuid
  6 | import traceback
  7 | 
  8 | from app.models import JobSearchParams, JobResponse, PaginatedJobResponse
  9 | from app.config import settings
 10 | from app.middleware.api_key_auth import get_api_key
 11 | from app.services.job_service import JobService
 12 | from app.utils.validation_helpers import VALID_PARAMETERS, get_parameter_suggestion, generate_error_suggestions
 13 | 
 14 | router = APIRouter()
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | SUPPORTED_COUNTRIES_INDEED = {
 18 |     "Argentina", "Australia", "Austria", "Bahrain", "Belgium", "Brazil", "Canada", "Chile", "China", "Colombia",
 19 |     "Costa Rica", "Czech Republic", "Denmark", "Ecuador", "Egypt", "Finland", "France", "Germany", "Greece",
 20 |     "Hong Kong", "Hungary", "India", "Indonesia", "Ireland", "Israel", "Italy", "Japan", "Kuwait", "Luxembourg",
 21 |     "Malaysia", "Mexico", "Morocco", "Netherlands", "New Zealand", "Nigeria", "Norway", "Oman", "Pakistan",
 22 |     "Panama", "Peru", "Philippines", "Poland", "Portugal", "Qatar", "Romania", "Saudi Arabia", "Singapore",
 23 |     "South Africa", "South Korea", "Spain", "Sweden", "Switzerland", "Taiwan", "Thailand", "Turkey", "Ukraine",
 24 |     "United Arab Emirates", "UK", "USA", "Uruguay", "Venezuela", "Vietnam"
 25 | }
 26 | 
 27 | def validate_job_search_params(
 28 |     site_name,
 29 |     country_indeed,
 30 |     hours_old,
 31 |     job_type,
 32 |     is_remote,
 33 |     easy_apply,
 34 |     description_format=None,
 35 |     verbose=None,
 36 |     page=None,
 37 |     page_size=None,
 38 |     paginate=None,
 39 |     endpoint="search_jobs"
 40 | ):
 41 |     # Normalize site names
 42 |     snames = [s.lower() for s in site_name] if site_name else []
 43 |     # Supported country validation for Indeed/Glassdoor
 44 |     if ("indeed" in snames or "glassdoor" in snames):
 45 |         if not country_indeed:
 46 |             raise HTTPException(
 47 |                 status_code=400,
 48 |                 detail={
 49 |                     "error": "Missing required parameter",
 50 |                     "parameter": "country_indeed",
 51 |                     "message": "country_indeed is required when searching Indeed or Glassdoor.",
 52 |                     "suggestion": "Specify a supported country using the country_indeed parameter. See documentation for valid values."
 53 |                 }
 54 |             )
 55 |         if country_indeed not in SUPPORTED_COUNTRIES_INDEED:
 56 |             raise HTTPException(
 57 |                 status_code=400,
 58 |                 detail={
 59 |                     "error": "Invalid country_indeed value",
 60 |                     "invalid_value": country_indeed,
 61 |                     "valid_countries": sorted(SUPPORTED_COUNTRIES_INDEED),
 62 |                     "suggestion": "Use one of the supported country names exactly as listed in the documentation."
 63 |                 }
 64 |             )
 65 |     # Parameter conflict logic for Indeed
 66 |     if "indeed" in snames:
 67 |         conflict_params = []
 68 |         if hours_old is not None:
 69 |             if (job_type is not None or is_remote is not None) or (easy_apply is not None):
 70 |                 conflict_params = ["hours_old", "job_type/is_remote", "easy_apply"]
 71 |         elif (job_type is not None or is_remote is not None) and easy_apply is not None:
 72 |             conflict_params = ["job_type/is_remote", "easy_apply"]
 73 |         if conflict_params:
 74 |             raise HTTPException(
 75 |                 status_code=400,
 76 |                 detail={
 77 |                     "error": "Parameter conflict for Indeed",
 78 |                     "conflicting_parameters": conflict_params,
 79 |                     "message": (
 80 |                         "Indeed searches only support one of the following at a time: "
 81 |                         "hours_old, (job_type & is_remote), or easy_apply."
 82 |                     ),
 83 |                     "suggestion": (
 84 |                         "Remove one or more of these parameters so that only one group is used per search. "
 85 |                         "See documentation for details."
 86 |                     )
 87 |                 }
 88 |             )
 89 |     # Parameter conflict logic for LinkedIn
 90 |     if "linkedin" in snames:
 91 |         if hours_old is not None and easy_apply is not None:
 92 |             raise HTTPException(
 93 |                 status_code=400,
 94 |                 detail={
 95 |                     "error": "Parameter conflict for LinkedIn",
 96 |                     "conflicting_parameters": ["hours_old", "easy_apply"],
 97 |                     "message": (
 98 |                         "LinkedIn searches only support one of the following at a time: hours_old or easy_apply."
 99 |                     ),
100 |                     "suggestion": (
101 |                         "Remove either hours_old or easy_apply from your search parameters."
102 |                     )
103 |                 }
104 |             )
105 | 
106 |     # --- General parameter validation ---
107 |     errors = []
108 |     # site_name
109 |     if site_name:
110 |         for s in site_name:
111 |             if s not in VALID_PARAMETERS["site_name"]:
112 |                 errors.append(get_parameter_suggestion("site_name", s))
113 |     # job_type
114 |     if job_type and job_type not in VALID_PARAMETERS["job_type"]:
115 |         errors.append(get_parameter_suggestion("job_type", job_type))
116 |     # description_format
117 |     if description_format and description_format not in VALID_PARAMETERS["description_format"]:
118 |         errors.append(get_parameter_suggestion("description_format", description_format))
119 |     # verbose
120 |     if verbose is not None and verbose not in VALID_PARAMETERS["verbose"]:
121 |         errors.append(get_parameter_suggestion("verbose", verbose))
122 |     # page_size
123 |     if page_size is not None and (page_size < 1 or page_size > 100):
124 |         errors.append(get_parameter_suggestion("page_size", page_size))
125 |     # paginate
126 |     if paginate is not None and paginate not in [True, False, 0, 1]:
127 |         errors.append(get_parameter_suggestion("paginate", paginate))
128 |     # page
129 |     if page is not None and page < 1:
130 |         errors.append(get_parameter_suggestion("page", page))
131 |     # If any errors, raise with all suggestions
132 |     if errors:
133 |         raise HTTPException(
134 |             status_code=400,
135 |             detail={
136 |                 "error": "Invalid parameter(s)",
137 |                 "suggestions": errors,
138 |             }
139 |         )
140 | 
141 | @router.get("/search_jobs", response_model=Union[JobResponse, PaginatedJobResponse], dependencies=[Depends(get_api_key)])
142 | async def search_jobs(
143 |     request: Request,
144 |     # Pagination parameters
145 |     paginate: bool = Query(False, description="Enable pagination"),
146 |     page: int = Query(1, ge=1, description="Page number (if pagination enabled)"),
147 |     page_size: int = Query(10, ge=1, le=100, description="Items per page (if pagination enabled)"),
148 |     
149 |     # Basic search parameters
150 |     site_name: List[str] = Query(default=None, description="Job sites to search on"),
151 |     search_term: str = Query(None, description="Job search term"),
152 |     google_search_term: Optional[str] = Query(None, description="Search term for Google jobs"),
153 |     location: str = Query(None, description="Job location"),
154 |     distance: int = Query(None, description="Distance in miles"),
155 |     
156 |     # Job filters
157 |     job_type: Optional[str] = Query(None, description="Job type (fulltime, parttime, internship, contract)"),
158 |     is_remote: Optional[bool] = Query(None, description="Remote job filter"),
159 |     hours_old: Optional[int] = Query(None, description="Filter by hours since posting"),
160 |     easy_apply: Optional[bool] = Query(None, description="Filter for easy apply jobs"),
161 |     
162 |     # Advanced parameters
163 |     results_wanted: int = Query(None, description="Number of results per site"),
164 |     description_format: str = Query(None, description="Format of job description"),
165 |     offset: int = Query(None, description="Offset for pagination"),
166 |     verbose: int = Query(None, description="Controls verbosity (0: errors only, 1: errors+warnings, 2: all logs)"),
167 |     linkedin_fetch_description: bool = Query(None, description="Fetch full LinkedIn descriptions"),
168 |     linkedin_company_ids: Optional[List[int]] = Query(None, description="LinkedIn company IDs to filter by"),
169 |     country_indeed: Optional[str] = Query(None, description="Country filter for Indeed & Glassdoor"),
170 |     enforce_annual_salary: bool = Query(None, description="Convert wages to annual salary"),
171 | ):
172 |     """
173 |     Search for jobs across multiple platforms with optional pagination.
174 |     
175 |     If paginate=True, returns paginated results with next/previous page links.
176 |     Otherwise, returns all results in a single response.
177 |     """
178 |     request_id = str(uuid.uuid4())
179 |     start_time = time.time()
180 |     
181 |     validate_job_search_params(
182 |         site_name=site_name,
183 |         country_indeed=country_indeed,
184 |         hours_old=hours_old,
185 |         job_type=job_type,
186 |         is_remote=is_remote,
187 |         easy_apply=easy_apply,
188 |         description_format=description_format,
189 |         verbose=verbose,
190 |         page=page,
191 |         page_size=page_size,
192 |         paginate=paginate,
193 |     )
194 |     
195 |     # Validate site_name values
196 |     if site_name:
197 |         invalid_sites = [site for site in site_name if site not in VALID_PARAMETERS["site_name"]]
198 |         if invalid_sites:
199 |             suggestions = [get_parameter_suggestion("site_name", site) for site in invalid_sites]
200 |             raise HTTPException(
201 |                 status_code=400, 
202 |                 detail={
203 |                     "error": "Invalid job site name(s)",
204 |                     "invalid_values": invalid_sites,
205 |                     "valid_sites": VALID_PARAMETERS["site_name"],
206 |                     "suggestions": suggestions
207 |                 }
208 |             )
209 |     
210 |     # Validate job_type
211 |     if job_type and job_type not in VALID_PARAMETERS["job_type"]:
212 |         suggestion = get_parameter_suggestion("job_type", job_type)
213 |         raise HTTPException(
214 |             status_code=400, 
215 |             detail={
216 |                 "error": "Invalid job type",
217 |                 "invalid_value": job_type,
218 |                 "valid_types": VALID_PARAMETERS["job_type"],
219 |                 "suggestion": suggestion
220 |             }
221 |         )
222 |     
223 |     # Validate description_format
224 |     if description_format and description_format not in VALID_PARAMETERS["description_format"]:
225 |         suggestion = get_parameter_suggestion("description_format", description_format)
226 |         raise HTTPException(
227 |             status_code=400, 
228 |             detail={
229 |                 "error": "Invalid description format",
230 |                 "invalid_value": description_format,
231 |                 "valid_formats": VALID_PARAMETERS["description_format"],
232 |                 "suggestion": suggestion
233 |             }
234 |         )
235 |     
236 |     # Create parameters object with all search parameters
237 |     params = JobSearchParams(
238 |         site_name=site_name if site_name else settings.DEFAULT_SITE_NAMES,
239 |         search_term=search_term,
240 |         google_search_term=google_search_term,
241 |         location=location,
242 |         distance=distance if distance is not None else settings.DEFAULT_DISTANCE,
243 |         job_type=job_type,
244 |         proxies=settings.DEFAULT_PROXIES if settings.DEFAULT_PROXIES else None,
245 |         is_remote=is_remote,
246 |         results_wanted=results_wanted if results_wanted is not None else settings.DEFAULT_RESULTS_WANTED,
247 |         hours_old=hours_old,
248 |         easy_apply=easy_apply,
249 |         description_format=description_format if description_format else settings.DEFAULT_DESCRIPTION_FORMAT,
250 |         offset=offset if offset is not None else 0,
251 |         verbose=verbose if verbose is not None else 2,
252 |         linkedin_fetch_description=linkedin_fetch_description if linkedin_fetch_description is not None else False,
253 |         linkedin_company_ids=linkedin_company_ids,
254 |         country_indeed=country_indeed if country_indeed else settings.DEFAULT_COUNTRY_INDEED,
255 |         enforce_annual_salary=enforce_annual_salary if enforce_annual_salary is not None else False,
256 |         ca_cert=settings.CA_CERT_PATH,
257 |     )
258 |     
259 |     logger.info(f"Request {request_id}: Starting job search with parameters: {params.dict(exclude_none=True)}")
260 |     
261 |     try:
262 |         # Execute the search
263 |         jobs_df, is_cached = JobService.search_jobs(params.dict(exclude_none=True))
264 |         
265 |         # Return results - either paginated or all at once
266 |         if paginate:
267 |             # Calculate pagination
268 |             total_items = len(jobs_df)
269 |             total_pages = (total_items + page_size - 1) // page_size if total_items > 0 else 1
270 |             
271 |             # Validate page number
272 |             if page > total_pages and total_pages > 0:
273 |                 raise HTTPException(
274 |                     status_code=404, 
275 |                     detail={
276 |                         "error": f"Page {page} not found",
277 |                         "total_pages": total_pages,
278 |                         "suggestion": f"Use a page number between 1 and {total_pages}"
279 |                     }
280 |                 )
281 |             
282 |             # Apply pagination
283 |             start_idx = (page - 1) * page_size
284 |             end_idx = min(start_idx + page_size, total_items)
285 |             paginated_df = jobs_df.iloc[start_idx:end_idx] if total_items > 0 else jobs_df
286 |             
287 |             # Generate next/previous page URLs
288 |             base_url = str(request.url).split("?")[0]
289 |             query_params = dict(request.query_params)
290 |             
291 |             next_page = None
292 |             if page < total_pages:
293 |                 query_params["page"] = str(page + 1)
294 |                 next_page = f"{base_url}?{'&'.join([f'{k}={v}' for k, v in query_params.items()])}"
295 |             
296 |             previous_page = None
297 |             if page > 1:
298 |                 query_params["page"] = str(page - 1)
299 |                 previous_page = f"{base_url}?{'&'.join([f'{k}={v}' for k, v in query_params.items()])}"
300 |             
301 |             # Convert DataFrame to dictionary format
302 |             jobs_list = paginated_df.to_dict('records') if not paginated_df.empty else []
303 |             
304 |             end_time = time.time()
305 |             logger.info(f"Request {request_id}: Completed in {end_time - start_time:.2f} seconds. Found {total_items} jobs, returning page {page}/{total_pages}")
306 |             
307 |             return {
308 |                 "count": total_items,
309 |                 "total_pages": total_pages,
310 |                 "current_page": page,
311 |                 "page_size": page_size,
312 |                 "jobs": jobs_list,
313 |                 "cached": is_cached,
314 |                 "next_page": next_page,
315 |                 "previous_page": previous_page
316 |             }
317 |         else:
318 |             # Return all results without pagination
319 |             jobs_list = jobs_df.to_dict('records') if not jobs_df.empty else []
320 |             
321 |             end_time = time.time()
322 |             logger.info(f"Request {request_id}: Completed in {end_time - start_time:.2f} seconds. Found {len(jobs_list)} jobs")
323 |             
324 |             return {
325 |                 "count": len(jobs_list),
326 |                 "jobs": jobs_list,
327 |                 "cached": is_cached
328 |             }
329 |     except Exception as e:
330 |         if isinstance(e, HTTPException):
331 |             raise e
332 |         
333 |         logger.error(f"Request {request_id}: Error scraping jobs: {str(e)}")
334 |         logger.debug(traceback.format_exc())
335 |         
336 |         # Provide more helpful error details
337 |         error_message = str(e)
338 |         suggestion = "Try simplifying your search or using fewer job sites"
339 |         
340 |         if "proxy" in error_message.lower():
341 |             suggestion = "Check your proxy configuration or try without a proxy"
342 |         elif "timeout" in error_message.lower():
343 |             suggestion = "The request timed out. Try reducing the number of job sites or results_wanted"
344 |         elif "captcha" in error_message.lower():
345 |             suggestion = "A CAPTCHA was encountered. Try using a different proxy or reduce request frequency"
346 |         
347 |         raise HTTPException(
348 |             status_code=500, 
349 |             detail={
350 |                 "error": "Error scraping jobs",
351 |                 "message": error_message,
352 |                 "suggestion": suggestion
353 |             }
354 |         )
355 | 
356 | @router.post("/search_jobs", response_model=Union[JobResponse, PaginatedJobResponse], dependencies=[Depends(get_api_key)])
357 | async def search_jobs_post(
358 |     params: JobSearchParams,
359 |     request: Request,
360 | ):
361 |     """
362 |     Search for jobs across multiple platforms using POST method.
363 |     """
364 |     request_id = str(uuid.uuid4())
365 |     start_time = time.time()
366 |     
367 |     validate_job_search_params(
368 |         site_name=params.site_name if isinstance(params.site_name, list) else [params.site_name],
369 |         country_indeed=params.country_indeed,
370 |         hours_old=params.hours_old,
371 |         job_type=params.job_type,
372 |         is_remote=params.is_remote,
373 |         easy_apply=params.easy_apply,
374 |         description_format=params.description_format,
375 |         verbose=params.verbose,
376 |         page=getattr(params, "page", None),
377 |         page_size=getattr(params, "page_size", None),
378 |         paginate=getattr(params, "paginate", None),
379 |     )
380 |     
381 |     logger.info(f"Request {request_id}: Starting job search with parameters: {params.dict(exclude_none=True)}")
382 |     
383 |     try:
384 |         # Execute the search
385 |         jobs_df, is_cached = JobService.search_jobs(params.dict(exclude_none=True))
386 |         
387 |         # Return all results without pagination
388 |         jobs_list = jobs_df.to_dict('records') if not jobs_df.empty else []
389 |         
390 |         end_time = time.time()
391 |         logger.info(f"Request {request_id}: Completed in {end_time - start_time:.2f} seconds. Found {len(jobs_list)} jobs")
392 |         
393 |         return {
394 |             "count": len(jobs_list),
395 |             "jobs": jobs_list,
396 |             "cached": is_cached
397 |         }
398 |     except Exception as e:
399 |         if isinstance(e, HTTPException):
400 |             raise e
401 |         
402 |         logger.error(f"Request {request_id}: Error scraping jobs: {str(e)}")
403 |         logger.debug(traceback.format_exc())
404 |         
405 |         # Provide more helpful error details
406 |         error_message = str(e)
407 |         suggestion = "Try simplifying your search or using fewer job sites"
408 |         
409 |         if "proxy" in error_message.lower():
410 |             suggestion = "Check your proxy configuration or try without a proxy"
411 |         elif "timeout" in error_message.lower():
412 |             suggestion = "The request timed out. Try reducing the number of job sites or results_wanted"
413 |         elif "captcha" in error_message.lower():
414 |             suggestion = "A CAPTCHA was encountered. Try using a different proxy or reduce request frequency"
415 |         
416 |         raise HTTPException(
417 |             status_code=500, 
418 |             detail={
419 |                 "error": "Error scraping jobs",
420 |                 "message": error_message,
421 |                 "suggestion": suggestion
422 |             }
423 |         )
424 | 


--------------------------------------------------------------------------------
/app/routes/health.py:
--------------------------------------------------------------------------------
  1 | from fastapi import APIRouter, Request, Depends, HTTPException, status
  2 | from app.models import HealthCheck
  3 | from app.core.config import settings as core_settings
  4 | from app.config import settings as app_settings
  5 | import logging
  6 | import os
  7 | import platform
  8 | import time
  9 | from app.utils.auth_health import check_auth_configuration
 10 | 
 11 | router = APIRouter()
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | # Create a dependency to check if health endpoints are enabled
 15 | async def verify_health_enabled():
 16 |     """Verify that health endpoints are enabled via configuration."""
 17 |     if not app_settings.ENABLE_HEALTH_ENDPOINTS:
 18 |         raise HTTPException(
 19 |             status_code=status.HTTP_404_NOT_FOUND,
 20 |             detail="Health endpoints are disabled"
 21 |         )
 22 |     return True
 23 | 
 24 | @router.get("/health", response_model=HealthCheck, tags=["Health"], dependencies=[Depends(verify_health_enabled)])
 25 | async def health_check():
 26 |     """
 27 |     Health check endpoint to verify the API is running correctly and return system status
 28 |     """
 29 |     # Get authentication status
 30 |     auth_status = check_auth_configuration()
 31 |     
 32 |     # Build response with all the requested information
 33 |     return HealthCheck(
 34 |         status="ok",
 35 |         version="1.0.0",
 36 |         environment=app_settings.ENVIRONMENT,
 37 |         log_level=app_settings.LOG_LEVEL,
 38 |         auth={
 39 |             "enabled": app_settings.ENABLE_API_KEY_AUTH,
 40 |             "api_keys_configured": bool(app_settings.API_KEYS),
 41 |             "api_keys_count": len(app_settings.API_KEYS) if app_settings.API_KEYS else 0,
 42 |             "inconsistent": auth_status["inconsistent_config"],
 43 |         },
 44 |         rate_limiting={
 45 |             "enabled": app_settings.RATE_LIMIT_ENABLED,
 46 |             "requests_limit": app_settings.RATE_LIMIT_REQUESTS,
 47 |             "timeframe_seconds": app_settings.RATE_LIMIT_TIMEFRAME,
 48 |         },
 49 |         cache={
 50 |             "enabled": app_settings.ENABLE_CACHE,
 51 |             "expiry_seconds": app_settings.CACHE_EXPIRY,
 52 |         },
 53 |         health_endpoints={
 54 |             "enabled": app_settings.ENABLE_HEALTH_ENDPOINTS,
 55 |             "detailed_health": app_settings.ENABLE_DETAILED_HEALTH,
 56 |         },
 57 |         config={
 58 |             "default_site_names": app_settings.DEFAULT_SITE_NAMES,
 59 |             "default_results_wanted": app_settings.DEFAULT_RESULTS_WANTED,
 60 |             "default_distance": app_settings.DEFAULT_DISTANCE,
 61 |             "default_description_format": app_settings.DEFAULT_DESCRIPTION_FORMAT,
 62 |             "default_country_indeed": app_settings.DEFAULT_COUNTRY_INDEED,
 63 |         },
 64 |         timestamp=time.time()
 65 |     )
 66 | 
 67 | @router.get("/ping", tags=["Health"], dependencies=[Depends(verify_health_enabled)])
 68 | async def ping():
 69 |     """
 70 |     Simple ping endpoint for load balancers and monitoring
 71 |     """
 72 |     return {"status": "ok"}
 73 | 
 74 | @router.get("/auth-status", tags=["Health"], dependencies=[Depends(verify_health_enabled)])
 75 | async def auth_status(request: Request):
 76 |     """
 77 |     Diagnostic endpoint to check authentication settings
 78 |     """
 79 |     logger.info("Auth status endpoint called")
 80 |     
 81 |     # Check if the request has the API key header
 82 |     api_key_header_name = "X-API-Key"
 83 |     api_key_in_request = request.headers.get(api_key_header_name)
 84 |     
 85 |     return {
 86 |         "api_key_configured": bool(core_settings.API_KEY),
 87 |         "api_key_header_name": api_key_header_name,
 88 |         "api_key_in_request": bool(api_key_in_request),
 89 |         "authentication_enabled": bool(core_settings.API_KEY),
 90 |         "environment": core_settings.ENVIRONMENT if hasattr(core_settings, "ENVIRONMENT") else app_settings.ENVIRONMENT
 91 |     }
 92 | 
 93 | @router.get("/api-config", tags=["Health"], dependencies=[Depends(verify_health_enabled)])
 94 | async def api_config():
 95 |     """
 96 |     Diagnostic endpoint to check API configuration settings
 97 |     """
 98 |     logger.info("API configuration endpoint called")
 99 |     
100 |     # Only provide detailed info if it's enabled
101 |     if not app_settings.ENABLE_DETAILED_HEALTH:
102 |         return {
103 |             "status": "ok",
104 |             "message": "Detailed health information is disabled. Enable with ENABLE_DETAILED_HEALTH=true"
105 |         }
106 |     
107 |     # Build comprehensive config information
108 |     system_info = {
109 |         "platform": platform.platform(),
110 |         "python_version": platform.python_version(),
111 |     }
112 |     
113 |     # Configuration information
114 |     config = {
115 |         "environment": app_settings.ENVIRONMENT,
116 |         "log_level": app_settings.LOG_LEVEL,
117 |         "authentication": {
118 |             "enabled": app_settings.ENABLE_API_KEY_AUTH,
119 |             "api_keys_configured": bool(app_settings.API_KEYS),
120 |             "api_keys_count": len(app_settings.API_KEYS) if app_settings.API_KEYS else 0,
121 |             "header_name": app_settings.API_KEY_HEADER_NAME,
122 |         },
123 |         "rate_limiting": {
124 |             "enabled": app_settings.RATE_LIMIT_ENABLED,
125 |             "requests_limit": app_settings.RATE_LIMIT_REQUESTS,
126 |             "timeframe_seconds": app_settings.RATE_LIMIT_TIMEFRAME,
127 |         },
128 |         "caching": {
129 |             "enabled": app_settings.ENABLE_CACHE,
130 |             "expiry_seconds": app_settings.CACHE_EXPIRY,
131 |         },
132 |         "health_endpoints": {
133 |             "enabled": app_settings.ENABLE_HEALTH_ENDPOINTS,
134 |             "detailed_health": app_settings.ENABLE_DETAILED_HEALTH,
135 |         },
136 |     }
137 |     
138 |     return {
139 |         "status": "ok",
140 |         "system": system_info,
141 |         "config": config,
142 |         "timestamp": time.time()
143 |     }
144 | 
145 | @router.get("/config-sources", tags=["Health"], dependencies=[Depends(verify_health_enabled)])
146 | async def config_sources():
147 |     """
148 |     Diagnostic endpoint to view the source of each configuration setting
149 |     """
150 |     logger.info("Configuration sources endpoint called")
151 |     
152 |     # Only provide detailed info if it's enabled
153 |     if not app_settings.ENABLE_DETAILED_HEALTH:
154 |         return {
155 |             "status": "ok",
156 |             "message": "Detailed health information is disabled. Enable with ENABLE_DETAILED_HEALTH=true"
157 |         }
158 |     
159 |     # Get all settings with their sources
160 |     settings_with_sources = app_settings.get_all_settings()
161 |     
162 |     # Format for output, focusing on key settings
163 |     important_settings = [
164 |         "ENABLE_API_KEY_AUTH", "API_KEYS", "RATE_LIMIT_ENABLED", 
165 |         "ENABLE_CACHE", "ENVIRONMENT", "LOG_LEVEL"
166 |     ]
167 |     
168 |     focused_settings = {k: settings_with_sources[k] for k in important_settings if k in settings_with_sources}
169 |     
170 |     # Check for configuration inconsistencies
171 |     auth_status = check_auth_configuration()
172 |     inconsistencies = []
173 |     
174 |     if auth_status["inconsistent_config"]:
175 |         inconsistencies.extend(auth_status["recommendations"])
176 |     
177 |     return {
178 |         "status": "ok",
179 |         "key_settings": focused_settings,
180 |         "all_settings": settings_with_sources,
181 |         "inconsistencies": inconsistencies,
182 |         "timestamp": time.time()
183 |     }
184 | 


--------------------------------------------------------------------------------
/app/services/__init__.py:
--------------------------------------------------------------------------------
1 | """Business logic services."""
2 | 


--------------------------------------------------------------------------------
/app/services/background_service.py:
--------------------------------------------------------------------------------
 1 | """Background job processing for JobSpy Docker API."""
 2 | import asyncio
 3 | from typing import Dict, Any, Optional
 4 | import uuid
 5 | import logging
 6 | from datetime import datetime
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | # Simple in-memory job storage (would use a database in production)
11 | jobs = {}
12 | 
13 | async def process_job_async(job_id: str, search_function, params: Dict[str, Any]):
14 |     """Process a job asynchronously."""
15 |     try:
16 |         logger.info(f"Starting background job {job_id}")
17 |         jobs[job_id]["status"] = "running"
18 |         
19 |         # Execute the search
20 |         result, is_cached = await asyncio.to_thread(search_function, params)
21 |         
22 |         # Store result
23 |         jobs[job_id]["status"] = "completed"
24 |         jobs[job_id]["result"] = result
25 |         jobs[job_id]["is_cached"] = is_cached
26 |         jobs[job_id]["completed_at"] = datetime.now().isoformat()
27 |         
28 |         logger.info(f"Completed background job {job_id}")
29 |     except Exception as e:
30 |         logger.error(f"Error processing job {job_id}: {str(e)}")
31 |         jobs[job_id]["status"] = "failed"
32 |         jobs[job_id]["error"] = str(e)
33 | 
34 | def create_background_job(search_function, params: Dict[str, Any]) -> str:
35 |     """Create a new background job."""
36 |     job_id = str(uuid.uuid4())
37 |     jobs[job_id] = {
38 |         "id": job_id,
39 |         "status": "pending",
40 |         "created_at": datetime.now().isoformat(),
41 |         "params": params,
42 |     }
43 |     
44 |     # Start the background task
45 |     asyncio.create_task(process_job_async(job_id, search_function, params))
46 |     
47 |     return job_id
48 | 
49 | def get_job_status(job_id: str) -> Optional[Dict[str, Any]]:
50 |     """Get the status of a job."""
51 |     return jobs.get(job_id)
52 | 


--------------------------------------------------------------------------------
/app/services/external_service.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | import time
 3 | import httpx
 4 | from app.core.logging_config import get_logger
 5 | 
 6 | logger = get_logger("services.external_service")
 7 | 
 8 | 
 9 | async def fetch_data_from_external_api(url: str, params: dict = None):
10 |     start_time = time.time()
11 |     request_id = str(uuid.uuid4())
12 |     
13 |     logger.debug(f"External API request {request_id} started: GET {url} - Params: {params}")
14 |     
15 |     try:
16 |         async with httpx.AsyncClient() as client:
17 |             response = await client.get(url, params=params)
18 |         
19 |         elapsed_time = time.time() - start_time
20 |         logger.debug(
21 |             f"External API request {request_id} completed: GET {url} - "
22 |             f"Status: {response.status_code} - Time: {elapsed_time:.3f}s"
23 |         )
24 |         
25 |         response.raise_for_status()
26 |         return response.json()
27 |     except httpx.HTTPStatusError as e:
28 |         elapsed_time = time.time() - start_time
29 |         logger.error(
30 |             f"External API request {request_id} failed with status {e.response.status_code}: "
31 |             f"GET {url} - Time: {elapsed_time:.3f}s - Response: {e.response.text}"
32 |         )
33 |         raise
34 |     except Exception as e:
35 |         elapsed_time = time.time() - start_time
36 |         logger.exception(
37 |             f"External API request {request_id} failed: GET {url} - "
38 |             f"Time: {elapsed_time:.3f}s - Error: {str(e)}"
39 |         )
40 |         raise


--------------------------------------------------------------------------------
/app/services/job_service.py:
--------------------------------------------------------------------------------
 1 | """Job search service layer."""
 2 | from typing import Dict, Any
 3 | import pandas as pd
 4 | from jobspy import scrape_jobs
 5 | import logging
 6 | 
 7 | from app.config import settings
 8 | from app.cache import cache
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | class JobService:
13 |     """Service for interacting with JobSpy library."""
14 |     
15 |     @staticmethod
16 |     def search_jobs(params: Dict[str, Any]) -> pd.DataFrame:
17 |         """
18 |         Execute a job search using the JobSpy library.
19 |         
20 |         Args:
21 |             params: Dictionary of search parameters
22 |             
23 |         Returns:
24 |             DataFrame containing job results
25 |         """
26 |         # Apply default proxies from env if none provided
27 |         if params.get('proxies') is None and settings.DEFAULT_PROXIES:
28 |             params['proxies'] = settings.DEFAULT_PROXIES
29 |         
30 |         # Apply default CA cert path if none provided
31 |         if params.get('ca_cert') is None and settings.CA_CERT_PATH:
32 |             params['ca_cert'] = settings.CA_CERT_PATH
33 |             
34 |         # Apply default country_indeed if none provided
35 |         if params.get('country_indeed') is None and settings.DEFAULT_COUNTRY_INDEED:
36 |             params['country_indeed'] = settings.DEFAULT_COUNTRY_INDEED
37 |         
38 |         # Check cache first
39 |         cached_results = cache.get(params)
40 |         if cached_results is not None:
41 |             logger.info(f"Returning cached results with {len(cached_results)} jobs")
42 |             return cached_results, True
43 |         
44 |         # Execute search
45 |         jobs_df = scrape_jobs(**params)
46 |         
47 |         # Cache the results
48 |         cache.set(params, jobs_df)
49 |         
50 |         return jobs_df, False
51 | 
52 |     @staticmethod
53 |     def filter_jobs(jobs_df: pd.DataFrame, filters: Dict[str, Any]) -> pd.DataFrame:
54 |         """Filter job results based on criteria."""
55 |         filtered_df = jobs_df.copy()
56 |         
57 |         # Filter by salary range
58 |         if 'min_salary' in filters and filters['min_salary'] is not None:
59 |             # Convert to numeric first to handle comparison properly
60 |             filtered_df = filtered_df[filtered_df['MIN_AMOUNT'].astype(float) >= float(filters['min_salary'])]
61 |             
62 |         if 'max_salary' in filters and filters['max_salary'] is not None:
63 |             filtered_df = filtered_df[filtered_df['MAX_AMOUNT'].astype(float) <= float(filters['max_salary'])]
64 |             
65 |         # Filter by company
66 |         if 'company' in filters and filters['company']:
67 |             filtered_df = filtered_df[filtered_df['COMPANY'].str.contains(filters['company'], case=False, na=False)]
68 |             
69 |         # Filter by job type
70 |         if 'job_type' in filters and filters['job_type']:
71 |             filtered_df = filtered_df[filtered_df['JOB_TYPE'] == filters['job_type']]
72 |             
73 |         # Filter by location
74 |         if 'city' in filters and filters['city']:
75 |             filtered_df = filtered_df[filtered_df['CITY'].str.contains(filters['city'], case=False, na=False)]
76 |             
77 |         if 'state' in filters and filters['state']:
78 |             filtered_df = filtered_df[filtered_df['STATE'].str.contains(filters['state'], case=False, na=False)]
79 |             
80 |         # Filter by keyword in title
81 |         if 'title_keywords' in filters and filters['title_keywords']:
82 |             filtered_df = filtered_df[filtered_df['TITLE'].str.contains(filters['title_keywords'], case=False, na=False)]
83 |             
84 |         return filtered_df
85 |     
86 |     @staticmethod
87 |     def sort_jobs(jobs_df: pd.DataFrame, sort_by: str, sort_order: str = 'desc') -> pd.DataFrame:
88 |         """Sort job results by specified field."""
89 |         if not sort_by or sort_by not in jobs_df.columns:
90 |             return jobs_df
91 |             
92 |         ascending = sort_order.lower() != 'desc'
93 |         return jobs_df.sort_values(by=sort_by, ascending=ascending)
94 | 


--------------------------------------------------------------------------------
/app/utils/auth_health.py:
--------------------------------------------------------------------------------
 1 | """Utility functions for checking authentication health."""
 2 | import logging
 3 | from typing import Dict, Any
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | def check_auth_configuration() -> Dict[str, Any]:
 8 |     """
 9 |     Check the authentication configuration and return status details.
10 |     This helps diagnose authentication issues by checking all relevant settings.
11 |     """
12 |     # Import here to avoid circular imports
13 |     from app.core.config import settings as core_settings
14 |     from app.config import settings as app_settings
15 |     
16 |     # Check core settings
17 |     core_api_key_set = bool(core_settings.API_KEY)
18 |     
19 |     # Check app settings  
20 |     app_auth_enabled = app_settings.ENABLE_API_KEY_AUTH
21 |     app_keys_configured = bool(app_settings.API_KEYS)
22 |     app_keys_count = len(app_settings.API_KEYS)
23 |     
24 |     # Check for configuration inconsistencies
25 |     inconsistent_config = (app_keys_configured and not app_auth_enabled)
26 |     
27 |     # Generate recommendations
28 |     recommendations = []
29 |     if inconsistent_config:
30 |         recommendations.append(
31 |             "API keys are configured but authentication is disabled. Consider enabling ENABLE_API_KEY_AUTH."
32 |         )
33 |         logger.warning("API keys are configured but authentication is disabled. This may lead to unexpected behavior.")
34 |     
35 |     # Determine if authentication is needed based on both configs
36 |     auth_required = core_api_key_set or (app_auth_enabled and app_keys_configured)
37 |     
38 |     # Log configuration sources
39 |     logger.debug(f"API keys loaded from: {app_settings.API_KEYS_SOURCE}")
40 |     logger.debug(f"Auth enabled setting loaded from: {app_settings.ENABLE_API_KEY_AUTH_SOURCE}")
41 |     
42 |     return {
43 |         "auth_required": auth_required,
44 |         "core_settings": {
45 |             "api_key_configured": core_api_key_set,
46 |         },
47 |         "app_settings": {
48 |             "auth_enabled": app_auth_enabled,
49 |             "api_keys_configured": app_keys_configured,
50 |             "api_keys_count": app_keys_count,
51 |             "header_name": app_settings.API_KEY_HEADER_NAME,
52 |             "api_keys_source": app_settings.API_KEYS_SOURCE,
53 |             "auth_enabled_source": app_settings.ENABLE_API_KEY_AUTH_SOURCE,
54 |         },
55 |         "inconsistent_config": inconsistent_config,
56 |         "recommendations": recommendations
57 |     }
58 | 


--------------------------------------------------------------------------------
/app/utils/env_debugger.py:
--------------------------------------------------------------------------------
 1 | """Utility to debug environment variable loading."""
 2 | import logging
 3 | import os
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | def log_environment_settings():
 8 |     """
 9 |     Log all environment variables relevant to application configuration.
10 |     This helps diagnose when environment variables aren't being loaded correctly.
11 |     """
12 |     env_vars = {
13 |         "API_KEYS": os.getenv("API_KEYS", "[not set]"),
14 |         "ENABLE_API_KEY_AUTH": os.getenv("ENABLE_API_KEY_AUTH", "[not set]"),
15 |         "API_KEY_HEADER_NAME": os.getenv("API_KEY_HEADER_NAME", "[not set]"),
16 |         "RATE_LIMIT_ENABLED": os.getenv("RATE_LIMIT_ENABLED", "[not set]"),
17 |         "RATE_LIMIT_REQUESTS": os.getenv("RATE_LIMIT_REQUESTS", "[not set]"),
18 |         "RATE_LIMIT_TIMEFRAME": os.getenv("RATE_LIMIT_TIMEFRAME", "[not set]"),
19 |         "DEFAULT_PROXIES": os.getenv("DEFAULT_PROXIES", "[not set]"),
20 |         "DEFAULT_SITE_NAMES": os.getenv("DEFAULT_SITE_NAMES", "[not set]"),
21 |         "ENABLE_CACHE": os.getenv("ENABLE_CACHE", "[not set]"),
22 |         "CACHE_EXPIRY": os.getenv("CACHE_EXPIRY", "[not set]"),
23 |         "ENVIRONMENT": os.getenv("ENVIRONMENT", "[not set]"),
24 |         "LOG_LEVEL": os.getenv("LOG_LEVEL", "[not set]"),
25 |     }
26 | 
27 |     # Mask sensitive values
28 |     if env_vars["API_KEYS"] != "[not set]":
29 |         env_vars["API_KEYS"] = "****[MASKED]****"
30 | 
31 |     # Log all relevant environment variables
32 |     logger.info("Environment variables loaded:")
33 |     for key, value in env_vars.items():
34 |         logger.info(f"  {key}={value}")
35 | 
36 |     return env_vars
37 | 


--------------------------------------------------------------------------------
/app/utils/error_handlers.py:
--------------------------------------------------------------------------------
 1 | """Error handling utilities for the API."""
 2 | from fastapi import Request, status
 3 | from fastapi.responses import JSONResponse
 4 | from fastapi.exceptions import RequestValidationError
 5 | from starlette.exceptions import HTTPException as StarletteHTTPException
 6 | import logging
 7 | 
 8 | from app.utils.validation_helpers import generate_error_suggestions, get_parameter_suggestion
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | async def validation_exception_handler(request: Request, exc: RequestValidationError):
13 |     """Handle validation errors in a consistent way."""
14 |     error_details = []
15 |     for error in exc.errors():
16 |         error_details.append({
17 |             "location": error["loc"],
18 |             "message": error["msg"],
19 |             "type": error["type"]
20 |         })
21 |     
22 |     logger.warning(f"Validation error: {error_details}")
23 |     
24 |     # Generate helpful suggestions
25 |     suggestions = generate_error_suggestions(error_details)
26 |     
27 |     return JSONResponse(
28 |         status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
29 |         content={
30 |             "error": "Validation Error",
31 |             "details": error_details,
32 |             "path": request.url.path,
33 |             "suggestions": suggestions,
34 |             "documentation_url": "/docs"
35 |         }
36 |     )
37 | 
38 | async def http_exception_handler(request: Request, exc: StarletteHTTPException):
39 |     """Handle HTTP exceptions with consistent response format."""
40 |     logger.warning(f"HTTP exception: {exc.status_code} - {exc.detail}")
41 |     
42 |     # Create a content object with standard fields
43 |     content = {
44 |         "error": "Request Error",
45 |         "status_code": exc.status_code,
46 |         "message": exc.detail,
47 |         "path": request.url.path
48 |     }
49 |     
50 |     # Add suggestions for common errors
51 |     if exc.status_code == status.HTTP_403_FORBIDDEN:
52 |         if "API Key" in exc.detail:
53 |             content["suggestions"] = [{
54 |                 "parameter": "x-api-key",
55 |                 "message": "Missing or invalid API key",
56 |                 "suggestion": "Include a valid API key in the x-api-key header",
57 |                 "documentation_url": "/docs#section/Authentication"
58 |             }]
59 |     elif exc.status_code == status.HTTP_404_NOT_FOUND:
60 |         if "Page" in exc.detail and "not found" in exc.detail:
61 |             content["suggestions"] = [{
62 |                 "parameter": "page",
63 |                 "message": "Page number out of range",
64 |                 "suggestion": "Use a page number within the available range",
65 |             }]
66 |     
67 |     return JSONResponse(
68 |         status_code=exc.status_code,
69 |         content=content,
70 |         headers=exc.headers
71 |     )
72 | 
73 | async def general_exception_handler(request: Request, exc: Exception):
74 |     """Handle all other exceptions with consistent response format."""
75 |     logger.error(f"Unhandled exception: {str(exc)}", exc_info=True)
76 |     
77 |     # Create basic error response
78 |     content = {
79 |         "error": "Server Error",
80 |         "message": str(exc),
81 |         "path": request.url.path
82 |     }
83 |     
84 |     # Add suggestions based on exception type or message
85 |     if "scrape_jobs" in str(exc):
86 |         content["suggestions"] = [{
87 |             "message": "Error occurred during job scraping",
88 |             "suggestion": "Check your search parameters and try again with fewer job boards or results",
89 |             "troubleshooting": "Try using only one job site at a time (e.g., site_name=linkedin)" 
90 |         }]
91 |     
92 |     return JSONResponse(
93 |         status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
94 |         content=content
95 |     )
96 | 


--------------------------------------------------------------------------------
/app/utils/logging_config.py:
--------------------------------------------------------------------------------
 1 | """Logging configuration for JobSpy Docker API."""
 2 | import logging
 3 | import logging.config
 4 | import os
 5 | from pathlib import Path
 6 | 
 7 | def setup_logging(log_level: str = "INFO"):
 8 |     """Configure logging for the application."""
 9 |     log_config = {
10 |         "version": 1,
11 |         "disable_existing_loggers": False,
12 |         "formatters": {
13 |             "default": {
14 |                 "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
15 |                 "datefmt": "%Y-%m-%d %H:%M:%S",
16 |             },
17 |             "json": {
18 |                 "()": "pythonjsonlogger.jsonlogger.JsonFormatter",
19 |                 "format": "%(asctime)s %(name)s %(levelname)s %(message)s",
20 |             },
21 |         },
22 |         "handlers": {
23 |             "console": {
24 |                 "class": "logging.StreamHandler",
25 |                 "formatter": "default",
26 |                 "level": log_level,
27 |             },
28 |         },
29 |         "loggers": {
30 |             "": {"level": log_level, "handlers": ["console"], "propagate": True},
31 |             "app": {"level": log_level, "handlers": ["console"], "propagate": False},
32 |             "uvicorn": {"level": log_level, "handlers": ["console"], "propagate": False},
33 |         },
34 |     }
35 |     
36 |     # Create logs directory if it doesn't exist
37 |     logs_dir = Path("logs")
38 |     logs_dir.mkdir(exist_ok=True)
39 |     
40 |     # Add file handler if not in development mode
41 |     if os.environ.get("ENVIRONMENT", "development") != "development":
42 |         log_config["handlers"]["file"] = {
43 |             "class": "logging.handlers.RotatingFileHandler",
44 |             "formatter": "json",
45 |             "filename": "logs/app.log",
46 |             "maxBytes": 10485760,  # 10MB
47 |             "backupCount": 5,
48 |             "level": log_level,
49 |         }
50 |         log_config["loggers"][""]["handlers"].append("file")
51 |         log_config["loggers"]["app"]["handlers"].append("file")
52 |     
53 |     logging.config.dictConfig(log_config)
54 | 


--------------------------------------------------------------------------------
/app/utils/logging_docs.py:
--------------------------------------------------------------------------------
 1 | """Documentation for logging levels and troubleshooting."""
 2 | 
 3 | LOGGING_LEVELS = {
 4 |     "DEBUG": {
 5 |         "level": 10,
 6 |         "description": "Detailed information, typically of interest only when diagnosing problems",
 7 |         "use_case": "Shows detailed flow of the application, including variable values and decision points",
 8 |         "shows_auth_errors": True,
 9 |         "environment": "Development"
10 |     },
11 |     "INFO": {
12 |         "level": 20,
13 |         "description": "Confirmation that things are working as expected",
14 |         "use_case": "Normal operation events like startup, shutdown, or successful requests",
15 |         "shows_auth_errors": False,
16 |         "environment": "Development/Production"
17 |     },
18 |     "WARNING": {
19 |         "level": 30,
20 |         "description": "Indication that something unexpected happened, or may happen in the near future",
21 |         "use_case": "Non-critical issues like deprecation notices or improper usage",
22 |         "shows_auth_errors": True,
23 |         "environment": "Development/Production"
24 |     },
25 |     "ERROR": {
26 |         "level": 40,
27 |         "description": "Due to a more serious problem, the software has not been able to perform some function",
28 |         "use_case": "Exception handling and error conditions that should be investigated",
29 |         "shows_auth_errors": True,
30 |         "environment": "Development/Production"
31 |     },
32 |     "CRITICAL": {
33 |         "level": 50,
34 |         "description": "A very serious error, indicating that the program itself may be unable to continue running",
35 |         "use_case": "Application crashes and severe system issues",
36 |         "shows_auth_errors": True,
37 |         "environment": "Development/Production"
38 |     }
39 | }
40 | 
41 | def get_appropriate_level_for_issue(issue_type):
42 |     """Get the appropriate logging level for different issue types."""
43 |     issue_levels = {
44 |         "auth": ["DEBUG", "WARNING"],
45 |         "api_key": ["DEBUG", "WARNING"],
46 |         "request_validation": ["DEBUG", "WARNING"],
47 |         "server_error": ["ERROR", "CRITICAL"],
48 |         "rate_limit": ["WARNING"],
49 |         "performance": ["DEBUG", "INFO"]
50 |     }
51 |     return issue_levels.get(issue_type, ["DEBUG"])
52 | 
53 | def get_troubleshooting_tips():
54 |     """Get troubleshooting tips for common issues."""
55 |     return {
56 |         "authentication_issues": [
57 |             "Check if API_KEY is set in your environment or .env file",
58 |             "Verify your requests include the X-API-Key header with the correct value",
59 |             "Try the /auth-status endpoint to check current authentication settings",
60 |             "Set LOG_LEVEL=DEBUG to see detailed authentication logging"
61 |         ],
62 |         "missing_api_key_error": [
63 |             "This error occurs when API_KEY is configured but not included in your request",
64 |             "Either add the X-API-Key header to your request or remove the API_KEY from your settings"
65 |         ],
66 |         "invalid_api_key_error": [
67 |             "This error occurs when the API key in your request doesn't match the configured value",
68 |             "Check the API_KEY value in your environment or .env file"
69 |         ],
70 |         "server_errors": [
71 |             "Check the application logs for details about the error",
72 |             "Ensure all required environment variables are set",
73 |             "Verify the application has appropriate permissions"
74 |         ]
75 |     }
76 | 


--------------------------------------------------------------------------------
/app/utils/validation_helpers.py:
--------------------------------------------------------------------------------
  1 | """Utility functions for parameter validation and providing helpful error messages."""
  2 | from typing import Any, Dict, List, Tuple
  3 | 
  4 | # Define valid values for different parameters
  5 | VALID_PARAMETERS = {
  6 |     "site_name": ["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"],
  7 |     "job_type": ["fulltime", "parttime", "internship", "contract"],
  8 |     "description_format": ["markdown", "html"],
  9 |     "verbose": [0, 1, 2],
 10 |     "page_size": list(range(1, 101)),
 11 |     "paginate": [True, False],
 12 | }
 13 | 
 14 | # Define parameter type information to improve error messages
 15 | PARAMETER_TYPES = {
 16 |     "site_name": "string or list",
 17 |     "search_term": "string",
 18 |     "location": "string",
 19 |     "distance": "integer",
 20 |     "job_type": "string",
 21 |     "is_remote": "boolean",
 22 |     "results_wanted": "integer",
 23 |     "hours_old": "integer",
 24 |     "linkedin_fetch_description": "boolean",
 25 |     "linkedin_company_ids": "list of integers",
 26 |     "country_indeed": "string",
 27 |     "enforce_annual_salary": "boolean",
 28 |     "description_format": "string",
 29 |     "offset": "integer",
 30 |     "easy_apply": "boolean",
 31 |     "page": "integer",
 32 |     "page_size": "integer",
 33 |     "paginate": "boolean",
 34 | }
 35 | 
 36 | # Parameter descriptions for helpful error messages
 37 | PARAMETER_DESCRIPTIONS = {
 38 |     "site_name": "Job sites to search on (e.g., indeed, linkedin)",
 39 |     "search_term": "Job search term (e.g., 'software engineer')",
 40 |     "location": "Job location (e.g., 'San Francisco, CA')",
 41 |     "distance": "Distance in miles (default: 50)",
 42 |     "job_type": "Type of job (e.g., fulltime, parttime)",
 43 |     "is_remote": "Whether to include remote jobs (true or false)",
 44 |     "results_wanted": "Number of job results per site",
 45 |     "hours_old": "Filter jobs by hours since posting",
 46 |     "linkedin_fetch_description": "Fetch full LinkedIn descriptions",
 47 |     "linkedin_company_ids": "LinkedIn company IDs to filter by",
 48 |     "country_indeed": "Country filter for Indeed & Glassdoor",
 49 |     "enforce_annual_salary": "Convert wages to annual salary",
 50 |     "description_format": "Format of job description (markdown, html)",
 51 |     "offset": "Offset for pagination",
 52 |     "easy_apply": "Filter for easy apply jobs",
 53 |     "page": "Page number for paginated results",
 54 |     "page_size": "Number of results per page",
 55 |     "paginate": "Enable pagination",
 56 | }
 57 | 
 58 | # Parameter limitations and notes
 59 | PARAMETER_LIMITATIONS = {
 60 |     "hours_old": "Cannot be used with job_type, is_remote, or easy_apply for Indeed searches",
 61 |     "easy_apply": "Cannot be used with hours_old for LinkedIn and Indeed searches",
 62 |     "job_type": "Cannot be used with hours_old for Indeed searches when combined with is_remote",
 63 |     "page_size": "Must be between 1 and 100",
 64 | }
 65 | 
 66 | def get_parameter_suggestion(param_name: str, invalid_value: Any = None) -> Dict[str, Any]:
 67 |     """Generate helpful suggestions for invalid parameters."""
 68 |     suggestion = {
 69 |         "parameter": param_name,
 70 |         "message": f"Invalid value for {param_name}",
 71 |     }
 72 |     
 73 |     # Add information about the parameter type
 74 |     if param_name in PARAMETER_TYPES:
 75 |         suggestion["expected_type"] = PARAMETER_TYPES[param_name]
 76 |     
 77 |     # Add description if available
 78 |     if param_name in PARAMETER_DESCRIPTIONS:
 79 |         suggestion["description"] = PARAMETER_DESCRIPTIONS[param_name]
 80 |     
 81 |     # Add valid values if available
 82 |     if param_name in VALID_PARAMETERS:
 83 |         suggestion["valid_values"] = VALID_PARAMETERS[param_name]
 84 |     
 85 |     # Add limitations if applicable
 86 |     if param_name in PARAMETER_LIMITATIONS:
 87 |         suggestion["limitation"] = PARAMETER_LIMITATIONS[param_name]
 88 |     
 89 |     # Add specific suggestions based on the parameter
 90 |     if param_name == "site_name" and invalid_value:
 91 |         suggestion["message"] = f"'{invalid_value}' is not a valid job site"
 92 |         suggestion["suggestion"] = f"Use one or more of the valid job sites: {', '.join(VALID_PARAMETERS['site_name'])}"
 93 |     elif param_name == "job_type" and invalid_value:
 94 |         suggestion["message"] = f"'{invalid_value}' is not a valid job type"
 95 |         suggestion["suggestion"] = f"Use one of: {', '.join(VALID_PARAMETERS['job_type'])}"
 96 |     elif param_name == "description_format" and invalid_value:
 97 |         suggestion["message"] = f"'{invalid_value}' is not a valid description format"
 98 |         suggestion["suggestion"] = f"Use one of: {', '.join(VALID_PARAMETERS['description_format'])}"
 99 |     elif param_name == "verbose" and invalid_value is not None:
100 |         suggestion["message"] = f"'{invalid_value}' is not a valid verbosity level"
101 |         suggestion["suggestion"] = f"Use one of: {', '.join(map(str, VALID_PARAMETERS['verbose']))}"
102 |     elif param_name == "page_size" and invalid_value is not None:
103 |         suggestion["message"] = f"'{invalid_value}' is not a valid page size"
104 |         suggestion["suggestion"] = "Page size must be between 1 and 100"
105 |     elif param_name == "paginate" and invalid_value is not None:
106 |         suggestion["message"] = f"'{invalid_value}' is not a valid value for paginate"
107 |         suggestion["suggestion"] = "Use true or false"
108 |     
109 |     return suggestion
110 | 
111 | def extract_validation_location(error_location: Tuple) -> str:
112 |     """Extract the parameter name from the error location tuple."""
113 |     if len(error_location) > 1:
114 |         return error_location[1]
115 |     return str(error_location[0])
116 | 
117 | def generate_error_suggestions(validation_errors: List[Dict]) -> List[Dict]:
118 |     """Generate helpful suggestions for validation errors."""
119 |     suggestions = []
120 |     
121 |     for error in validation_errors:
122 |         error_type = error.get("type", "")
123 |         error_loc = error.get("location", [])
124 |         
125 |         if not error_loc:
126 |             continue
127 |             
128 |         param_name = extract_validation_location(error_loc)
129 |         invalid_value = None
130 |         
131 |         # For value errors, extract the invalid value if possible
132 |         if "value_error" in error_type and "msg" in error:
133 |             # Try to extract the invalid value from the error message
134 |             msg = error["message"]
135 |             if "not a valid" in msg and "=" in msg:
136 |                 invalid_value = msg.split("=")[-1].strip().strip("'\"")
137 |         
138 |         suggestion = get_parameter_suggestion(param_name, invalid_value)
139 |         suggestions.append(suggestion)
140 |     
141 |     return suggestions
142 | 


--------------------------------------------------------------------------------
/docker-compose.dev.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | 
 3 | services:
 4 |   jobspy-api:
 5 |     build: .
 6 |     image: jobspy-docker-api-dev
 7 |     container_name: jobspy-docker-api-dev
 8 |     ports:
 9 |       - "8000:8000"
10 |     env_file:
11 |       - .env
12 |       - .env.local  # Load .env.local to override .env values
13 |     environment:
14 |       # The following values intentionally override .env settings for development
15 |       # These hardcoded values ensure consistent behavior in development environment
16 |       - ENVIRONMENT=production
17 |       - LOG_LEVEL=INFO
18 |       - ENABLE_API_KEY_AUTH=true
19 |       - API_KEYS=${API_KEYS:-dev-key-123}  # Use from .env.local or default to dev-key
20 |       - RATE_LIMIT_ENABLED=true
21 |       - ENABLE_CACHE=true
22 |     volumes:
23 |       - .:/app
24 |     command: uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload
25 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | 
 3 | services:
 4 |   jobspy-api:
 5 |     build: .
 6 |     image: jobspy-docker-api
 7 |     container_name: jobspy-docker-api
 8 |     ports:
 9 |       - "8000:8000"
10 |     env_file:
11 |       - .env
12 |     environment:
13 |       # The most important settings that need consistent values
14 |       - LOG_LEVEL=${LOG_LEVEL:-INFO}  # Default to INFO if not set elsewhere
15 |       - ENABLE_API_KEY_AUTH=${ENABLE_API_KEY_AUTH:-false}
16 |       - API_KEYS=${API_KEYS:-}
17 |       
18 |       # Rate Limiting (only enable if needed)
19 |       - RATE_LIMIT_ENABLED=${RATE_LIMIT_ENABLED:-false}
20 |       - RATE_LIMIT_REQUESTS=${RATE_LIMIT_REQUESTS:-100}
21 |       - RATE_LIMIT_TIMEFRAME=${RATE_LIMIT_TIMEFRAME:-3600}
22 |       
23 |       # Proxy Configuration
24 |       - DEFAULT_PROXIES=${DEFAULT_PROXIES}
25 |       - CA_CERT_PATH=${CA_CERT_PATH}
26 |       
27 |       # JobSpy Default Settings
28 |       - DEFAULT_SITE_NAMES=${DEFAULT_SITE_NAMES:-indeed,linkedin,zip_recruiter,glassdoor,google,bayt,naukri}
29 |       - DEFAULT_RESULTS_WANTED=${DEFAULT_RESULTS_WANTED:-20}
30 |       - DEFAULT_DISTANCE=${DEFAULT_DISTANCE:-50}
31 |       - DEFAULT_DESCRIPTION_FORMAT=${DEFAULT_DESCRIPTION_FORMAT:-markdown}
32 |       - DEFAULT_COUNTRY_INDEED=${DEFAULT_COUNTRY_INDEED:-USA}
33 |       
34 |       # Caching
35 |       - ENABLE_CACHE=${ENABLE_CACHE:-false}
36 |       - CACHE_EXPIRY=${CACHE_EXPIRY:-3600}
37 |       
38 |       # Logging
39 |       - ENVIRONMENT=${ENVIRONMENT:-production}
40 |       
41 |       # CORS
42 |       - CORS_ORIGINS=${CORS_ORIGINS:-*}
43 |       
44 |       # Health Endpoints
45 |       - ENABLE_HEALTH_ENDPOINTS=${ENABLE_HEALTH_ENDPOINTS:-true}
46 |       - ENABLE_DETAILED_HEALTH=${ENABLE_DETAILED_HEALTH:-true}
47 |       
48 |       # API Documentation
49 |       - ENABLE_SWAGGER_UI=${ENABLE_SWAGGER_UI:-true}
50 |       - ENABLE_REDOC=${ENABLE_REDOC:-true}
51 |       - SWAGGER_UI_PATH=${SWAGGER_UI_PATH:-/docs}
52 |       - REDOC_PATH=${REDOC_PATH:-/redoc}
53 |     volumes:
54 |       - ./logs:/app/logs
55 |       - ./scripts:/app/scripts  # Ensure scripts are mounted properly
56 |     restart: unless-stopped
57 |     # Use bash explicitly to execute scripts and fix permission issues
58 |     command: >
59 |       /bin/bash -c "bash /app/scripts/docker-entrypoint.sh"
60 |     healthcheck:
61 |       test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
62 |       interval: 120s
63 |       timeout: 5s
64 |       retries: 3
65 |       start_period: 10s
66 | 


--------------------------------------------------------------------------------
/examples/api_usage.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import json
  3 | import pandas as pd
  4 | 
  5 | # Base URL for the API
  6 | BASE_URL = "http://localhost:8000"
  7 | 
  8 | def search_jobs_simple():
  9 |     """
 10 |     Simple job search using the consolidated GET endpoint
 11 |     """
 12 |     params = {
 13 |         "site_name": ["indeed", "linkedin"],
 14 |         "search_term": "software engineer",
 15 |         "location": "San Francisco, CA",
 16 |         "results_wanted": 5
 17 |     }
 18 |     
 19 |     response = requests.get(f"{BASE_URL}/api/v1/search_jobs", params=params)
 20 |     
 21 |     if response.status_code == 200:
 22 |         data = response.json()
 23 |         print(f"Found {data['count']} jobs")
 24 |         
 25 |         # Convert to pandas DataFrame for easier viewing
 26 |         df = pd.DataFrame(data['jobs'])
 27 |         print(df.head())
 28 |         
 29 |         # Save to CSV
 30 |         df.to_csv("jobs_simple.csv", index=False)
 31 |     else:
 32 |         print(f"Error: {response.status_code}")
 33 |         print(response.text)
 34 | 
 35 | def search_jobs_advanced():
 36 |     """
 37 |     Advanced job search using GET endpoint with all parameters
 38 |     """
 39 |     params = {
 40 |         "site_name": ["indeed", "linkedin", "zip_recruiter"],
 41 |         "search_term": "data scientist",
 42 |         "google_search_term": "data scientist jobs near New York, NY since yesterday",
 43 |         "location": "New York, NY",
 44 |         "distance": 25,
 45 |         "job_type": "fulltime",
 46 |         "is_remote": True,
 47 |         "results_wanted": 10,
 48 |         "hours_old": 48,
 49 |         "description_format": "markdown",
 50 |         "country_indeed": "USA",
 51 |         "enforce_annual_salary": True,
 52 |         "linkedin_fetch_description": True
 53 |     }
 54 |     
 55 |     response = requests.get(
 56 |         f"{BASE_URL}/api/v1/search_jobs",
 57 |         params=params
 58 |     )
 59 |     
 60 |     if response.status_code == 200:
 61 |         data = response.json()
 62 |         print(f"Found {data['count']} jobs")
 63 |         
 64 |         # Convert to pandas DataFrame for easier viewing
 65 |         df = pd.DataFrame(data['jobs'])
 66 |         print(df.head())
 67 |         
 68 |         # Save to CSV
 69 |         df.to_csv("jobs_advanced.csv", index=False)
 70 |     else:
 71 |         print(f"Error: {response.status_code}")
 72 |         print(response.text)
 73 | 
 74 | def search_jobs_paginated():
 75 |     """
 76 |     Paginated job search using GET endpoint
 77 |     """
 78 |     params = {
 79 |         "paginate": True,
 80 |         "page": 1,
 81 |         "page_size": 5,
 82 |         "site_name": ["indeed", "linkedin"],
 83 |         "search_term": "software engineer",
 84 |         "location": "San Francisco, CA",
 85 |         "results_wanted": 20
 86 |     }
 87 |     
 88 |     response = requests.get(f"{BASE_URL}/api/v1/search_jobs", params=params)
 89 |     
 90 |     if response.status_code == 200:
 91 |         data = response.json()
 92 |         print(f"Found {data['count']} total jobs, showing page {data['current_page']} of {data['total_pages']}")
 93 |         print(f"Page size: {data['page_size']}, showing {len(data['jobs'])} jobs")
 94 |         
 95 |         # Convert to pandas DataFrame for easier viewing
 96 |         df = pd.DataFrame(data['jobs'])
 97 |         print(df.head())
 98 |         
 99 |         # Check if there's a next page
100 |         if data['next_page']:
101 |             print(f"Next page URL: {data['next_page']}")
102 |         
103 |         # Save to CSV
104 |         df.to_csv("jobs_paginated.csv", index=False)
105 |     else:
106 |         print(f"Error: {response.status_code}")
107 |         print(response.text)
108 | 
109 | if __name__ == "__main__":
110 |     print("Running simple job search...")
111 |     search_jobs_simple()
112 |     
113 |     print("\nRunning advanced job search...")
114 |     search_jobs_advanced()
115 |     
116 |     print("\nRunning paginated job search...")
117 |     search_jobs_paginated()
118 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import List, Optional, Union, Dict, Any
  3 | from fastapi import FastAPI, Query, HTTPException
  4 | from pydantic import BaseModel, Field
  5 | from jobspy import scrape_jobs
  6 | import pandas as pd
  7 | 
  8 | app = FastAPI(
  9 |     title="JobSpy Docker API",
 10 |     description="API for searching jobs across multiple platforms using JobSpy",
 11 |     version="1.0.0",
 12 | )
 13 | 
 14 | SUPPORTED_SITES = ["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"]
 15 | 
 16 | def get_env_bool(var_name, default=True):
 17 |     val = os.getenv(var_name)
 18 |     if val is None:
 19 |         return default
 20 |     return str(val).lower() in ("1", "true", "yes", "on")
 21 | 
 22 | class JobSearchParams(BaseModel):
 23 |     site_name: Union[List[str], str] = Field(
 24 |         default=["indeed", "linkedin", "zip_recruiter", "glassdoor", "google", "bayt", "naukri"],
 25 |         description="Job sites to search on",
 26 |     )
 27 |     search_term: Optional[str] = Field(default=None, description="Job search term")
 28 |     google_search_term: Optional[str] = Field(default=None, description="Search term for Google jobs")
 29 |     location: Optional[str] = Field(default=None, description="Job location")
 30 |     distance: Optional[int] = Field(default=50, description="Distance in miles")
 31 |     job_type: Optional[str] = Field(default=None, description="Job type (fulltime, parttime, internship, contract)")
 32 |     proxies: Optional[List[str]] = Field(default=None, description="Proxies in format ['user:pass@host:port', 'localhost']")
 33 |     is_remote: Optional[bool] = Field(default=None, description="Remote job filter")
 34 |     results_wanted: Optional[int] = Field(default=20, description="Number of results per site")
 35 |     hours_old: Optional[int] = Field(default=None, description="Filter by hours since posting")
 36 |     easy_apply: Optional[bool] = Field(default=None, description="Filter for easy apply jobs")
 37 |     description_format: Optional[str] = Field(default="markdown", description="Format of job description")
 38 |     offset: Optional[int] = Field(default=0, description="Offset for pagination")
 39 |     verbose: Optional[int] = Field(default=2, description="Controls verbosity (0: errors only, 1: errors+warnings, 2: all logs)")
 40 |     linkedin_fetch_description: Optional[bool] = Field(default=False, description="Fetch full LinkedIn descriptions")
 41 |     linkedin_company_ids: Optional[List[int]] = Field(default=None, description="LinkedIn company IDs to filter by")
 42 |     country_indeed: Optional[str] = Field(default=None, description="Country filter for Indeed & Glassdoor")
 43 |     enforce_annual_salary: Optional[bool] = Field(default=False, description="Convert wages to annual salary")
 44 |     ca_cert: Optional[str] = Field(default=None, description="Path to CA Certificate file for proxies")
 45 | 
 46 | class JobResponse(BaseModel):
 47 |     count: int
 48 |     jobs: List[Dict[str, Any]]
 49 | 
 50 | @app.get("/", tags=["Info"])
 51 | def read_root():
 52 |     return {"message": "Welcome to JobSpy Docker API! Go to /docs for the API documentation."}
 53 | 
 54 | @app.post("/search_jobs", response_model=JobResponse, tags=["Jobs"])
 55 | def search_jobs(params: JobSearchParams):
 56 |     try:
 57 |         jobs_df = scrape_jobs(
 58 |             site_name=params.site_name,
 59 |             search_term=params.search_term,
 60 |             google_search_term=params.google_search_term,
 61 |             location=params.location,
 62 |             distance=params.distance,
 63 |             job_type=params.job_type,
 64 |             proxies=params.proxies,
 65 |             is_remote=params.is_remote,
 66 |             results_wanted=params.results_wanted,
 67 |             hours_old=params.hours_old,
 68 |             easy_apply=params.easy_apply,
 69 |             description_format=params.description_format,
 70 |             offset=params.offset,
 71 |             verbose=params.verbose,
 72 |             linkedin_fetch_description=params.linkedin_fetch_description,
 73 |             linkedin_company_ids=params.linkedin_company_ids,
 74 |             country_indeed=params.country_indeed,
 75 |             enforce_annual_salary=params.enforce_annual_salary,
 76 |             ca_cert=params.ca_cert,
 77 |         )
 78 |         
 79 |         # Convert DataFrame to dictionary format
 80 |         jobs_list = jobs_df.to_dict('records')
 81 |         
 82 |         return {
 83 |             "count": len(jobs_list),
 84 |             "jobs": jobs_list
 85 |         }
 86 |     except Exception as e:
 87 |         raise HTTPException(status_code=500, detail=f"Error scraping jobs: {str(e)}")
 88 | 
 89 | @app.get("/api/v1/search_jobs")
 90 | async def search_jobs_get(
 91 |     site_name: Union[List[str], str] = Query("all", description="Job sites to search on"),
 92 |     search_term: str = Query(None, description="Job search term"),
 93 |     google_search_term: Optional[str] = Query(None, description="Search term for Google jobs"),
 94 |     location: str = Query(None, description="Job location"),
 95 |     distance: int = Query(50, description="Distance in miles"),
 96 |     job_type: Optional[str] = Query(None, description="Job type (fulltime, parttime, internship, contract)"),
 97 |     is_remote: Optional[bool] = Query(None, description="Remote job filter"),
 98 |     results_wanted: int = Query(10, description="Number of results per site"),
 99 |     hours_old: Optional[int] = Query(None, description="Filter by hours since posting"),
100 |     easy_apply: Optional[bool] = Query(None, description="Filter for easy apply jobs"),
101 |     description_format: str = Query("markdown", description="Format of job description"),
102 |     offset: int = Query(0, description="Offset for pagination"),
103 |     verbose: int = Query(2, description="Controls verbosity (0: errors only, 1: errors+warnings, 2: all logs)"),
104 |     linkedin_fetch_description: bool = Query(False, description="Fetch full LinkedIn descriptions"),
105 |     country_indeed: Optional[str] = Query(None, description="Country filter for Indeed & Glassdoor"),
106 |     enforce_annual_salary: bool = Query(False, description="Convert wages to annual salary"),
107 |     format: str = Query("json", description="Output format: json or csv"),
108 | ):
109 |     # Handle site_name=all
110 |     if isinstance(site_name, str):
111 |         if site_name.lower() == "all":
112 |             site_name = SUPPORTED_SITES
113 |         else:
114 |             site_name = [site_name]
115 |     elif isinstance(site_name, list):
116 |         if "all" in [s.lower() for s in site_name]:
117 |             site_name = SUPPORTED_SITES
118 | 
119 |     # Use env default for country_indeed if not provided
120 |     if not country_indeed:
121 |         country_indeed = os.getenv("DEFAULT_COUNTRY_INDEED", "USA")
122 | 
123 |     try:
124 |         jobs_df = scrape_jobs(
125 |             site_name=site_name,
126 |             search_term=search_term,
127 |             google_search_term=google_search_term,
128 |             location=location,
129 |             distance=distance,
130 |             job_type=job_type,
131 |             is_remote=is_remote,
132 |             results_wanted=results_wanted,
133 |             hours_old=hours_old,
134 |             easy_apply=easy_apply,
135 |             description_format=description_format,
136 |             offset=offset,
137 |             verbose=verbose,
138 |             linkedin_fetch_description=linkedin_fetch_description,
139 |             country_indeed=country_indeed,
140 |             enforce_annual_salary=enforce_annual_salary,
141 |         )
142 |         
143 |         # Convert DataFrame to dictionary format
144 |         jobs_data = jobs_df.to_dict('records')
145 | 
146 |         if format.lower() == "csv":
147 |             import io, csv
148 |             from fastapi.responses import StreamingResponse
149 |             if not jobs_data:
150 |                 output = io.StringIO()
151 |                 writer = csv.writer(output)
152 |                 writer.writerow(["No results"])
153 |                 output.seek(0)
154 |                 return StreamingResponse(output, media_type="text/csv", headers={"Content-Disposition": "attachment; filename=jobs.csv"})
155 |             output = io.StringIO()
156 |             writer = csv.DictWriter(output, fieldnames=jobs_data[0].keys())
157 |             writer.writeheader()
158 |             writer.writerows(jobs_data)
159 |             output.seek(0)
160 |             return StreamingResponse(output, media_type="text/csv", headers={"Content-Disposition": "attachment; filename=jobs.csv"})
161 |         # Default: JSON
162 |         from fastapi.responses import JSONResponse
163 |         return JSONResponse(content={"count": len(jobs_data), "jobs": jobs_data})
164 |     except Exception as e:
165 |         raise HTTPException(status_code=500, detail=f"Error scraping jobs: {str(e)}")
166 | 
167 | # API key auth default logic (at app startup or dependency)
168 | ENABLE_API_KEY_AUTH = get_env_bool("ENABLE_API_KEY_AUTH", default=True)
169 | if not ENABLE_API_KEY_AUTH:
170 |     import warnings
171 |     warnings.warn("API key authentication is disabled. Set ENABLE_API_KEY_AUTH=true to enable.")
172 | 
173 | if __name__ == "__main__":
174 |     import uvicorn
175 |     uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
176 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | name = "jobspy-docker-api"
  3 | version = "1.0.0"
  4 | description = "A Docker-containerized FastAPI application providing secure API access to the Python JobSpy library."
  5 | authors = [
  6 |     { name = "Shannon Atkinson", email = "rainmanjam@gmail.com" }
  7 | ]
  8 | readme = "README.md"
  9 | requires-python = ">=3.8"
 10 | license = { file = "LICENSE" }
 11 | keywords = ["fastapi", "jobspy", "docker", "api", "job-search"]
 12 | classifiers = [
 13 |     "Programming Language :: Python :: 3",
 14 |     "Programming Language :: Python :: 3.8",
 15 |     "Programming Language :: Python :: 3.9",
 16 |     "Programming Language :: Python :: 3.10",
 17 |     "Programming Language :: Python :: 3.11",
 18 |     "Programming Language :: Python :: 3.12",
 19 |     "Framework :: FastAPI",
 20 |     "License :: OSI Approved :: MIT License",
 21 |     "Operating System :: OS Independent"
 22 | ]
 23 | dependencies = [
 24 |     "fastapi=0.115.12",
 25 |     "uvicorn[standard]=0.34.2",
 26 |     "python-jobspy=1.1.80",
 27 |     "pydantic=2.11.3",
 28 |     "pydantic-settings=2.9.1",
 29 |     "python-multipart=0.0.20",
 30 |     "psutil=7.0.0",
 31 |     "python-dotenv=1.1.0"
 32 | ]
 33 | 
 34 | [project.optional-dependencies]
 35 | dev = [
 36 |     "pytest>=8.2.2",
 37 |     "pytest-cov>=5.0.0",
 38 |     "pytest-asyncio>=0.23.6",
 39 |     "pylint>=3.1.0",
 40 |     "black>=24.4.2",
 41 |     "isort>=5.13.2",
 42 |     "pre-commit>=3.7.1",
 43 |     "safety>=3.2.0"
 44 | ]
 45 | 
 46 | [tool.setuptools]
 47 | packages = ["app"]
 48 | 
 49 | [tool.black]
 50 | line-length = 88
 51 | target-version = ['py38']
 52 | exclude = '''
 53 | /(
 54 |     \.git
 55 |   | \.venv
 56 |   | build
 57 |   | dist
 58 |   | logs
 59 |   | temp
 60 |   | __pycache__
 61 | )/
 62 | '''
 63 | 
 64 | [tool.isort]
 65 | profile = "black"
 66 | line_length = 88
 67 | multi_line_output = 3
 68 | include_trailing_comma = true
 69 | 
 70 | [tool.pytest.ini_options]
 71 | minversion = "7.0"
 72 | addopts = "--cov=app --cov-report=term-missing"
 73 | testpaths = [
 74 |     "tests"
 75 | ]
 76 | 
 77 | [tool.pylint.'MESSAGES CONTROL']
 78 | disable = [
 79 |     "C0114",  # missing-module-docstring
 80 |     "C0115",  # missing-class-docstring
 81 |     "C0116",  # missing-function-docstring
 82 | ]
 83 | 
 84 | [tool.pylint.format]
 85 | max-line-length = 88
 86 | 
 87 | [tool.coverage.run]
 88 | branch = true
 89 | source = [
 90 |     "app"
 91 | ]
 92 | 
 93 | [tool.coverage.report]
 94 | show_missing = true
 95 | skip_covered = true
 96 | exclude_lines = [
 97 |     "pragma: no cover",
 98 |     "if __name__ == .__main__.:"
 99 | ]
100 | 
101 | [build-system]
102 | requires = ["setuptools>=61.0", "wheel"]
103 | build-backend = "setuptools.build_meta"
104 | 
105 | [tool.pre-commit]
106 | repos = [
107 |     { repo = "https://github.com/pre-commit/pre-commit-hooks", rev = "v4.4.0", hooks = [
108 |         { id = "trailing-whitespace" },
109 |         { id = "end-of-file-fixer" },
110 |         { id = "check-yaml" },
111 |         { id = "check-added-large-files" }
112 |     ] },
113 |     { repo = "https://github.com/psf/black", rev = "23.3.0", hooks = [
114 |         { id = "black" }
115 |     ] },
116 |     { repo = "https://github.com/pycqa/isort", rev = "5.12.0", hooks = [
117 |         { id = "isort" }
118 |     ] },
119 |     { repo = "https://github.com/pycqa/flake8", rev = "6.0.0", hooks = [
120 |         { id = "flake8", additional_dependencies = ["flake8-docstrings"] }
121 |     ] }
122 | ]
123 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | pytest>=7.0.0
3 | pytest-cov>=4.0.0
4 | pytest-asyncio>=0.21.0
5 | pylint>=2.15.0
6 | black>=23.0.0
7 | isort>=5.12.0
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi
2 | uvicorn[standard]
3 | python-jobspy
4 | pydantic
5 | pydantic-settings
6 | python-multipart
7 | psutil
8 | python-dotenv
9 | 


--------------------------------------------------------------------------------
/scripts/check_auth.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Script to check API authentication configuration.
 4 | Run this script to debug issues with API key authentication.
 5 | """
 6 | import os
 7 | import sys
 8 | from pathlib import Path
 9 | 
10 | # Add parent directory to path so we can import app modules
11 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
12 | 
13 | def check_auth_config():
14 |     """Print authentication configuration settings."""
15 |     print("=== API Authentication Configuration ===")
16 |     
17 |     # Check environment variables
18 |     env_vars = {
19 |         "API_KEY": os.getenv("API_KEY", ""),
20 |         "ENABLE_API_KEY_AUTH": os.getenv("ENABLE_API_KEY_AUTH", ""),
21 |         "API_KEY_HEADER_NAME": os.getenv("API_KEY_HEADER_NAME", ""),
22 |     }
23 |     
24 |     print("\nEnvironment Variables:")
25 |     for key, value in env_vars.items():
26 |         masked_value = "********" if key == "API_KEY" and value else value
27 |         print(f"{key}={masked_value!r}")
28 |     
29 |     # Try to load app settings
30 |     print("\nApp Settings:")
31 |     try:
32 |         from app.core.config import settings
33 |         print(f"API_KEY configured: {bool(settings.API_KEY)}")
34 |         print(f"API_KEY value is set: {bool(settings.API_KEY and settings.API_KEY != '')}")
35 |     except Exception as e:
36 |         print(f"Error loading settings: {e}")
37 |     
38 |     # Check .env file
39 |     env_file = Path(".env")
40 |     env_local_file = Path(".env.local")
41 |     
42 |     print("\nEnvironment Files:")
43 |     print(f".env exists: {env_file.exists()}")
44 |     print(f".env.local exists: {env_local_file.exists()}")
45 |     
46 |     # Provide troubleshooting tips
47 |     print("\n=== Troubleshooting Tips ===")
48 |     print("1. If you want to disable API key authentication:")
49 |     print("   - Ensure API_KEY is not set in your environment or .env files")
50 |     print("   - Or explicitly set API_KEY='' (empty string) in your .env file")
51 |     print("\n2. If you want to enable API key authentication:")
52 |     print("   - Set API_KEY='your-secret-key' in your .env.local file")
53 |     print("   - Include the X-API-Key header in your requests")
54 |     print("\n3. To see detailed authentication logs:")
55 |     print("   - Set LOG_LEVEL=DEBUG in your environment or .env file")
56 | 
57 | if __name__ == "__main__":
58 |     check_auth_config()
59 | 


--------------------------------------------------------------------------------
/scripts/check_config_consistency.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Script to check for configuration consistency across different settings files.
  4 | This helps identify and resolve inconsistencies in environment variables.
  5 | """
  6 | import os
  7 | import sys
  8 | import yaml
  9 | import dotenv
 10 | from pathlib import Path
 11 | import re
 12 | from typing import Dict, Any, List, Set
 13 | 
 14 | # Add parent directory to path so we can import app modules
 15 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 16 | 
 17 | def color_text(text, color_code):
 18 |     """Add color to terminal output."""
 19 |     return f"\033[{color_code}m{text}\033[0m"
 20 | 
 21 | def red(text):
 22 |     return color_text(text, 31)
 23 | 
 24 | def green(text):
 25 |     return color_text(text, 32)
 26 | 
 27 | def yellow(text):
 28 |     return color_text(text, 33)
 29 | 
 30 | def blue(text):
 31 |     return color_text(text, 34)
 32 | 
 33 | def load_env_file(path: Path) -> Dict[str, str]:
 34 |     """Load environment variables from a .env file."""
 35 |     if not path.exists():
 36 |         print(f"Warning: {path} not found")
 37 |         return {}
 38 |     
 39 |     return dotenv.dotenv_values(path)
 40 | 
 41 | def extract_dockerfile_env_vars(path: Path) -> Dict[str, str]:
 42 |     """Extract environment variables from a Dockerfile."""
 43 |     if not path.exists():
 44 |         print(f"Warning: {path} not found")
 45 |         return {}
 46 |     
 47 |     env_vars = {}
 48 |     try:
 49 |         with open(path, 'r') as f:
 50 |             content = f.read()
 51 |             
 52 |         # Look for ENV statements
 53 |         # This is a simple approach - a proper parser would be better
 54 |         env_pattern = r'ENV\s+([A-Za-z0-9_]+)=([^\s\\]+)'
 55 |         simple_envs = re.findall(env_pattern, content)
 56 |         for key, value in simple_envs:
 57 |             env_vars[key] = value.strip('"\'')
 58 |             
 59 |         # Look for multi-line ENV statements
 60 |         multi_pattern = r'ENV\s+([A-Za-z0-9_]+)=([^\s\\]+)(\s*\\\s*\n\s*([A-Za-z0-9_]+)=([^\s\\]+))*'
 61 |         multi_envs = re.findall(multi_pattern, content)
 62 |         for match in multi_envs:
 63 |             for i in range(0, len(match), 3):
 64 |                 if i+1 < len(match) and match[i] and match[i+1]:
 65 |                     env_vars[match[i]] = match[i+1].strip('"\'')
 66 |     except Exception as e:
 67 |         print(f"Error parsing Dockerfile: {e}")
 68 |     
 69 |     return env_vars
 70 | 
 71 | def load_docker_compose_vars(path: Path) -> Dict[str, str]:
 72 |     """Load environment variables from a docker-compose file."""
 73 |     if not path.exists():
 74 |         print(f"Warning: {path} not found")
 75 |         return {}
 76 |     
 77 |     try:
 78 |         with open(path, 'r') as f:
 79 |             compose_data = yaml.safe_load(f)
 80 |         
 81 |         env_vars = {}
 82 |         for service_name, service_data in compose_data.get('services', {}).items():
 83 |             # Check environment section
 84 |             environment = service_data.get('environment', [])
 85 |             if isinstance(environment, list):
 86 |                 for env in environment:
 87 |                     if isinstance(env, str) and '=' in env:
 88 |                         key, value = env.split('=', 1)
 89 |                         # Handle ${VAR:-default} format
 90 |                         if '${' in value and ':-' in value and '}' in value:
 91 |                             default_val = value.split(':-')[1].split('}')[0]
 92 |                             env_vars[key] = default_val
 93 |                         else:
 94 |                             env_vars[key] = value
 95 |             elif isinstance(environment, dict):
 96 |                 env_vars.update(environment)
 97 |         
 98 |         return env_vars
 99 |     except Exception as e:
100 |         print(f"Error parsing docker-compose file: {e}")
101 |         return {}
102 | 
103 | def check_config_consistency():
104 |     """Check configuration consistency across different settings files."""
105 |     print(yellow("=== Configuration Consistency Checker ===\n"))
106 |     
107 |     # Define paths to all configuration files
108 |     env_path = Path(".env")
109 |     env_local_path = Path(".env.local")
110 |     dockerfile_path = Path("Dockerfile")
111 |     docker_compose_path = Path("docker-compose.yml")
112 |     docker_compose_dev_path = Path("docker-compose.dev.yml")
113 |     
114 |     # Load environment variables from each file
115 |     env_vars = load_env_file(env_path)
116 |     env_local_vars = load_env_file(env_local_path)
117 |     dockerfile_vars = extract_dockerfile_env_vars(dockerfile_path)
118 |     docker_compose_vars = load_docker_compose_vars(docker_compose_path)
119 |     docker_compose_dev_vars = load_docker_compose_vars(docker_compose_dev_path)
120 |     
121 |     # Collect all variable names across all files
122 |     all_vars = set()
123 |     all_vars.update(env_vars.keys())
124 |     all_vars.update(env_local_vars.keys())
125 |     all_vars.update(dockerfile_vars.keys())
126 |     all_vars.update(docker_compose_vars.keys())
127 |     all_vars.update(docker_compose_dev_vars.keys())
128 |     
129 |     # Filter out non-app related environment variables
130 |     excluded_vars = {'PYTHONDONTWRITEBYTECODE', 'PYTHONUNBUFFERED', 'PYTHONPATH'}
131 |     app_vars = all_vars - excluded_vars
132 |     
133 |     # Check for presence of each variable in each file
134 |     print(yellow("Checking variable presence in each configuration file:"))
135 |     missing_vars = {
136 |         ".env": [],
137 |         ".env.local": [],
138 |         "Dockerfile": [],
139 |         "docker-compose.yml": [],
140 |         "docker-compose.dev.yml": []
141 |     }
142 |     
143 |     for var in sorted(app_vars):
144 |         print(f"\n{blue(var)}:")
145 |         
146 |         if var not in env_vars:
147 |             missing_vars[".env"].append(var)
148 |             print(f"  .env: {red('MISSING')}")
149 |         else:
150 |             print(f"  .env: {env_vars[var]}")
151 |             
152 |         if var not in env_local_vars:
153 |             # Only mark as missing if uncommented in .env.local
154 |             print(f"  .env.local: {yellow('Not specified')}")
155 |         else:
156 |             print(f"  .env.local: {env_local_vars[var]}")
157 |             
158 |         if var not in dockerfile_vars:
159 |             missing_vars["Dockerfile"].append(var)
160 |             print(f"  Dockerfile: {red('MISSING')}")
161 |         else:
162 |             print(f"  Dockerfile: {dockerfile_vars[var]}")
163 |             
164 |         if var not in docker_compose_vars:
165 |             missing_vars["docker-compose.yml"].append(var)
166 |             print(f"  docker-compose.yml: {red('MISSING')}")
167 |         else:
168 |             print(f"  docker-compose.yml: {docker_compose_vars[var]}")
169 |             
170 |         if var not in docker_compose_dev_vars:
171 |             missing_vars["docker-compose.dev.yml"].append(var)
172 |             print(f"  docker-compose.dev.yml: {red('MISSING')}")
173 |         else:
174 |             print(f"  docker-compose.dev.yml: {docker_compose_dev_vars[var]}")
175 |     
176 |     # Print summary of missing variables
177 |     print("\n" + yellow("=== Missing Variables Summary ==="))
178 |     for file_path, vars_list in missing_vars.items():
179 |         if vars_list:
180 |             print(f"\n{file_path} is missing these variables:")
181 |             for var in vars_list:
182 |                 print(f"  - {var}")
183 |     
184 |     # Check for inconsistent default values
185 |     print("\n" + yellow("=== Inconsistent Default Values ==="))
186 |     inconsistent_defaults = []
187 |     for var in sorted(app_vars):
188 |         values = {}
189 |         if var in env_vars:
190 |             values[".env"] = env_vars[var]
191 |         if var in dockerfile_vars:
192 |             values["Dockerfile"] = dockerfile_vars[var]
193 |         
194 |         # Skip if we don't have at least two sources to compare
195 |         if len(values) < 2:
196 |             continue
197 |             
198 |         # Check if values are inconsistent
199 |         if len(set(values.values())) > 1:
200 |             inconsistent_defaults.append((var, values))
201 |     
202 |     if inconsistent_defaults:
203 |         for var, values in inconsistent_defaults:
204 |             print(f"\n{red(var)} has inconsistent default values:")
205 |             for source, value in values.items():
206 |                 print(f"  {source}: {value}")
207 |     else:
208 |         print(green("No inconsistencies found in default values!"))
209 |     
210 |     # Provide recommendations
211 |     print("\n" + yellow("=== Recommendations ==="))
212 |     
213 |     if missing_vars[".env"]:
214 |         print("\n1. Add these missing variables to .env:")
215 |         for var in missing_vars[".env"]:
216 |             # Try to find a default value from other files
217 |             default_val = docker_compose_vars.get(var) or dockerfile_vars.get(var) or ""
218 |             print(f"   {var}={default_val}")
219 |     
220 |     if missing_vars["Dockerfile"]:
221 |         print("\n2. Consider adding these variables to Dockerfile ENV section:")
222 |         for var in missing_vars["Dockerfile"]:
223 |             # Try to find a default value from other files
224 |             default_val = env_vars.get(var) or docker_compose_vars.get(var) or ""
225 |             print(f"   {var}={default_val}")
226 |     
227 |     print("\n3. Ensure docker-compose.dev.yml loads from .env:")
228 |     print("   Add this to the service configuration:")
229 |     print("   env_file:")
230 |     print("     - .env")
231 |     
232 |     if inconsistent_defaults:
233 |         print("\n4. Fix inconsistent default values between files")
234 | 
235 | if __name__ == "__main__":
236 |     try:
237 |         check_config_consistency()
238 |     except Exception as e:
239 |         print(red(f"Error: {e}"))
240 |         import traceback
241 |         traceback.print_exc()
242 | 


--------------------------------------------------------------------------------
/scripts/check_env.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Script to check environment variables and configuration settings.
 4 | Run this script to debug issues with environment variables.
 5 | """
 6 | import os
 7 | import sys
 8 | import json
 9 | from pathlib import Path
10 | 
11 | # Add parent directory to path so we can import app modules
12 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
13 | 
14 | def check_env():
15 |     """Print environment variables and settings."""
16 |     print("=== Environment Variables ===")
17 |     env_vars = {
18 |         "API_KEYS": os.getenv("API_KEYS", ""),
19 |         "ENABLE_API_KEY_AUTH": os.getenv("ENABLE_API_KEY_AUTH", ""),
20 |         "API_KEY_HEADER_NAME": os.getenv("API_KEY_HEADER_NAME", ""),
21 |         "RATE_LIMIT_ENABLED": os.getenv("RATE_LIMIT_ENABLED", ""),
22 |         "DEFAULT_PROXIES": os.getenv("DEFAULT_PROXIES", ""),
23 |         "DEFAULT_SITE_NAMES": os.getenv("DEFAULT_SITE_NAMES", ""),
24 |         "ENABLE_CACHE": os.getenv("ENABLE_CACHE", ""),
25 |         "ENVIRONMENT": os.getenv("ENVIRONMENT", "")
26 |     }
27 |     
28 |     for key, value in env_vars.items():
29 |         print(f"{key}={value!r}")
30 |     
31 |     print("\n=== Testing Settings Loading ===")
32 |     try:
33 |         from app.config import settings
34 |         
35 |         print(f"API_KEYS: {settings.API_KEYS}")
36 |         print(f"ENABLE_API_KEY_AUTH: {settings.ENABLE_API_KEY_AUTH}")
37 |         print(f"API_KEY_HEADER_NAME: {settings.API_KEY_HEADER_NAME}")
38 |         print(f"RATE_LIMIT_ENABLED: {settings.RATE_LIMIT_ENABLED}")
39 |         print(f"DEFAULT_PROXIES: {settings.DEFAULT_PROXIES}")
40 |         print(f"DEFAULT_SITE_NAMES: {settings.DEFAULT_SITE_NAMES}")
41 |         print(f"ENABLE_CACHE: {settings.ENABLE_CACHE}")
42 |         print(f"ENVIRONMENT: {settings.ENVIRONMENT}")
43 |         
44 |         print("\nSettings were loaded successfully!")
45 |     except Exception as e:
46 |         print(f"Error loading settings: {e}")
47 |         import traceback
48 |         traceback.print_exc()
49 | 
50 | if __name__ == "__main__":
51 |     check_env()
52 | 


--------------------------------------------------------------------------------
/scripts/confirm_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Script to confirm environment variables are set correctly
 3 | # Run this at container startup
 4 | 
 5 | echo "=== Environment Variable Confirmation ==="
 6 | echo "ENABLE_API_KEY_AUTH: $ENABLE_API_KEY_AUTH"
 7 | echo "API_KEYS: ${API_KEYS:0:3}... (truncated for security)"
 8 | echo "RATE_LIMIT_ENABLED: $RATE_LIMIT_ENABLED"
 9 | echo "ENABLE_CACHE: $ENABLE_CACHE"
10 | echo "LOG_LEVEL: $LOG_LEVEL"
11 | echo "ENVIRONMENT: $ENVIRONMENT"
12 | echo "========================================"
13 | 


--------------------------------------------------------------------------------
/scripts/debug_env_conflicts.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Script to detect environment variable conflicts between different sources.
  4 | This helps diagnose issues where values in code, .env, or Docker might conflict.
  5 | """
  6 | import json
  7 | import os
  8 | import sys
  9 | from pathlib import Path
 10 | 
 11 | # Add parent directory to path so we can import app modules
 12 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 13 | 
 14 | def color_text(text, color_code):
 15 |     """Add color to terminal output."""
 16 |     return f"\033[{color_code}m{text}\033[0m"
 17 | 
 18 | def red(text):
 19 |     return color_text(text, 31)
 20 | 
 21 | def green(text):
 22 |     return color_text(text, 32)
 23 | 
 24 | def yellow(text):
 25 |     return color_text(text, 33)
 26 | 
 27 | def get_docker_env_vars():
 28 |     """Get environment variables from docker-compose.yml."""
 29 |     docker_compose_path = Path("docker-compose.yml")
 30 |     if not docker_compose_path.exists():
 31 |         return {}
 32 |     
 33 |     try:
 34 |         import yaml
 35 |         with open(docker_compose_path, 'r') as f:
 36 |             docker_compose = yaml.safe_load(f)
 37 |         
 38 |         if not docker_compose or 'services' not in docker_compose:
 39 |             return {}
 40 |             
 41 |         for service in docker_compose['services'].values():
 42 |             if 'environment' in service:
 43 |                 env_vars = {}
 44 |                 for env in service['environment']:
 45 |                     if isinstance(env, str) and '=' in env:
 46 |                         key, value = env.split('=', 1)
 47 |                         env_vars[key] = value
 48 |                     elif isinstance(env, dict):
 49 |                         env_vars.update(env)
 50 |                 return env_vars
 51 |         return {}
 52 |     except Exception as e:
 53 |         print(f"Error parsing docker-compose.yml: {e}")
 54 |         return {}
 55 | 
 56 | def get_dotenv_vars():
 57 |     """Get environment variables from .env and .env.local."""
 58 |     env_vars = {}
 59 |     try:
 60 |         import dotenv
 61 |         # Load .env
 62 |         env_path = Path(".env")
 63 |         if env_path.exists():
 64 |             env_vars.update(dotenv.dotenv_values(env_path))
 65 |         
 66 |         # Load .env.local which overrides .env
 67 |         local_env_path = Path(".env.local")
 68 |         if local_env_path.exists():
 69 |             env_vars.update(dotenv.dotenv_values(local_env_path))
 70 |     except ImportError:
 71 |         print("python-dotenv not installed. Please install with: pip install python-dotenv")
 72 |     
 73 |     return env_vars
 74 | 
 75 | def debug_env_conflicts():
 76 |     """Find and report conflicts between environment variable sources."""
 77 |     print(yellow("=== Environment Variable Conflict Detector ===\n"))
 78 |     
 79 |     # Get environment variables from different sources
 80 |     os_env_vars = {k: v for k, v in os.environ.items() if k.isupper()}
 81 |     dotenv_vars = get_dotenv_vars()
 82 |     docker_vars = get_docker_env_vars()
 83 |     
 84 |     # Check for key environment variables
 85 |     key_vars = [
 86 |         "ENABLE_API_KEY_AUTH", "API_KEYS", "RATE_LIMIT_ENABLED", 
 87 |         "ENABLE_CACHE", "ENVIRONMENT", "LOG_LEVEL"
 88 |     ]
 89 |     
 90 |     print(yellow("Checking key environment variables:"))
 91 |     for var in key_vars:
 92 |         values = {}
 93 |         if var in os_env_vars:
 94 |             values["OS"] = os_env_vars[var]
 95 |         if var in dotenv_vars:
 96 |             values["dotenv"] = dotenv_vars[var]
 97 |         if var in docker_vars:
 98 |             values["docker"] = docker_vars[var]
 99 |             
100 |         if not values:
101 |             print(f"  {var}: {yellow('Not set in any source')}")
102 |             continue
103 |             
104 |         if len(set(values.values())) > 1:
105 |             print(f"  {var}: {red('CONFLICT DETECTED')}")
106 |             for source, value in values.items():
107 |                 print(f"    - {source}: {value}")
108 |         else:
109 |             value = next(iter(values.values()))
110 |             sources = ", ".join(values.keys())
111 |             print(f"  {var}: {green(value)} (from {sources})")
112 |     
113 |     # Check app config (after environment variables are resolved)
114 |     print("\n" + yellow("Checking application config:"))
115 |     try:
116 |         from app.config import settings
117 |         from app.utils.auth_health import check_auth_configuration
118 |         
119 |         # Check for auth configuration inconsistencies
120 |         auth_status = check_auth_configuration()
121 |         if auth_status["inconsistent_config"]:
122 |             print(red("  Authentication configuration issue detected:"))
123 |             for rec in auth_status["recommendations"]:
124 |                 print(f"    - {rec}")
125 |         else:
126 |             print(green("  Authentication configuration is consistent"))
127 |             
128 |         # Check other important settings
129 |         print("\n" + yellow("Final resolved configuration:"))
130 |         print(f"  ENABLE_API_KEY_AUTH: {settings.ENABLE_API_KEY_AUTH}")
131 |         print(f"  API_KEYS configured: {bool(settings.API_KEYS)}")
132 |         print(f"  API_KEYS count: {len(settings.API_KEYS)}")
133 |         print(f"  RATE_LIMIT_ENABLED: {settings.RATE_LIMIT_ENABLED}")
134 |         print(f"  ENABLE_CACHE: {settings.ENABLE_CACHE}")
135 |         print(f"  ENVIRONMENT: {settings.ENVIRONMENT}")
136 |         print(f"  LOG_LEVEL: {settings.LOG_LEVEL}")
137 |         
138 |     except ImportError:
139 |         print(red("  Could not import app.config. Make sure you're running from the project root"))
140 |     
141 | if __name__ == "__main__":
142 |     try:
143 |         debug_env_conflicts()
144 |     except Exception as e:
145 |         print(red(f"Error: {e}"))
146 |         import traceback
147 |         traceback.print_exc()
148 | 


--------------------------------------------------------------------------------
/scripts/debug_env_load_order.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Script to debug the order of environment variable loading
 3 | # Run this to understand where each environment variable comes from
 4 | 
 5 | echo "=== Environment Variable Load Order Debug ==="
 6 | 
 7 | # Check different environment variable sources in order of precedence
 8 | echo "Environment variables from different sources:"
 9 | echo "1. Command line/docker-compose.yml environment section:"
10 | echo "   LOG_LEVEL=$LOG_LEVEL"
11 | echo "   ENABLE_API_KEY_AUTH=$ENABLE_API_KEY_AUTH"
12 | echo 
13 | 
14 | # Check Dockerfile ENV vs runtime environment
15 | echo "2. Default values from Dockerfile (these should be overridden at runtime):"
16 | echo "   Dockerfile ARG LOG_LEVEL default=DEBUG"
17 | echo "   Dockerfile ARG ENABLE_API_KEY_AUTH default=false"
18 | echo 
19 | 
20 | # Dump all environment variables for analysis
21 | echo "3. All current environment variables (alphabetical):"
22 | env | grep -E "LOG_LEVEL|ENABLE_|API_KEY|ENVIRONMENT" | sort
23 | echo
24 | 
25 | echo "=== Environment Variable Override Chain ==="
26 | echo "Command line args > docker-compose environment > .env > Dockerfile ENV > Dockerfile ARG defaults"
27 | echo "==========================================="
28 | 


--------------------------------------------------------------------------------
/scripts/docker-entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Docker entrypoint script that handles script permissions and execution
 3 | 
 4 | # Ensure scripts are executable (needed when mounted as volumes)
 5 | find /app/scripts -type f -name "*.sh" -exec chmod +x {} \;
 6 | find /app/scripts -type f -name "*.py" -exec chmod +x {} \;
 7 | 
 8 | # Display environment variable debug info
 9 | echo "=== Environment Variable Load Order Debug ==="
10 | echo "Environment variables from different sources:"
11 | echo "1. Command line/docker-compose.yml environment section:"
12 | echo "   LOG_LEVEL=$LOG_LEVEL"
13 | echo "   ENABLE_API_KEY_AUTH=$ENABLE_API_KEY_AUTH"
14 | echo 
15 | 
16 | # Check Dockerfile ENV vs runtime environment
17 | echo "2. Default values from Dockerfile (these should be overridden at runtime):"
18 | echo "   Dockerfile ARG LOG_LEVEL default=DEBUG"
19 | echo "   Dockerfile ARG ENABLE_API_KEY_AUTH default=false"
20 | echo 
21 | 
22 | # Dump all environment variables for analysis
23 | echo "3. All current environment variables (alphabetical):"
24 | env | grep -E "LOG_LEVEL|ENABLE_|API_KEY|ENVIRONMENT" | sort
25 | echo
26 | 
27 | echo "=== Environment Variable Override Chain ==="
28 | echo "Command line args > docker-compose environment > .env > Dockerfile ENV > Dockerfile ARG defaults"
29 | echo "==========================================="
30 | 
31 | # Run the confirmation script
32 | bash /app/scripts/confirm_env.sh
33 | 
34 | # Start the FastAPI application
35 | exec uvicorn app.main:app --host 0.0.0.0 --port 8000 --proxy-headers
36 | 


--------------------------------------------------------------------------------
/scripts/increment_version.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Script to increment the version number in the app's __init__.py file.
 4 | Usage: python increment_version.py [major|minor|patch]
 5 | """
 6 | import re
 7 | import sys
 8 | from pathlib import Path
 9 | 
10 | # Get the project root directory
11 | project_root = Path(__file__).parent.parent
12 | init_file = project_root / "app" / "__init__.py"
13 | 
14 | def read_version():
15 |     """Read the current version from __init__.py"""
16 |     content = init_file.read_text()
17 |     version_match = re.search(r'__version__\s*=\s*["\']([^"\']+)["\']', content)
18 |     if not version_match:
19 |         raise ValueError("Could not find version string in __init__.py")
20 |     return version_match.group(1)
21 | 
22 | def write_version(new_version):
23 |     """Write the new version to __init__.py"""
24 |     content = init_file.read_text()
25 |     new_content = re.sub(
26 |         r'__version__\s*=\s*["\']([^"\']+)["\']',
27 |         f'__version__ = "{new_version}"',
28 |         content
29 |     )
30 |     init_file.write_text(new_content)
31 | 
32 | def increment_version(version_part):
33 |     """
34 |     Increment the version number.
35 |     version_part: 'major', 'minor', or 'patch'
36 |     """
37 |     current = read_version()
38 |     print(f"Current version: {current}")
39 |     
40 |     try:
41 |         major, minor, patch = map(int, current.split('.'))
42 |     except ValueError:
43 |         print(f"Error: Version {current} is not in the format X.Y.Z")
44 |         sys.exit(1)
45 |     
46 |     if version_part == "major":
47 |         major += 1
48 |         minor = 0
49 |         patch = 0
50 |     elif version_part == "minor":
51 |         minor += 1
52 |         patch = 0
53 |     elif version_part == "patch":
54 |         patch += 1
55 |     else:
56 |         print(f"Error: Unknown version part '{version_part}'. Use 'major', 'minor', or 'patch'")
57 |         sys.exit(1)
58 |     
59 |     new_version = f"{major}.{minor}.{patch}"
60 |     write_version(new_version)
61 |     print(f"Version updated to: {new_version}")
62 |     return new_version
63 | 
64 | if __name__ == "__main__":
65 |     if len(sys.argv) != 2 or sys.argv[1] not in ["major", "minor", "patch"]:
66 |         print("Usage: python increment_version.py [major|minor|patch]")
67 |         sys.exit(1)
68 |     
69 |     increment_version(sys.argv[1])
70 | 


--------------------------------------------------------------------------------
/scripts/load_local_env.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Script to explicitly load .env.local environment variables.
 4 | Run this script before starting the app if you want to use .env.local.
 5 | """
 6 | import os
 7 | import sys
 8 | from pathlib import Path
 9 | 
10 | try:
11 |     from dotenv import load_dotenv
12 | except ImportError:
13 |     print("Error: python-dotenv is not installed. Please install it with:")
14 |     print("pip install python-dotenv")
15 |     sys.exit(1)
16 | 
17 | def load_local_env():
18 |     """Explicitly load .env.local file if it exists."""
19 |     env_local_path = Path(".env.local")
20 |     
21 |     if not env_local_path.exists():
22 |         print(f"Warning: {env_local_path} not found")
23 |         return False
24 |     
25 |     print(f"Loading environment variables from {env_local_path.absolute()}")
26 |     load_dotenv(env_local_path, override=True)
27 |     
28 |     # Print a few non-sensitive variables to confirm loading
29 |     print("Loaded variables (sample):")
30 |     for var in ["LOG_LEVEL", "ENVIRONMENT", "ENABLE_CACHE"]:
31 |         value = os.getenv(var, "[not set]")
32 |         print(f"  {var}={value}")
33 |     
34 |     return True
35 | 
36 | if __name__ == "__main__":
37 |     if load_local_env():
38 |         print("\nSuccessfully loaded .env.local")
39 |         print("Run your application now to use these variables")
40 |     else:
41 |         print("\nNo .env.local file found")
42 |         print("Using default environment variables only")
43 | 


--------------------------------------------------------------------------------
/scripts/load_test.py:
--------------------------------------------------------------------------------
  1 | """Load testing script for the JobSpy Docker API."""
  2 | import argparse
  3 | import asyncio
  4 | import random
  5 | import statistics
  6 | import time
  7 | from typing import Any, Dict, List
  8 | 
  9 | import aiohttp
 10 | 
 11 | # Job titles and locations for random queries
 12 | JOB_TITLES = ["software engineer", "data scientist", "product manager", "devops engineer", "full stack developer"]
 13 | LOCATIONS = ["San Francisco, CA", "New York, NY", "Seattle, WA", "Austin, TX", "Boston, MA"]
 14 | 
 15 | async def make_request(session, url, api_key, params=None, json_data=None):
 16 |     """Make an HTTP request and measure response time."""
 17 |     headers = {"x-api-key": api_key, "accept": "application/json"}
 18 |     
 19 |     start_time = time.time()
 20 |     
 21 |     if json_data:
 22 |         async with session.post(url, headers=headers, json=json_data) as response:
 23 |             data = await response.json()
 24 |             status = response.status
 25 |     else:
 26 |         async with session.get(url, headers=headers, params=params) as response:
 27 |             data = await response.json()
 28 |             status = response.status
 29 |             
 30 |     end_time = time.time()
 31 |     response_time = end_time - start_time
 32 |     
 33 |     return {
 34 |         "status": status,
 35 |         "response_time": response_time,
 36 |         "data": data
 37 |     }
 38 | 
 39 | async def run_load_test(base_url, api_key, num_requests, concurrency):
 40 |     """Run a load test with the specified number of concurrent requests."""
 41 |     print(f"Starting load test with {num_requests} total requests, {concurrency} concurrent")
 42 |     
 43 |     # Create a connection pool
 44 |     connector = aiohttp.TCPConnector(limit=concurrency)
 45 |     async with aiohttp.ClientSession(connector=connector) as session:
 46 |         tasks = []
 47 |         
 48 |         for _ in range(num_requests):
 49 |             # Generate random query parameters
 50 |             job_title = random.choice(JOB_TITLES)
 51 |             location = random.choice(LOCATIONS)
 52 |             
 53 |             # Randomly choose between GET and POST
 54 |             if random.choice([True, False]):
 55 |                 # GET request
 56 |                 params = {
 57 |                     "site_name": random.sample(["indeed", "linkedin", "zip_recruiter"], 1),
 58 |                     "search_term": job_title,
 59 |                     "location": location,
 60 |                     "results_wanted": 5
 61 |                 }
 62 |                 tasks.append(make_request(session, f"{base_url}/api/v1/search_jobs", api_key, params=params))
 63 |             else:
 64 |                 # POST request
 65 |                 json_data = {
 66 |                     "site_name": random.sample(["indeed", "linkedin", "zip_recruiter"], 2),
 67 |                     "search_term": job_title,
 68 |                     "location": location,
 69 |                     "results_wanted": 5
 70 |                 }
 71 |                 tasks.append(make_request(session, f"{base_url}/api/v1/search_jobs", api_key, json_data=json_data))
 72 |         
 73 |         # Execute requests with limited concurrency
 74 |         results = []
 75 |         for i in range(0, len(tasks), concurrency):
 76 |             batch = tasks[i:i+concurrency]
 77 |             batch_results = await asyncio.gather(*batch)
 78 |             results.extend(batch_results)
 79 |             print(f"Completed {min(i+concurrency, len(tasks))}/{len(tasks)} requests")
 80 |         
 81 |         return results
 82 | 
 83 | def analyze_results(results):
 84 |     """Analyze load test results."""
 85 |     response_times = [r["response_time"] for r in results]
 86 |     statuses = [r["status"] for r in results]
 87 |     
 88 |     # Calculate statistics
 89 |     avg_time = statistics.mean(response_times)
 90 |     median_time = statistics.median(response_times)
 91 |     min_time = min(response_times)
 92 |     max_time = max(response_times)
 93 |     p95_time = sorted(response_times)[int(len(response_times) * 0.95)]
 94 |     
 95 |     success_count = statuses.count(200)
 96 |     error_count = len(statuses) - success_count
 97 |     
 98 |     # Print results
 99 |     print("\n=== Load Test Results ===")
100 |     print(f"Total Requests: {len(results)}")
101 |     print(f"Success Rate: {success_count/len(results)*100:.2f}% ({success_count}/{len(results)})")
102 |     print(f"Average Response Time: {avg_time:.4f} seconds")
103 |     print(f"Median Response Time: {median_time:.4f} seconds")
104 |     print(f"Min Response Time: {min_time:.4f} seconds")
105 |     print(f"Max Response Time: {max_time:.4f} seconds")
106 |     print(f"95th Percentile Response Time: {p95_time:.4f} seconds")
107 |     
108 |     # Count status codes
109 |     status_counts = {}
110 |     for status in statuses:
111 |         status_counts[status] = status_counts.get(status, 0) + 1
112 |     
113 |     print("\nStatus Code Distribution:")
114 |     for status, count in status_counts.items():
115 |         print(f"  {status}: {count} ({count/len(results)*100:.2f}%)")
116 | 
117 | if __name__ == "__main__":
118 |     parser = argparse.ArgumentParser(description="Load test the JobSpy Docker API")
119 |     parser.add_argument("--url", default="http://localhost:8000", help="Base URL of the API")
120 |     parser.add_argument("--api-key", required=True, help="API key for authentication")
121 |     parser.add_argument("--requests", type=int, default=10, help="Total number of requests to make")
122 |     parser.add_argument("--concurrency", type=int, default=2, help="Number of concurrent requests")
123 |     args = parser.parse_args()
124 |     
125 |     results = asyncio.run(run_load_test(args.url, args.api_key, args.requests, args.concurrency))
126 |     analyze_results(results)
127 | 


--------------------------------------------------------------------------------
/scripts/make_scripts_executable.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Make all scripts executable
3 | echo "Making scripts executable..."
4 | chmod +x scripts/*.py
5 | chmod +x scripts/*.sh
6 | echo "Done."
7 | 


--------------------------------------------------------------------------------
/scripts/set_log_level.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Script to set log level and restart the application
 3 | 
 4 | if [ -z "$1" ]; then
 5 |   echo "Usage: $0 <log_level>"
 6 |   echo "Available levels: DEBUG, INFO, WARNING, ERROR, CRITICAL"
 7 |   exit 1
 8 | fi
 9 | 
10 | LOG_LEVEL=$(echo "$1" | tr '[:lower:]' '[:upper:]')
11 | echo "Setting log level to: $LOG_LEVEL"
12 | 
13 | # Update .env file
14 | sed -i.bak "s/^LOG_LEVEL=.*/LOG_LEVEL=$LOG_LEVEL/" .env
15 | 
16 | # Restart the service with new log level
17 | echo "Restarting services..."
18 | docker-compose down
19 | LOG_LEVEL=$LOG_LEVEL docker-compose up -d
20 | 
21 | echo "Done! Services restarted with log level: $LOG_LEVEL"
22 | echo "View logs with: docker-compose logs -f"
23 | 


--------------------------------------------------------------------------------
/scripts/verify_env_loading.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Script to verify that environment variables are being properly loaded.
 4 | Run this script to compare .env values with actual loaded values.
 5 | """
 6 | import os
 7 | import sys
 8 | from pathlib import Path
 9 | import dotenv
10 | 
11 | # Add parent directory to path so we can import app modules
12 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
13 | 
14 | def verify_env_loading():
15 |     """Verify environment variables are loaded correctly from .env files."""
16 |     print("=== Environment Variable Loading Verification ===\n")
17 |     
18 |     # Load .env file content to compare with actual environment variables
19 |     env_file = Path(".env")
20 |     env_local_file = Path(".env.local")
21 |     
22 |     env_vars = {}
23 |     if env_file.exists():
24 |         print(f"Loading .env file from {env_file.absolute()}")
25 |         env_vars.update(dotenv.dotenv_values(env_file))
26 |     else:
27 |         print(".env file not found")
28 |     
29 |     # Check if .env.local exists, but note that it's not loaded by default
30 |     if env_local_file.exists():
31 |         print(f"Found .env.local file at {env_local_file.absolute()}")
32 |         print("NOTE: .env.local is not automatically loaded by the application.")
33 |         print("To use .env.local, you must explicitly load it or use docker-compose.dev.yml")
34 |         
35 |         # Still load it for debugging purposes
36 |         local_vars = dotenv.dotenv_values(env_local_file)
37 |         print(f"  .env.local contains {len(local_vars)} variables")
38 |     else:
39 |         print(".env.local file not found")
40 |     
41 |     print("\n=== Expected vs Actual Values ===")
42 |     for key, expected_value in env_vars.items():
43 |         actual_value = os.getenv(key)
44 |         match = expected_value == actual_value
45 |         status = "✅" if match else "❌"
46 |         
47 |         # Mask API keys
48 |         if "API_KEY" in key and expected_value:
49 |             expected_value = "****[MASKED]****"
50 |         if "API_KEY" in key and actual_value:
51 |             actual_value = "****[MASKED]****"
52 |             
53 |         print(f"{status} {key}:")
54 |         print(f"  Expected: {expected_value!r}")
55 |         print(f"  Actual:   {actual_value!r}")
56 |     
57 |     print("\n=== Docker Environment Note ===")
58 |     print("If running in Docker, environment values in docker-compose.yml")
59 |     print("will override values from .env files. To fix this:")
60 |     print("1. Use ${VAR_NAME:-default} syntax in docker-compose.yml")
61 |     print("2. Use the env_file directive to load .env files")
62 |     print("3. Ensure .env files are mounted/copied to the container")
63 | 
64 | if __name__ == "__main__":
65 |     try:
66 |         import dotenv
67 |     except ImportError:
68 |         print("python-dotenv package is required to run this script.")
69 |         print("Install it with: pip install python-dotenv")
70 |         sys.exit(1)
71 |         
72 |     verify_env_loading()
73 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Test package for JobSpy Docker API."""
2 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """Pytest configuration for JobSpy Docker API tests."""
 2 | import pytest
 3 | from fastapi.testclient import TestClient
 4 | from app.main import app
 5 | 
 6 | @pytest.fixture
 7 | def client():
 8 |     """Get a TestClient instance for the FastAPI app."""
 9 |     with TestClient(app) as test_client:
10 |         yield test_client
11 | 


--------------------------------------------------------------------------------
/tests/test_api.py:
--------------------------------------------------------------------------------
 1 | """Tests for the JobSpy Docker API."""
 2 | import pytest
 3 | from unittest.mock import patch, MagicMock
 4 | import pandas as pd
 5 | 
 6 | def test_health_endpoint(client):
 7 |     """Test the health endpoint."""
 8 |     response = client.get("/health")
 9 |     assert response.status_code == 200
10 |     assert response.json()["status"] == "ok"
11 | 
12 | @patch('app.services.job_service.scrape_jobs')
13 | def test_search_jobs(mock_scrape_jobs, client):
14 |     """Test the search_jobs endpoint."""
15 |     # Setup mock
16 |     mock_df = pd.DataFrame({
17 |         'SITE': ['indeed', 'linkedin'],
18 |         'TITLE': ['Software Engineer', 'Data Scientist'],
19 |         'COMPANY': ['Test Corp', 'Test Inc'],
20 |     })
21 |     mock_scrape_jobs.return_value = mock_df
22 |     
23 |     # Disable auth for testing
24 |     with patch('app.config.settings.ENABLE_API_KEY_AUTH', False):
25 |         response = client.post(
26 |             "/api/v1/search_jobs",
27 |             json={
28 |                 "site_name": ["indeed", "linkedin"],
29 |                 "search_term": "software engineer",
30 |                 "location": "San Francisco",
31 |                 "country_indeed": "USA"
32 |             }
33 |         )
34 |     
35 |     # Check response
36 |     assert response.status_code == 200
37 |     assert response.json()["count"] == 2
38 |     assert not response.json()["cached"]
39 |     assert len(response.json()["jobs"]) == 2
40 | 


--------------------------------------------------------------------------------