├── apps ├── api │ ├── src │ │ ├── __init__.py │ │ ├── api │ │ │ ├── __init__.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ └── schemas.py │ │ │ ├── routes │ │ │ │ ├── __init__.py │ │ │ │ └── crawl.py │ │ │ └── middleware │ │ │ │ └── __init__.py │ │ ├── core │ │ │ ├── __init__.py │ │ │ └── crawler.py │ │ ├── config.py │ │ ├── requirements.txt │ │ └── main.py │ └── Dockerfile └── sdks │ ├── python │ ├── scrapester │ │ ├── __init__.py │ │ └── crawler.py │ ├── README.md │ ├── test.py │ └── setup.py │ └── javascript │ ├── src │ ├── index.ts │ ├── types.ts │ └── client.ts │ ├── tsup.config.ts │ ├── tsconfig.json │ ├── README.md │ ├── .npmignore │ └── package.json ├── assets └── image.png ├── LICENSE ├── .gitignore └── README.md /apps/api/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/api/src/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/api/src/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/api/src/api/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/api/src/api/routes/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/api/src/api/middleware/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/sdks/python/scrapester/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/sdks/python/README.md: -------------------------------------------------------------------------------- 1 | # scrapester 2 | Turn any website into LLM structured data. 3 | -------------------------------------------------------------------------------- /assets/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bugsterapp/scrapester/HEAD/assets/image.png -------------------------------------------------------------------------------- /apps/sdks/javascript/src/index.ts: -------------------------------------------------------------------------------- 1 | export { ScrapesterApp } from './client'; 2 | export { 3 | APIError, CrawlerError, CrawlerResponse, type CrawlerResponseData 4 | } from './types'; 5 | -------------------------------------------------------------------------------- /apps/sdks/javascript/tsup.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from 'tsup'; 2 | 3 | export default defineConfig({ 4 | entry: ['src/index.ts'], 5 | format: ['cjs', 'esm'], 6 | dts: true, 7 | splitting: false, 8 | sourcemap: true, 9 | clean: true, 10 | }); -------------------------------------------------------------------------------- /apps/api/src/config.py: -------------------------------------------------------------------------------- 1 | from pydantic_settings import BaseSettings 2 | from typing import Optional 3 | 4 | class Settings(BaseSettings): 5 | REDIS_URL: str = "redis://localhost:6379" 6 | CACHE_TTL: int = 3600 7 | MAX_CONCURRENT_CRAWLS: int = 5 8 | RATE_LIMIT_REQUESTS: int = 100 9 | RATE_LIMIT_PERIOD: int = 60 10 | API_KEY_HEADER: str = "X-API-Key" 11 | 12 | class Config: 13 | env_file = ".env" 14 | 15 | settings = Settings() 16 | -------------------------------------------------------------------------------- /apps/sdks/javascript/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es2020", 4 | "module": "commonjs", 5 | "declaration": true, 6 | "outDir": "./dist", 7 | "strict": true, 8 | "esModuleInterop": true, 9 | "skipLibCheck": true, 10 | "forceConsistentCasingInFileNames": true, 11 | "moduleResolution": "node", 12 | "types": ["yargs"] 13 | }, 14 | "include": ["src"], 15 | "exclude": ["node_modules", "dist", "test"] 16 | } 17 | -------------------------------------------------------------------------------- /apps/sdks/javascript/README.md: -------------------------------------------------------------------------------- 1 | # scrapester 2 | Turn any website into LLM structured data. 3 | 4 | ## Usage Example 5 | 6 | 7 | import { ScrapesterApp } from 'scrapester'; 8 | 9 | const client = new ScrapesterApp('your-api-key'); 10 | 11 | // Scrape a single URL 12 | const result = await client.scrape('https://example.com'); 13 | console.log(result.markdown); 14 | 15 | // Crawl a website 16 | const results = await client.crawl('https://example.com', { 17 | maxPages: 10, 18 | maxDepth: 2 19 | }); 20 | 21 | 22 | -------------------------------------------------------------------------------- /apps/sdks/python/test.py: -------------------------------------------------------------------------------- 1 | from scrapester.crawler import ScrapesterApp, APIError 2 | 3 | 4 | client = ScrapesterApp(api_key="your-api-key") 5 | 6 | # # Scrape a single URL 7 | # try: 8 | # url = "https://platan.us/" 9 | # result = client.crawl(url) 10 | # print(f"Title: {result.metadata.get('title')}") 11 | # print(f"Content:\n{result.markdown}") 12 | # except APIError as e: 13 | # print(f"Error: {e}") 14 | 15 | 16 | try: 17 | url = "https://platan.us/" 18 | result = client.crawl(url) 19 | print(f"{result}") 20 | except APIError as e: 21 | print(f"Error: {e}") 22 | -------------------------------------------------------------------------------- /apps/sdks/javascript/.npmignore: -------------------------------------------------------------------------------- 1 | # Source 2 | src/ 3 | tests/ 4 | examples/ 5 | docs/ 6 | 7 | # Configuration files 8 | .eslintrc 9 | .prettierrc 10 | .editorconfig 11 | .gitignore 12 | tsconfig.json 13 | jest.config.js 14 | .travis.yml 15 | .github/ 16 | .vscode/ 17 | 18 | # Development files 19 | *.test.ts 20 | *.spec.ts 21 | coverage/ 22 | .nyc_output/ 23 | .circleci/ 24 | 25 | # Documentation 26 | docs/ 27 | CONTRIBUTING.md 28 | CHANGELOG.md 29 | 30 | # Build tools 31 | webpack.config.js 32 | rollup.config.js 33 | babel.config.js 34 | tsup.config.ts 35 | 36 | # Git 37 | .git/ 38 | .gitignore 39 | 40 | # Development environment 41 | .env* 42 | .idea/ 43 | .vscode/ 44 | 45 | # Logs 46 | *.log 47 | npm-debug.log* 48 | yarn-debug.log* 49 | yarn-error.log* 50 | 51 | # Cache 52 | .eslintcache 53 | .npm 54 | .yarn 55 | 56 | # Test files 57 | __tests__/ 58 | __mocks__/ 59 | *.test.ts 60 | *.spec.ts 61 | test/ -------------------------------------------------------------------------------- /apps/api/src/api/models/schemas.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, HttpUrl 2 | from typing import List, Dict, Optional, Union 3 | from enum import Enum 4 | 5 | 6 | class ActionType(str, Enum): 7 | CLICK = "click" 8 | TYPE = "type" 9 | WAIT = "wait" 10 | SCROLL = "scroll" 11 | 12 | 13 | class Action(BaseModel): 14 | type: ActionType 15 | selector: Optional[str] = None 16 | text: Optional[str] = None 17 | duration: Optional[int] = None 18 | 19 | 20 | class CrawlOptions(BaseModel): 21 | max_pages: Optional[int] = 10 22 | max_depth: Optional[int] = 3 23 | timeout: Optional[int] = 30000 24 | scroll: Optional[bool] = True 25 | screenshot: Optional[bool] = False 26 | wait_for_selector: Optional[str] = None 27 | actions: Optional[List[Action]] = None 28 | # auth: Optional[AuthConfig] = None 29 | 30 | 31 | class ScrapeRequest(BaseModel): 32 | url: HttpUrl 33 | 34 | 35 | class CrawlRequest(BaseModel): 36 | url: HttpUrl 37 | options: Optional[CrawlOptions] = None 38 | -------------------------------------------------------------------------------- /apps/api/src/requirements.txt: -------------------------------------------------------------------------------- 1 | annotated-types==0.7.0 2 | anyio==4.6.2.post1 3 | beautifulsoup4==4.12.3 4 | build==1.2.2.post1 5 | certifi==2024.8.30 6 | charset-normalizer==3.4.0 7 | click==8.1.7 8 | docutils==0.21.2 9 | fastapi==0.115.5 10 | greenlet==3.1.1 11 | gunicorn==23.0.0 12 | h11==0.14.0 13 | idna==3.10 14 | importlib_metadata==8.5.0 15 | jaraco.classes==3.4.0 16 | jaraco.context==6.0.1 17 | jaraco.functools==4.1.0 18 | keyring==25.5.0 19 | Markdown==3.7 20 | markdown-it-py==3.0.0 21 | mdurl==0.1.2 22 | more-itertools==10.5.0 23 | nh3==0.2.18 24 | packaging==24.2 25 | pkginfo==1.10.0 26 | playwright==1.49.0 27 | pydantic==2.10.1 28 | pydantic-settings==2.6.1 29 | pydantic_core==2.27.1 30 | pyee==12.0.0 31 | Pygments==2.18.0 32 | pyproject_hooks==1.2.0 33 | python-dotenv==1.0.1 34 | readme_renderer==44.0 35 | requests==2.32.3 36 | requests-toolbelt==1.0.0 37 | rfc3986==2.0.0 38 | rich==13.9.4 39 | setuptools==75.6.0 40 | sniffio==1.3.1 41 | soupsieve==2.6 42 | starlette==0.41.3 43 | twine==5.1.1 44 | typing_extensions==4.12.2 45 | urllib3==2.2.3 46 | uvicorn==0.32.1 47 | zipp==3.21.0 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Bugster 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /apps/api/Dockerfile: -------------------------------------------------------------------------------- 1 | # Define function directory 2 | ARG FUNCTION_DIR="/src" 3 | 4 | FROM mcr.microsoft.com/playwright/python:v1.49.0-jammy as build-image 5 | 6 | # Install aws-lambda-cpp build dependencies 7 | RUN apt-get update && \ 8 | apt-get install -y \ 9 | g++ \ 10 | make \ 11 | cmake \ 12 | unzip \ 13 | libcurl4-openssl-dev 14 | # Include global arg in this stage of the build 15 | ARG FUNCTION_DIR 16 | # Create function directory 17 | RUN mkdir -p ${FUNCTION_DIR} 18 | 19 | # Copy function code 20 | COPY ./src ${FUNCTION_DIR} 21 | 22 | # Install Playwright 23 | RUN pip3 install playwright 24 | 25 | # Run Playwright install to ensure all necessary browser binaries are installed 26 | RUN playwright install 27 | 28 | # Multi-stage build: grab a fresh copy of the base image 29 | FROM mcr.microsoft.com/playwright/python:v1.49.0-jammy 30 | # Include global arg in this stage of the build 31 | ARG FUNCTION_DIR 32 | # Set working directory to function root directory 33 | WORKDIR ${FUNCTION_DIR} 34 | 35 | # Copy in the build image dependencies 36 | COPY --from=build-image ${FUNCTION_DIR} ${FUNCTION_DIR} 37 | # Install the dependencies 38 | RUN pip install --no-cache-dir -r requirements.txt 39 | 40 | CMD [ "uvicorn","main:app", "--host","0.0.0.0", "--port", "80" ] 41 | -------------------------------------------------------------------------------- /apps/sdks/python/setup.py: -------------------------------------------------------------------------------- 1 | # setup.py 2 | from setuptools import setup, find_packages 3 | 4 | with open("README.md", "r", encoding="utf-8") as fh: 5 | long_description = fh.read() 6 | 7 | setup( 8 | name="scrapester", 9 | version="0.1.2", 10 | author="Naquiao", 11 | author_email="ignacio@bugster.app", 12 | description="Python SDK for Scrapester API", 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | url="https://github.com/Bugsterapp/scrapester", 16 | packages=find_packages(), 17 | classifiers=[ 18 | "Development Status :: 3 - Alpha", 19 | "Intended Audience :: Developers", 20 | "License :: OSI Approved :: MIT License", 21 | "Operating System :: OS Independent", 22 | "Programming Language :: Python :: 3.7", 23 | "Programming Language :: Python :: 3.8", 24 | "Programming Language :: Python :: 3.9", 25 | "Programming Language :: Python :: 3.10", 26 | "Programming Language :: Python :: 3.11", 27 | "Programming Language :: Python :: 3.12", 28 | "Topic :: Internet :: WWW/HTTP", 29 | "Topic :: Software Development :: Libraries :: Python Modules", 30 | ], 31 | python_requires=">=3.7", 32 | install_requires=[ 33 | "requests>=2.25.0", 34 | ], 35 | ) 36 | -------------------------------------------------------------------------------- /apps/sdks/javascript/src/types.ts: -------------------------------------------------------------------------------- 1 | export interface CrawlerResponseData { 2 | url: string; 3 | markdown: string; 4 | metadata: Record; 5 | timestamp: string; 6 | } 7 | 8 | export class CrawlerResponse implements CrawlerResponseData { 9 | url: string; 10 | markdown: string; 11 | metadata: Record; 12 | timestamp: string; 13 | 14 | constructor(data: Partial) { 15 | this.url = data.url || ''; 16 | this.markdown = data.markdown || ''; 17 | this.metadata = data.metadata || {}; 18 | this.timestamp = data.timestamp || new Date().toISOString(); 19 | } 20 | 21 | static fromDict(data: Record): CrawlerResponse { 22 | return new CrawlerResponse({ 23 | url: data.url, 24 | markdown: data.markdown, 25 | metadata: data.metadata, 26 | timestamp: data.timestamp, 27 | }); 28 | } 29 | } 30 | 31 | export class CrawlerError extends Error { 32 | constructor(message: string) { 33 | super(message); 34 | this.name = 'CrawlerError'; 35 | } 36 | } 37 | 38 | export class APIError extends CrawlerError { 39 | constructor( 40 | message: string, 41 | public statusCode?: number, 42 | public response?: Record 43 | ) { 44 | super(message); 45 | this.name = 'APIError'; 46 | } 47 | } -------------------------------------------------------------------------------- /apps/sdks/javascript/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "scrapester", 3 | "version": "0.1.0", 4 | "description": "JavaScript SDK for Scrapester API", 5 | "main": "dist/index.js", 6 | "module": "dist/index.mjs", 7 | "types": "dist/index.d.ts", 8 | "files": [ 9 | "dist" 10 | ], 11 | "scripts": { 12 | "build": "tsup", 13 | "test": "jest", 14 | "lint": "eslint src --ext .ts", 15 | "format": "prettier --write \"src/**/*.ts\"", 16 | "prepublishOnly": "npm run build" 17 | }, 18 | "author": "Naquiao ", 19 | "license": "MIT", 20 | "dependencies": { 21 | "axios": "^1.6.0" 22 | }, 23 | "devDependencies": { 24 | "@types/jest": "^29.5.0", 25 | "@types/node": "^20.0.0", 26 | "@types/yargs": "^17.0.33", 27 | "@typescript-eslint/eslint-plugin": "^6.0.0", 28 | "@typescript-eslint/parser": "^6.0.0", 29 | "eslint": "^8.0.0", 30 | "jest": "^29.0.0", 31 | "prettier": "^3.0.0", 32 | "ts-jest": "^29.0.0", 33 | "tsup": "^8.3.5", 34 | "typescript": "^5.0.0" 35 | }, 36 | "keywords": [ 37 | "scraper", 38 | "crawler", 39 | "web-scraping", 40 | "api-client" 41 | ], 42 | "repository": { 43 | "type": "git", 44 | "url": "git+https://github.com/Bugsterapp/scrapester.git" 45 | }, 46 | "bugs": { 47 | "url": "https://github.com/Bugsterapp/scrapester/issues" 48 | }, 49 | "homepage": "https://github.com/Bugsterapp/scrapester#readme" 50 | } 51 | -------------------------------------------------------------------------------- /apps/api/src/main.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, Request 2 | from fastapi.middleware.cors import CORSMiddleware 3 | from fastapi.responses import JSONResponse, RedirectResponse 4 | from api.routes import crawl 5 | from config import settings 6 | import logging 7 | from contextlib import asynccontextmanager 8 | 9 | # Configure logging 10 | logging.basicConfig(level=logging.INFO) 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | @asynccontextmanager 15 | async def lifespan(app: FastAPI): 16 | yield 17 | # Cleanup on shutdown 18 | crawler = await crawl.get_crawler() 19 | await crawler.close() 20 | 21 | 22 | app = FastAPI( 23 | title="Scrapester API", 24 | description="High-performance web crawler", 25 | version="1.0.0", 26 | lifespan=lifespan, 27 | ) 28 | app.add_middleware( 29 | CORSMiddleware, 30 | allow_origins=["*"], 31 | allow_credentials=True, 32 | allow_methods=["*"], 33 | allow_headers=["*"], 34 | ) 35 | 36 | 37 | # Error handler 38 | @app.exception_handler(Exception) 39 | async def global_exception_handler(request: Request, exc: Exception): 40 | logger.error(f"Global error handler caught: {exc}") 41 | return JSONResponse(status_code=500, content={"detail": str(exc)}) 42 | 43 | 44 | # Include routers 45 | app.include_router(crawl.router, prefix="/v1", tags=["crawler"]) 46 | 47 | 48 | @app.get("/docs", tags=["API Documentation"]) 49 | async def docs_redirect(): 50 | return RedirectResponse(url="/api/v1/docs") 51 | 52 | 53 | @app.get("/health", tags=["Healthcheck"]) 54 | async def health_check(): 55 | return {"status": "ok"} 56 | 57 | 58 | @app.get("/openapi.json", include_in_schema=False) 59 | async def get_openapi_json(): 60 | return app.openapi() 61 | -------------------------------------------------------------------------------- /apps/api/src/api/routes/crawl.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, HTTPException, BackgroundTasks, Depends 2 | from ..models.schemas import ScrapeRequest, CrawlRequest 3 | from core.crawler import ScrapesterCrawler 4 | from typing import Dict, List 5 | import asyncio 6 | 7 | router = APIRouter() 8 | crawler = None 9 | 10 | 11 | async def get_crawler(): 12 | global crawler 13 | if crawler is None: 14 | crawler = ScrapesterCrawler() 15 | await crawler.initialize() 16 | return crawler 17 | 18 | 19 | @router.post("/scrape") 20 | async def scrape_url( 21 | request: ScrapeRequest, crawler: ScrapesterCrawler = Depends(get_crawler) 22 | ) -> Dict: 23 | """ 24 | Scrape a single URL with advanced options 25 | """ 26 | try: 27 | result = await crawler.scrape_url(str(request.url), None) 28 | return {"success": True, "data": result} 29 | except Exception as e: 30 | raise HTTPException(status_code=500, detail=str(e)) 31 | 32 | 33 | @router.post("/crawl") 34 | async def crawl_website( 35 | request: CrawlRequest, 36 | background_tasks: BackgroundTasks, 37 | crawler: ScrapesterCrawler = Depends(get_crawler), 38 | ) -> Dict: 39 | """ 40 | Crawl an entire website with advanced options 41 | """ 42 | try: 43 | # Start crawling in the background 44 | results = await crawler.crawl_website( 45 | str(request.url), request.options.dict() if request.options else None 46 | ) 47 | return { 48 | "success": True, 49 | "data": results, 50 | "stats": { 51 | "pages_crawled": len(results), 52 | "successful_crawls": sum(1 for r in results if r.get("success", False)), 53 | }, 54 | } 55 | except Exception as e: 56 | raise HTTPException(status_code=500, detail=str(e)) 57 | -------------------------------------------------------------------------------- /apps/sdks/javascript/src/client.ts: -------------------------------------------------------------------------------- 1 | import axios, { AxiosError, AxiosInstance } from 'axios'; 2 | import { APIError, CrawlerResponse } from './types'; 3 | export class ScrapesterApp { 4 | private session: AxiosInstance; 5 | 6 | constructor( 7 | private apiKey: string, 8 | private baseUrl: string = 'http://localhost:8000', 9 | private timeout: number = 600000 // 600 seconds in milliseconds 10 | ) { 11 | this.baseUrl = baseUrl.replace(/\/$/, ''); 12 | this.session = this.createSession(); 13 | } 14 | 15 | private createSession(): AxiosInstance { 16 | return axios.create({ 17 | baseURL: this.baseUrl, 18 | timeout: this.timeout, 19 | headers: { 20 | 'Authorization': `Bearer ${this.apiKey}`, 21 | 'Content-Type': 'application/json', 22 | 'User-Agent': 'Scrapester-Python-SDK/1.0' 23 | } 24 | }); 25 | } 26 | 27 | private async request( 28 | method: string, 29 | endpoint: string, 30 | params?: Record, 31 | data?: Record 32 | ): Promise { 33 | const url = `${this.baseUrl}${endpoint}`; 34 | 35 | try { 36 | const response = await this.session.request({ 37 | method, 38 | url, 39 | params, 40 | data, 41 | }); 42 | 43 | return response.data; 44 | } catch (error) { 45 | if (error instanceof AxiosError) { 46 | if (error.response?.status === 429) { 47 | throw new APIError('Rate limit exceeded', 429); 48 | } 49 | 50 | if (error.response) { 51 | try { 52 | const errorData = error.response.data; 53 | throw new APIError( 54 | String(errorData.detail || 'Unknown error'), 55 | error.response.status, 56 | errorData 57 | ); 58 | } catch { 59 | throw new APIError( 60 | error.response.statusText, 61 | error.response.status, 62 | { detail: error.response.statusText } 63 | ); 64 | } 65 | } 66 | } 67 | 68 | throw new APIError(String(error)); 69 | } 70 | } 71 | 72 | async scrape(url: string): Promise { 73 | const data = { url }; 74 | const response = await this.request('POST', '/v1/scrape', undefined, data); 75 | return CrawlerResponse.fromDict(response.data || {}); 76 | } 77 | 78 | async crawl( 79 | url: string, 80 | options?: { 81 | maxPages?: number; 82 | maxDepth?: number; 83 | includePatterns?: string[]; 84 | excludePatterns?: string[]; 85 | } 86 | ): Promise { 87 | const data = { url, ...options }; 88 | const response = await this.request('POST', '/v1/crawl', undefined, data); 89 | return (response.data || []).map((item: any) => CrawlerResponse.fromDict(item)); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /apps/sdks/python/scrapester/crawler.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, List, Union 2 | import requests 3 | from dataclasses import dataclass 4 | from datetime import datetime 5 | 6 | 7 | @dataclass 8 | class CrawlerResponse: 9 | url: str 10 | markdown: str 11 | metadata: Dict 12 | timestamp: str 13 | 14 | @classmethod 15 | def from_dict(cls, data: Dict) -> "CrawlerResponse": 16 | return cls( 17 | url=data.get("url", ""), 18 | markdown=data.get("markdown", ""), 19 | metadata=data.get("metadata", {}), 20 | timestamp=data.get("timestamp", datetime.utcnow().isoformat()), 21 | ) 22 | 23 | 24 | class CrawlerError(Exception): 25 | """Base exception for crawler errors""" 26 | 27 | pass 28 | 29 | 30 | class APIError(CrawlerError): 31 | """Raised when the API returns an error""" 32 | 33 | def __init__(self, message: str, status_code: int = None, response: Dict = None): 34 | self.status_code = status_code 35 | self.response = response 36 | super().__init__(message) 37 | 38 | 39 | class ScrapesterApp: 40 | def __init__( 41 | self, api_key: str, base_url: str = "http://localhost:8000", timeout: int = 600 42 | ): 43 | """Initialize the Crawler client 44 | 45 | Args: 46 | api_key: Your API key 47 | base_url: Base URL for the API (default: http://localhost:8000) 48 | timeout: Request timeout in seconds (default: 30) 49 | """ 50 | self.api_key = api_key 51 | self.base_url = base_url.rstrip("/") 52 | self.timeout = timeout 53 | self.session = self._create_session() 54 | 55 | def _create_session(self) -> requests.Session: 56 | """Create a requests session with default headers""" 57 | session = requests.Session() 58 | session.headers.update( 59 | { 60 | "Authorization": f"Bearer {self.api_key}", 61 | "Content-Type": "application/json", 62 | "User-Agent": "Scrapester-Python-SDK/1.0", 63 | } 64 | ) 65 | return session 66 | 67 | def _request( 68 | self, method: str, endpoint: str, params: Dict = None, data: Dict = None 69 | ) -> Dict: 70 | """Make an HTTP request to the API""" 71 | url = f"{self.base_url}{endpoint}" 72 | 73 | try: 74 | response = self.session.request( 75 | method=method, url=url, params=params, json=data, timeout=self.timeout 76 | ) 77 | 78 | if response.status_code == 429: 79 | raise APIError("Rate limit exceeded", status_code=429) 80 | 81 | response.raise_for_status() 82 | return response.json() 83 | 84 | except requests.exceptions.RequestException as e: 85 | if hasattr(e, "response") and e.response is not None: 86 | try: 87 | error_data = e.response.json() 88 | except ValueError: 89 | error_data = {"detail": e.response.text} 90 | raise APIError( 91 | str(error_data.get("detail", "Unknown error")), 92 | status_code=e.response.status_code, 93 | response=error_data, 94 | ) 95 | raise APIError(str(e)) 96 | 97 | def scrape(self, url: str) -> CrawlerResponse: 98 | """Scrape a single URL 99 | 100 | Args: 101 | url: The URL to scrape 102 | options: Optional scraping configurations 103 | - wait_for_selector: CSS selector to wait for 104 | - screenshot: Take a screenshot (bool) 105 | - scroll: Enable smart scrolling (bool) 106 | - timeout: Custom timeout for this request 107 | 108 | Returns: 109 | CrawlerResponse object containing the scraped data 110 | """ 111 | data = {"url": url} 112 | 113 | response = self._request("POST", "/v1/scrape", data=data) 114 | return CrawlerResponse.from_dict(response.get("data", {})) 115 | 116 | def crawl(self, url: str, options: Optional[Dict] = None) -> List[CrawlerResponse]: 117 | """Crawl a website starting from a URL 118 | 119 | Args: 120 | url: The starting URL to crawl 121 | options: Optional crawling configurations 122 | - max_pages: Maximum number of pages to crawl 123 | - max_depth: Maximum crawling depth 124 | - include_patterns: List of URL patterns to include 125 | - exclude_patterns: List of URL patterns to exclude 126 | 127 | Returns: 128 | List of CrawlerResponse objects 129 | """ 130 | data = {"url": url, **(options or {})} 131 | 132 | response = self._request("POST", "/v1/crawl", data=data) 133 | return [CrawlerResponse.from_dict(item) for item in response.get("data", [])] 134 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | 164 | 165 | # Dependencies 166 | node_modules/ 167 | package-lock.json 168 | yarn.lock 169 | pnpm-lock.yaml 170 | 171 | # Build outputs 172 | dist/ 173 | build/ 174 | lib/ 175 | .next/ 176 | out/ 177 | 178 | # TypeScript 179 | *.tsbuildinfo 180 | .tsbuildinfo 181 | 182 | # Testing 183 | coverage/ 184 | .nyc_output/ 185 | junit.xml 186 | 187 | # Environment variables 188 | .env 189 | .env.local 190 | .env.*.local 191 | .env.development 192 | .env.test 193 | .env.production 194 | 195 | # IDE - VSCode 196 | .vscode/* 197 | !.vscode/settings.json 198 | !.vscode/tasks.json 199 | !.vscode/launch.json 200 | !.vscode/extensions.json 201 | 202 | # IDE - IntelliJ 203 | .idea/ 204 | *.iml 205 | *.ipr 206 | *.iws 207 | 208 | # IDE - WebStorm 209 | .idea/ 210 | *.swp 211 | *.swo 212 | 213 | # OS generated files 214 | .DS_Store 215 | .DS_Store? 216 | ._* 217 | .Spotlight-V100 218 | .Trashes 219 | ehthumbs.db 220 | Thumbs.db 221 | 222 | # Logs 223 | logs 224 | *.log 225 | npm-debug.log* 226 | yarn-debug.log* 227 | yarn-error.log* 228 | lerna-debug.log* 229 | 230 | # Cache directories 231 | .npm 232 | .eslintcache 233 | .stylelintcache 234 | .prettiercache 235 | 236 | # Optional npm cache directory 237 | .npm 238 | 239 | # Optional REPL history 240 | .node_repl_history 241 | 242 | # Output of 'npm pack' 243 | *.tgz 244 | 245 | # Yarn v2 246 | .yarn/* 247 | !.yarn/cache 248 | !.yarn/patches 249 | !.yarn/plugins 250 | !.yarn/releases 251 | !.yarn/sdks 252 | !.yarn/versions 253 | 254 | # Local development 255 | *.local 256 | 257 | # Temporary files 258 | *.swp 259 | *.swo 260 | *~ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrapester 2 |

3 | Scrapester Logo 4 |

5 | 6 | [![PyPI version](https://badge.fury.io/py/scrapester.svg)](https://badge.fury.io/py/scrapester) 7 | [![npm version](https://badge.fury.io/js/scrapester.svg)](https://badge.fury.io/js/scrapester) 8 | 9 | [![npm downloads](https://img.shields.io/npm/dm/scrapester)](https://www.npmjs.com/package/scrapester) 10 | [![PyPI downloads](https://img.shields.io/pypi/dm/scrapester)](https://pypi.org/project/scrapester/) 11 | 12 | 13 | [![GitHub stars](https://img.shields.io/github/stars/Bugsterapp/scrapester)](https://github.com/Bugsterapp/scrapester/stargazers) 14 | [![Last Commit](https://img.shields.io/github/last-commit/Bugsterapp/scrapester)](https://github.com/Bugsterapp/scrapester/commits/main) 15 | 16 | 17 | [Documentation](https://docs.scrapester.lol) | 18 | [Python SDK](https://pypi.org/project/scrapester/) | 19 | [JavaScript SDK](https://www.npmjs.com/package/scrapester) | 20 | [Playground](https://scrapester.lol) 21 | 22 | Turn any website into LLM-ready clean data. 23 | 24 | ## Overview 25 | Scrapester is a powerful web scraping tool that converts website content into clean, markdown-formatted data perfect for LLM processing. With support for both single-page scraping and full website crawling, Scrapester makes it easy to gather web content in a structured, consistent format. 26 | 27 | ## Features 28 | - 🔍 **Smart Content Extraction**: Automatically removes noise and extracts meaningful content 29 | - 📝 **Markdown Output**: Clean, structured content perfect for LLMs 30 | - 🕷️ **Website Crawling**: Scrape entire websites with configurable depth and limits 31 | - 🚀 **Multiple SDKs**: Official Python and JavaScript support 32 | - ⚡ **High Performance**: Built for speed and reliability 33 | - 🛡️ **Error Handling**: Robust error handling and rate limiting protection 34 | 35 | ## Installation 36 | 37 | ### Python 38 | ```bash 39 | pip install scrapester 40 | ``` 41 | 42 | ### JavaScript/TypeScript 43 | ```bash 44 | npm install scrapester 45 | # or 46 | yarn add scrapester 47 | ``` 48 | 49 | ## Quick Start 50 | 51 | ### Python 52 | ```python 53 | from scrapester import ScrapesterApp 54 | 55 | # Initialize the client 56 | app = ScrapesterApp(api_key="your-api-key") 57 | 58 | # Scrape a single page 59 | result = app.scrape("https://example.com") 60 | print(result.markdown) 61 | 62 | # Crawl an entire website 63 | results = app.crawl( 64 | "https://example.com", 65 | options={ 66 | "max_pages": 10, 67 | "max_depth": 2 68 | } 69 | ) 70 | ``` 71 | 72 | ### JavaScript/TypeScript 73 | ```typescript 74 | import { ScrapesterApp } from 'scrapester'; 75 | 76 | // Initialize the client 77 | const app = new ScrapesterApp('your-api-key'); 78 | 79 | // Scrape a single page 80 | const result = await app.scrape('https://example.com'); 81 | console.log(result.markdown); 82 | 83 | // Crawl an entire website 84 | const results = await app.crawl('https://example.com', { 85 | maxPages: 10, 86 | maxDepth: 2 87 | }); 88 | ``` 89 | 90 | ## Response Format 91 | 92 | Scrapester returns clean, structured data in the following format: 93 | 94 | ```typescript 95 | interface CrawlerResponse { 96 | url: string; // The scraped URL 97 | markdown: string; // Clean, markdown-formatted content 98 | metadata: { // Page metadata 99 | title: string, 100 | description: string, 101 | // ... other meta tags 102 | }; 103 | timestamp: string; // ISO timestamp of when the page was scraped 104 | } 105 | ``` 106 | 107 | ## API Reference 108 | 109 | ### ScrapesterApp 110 | 111 | #### Constructor 112 | ```typescript 113 | new ScrapesterApp( 114 | apiKey: string, 115 | baseUrl?: string, // default: "http://localhost:8000" 116 | timeout?: number // default: 600 seconds 117 | ) 118 | ``` 119 | 120 | #### Methods 121 | 122 | ##### scrape(url: string) 123 | Scrapes a single URL and returns clean, markdown-formatted content. 124 | 125 | ##### crawl(url: string, options?) 126 | Crawls a website starting from the given URL. Options include: 127 | - `maxPages`: Maximum number of pages to crawl 128 | - `maxDepth`: Maximum crawl depth 129 | - `includePatterns`: URL patterns to include 130 | - `excludePatterns`: URL patterns to exclude 131 | 132 | ## Error Handling 133 | 134 | Scrapester provides detailed error information through the `APIError` class: 135 | 136 | ```typescript 137 | class APIError extends Error { 138 | statusCode?: number; 139 | response?: object; 140 | } 141 | ``` 142 | 143 | Common error scenarios: 144 | - `429`: Rate limit exceeded 145 | - `400`: Invalid request 146 | - `401`: Invalid API key 147 | - `500`: Server error 148 | 149 | ## Development 150 | 151 | ### Running Tests 152 | ```bash 153 | # Python 154 | pytest tests/ 155 | 156 | # JavaScript 157 | npm test 158 | ``` 159 | 160 | ### Building from Source 161 | ```bash 162 | # Python 163 | pip install -e ".[dev]" 164 | 165 | # JavaScript 166 | npm install 167 | npm run build 168 | ``` 169 | 170 | ## Contributing 171 | We welcome contributions! Please see our [Contributing Guidelines](CONTRIBUTING.md) for details. 172 | 173 | ## Support 174 | 175 | - 📖 [Documentation](https://docs.scrapester.dev) 176 | - 💬 [Discord Community](https://discord.gg/scrapester) 177 | - 📧 [Email Support](mailto:support@scrapester.dev) 178 | 179 | ## License 180 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 181 | -------------------------------------------------------------------------------- /apps/api/src/core/crawler.py: -------------------------------------------------------------------------------- 1 | from playwright.async_api import async_playwright, Page, TimeoutError 2 | import asyncio 3 | from typing import Dict, List, Optional, Union 4 | import json 5 | from datetime import datetime 6 | import hashlib 7 | from bs4 import BeautifulSoup 8 | import markdown 9 | import logging 10 | from config import settings 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class ScrapesterCrawler: 16 | def __init__(self): 17 | self.browser = None 18 | self.context = None 19 | 20 | async def initialize(self): 21 | """Initialize the browser instance""" 22 | self.playwright = await async_playwright().start() 23 | self.browser = await self.playwright.chromium.launch( 24 | headless=False, args=["--no-sandbox", "--disable-setuid-sandbox"] 25 | ) 26 | self.context = await self.browser.new_context( 27 | viewport={"width": 1920, "height": 1080}, 28 | user_agent="Chrome/69.0.3497.100 Safari/537.36", 29 | ) 30 | 31 | async def close(self): 32 | """Clean up resources""" 33 | if self.context: 34 | await self.context.close() 35 | if self.browser: 36 | await self.browser.close() 37 | if hasattr(self, "playwright"): 38 | await self.playwright.stop() 39 | 40 | async def _smart_scroll(self, page: Page): 41 | """Smart scroll to handle dynamic loading""" 42 | try: 43 | last_height = await page.evaluate("document.body.scrollHeight") 44 | while True: 45 | await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") 46 | await page.wait_for_timeout(1000) 47 | new_height = await page.evaluate("document.body.scrollHeight") 48 | if new_height == last_height: 49 | break 50 | last_height = new_height 51 | except Exception as e: 52 | logger.warning(f"Smart scroll encountered an error: {e}") 53 | 54 | async def _extract_links(self, page: Page, base_url: str) -> List[str]: 55 | """Extract all valid links from the page""" 56 | links = await page.evaluate( 57 | """() => { 58 | const links = Array.from(document.links) 59 | .map(link => link.href) 60 | .filter(href => href.startsWith('http')); 61 | return [...new Set(links)]; 62 | }""" 63 | ) 64 | return [link for link in links if link.startswith(base_url)] 65 | 66 | async def _extract_content(self, page: Page) -> Dict: 67 | """Extract various content formats from the page""" 68 | html = await page.content() 69 | 70 | # Use BeautifulSoup for better content extraction 71 | soup = BeautifulSoup(html, "html.parser") 72 | 73 | # Remove unwanted elements 74 | for element in soup.select("script, style, noscript, iframe, img"): 75 | element.decompose() 76 | 77 | # Initialize markdown content 78 | md_parts = [] 79 | 80 | # Extract and format headers 81 | for h in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]): 82 | level = int(h.name[1]) # get header level 83 | md_parts.append(f"{'#' * level} {h.get_text(strip=True)}\n") 84 | 85 | # Extract and format paragraphs 86 | for p in soup.find_all("p"): 87 | text = p.get_text(strip=True) 88 | if text: 89 | # Check if it's a link paragraph 90 | links = p.find_all("a") 91 | if links: 92 | for link in links: 93 | href = link.get("href", "") 94 | text = link.get_text(strip=True) 95 | if href and text: 96 | md_parts.append(f"[**{text}**]({href})\n") 97 | else: 98 | md_parts.append(f"{text}\n") 99 | # Join all parts with proper spacing 100 | markdown_content = "\n".join(md_parts) 101 | # Get clean text 102 | text = soup.get_text(separator="\n", strip=True) 103 | 104 | # Convert to markdown 105 | # md = markdown.markdown(text) 106 | 107 | # Extract metadata 108 | metadata = await page.evaluate( 109 | """() => { 110 | const metadata = {}; 111 | 112 | // Basic metadata 113 | metadata.title = document.title; 114 | 115 | // Meta tags 116 | const metaTags = document.getElementsByTagName('meta'); 117 | for (let tag of metaTags) { 118 | const name = tag.getAttribute('name') || tag.getAttribute('property'); 119 | const content = tag.getAttribute('content'); 120 | if (name && content) { 121 | metadata[name] = content; 122 | } 123 | } 124 | 125 | // OpenGraph 126 | metadata.og = {}; 127 | document.querySelectorAll('meta[property^="og:"]').forEach(tag => { 128 | const property = tag.getAttribute('property').substring(3); 129 | metadata.og[property] = tag.getAttribute('content'); 130 | }); 131 | 132 | return metadata; 133 | }""" 134 | ) 135 | metadata = { 136 | "title": metadata.get("title", ""), 137 | "description": metadata.get("description", ""), 138 | "robots": metadata.get("robots", ""), 139 | } 140 | return { 141 | "url": page.url, 142 | "markdown": markdown_content, 143 | "metadata": metadata, 144 | "timestamp": datetime.utcnow().isoformat(), 145 | } 146 | 147 | async def scrape_url(self, url: str, options: Dict = None) -> Dict: 148 | """Scrape a single URL with advanced options""" 149 | if not self.context: 150 | await self.initialize() 151 | 152 | options = options or {} 153 | page = await self.context.new_page() 154 | 155 | try: 156 | # Configure page 157 | if options.get("timeout"): 158 | page.set_default_timeout(options["timeout"]) 159 | # Navigate to page 160 | try: 161 | response = await page.goto(url, wait_until="networkidle") 162 | except TimeoutError: 163 | pass 164 | if not response.ok: 165 | raise Exception(f"Failed to load page: {response.status}") 166 | 167 | # Handle dynamic content 168 | if options.get("wait_for_selector"): 169 | await page.wait_for_selector(options["wait_for_selector"]) 170 | 171 | if options.get("scroll", True): 172 | await self._smart_scroll(page) 173 | 174 | # Execute custom actions 175 | if options.get("actions"): 176 | await self.execute_actions(page, options["actions"]) 177 | 178 | # Take screenshot if requested 179 | screenshot = None 180 | if options.get("screenshot"): 181 | screenshot = await page.screenshot( 182 | full_page=True, type="jpeg", quality=80 183 | ) 184 | 185 | # Extract content 186 | content = await self._extract_content(page) 187 | 188 | if screenshot: 189 | content["screenshot"] = screenshot 190 | 191 | return content 192 | 193 | except Exception as e: 194 | logger.error(f"Error scraping {url}: {e}") 195 | raise 196 | finally: 197 | await page.close() 198 | 199 | async def execute_actions(self, page: Page, actions: List[Dict]): 200 | """Execute custom actions on the page""" 201 | for action in actions: 202 | action_type = action.get("type") 203 | try: 204 | if action_type == "click": 205 | await page.click(action["selector"]) 206 | elif action_type == "type": 207 | await page.type(action["selector"], action["text"]) 208 | elif action_type == "wait": 209 | await page.wait_for_timeout(action["duration"]) 210 | elif action_type == "scroll": 211 | await self._smart_scroll(page) 212 | # Add more action types as needed 213 | except Exception as e: 214 | logger.error(f"Error executing action {action_type}: {e}") 215 | raise 216 | 217 | async def crawl_website(self, start_url: str, options: Dict = None) -> List[Dict]: 218 | """Crawl an entire website with advanced options""" 219 | if not self.context: 220 | await self.initialize() 221 | 222 | options = options or {} 223 | max_pages = options.get("max_pages", 10) 224 | max_depth = options.get("max_depth", 3) 225 | 226 | visited = set() 227 | to_visit = {(start_url, 0)} # (url, depth) 228 | results = [] 229 | 230 | while to_visit and len(visited) < max_pages: 231 | url, depth = to_visit.pop() 232 | if url in visited or depth > max_depth: 233 | continue 234 | 235 | try: 236 | result = await self.scrape_url(url, options) 237 | results.append(result) 238 | visited.add(url) 239 | 240 | if depth < max_depth: 241 | page = await self.context.new_page() 242 | await page.goto(url) 243 | new_urls = await self._extract_links(page, start_url) 244 | await page.close() 245 | 246 | for new_url in new_urls: 247 | if new_url not in visited: 248 | to_visit.add((new_url, depth + 1)) 249 | 250 | except Exception as e: 251 | logger.error(f"Error crawling {url}: {e}") 252 | 253 | return results 254 | --------------------------------------------------------------------------------