├── apps
    ├── api
    │   ├── src
    │   │   ├── __init__.py
    │   │   ├── api
    │   │   │   ├── __init__.py
    │   │   │   ├── models
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── schemas.py
    │   │   │   ├── routes
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── crawl.py
    │   │   │   └── middleware
    │   │   │   │   └── __init__.py
    │   │   ├── core
    │   │   │   ├── __init__.py
    │   │   │   └── crawler.py
    │   │   ├── config.py
    │   │   ├── requirements.txt
    │   │   └── main.py
    │   └── Dockerfile
    └── sdks
    │   ├── python
    │       ├── scrapester
    │       │   ├── __init__.py
    │       │   └── crawler.py
    │       ├── README.md
    │       ├── test.py
    │       └── setup.py
    │   └── javascript
    │       ├── src
    │           ├── index.ts
    │           ├── types.ts
    │           └── client.ts
    │       ├── tsup.config.ts
    │       ├── tsconfig.json
    │       ├── README.md
    │       ├── .npmignore
    │       └── package.json
├── assets
    └── image.png
├── LICENSE
├── .gitignore
└── README.md


/apps/api/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/apps/api/src/api/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/apps/api/src/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/apps/api/src/api/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/apps/api/src/api/routes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/apps/api/src/api/middleware/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/apps/sdks/python/scrapester/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/apps/sdks/python/README.md:
--------------------------------------------------------------------------------
1 | # scrapester
2 | Turn any website into LLM structured data.
3 | 


--------------------------------------------------------------------------------
/assets/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Bugsterapp/scrapester/HEAD/assets/image.png


--------------------------------------------------------------------------------
/apps/sdks/javascript/src/index.ts:
--------------------------------------------------------------------------------
1 | export { ScrapesterApp } from './client';
2 | export {
3 |     APIError, CrawlerError, CrawlerResponse, type CrawlerResponseData
4 | } from './types';
5 | 


--------------------------------------------------------------------------------
/apps/sdks/javascript/tsup.config.ts:
--------------------------------------------------------------------------------
 1 | import { defineConfig } from 'tsup';
 2 | 
 3 | export default defineConfig({
 4 |   entry: ['src/index.ts'],
 5 |   format: ['cjs', 'esm'],
 6 |   dts: true,
 7 |   splitting: false,
 8 |   sourcemap: true,
 9 |   clean: true,
10 | });


--------------------------------------------------------------------------------
/apps/api/src/config.py:
--------------------------------------------------------------------------------
 1 | from pydantic_settings import BaseSettings
 2 | from typing import Optional
 3 | 
 4 | class Settings(BaseSettings):
 5 |     REDIS_URL: str = "redis://localhost:6379"
 6 |     CACHE_TTL: int = 3600
 7 |     MAX_CONCURRENT_CRAWLS: int = 5
 8 |     RATE_LIMIT_REQUESTS: int = 100
 9 |     RATE_LIMIT_PERIOD: int = 60
10 |     API_KEY_HEADER: str = "X-API-Key"
11 |     
12 |     class Config:
13 |         env_file = ".env"
14 | 
15 | settings = Settings()
16 | 


--------------------------------------------------------------------------------
/apps/sdks/javascript/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "compilerOptions": {
 3 |       "target": "es2020",
 4 |       "module": "commonjs",
 5 |       "declaration": true,
 6 |       "outDir": "./dist",
 7 |       "strict": true,
 8 |       "esModuleInterop": true,
 9 |       "skipLibCheck": true,
10 |       "forceConsistentCasingInFileNames": true,
11 |       "moduleResolution": "node",
12 |       "types": ["yargs"]
13 |     },
14 |     "include": ["src"],
15 |     "exclude": ["node_modules", "dist", "test"]
16 |   }
17 |   


--------------------------------------------------------------------------------
/apps/sdks/javascript/README.md:
--------------------------------------------------------------------------------
 1 | # scrapester
 2 | Turn any website into LLM structured data.
 3 | 
 4 | ## Usage Example
 5 | 
 6 | 
 7 | import { ScrapesterApp } from 'scrapester';
 8 | 
 9 | const client = new ScrapesterApp('your-api-key');
10 | 
11 | // Scrape a single URL
12 | const result = await client.scrape('https://example.com');
13 | console.log(result.markdown);
14 | 
15 | // Crawl a website
16 | const results = await client.crawl('https://example.com', {
17 |   maxPages: 10,
18 |   maxDepth: 2
19 | });
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/apps/sdks/python/test.py:
--------------------------------------------------------------------------------
 1 | from scrapester.crawler import ScrapesterApp, APIError
 2 | 
 3 | 
 4 | client = ScrapesterApp(api_key="your-api-key")
 5 | 
 6 | # # Scrape a single URL
 7 | # try:
 8 | #     url = "https://platan.us/"
 9 | #     result = client.crawl(url)
10 | #     print(f"Title: {result.metadata.get('title')}")
11 | #     print(f"Content:\n{result.markdown}")
12 | # except APIError as e:
13 | #     print(f"Error: {e}")
14 | 
15 | 
16 | try:
17 |     url = "https://platan.us/"
18 |     result = client.crawl(url)
19 |     print(f"{result}")
20 | except APIError as e:
21 |     print(f"Error: {e}")
22 | 


--------------------------------------------------------------------------------
/apps/sdks/javascript/.npmignore:
--------------------------------------------------------------------------------
 1 | # Source
 2 | src/
 3 | tests/
 4 | examples/
 5 | docs/
 6 | 
 7 | # Configuration files
 8 | .eslintrc
 9 | .prettierrc
10 | .editorconfig
11 | .gitignore
12 | tsconfig.json
13 | jest.config.js
14 | .travis.yml
15 | .github/
16 | .vscode/
17 | 
18 | # Development files
19 | *.test.ts
20 | *.spec.ts
21 | coverage/
22 | .nyc_output/
23 | .circleci/
24 | 
25 | # Documentation
26 | docs/
27 | CONTRIBUTING.md
28 | CHANGELOG.md
29 | 
30 | # Build tools
31 | webpack.config.js
32 | rollup.config.js
33 | babel.config.js
34 | tsup.config.ts
35 | 
36 | # Git
37 | .git/
38 | .gitignore
39 | 
40 | # Development environment
41 | .env*
42 | .idea/
43 | .vscode/
44 | 
45 | # Logs
46 | *.log
47 | npm-debug.log*
48 | yarn-debug.log*
49 | yarn-error.log*
50 | 
51 | # Cache
52 | .eslintcache
53 | .npm
54 | .yarn
55 | 
56 | # Test files
57 | __tests__/
58 | __mocks__/
59 | *.test.ts
60 | *.spec.ts
61 | test/


--------------------------------------------------------------------------------
/apps/api/src/api/models/schemas.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel, HttpUrl
 2 | from typing import List, Dict, Optional, Union
 3 | from enum import Enum
 4 | 
 5 | 
 6 | class ActionType(str, Enum):
 7 |     CLICK = "click"
 8 |     TYPE = "type"
 9 |     WAIT = "wait"
10 |     SCROLL = "scroll"
11 | 
12 | 
13 | class Action(BaseModel):
14 |     type: ActionType
15 |     selector: Optional[str] = None
16 |     text: Optional[str] = None
17 |     duration: Optional[int] = None
18 | 
19 | 
20 | class CrawlOptions(BaseModel):
21 |     max_pages: Optional[int] = 10
22 |     max_depth: Optional[int] = 3
23 |     timeout: Optional[int] = 30000
24 |     scroll: Optional[bool] = True
25 |     screenshot: Optional[bool] = False
26 |     wait_for_selector: Optional[str] = None
27 |     actions: Optional[List[Action]] = None
28 |     # auth: Optional[AuthConfig] = None
29 | 
30 | 
31 | class ScrapeRequest(BaseModel):
32 |     url: HttpUrl
33 | 
34 | 
35 | class CrawlRequest(BaseModel):
36 |     url: HttpUrl
37 |     options: Optional[CrawlOptions] = None
38 | 


--------------------------------------------------------------------------------
/apps/api/src/requirements.txt:
--------------------------------------------------------------------------------
 1 | annotated-types==0.7.0
 2 | anyio==4.6.2.post1
 3 | beautifulsoup4==4.12.3
 4 | build==1.2.2.post1
 5 | certifi==2024.8.30
 6 | charset-normalizer==3.4.0
 7 | click==8.1.7
 8 | docutils==0.21.2
 9 | fastapi==0.115.5
10 | greenlet==3.1.1
11 | gunicorn==23.0.0
12 | h11==0.14.0
13 | idna==3.10
14 | importlib_metadata==8.5.0
15 | jaraco.classes==3.4.0
16 | jaraco.context==6.0.1
17 | jaraco.functools==4.1.0
18 | keyring==25.5.0
19 | Markdown==3.7
20 | markdown-it-py==3.0.0
21 | mdurl==0.1.2
22 | more-itertools==10.5.0
23 | nh3==0.2.18
24 | packaging==24.2
25 | pkginfo==1.10.0
26 | playwright==1.49.0
27 | pydantic==2.10.1
28 | pydantic-settings==2.6.1
29 | pydantic_core==2.27.1
30 | pyee==12.0.0
31 | Pygments==2.18.0
32 | pyproject_hooks==1.2.0
33 | python-dotenv==1.0.1
34 | readme_renderer==44.0
35 | requests==2.32.3
36 | requests-toolbelt==1.0.0
37 | rfc3986==2.0.0
38 | rich==13.9.4
39 | setuptools==75.6.0
40 | sniffio==1.3.1
41 | soupsieve==2.6
42 | starlette==0.41.3
43 | twine==5.1.1
44 | typing_extensions==4.12.2
45 | urllib3==2.2.3
46 | uvicorn==0.32.1
47 | zipp==3.21.0
48 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Bugster
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/apps/api/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Define function directory
 2 | ARG FUNCTION_DIR="/src"
 3 | 
 4 | FROM mcr.microsoft.com/playwright/python:v1.49.0-jammy as build-image
 5 | 
 6 | # Install aws-lambda-cpp build dependencies
 7 | RUN apt-get update && \
 8 |     apt-get install -y \
 9 |     g++ \
10 |     make \
11 |     cmake \
12 |     unzip \
13 |     libcurl4-openssl-dev 
14 | # Include global arg in this stage of the build
15 | ARG FUNCTION_DIR
16 | # Create function directory
17 | RUN mkdir -p ${FUNCTION_DIR}
18 | 
19 | # Copy function code
20 | COPY ./src ${FUNCTION_DIR}
21 | 
22 | # Install Playwright
23 | RUN pip3 install playwright
24 | 
25 | # Run Playwright install to ensure all necessary browser binaries are installed
26 | RUN playwright install
27 | 
28 | # Multi-stage build: grab a fresh copy of the base image
29 | FROM mcr.microsoft.com/playwright/python:v1.49.0-jammy
30 | # Include global arg in this stage of the build
31 | ARG FUNCTION_DIR
32 | # Set working directory to function root directory
33 | WORKDIR ${FUNCTION_DIR}
34 | 
35 | # Copy in the build image dependencies
36 | COPY --from=build-image ${FUNCTION_DIR} ${FUNCTION_DIR}
37 | # Install the dependencies
38 | RUN pip install --no-cache-dir -r requirements.txt
39 | 
40 | CMD [ "uvicorn","main:app", "--host","0.0.0.0", "--port", "80" ]
41 | 


--------------------------------------------------------------------------------
/apps/sdks/python/setup.py:
--------------------------------------------------------------------------------
 1 | # setup.py
 2 | from setuptools import setup, find_packages
 3 | 
 4 | with open("README.md", "r", encoding="utf-8") as fh:
 5 |     long_description = fh.read()
 6 | 
 7 | setup(
 8 |     name="scrapester",
 9 |     version="0.1.2",
10 |     author="Naquiao",
11 |     author_email="ignacio@bugster.app",
12 |     description="Python SDK for Scrapester API",
13 |     long_description=long_description,
14 |     long_description_content_type="text/markdown",
15 |     url="https://github.com/Bugsterapp/scrapester",
16 |     packages=find_packages(),
17 |     classifiers=[
18 |         "Development Status :: 3 - Alpha",
19 |         "Intended Audience :: Developers",
20 |         "License :: OSI Approved :: MIT License",
21 |         "Operating System :: OS Independent",
22 |         "Programming Language :: Python :: 3.7",
23 |         "Programming Language :: Python :: 3.8",
24 |         "Programming Language :: Python :: 3.9",
25 |         "Programming Language :: Python :: 3.10",
26 |         "Programming Language :: Python :: 3.11",
27 |         "Programming Language :: Python :: 3.12",
28 |         "Topic :: Internet :: WWW/HTTP",
29 |         "Topic :: Software Development :: Libraries :: Python Modules",
30 |     ],
31 |     python_requires=">=3.7",
32 |     install_requires=[
33 |         "requests>=2.25.0",
34 |     ],
35 | )
36 | 


--------------------------------------------------------------------------------
/apps/sdks/javascript/src/types.ts:
--------------------------------------------------------------------------------
 1 | export interface CrawlerResponseData {
 2 |     url: string;
 3 |     markdown: string;
 4 |     metadata: Record<string, any>;
 5 |     timestamp: string;
 6 | }
 7 | 
 8 | export class CrawlerResponse implements CrawlerResponseData {
 9 |     url: string;
10 |     markdown: string;
11 |     metadata: Record<string, any>;
12 |     timestamp: string;
13 | 
14 |     constructor(data: Partial<CrawlerResponseData>) {
15 |         this.url = data.url || '';
16 |         this.markdown = data.markdown || '';
17 |         this.metadata = data.metadata || {};
18 |         this.timestamp = data.timestamp || new Date().toISOString();
19 |     }
20 | 
21 |     static fromDict(data: Record<string, any>): CrawlerResponse {
22 |         return new CrawlerResponse({
23 |             url: data.url,
24 |             markdown: data.markdown,
25 |             metadata: data.metadata,
26 |             timestamp: data.timestamp,
27 |         });
28 |     }
29 | }
30 | 
31 | export class CrawlerError extends Error {
32 |     constructor(message: string) {
33 |         super(message);
34 |         this.name = 'CrawlerError';
35 |     }
36 | }
37 | 
38 | export class APIError extends CrawlerError {
39 |     constructor(
40 |         message: string,
41 |         public statusCode?: number,
42 |         public response?: Record<string, any>
43 |     ) {
44 |         super(message);
45 |         this.name = 'APIError';
46 |     }
47 | }


--------------------------------------------------------------------------------
/apps/sdks/javascript/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "scrapester",
 3 |     "version": "0.1.0",
 4 |     "description": "JavaScript SDK for Scrapester API",
 5 |     "main": "dist/index.js",
 6 |     "module": "dist/index.mjs",
 7 |     "types": "dist/index.d.ts",
 8 |     "files": [
 9 |         "dist"
10 |     ],
11 |     "scripts": {
12 |         "build": "tsup",
13 |         "test": "jest",
14 |         "lint": "eslint src --ext .ts",
15 |         "format": "prettier --write \"src/**/*.ts\"",
16 |         "prepublishOnly": "npm run build"
17 |     },
18 |     "author": "Naquiao <ignacio@bugster.app>",
19 |     "license": "MIT",
20 |     "dependencies": {
21 |         "axios": "^1.6.0"
22 |     },
23 |     "devDependencies": {
24 |         "@types/jest": "^29.5.0",
25 |         "@types/node": "^20.0.0",
26 |         "@types/yargs": "^17.0.33",
27 |         "@typescript-eslint/eslint-plugin": "^6.0.0",
28 |         "@typescript-eslint/parser": "^6.0.0",
29 |         "eslint": "^8.0.0",
30 |         "jest": "^29.0.0",
31 |         "prettier": "^3.0.0",
32 |         "ts-jest": "^29.0.0",
33 |         "tsup": "^8.3.5",
34 |         "typescript": "^5.0.0"
35 |     },
36 |     "keywords": [
37 |         "scraper",
38 |         "crawler",
39 |         "web-scraping",
40 |         "api-client"
41 |     ],
42 |     "repository": {
43 |         "type": "git",
44 |         "url": "git+https://github.com/Bugsterapp/scrapester.git"
45 |     },
46 |     "bugs": {
47 |         "url": "https://github.com/Bugsterapp/scrapester/issues"
48 |     },
49 |     "homepage": "https://github.com/Bugsterapp/scrapester#readme"
50 | }
51 | 


--------------------------------------------------------------------------------
/apps/api/src/main.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI, Request
 2 | from fastapi.middleware.cors import CORSMiddleware
 3 | from fastapi.responses import JSONResponse, RedirectResponse
 4 | from api.routes import crawl
 5 | from config import settings
 6 | import logging
 7 | from contextlib import asynccontextmanager
 8 | 
 9 | # Configure logging
10 | logging.basicConfig(level=logging.INFO)
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | @asynccontextmanager
15 | async def lifespan(app: FastAPI):
16 |     yield
17 |     # Cleanup on shutdown
18 |     crawler = await crawl.get_crawler()
19 |     await crawler.close()
20 | 
21 | 
22 | app = FastAPI(
23 |     title="Scrapester API",
24 |     description="High-performance web crawler",
25 |     version="1.0.0",
26 |     lifespan=lifespan,
27 | )
28 | app.add_middleware(
29 |     CORSMiddleware,
30 |     allow_origins=["*"],
31 |     allow_credentials=True,
32 |     allow_methods=["*"],
33 |     allow_headers=["*"],
34 | )
35 | 
36 | 
37 | # Error handler
38 | @app.exception_handler(Exception)
39 | async def global_exception_handler(request: Request, exc: Exception):
40 |     logger.error(f"Global error handler caught: {exc}")
41 |     return JSONResponse(status_code=500, content={"detail": str(exc)})
42 | 
43 | 
44 | # Include routers
45 | app.include_router(crawl.router, prefix="/v1", tags=["crawler"])
46 | 
47 | 
48 | @app.get("/docs", tags=["API Documentation"])
49 | async def docs_redirect():
50 |     return RedirectResponse(url="/api/v1/docs")
51 | 
52 | 
53 | @app.get("/health", tags=["Healthcheck"])
54 | async def health_check():
55 |     return {"status": "ok"}
56 | 
57 | 
58 | @app.get("/openapi.json", include_in_schema=False)
59 | async def get_openapi_json():
60 |     return app.openapi()
61 | 


--------------------------------------------------------------------------------
/apps/api/src/api/routes/crawl.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter, HTTPException, BackgroundTasks, Depends
 2 | from ..models.schemas import ScrapeRequest, CrawlRequest
 3 | from core.crawler import ScrapesterCrawler
 4 | from typing import Dict, List
 5 | import asyncio
 6 | 
 7 | router = APIRouter()
 8 | crawler = None
 9 | 
10 | 
11 | async def get_crawler():
12 |     global crawler
13 |     if crawler is None:
14 |         crawler = ScrapesterCrawler()
15 |         await crawler.initialize()
16 |     return crawler
17 | 
18 | 
19 | @router.post("/scrape")
20 | async def scrape_url(
21 |     request: ScrapeRequest, crawler: ScrapesterCrawler = Depends(get_crawler)
22 | ) -> Dict:
23 |     """
24 |     Scrape a single URL with advanced options
25 |     """
26 |     try:
27 |         result = await crawler.scrape_url(str(request.url), None)
28 |         return {"success": True, "data": result}
29 |     except Exception as e:
30 |         raise HTTPException(status_code=500, detail=str(e))
31 | 
32 | 
33 | @router.post("/crawl")
34 | async def crawl_website(
35 |     request: CrawlRequest,
36 |     background_tasks: BackgroundTasks,
37 |     crawler: ScrapesterCrawler = Depends(get_crawler),
38 | ) -> Dict:
39 |     """
40 |     Crawl an entire website with advanced options
41 |     """
42 |     try:
43 |         # Start crawling in the background
44 |         results = await crawler.crawl_website(
45 |             str(request.url), request.options.dict() if request.options else None
46 |         )
47 |         return {
48 |             "success": True,
49 |             "data": results,
50 |             "stats": {
51 |                 "pages_crawled": len(results),
52 |                 "successful_crawls": sum(1 for r in results if r.get("success", False)),
53 |             },
54 |         }
55 |     except Exception as e:
56 |         raise HTTPException(status_code=500, detail=str(e))
57 | 


--------------------------------------------------------------------------------
/apps/sdks/javascript/src/client.ts:
--------------------------------------------------------------------------------
 1 | import axios, { AxiosError, AxiosInstance } from 'axios';
 2 | import { APIError, CrawlerResponse } from './types';
 3 | export class ScrapesterApp {
 4 |   private session: AxiosInstance;
 5 | 
 6 |   constructor(
 7 |     private apiKey: string,
 8 |     private baseUrl: string = 'http://localhost:8000',
 9 |     private timeout: number = 600000 // 600 seconds in milliseconds
10 |   ) {
11 |     this.baseUrl = baseUrl.replace(/\/$/, '');
12 |     this.session = this.createSession();
13 |   }
14 | 
15 |   private createSession(): AxiosInstance {
16 |     return axios.create({
17 |       baseURL: this.baseUrl,
18 |       timeout: this.timeout,
19 |       headers: {
20 |         'Authorization': `Bearer ${this.apiKey}`,
21 |         'Content-Type': 'application/json',
22 |         'User-Agent': 'Scrapester-Python-SDK/1.0'
23 |       }
24 |     });
25 |   }
26 | 
27 |   private async request(
28 |     method: string,
29 |     endpoint: string,
30 |     params?: Record<string, any>,
31 |     data?: Record<string, any>
32 |   ): Promise<any> {
33 |     const url = `${this.baseUrl}${endpoint}`;
34 | 
35 |     try {
36 |       const response = await this.session.request({
37 |         method,
38 |         url,
39 |         params,
40 |         data,
41 |       });
42 | 
43 |       return response.data;
44 |     } catch (error) {
45 |       if (error instanceof AxiosError) {
46 |         if (error.response?.status === 429) {
47 |           throw new APIError('Rate limit exceeded', 429);
48 |         }
49 | 
50 |         if (error.response) {
51 |           try {
52 |             const errorData = error.response.data;
53 |             throw new APIError(
54 |               String(errorData.detail || 'Unknown error'),
55 |               error.response.status,
56 |               errorData
57 |             );
58 |           } catch {
59 |             throw new APIError(
60 |               error.response.statusText,
61 |               error.response.status,
62 |               { detail: error.response.statusText }
63 |             );
64 |           }
65 |         }
66 |       }
67 |       
68 |       throw new APIError(String(error));
69 |     }
70 |   }
71 | 
72 |   async scrape(url: string): Promise<CrawlerResponse> {
73 |     const data = { url };
74 |     const response = await this.request('POST', '/v1/scrape', undefined, data);
75 |     return CrawlerResponse.fromDict(response.data || {});
76 |   }
77 | 
78 |   async crawl(
79 |     url: string,
80 |     options?: {
81 |       maxPages?: number;
82 |       maxDepth?: number;
83 |       includePatterns?: string[];
84 |       excludePatterns?: string[];
85 |     }
86 |   ): Promise<CrawlerResponse[]> {
87 |     const data = { url, ...options };
88 |     const response = await this.request('POST', '/v1/crawl', undefined, data);
89 |     return (response.data || []).map((item: any) => CrawlerResponse.fromDict(item));
90 |   }
91 | }
92 | 


--------------------------------------------------------------------------------
/apps/sdks/python/scrapester/crawler.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Optional, List, Union
  2 | import requests
  3 | from dataclasses import dataclass
  4 | from datetime import datetime
  5 | 
  6 | 
  7 | @dataclass
  8 | class CrawlerResponse:
  9 |     url: str
 10 |     markdown: str
 11 |     metadata: Dict
 12 |     timestamp: str
 13 | 
 14 |     @classmethod
 15 |     def from_dict(cls, data: Dict) -> "CrawlerResponse":
 16 |         return cls(
 17 |             url=data.get("url", ""),
 18 |             markdown=data.get("markdown", ""),
 19 |             metadata=data.get("metadata", {}),
 20 |             timestamp=data.get("timestamp", datetime.utcnow().isoformat()),
 21 |         )
 22 | 
 23 | 
 24 | class CrawlerError(Exception):
 25 |     """Base exception for crawler errors"""
 26 | 
 27 |     pass
 28 | 
 29 | 
 30 | class APIError(CrawlerError):
 31 |     """Raised when the API returns an error"""
 32 | 
 33 |     def __init__(self, message: str, status_code: int = None, response: Dict = None):
 34 |         self.status_code = status_code
 35 |         self.response = response
 36 |         super().__init__(message)
 37 | 
 38 | 
 39 | class ScrapesterApp:
 40 |     def __init__(
 41 |         self, api_key: str, base_url: str = "http://localhost:8000", timeout: int = 600
 42 |     ):
 43 |         """Initialize the Crawler client
 44 | 
 45 |         Args:
 46 |             api_key: Your API key
 47 |             base_url: Base URL for the API (default: http://localhost:8000)
 48 |             timeout: Request timeout in seconds (default: 30)
 49 |         """
 50 |         self.api_key = api_key
 51 |         self.base_url = base_url.rstrip("/")
 52 |         self.timeout = timeout
 53 |         self.session = self._create_session()
 54 | 
 55 |     def _create_session(self) -> requests.Session:
 56 |         """Create a requests session with default headers"""
 57 |         session = requests.Session()
 58 |         session.headers.update(
 59 |             {
 60 |                 "Authorization": f"Bearer {self.api_key}",
 61 |                 "Content-Type": "application/json",
 62 |                 "User-Agent": "Scrapester-Python-SDK/1.0",
 63 |             }
 64 |         )
 65 |         return session
 66 | 
 67 |     def _request(
 68 |         self, method: str, endpoint: str, params: Dict = None, data: Dict = None
 69 |     ) -> Dict:
 70 |         """Make an HTTP request to the API"""
 71 |         url = f"{self.base_url}{endpoint}"
 72 | 
 73 |         try:
 74 |             response = self.session.request(
 75 |                 method=method, url=url, params=params, json=data, timeout=self.timeout
 76 |             )
 77 | 
 78 |             if response.status_code == 429:
 79 |                 raise APIError("Rate limit exceeded", status_code=429)
 80 | 
 81 |             response.raise_for_status()
 82 |             return response.json()
 83 | 
 84 |         except requests.exceptions.RequestException as e:
 85 |             if hasattr(e, "response") and e.response is not None:
 86 |                 try:
 87 |                     error_data = e.response.json()
 88 |                 except ValueError:
 89 |                     error_data = {"detail": e.response.text}
 90 |                 raise APIError(
 91 |                     str(error_data.get("detail", "Unknown error")),
 92 |                     status_code=e.response.status_code,
 93 |                     response=error_data,
 94 |                 )
 95 |             raise APIError(str(e))
 96 | 
 97 |     def scrape(self, url: str) -> CrawlerResponse:
 98 |         """Scrape a single URL
 99 | 
100 |         Args:
101 |             url: The URL to scrape
102 |             options: Optional scraping configurations
103 |                 - wait_for_selector: CSS selector to wait for
104 |                 - screenshot: Take a screenshot (bool)
105 |                 - scroll: Enable smart scrolling (bool)
106 |                 - timeout: Custom timeout for this request
107 | 
108 |         Returns:
109 |             CrawlerResponse object containing the scraped data
110 |         """
111 |         data = {"url": url}
112 | 
113 |         response = self._request("POST", "/v1/scrape", data=data)
114 |         return CrawlerResponse.from_dict(response.get("data", {}))
115 | 
116 |     def crawl(self, url: str, options: Optional[Dict] = None) -> List[CrawlerResponse]:
117 |         """Crawl a website starting from a URL
118 | 
119 |         Args:
120 |             url: The starting URL to crawl
121 |             options: Optional crawling configurations
122 |                 - max_pages: Maximum number of pages to crawl
123 |                 - max_depth: Maximum crawling depth
124 |                 - include_patterns: List of URL patterns to include
125 |                 - exclude_patterns: List of URL patterns to exclude
126 | 
127 |         Returns:
128 |             List of CrawlerResponse objects
129 |         """
130 |         data = {"url": url, **(options or {})}
131 | 
132 |         response = self._request("POST", "/v1/crawl", data=data)
133 |         return [CrawlerResponse.from_dict(item) for item in response.get("data", [])]
134 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 
164 | 
165 | # Dependencies
166 | node_modules/
167 | package-lock.json
168 | yarn.lock
169 | pnpm-lock.yaml
170 | 
171 | # Build outputs
172 | dist/
173 | build/
174 | lib/
175 | .next/
176 | out/
177 | 
178 | # TypeScript
179 | *.tsbuildinfo
180 | .tsbuildinfo
181 | 
182 | # Testing
183 | coverage/
184 | .nyc_output/
185 | junit.xml
186 | 
187 | # Environment variables
188 | .env
189 | .env.local
190 | .env.*.local
191 | .env.development
192 | .env.test
193 | .env.production
194 | 
195 | # IDE - VSCode
196 | .vscode/*
197 | !.vscode/settings.json
198 | !.vscode/tasks.json
199 | !.vscode/launch.json
200 | !.vscode/extensions.json
201 | 
202 | # IDE - IntelliJ
203 | .idea/
204 | *.iml
205 | *.ipr
206 | *.iws
207 | 
208 | # IDE - WebStorm
209 | .idea/
210 | *.swp
211 | *.swo
212 | 
213 | # OS generated files
214 | .DS_Store
215 | .DS_Store?
216 | ._*
217 | .Spotlight-V100
218 | .Trashes
219 | ehthumbs.db
220 | Thumbs.db
221 | 
222 | # Logs
223 | logs
224 | *.log
225 | npm-debug.log*
226 | yarn-debug.log*
227 | yarn-error.log*
228 | lerna-debug.log*
229 | 
230 | # Cache directories
231 | .npm
232 | .eslintcache
233 | .stylelintcache
234 | .prettiercache
235 | 
236 | # Optional npm cache directory
237 | .npm
238 | 
239 | # Optional REPL history
240 | .node_repl_history
241 | 
242 | # Output of 'npm pack'
243 | *.tgz
244 | 
245 | # Yarn v2
246 | .yarn/*
247 | !.yarn/cache
248 | !.yarn/patches
249 | !.yarn/plugins
250 | !.yarn/releases
251 | !.yarn/sdks
252 | !.yarn/versions
253 | 
254 | # Local development
255 | *.local
256 | 
257 | # Temporary files
258 | *.swp
259 | *.swo
260 | *~


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Scrapester
  2 | <p align="center">
  3 |   <img src="assets/image.png" alt="Scrapester Logo" width="200"/>
  4 | </p>
  5 | 
  6 | [![PyPI version](https://badge.fury.io/py/scrapester.svg)](https://badge.fury.io/py/scrapester)
  7 | [![npm version](https://badge.fury.io/js/scrapester.svg)](https://badge.fury.io/js/scrapester)
  8 | 
  9 | [![npm downloads](https://img.shields.io/npm/dm/scrapester)](https://www.npmjs.com/package/scrapester)
 10 | [![PyPI downloads](https://img.shields.io/pypi/dm/scrapester)](https://pypi.org/project/scrapester/)
 11 | 
 12 | 
 13 | [![GitHub stars](https://img.shields.io/github/stars/Bugsterapp/scrapester)](https://github.com/Bugsterapp/scrapester/stargazers)
 14 | [![Last Commit](https://img.shields.io/github/last-commit/Bugsterapp/scrapester)](https://github.com/Bugsterapp/scrapester/commits/main)
 15 | 
 16 | 
 17 | [Documentation](https://docs.scrapester.lol) |
 18 | [Python SDK](https://pypi.org/project/scrapester/) |
 19 | [JavaScript SDK](https://www.npmjs.com/package/scrapester) |
 20 | [Playground](https://scrapester.lol)
 21 | 
 22 | Turn any website into LLM-ready clean data.
 23 | 
 24 | ## Overview
 25 | Scrapester is a powerful web scraping tool that converts website content into clean, markdown-formatted data perfect for LLM processing. With support for both single-page scraping and full website crawling, Scrapester makes it easy to gather web content in a structured, consistent format.
 26 | 
 27 | ## Features
 28 | - 🔍 **Smart Content Extraction**: Automatically removes noise and extracts meaningful content
 29 | - 📝 **Markdown Output**: Clean, structured content perfect for LLMs
 30 | - 🕷️ **Website Crawling**: Scrape entire websites with configurable depth and limits
 31 | - 🚀 **Multiple SDKs**: Official Python and JavaScript support
 32 | - ⚡ **High Performance**: Built for speed and reliability
 33 | - 🛡️ **Error Handling**: Robust error handling and rate limiting protection
 34 | 
 35 | ## Installation
 36 | 
 37 | ### Python
 38 | ```bash
 39 | pip install scrapester
 40 | ```
 41 | 
 42 | ### JavaScript/TypeScript
 43 | ```bash
 44 | npm install scrapester
 45 | # or
 46 | yarn add scrapester
 47 | ```
 48 | 
 49 | ## Quick Start
 50 | 
 51 | ### Python
 52 | ```python
 53 | from scrapester import ScrapesterApp
 54 | 
 55 | # Initialize the client
 56 | app = ScrapesterApp(api_key="your-api-key")
 57 | 
 58 | # Scrape a single page
 59 | result = app.scrape("https://example.com")
 60 | print(result.markdown)
 61 | 
 62 | # Crawl an entire website
 63 | results = app.crawl(
 64 |     "https://example.com",
 65 |     options={
 66 |         "max_pages": 10,
 67 |         "max_depth": 2
 68 |     }
 69 | )
 70 | ```
 71 | 
 72 | ### JavaScript/TypeScript
 73 | ```typescript
 74 | import { ScrapesterApp } from 'scrapester';
 75 | 
 76 | // Initialize the client
 77 | const app = new ScrapesterApp('your-api-key');
 78 | 
 79 | // Scrape a single page
 80 | const result = await app.scrape('https://example.com');
 81 | console.log(result.markdown);
 82 | 
 83 | // Crawl an entire website
 84 | const results = await app.crawl('https://example.com', {
 85 |     maxPages: 10,
 86 |     maxDepth: 2
 87 | });
 88 | ```
 89 | 
 90 | ## Response Format
 91 | 
 92 | Scrapester returns clean, structured data in the following format:
 93 | 
 94 | ```typescript
 95 | interface CrawlerResponse {
 96 |     url: string;          // The scraped URL
 97 |     markdown: string;     // Clean, markdown-formatted content
 98 |     metadata: {          // Page metadata
 99 |         title: string,
100 |         description: string,
101 |         // ... other meta tags
102 |     };
103 |     timestamp: string;   // ISO timestamp of when the page was scraped
104 | }
105 | ```
106 | 
107 | ## API Reference
108 | 
109 | ### ScrapesterApp
110 | 
111 | #### Constructor
112 | ```typescript
113 | new ScrapesterApp(
114 |     apiKey: string,
115 |     baseUrl?: string,    // default: "http://localhost:8000"
116 |     timeout?: number     // default: 600 seconds
117 | )
118 | ```
119 | 
120 | #### Methods
121 | 
122 | ##### scrape(url: string)
123 | Scrapes a single URL and returns clean, markdown-formatted content.
124 | 
125 | ##### crawl(url: string, options?)
126 | Crawls a website starting from the given URL. Options include:
127 | - `maxPages`: Maximum number of pages to crawl
128 | - `maxDepth`: Maximum crawl depth
129 | - `includePatterns`: URL patterns to include
130 | - `excludePatterns`: URL patterns to exclude
131 | 
132 | ## Error Handling
133 | 
134 | Scrapester provides detailed error information through the `APIError` class:
135 | 
136 | ```typescript
137 | class APIError extends Error {
138 |     statusCode?: number;
139 |     response?: object;
140 | }
141 | ```
142 | 
143 | Common error scenarios:
144 | - `429`: Rate limit exceeded
145 | - `400`: Invalid request
146 | - `401`: Invalid API key
147 | - `500`: Server error
148 | 
149 | ## Development
150 | 
151 | ### Running Tests
152 | ```bash
153 | # Python
154 | pytest tests/
155 | 
156 | # JavaScript
157 | npm test
158 | ```
159 | 
160 | ### Building from Source
161 | ```bash
162 | # Python
163 | pip install -e ".[dev]"
164 | 
165 | # JavaScript
166 | npm install
167 | npm run build
168 | ```
169 | 
170 | ## Contributing
171 | We welcome contributions! Please see our [Contributing Guidelines](CONTRIBUTING.md) for details.
172 | 
173 | ## Support
174 | 
175 | - 📖 [Documentation](https://docs.scrapester.dev)
176 | - 💬 [Discord Community](https://discord.gg/scrapester)
177 | - 📧 [Email Support](mailto:support@scrapester.dev)
178 | 
179 | ## License
180 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
181 | 


--------------------------------------------------------------------------------
/apps/api/src/core/crawler.py:
--------------------------------------------------------------------------------
  1 | from playwright.async_api import async_playwright, Page, TimeoutError
  2 | import asyncio
  3 | from typing import Dict, List, Optional, Union
  4 | import json
  5 | from datetime import datetime
  6 | import hashlib
  7 | from bs4 import BeautifulSoup
  8 | import markdown
  9 | import logging
 10 | from config import settings
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class ScrapesterCrawler:
 16 |     def __init__(self):
 17 |         self.browser = None
 18 |         self.context = None
 19 | 
 20 |     async def initialize(self):
 21 |         """Initialize the browser instance"""
 22 |         self.playwright = await async_playwright().start()
 23 |         self.browser = await self.playwright.chromium.launch(
 24 |             headless=False, args=["--no-sandbox", "--disable-setuid-sandbox"]
 25 |         )
 26 |         self.context = await self.browser.new_context(
 27 |             viewport={"width": 1920, "height": 1080},
 28 |             user_agent="Chrome/69.0.3497.100 Safari/537.36",
 29 |         )
 30 | 
 31 |     async def close(self):
 32 |         """Clean up resources"""
 33 |         if self.context:
 34 |             await self.context.close()
 35 |         if self.browser:
 36 |             await self.browser.close()
 37 |         if hasattr(self, "playwright"):
 38 |             await self.playwright.stop()
 39 | 
 40 |     async def _smart_scroll(self, page: Page):
 41 |         """Smart scroll to handle dynamic loading"""
 42 |         try:
 43 |             last_height = await page.evaluate("document.body.scrollHeight")
 44 |             while True:
 45 |                 await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
 46 |                 await page.wait_for_timeout(1000)
 47 |                 new_height = await page.evaluate("document.body.scrollHeight")
 48 |                 if new_height == last_height:
 49 |                     break
 50 |                 last_height = new_height
 51 |         except Exception as e:
 52 |             logger.warning(f"Smart scroll encountered an error: {e}")
 53 | 
 54 |     async def _extract_links(self, page: Page, base_url: str) -> List[str]:
 55 |         """Extract all valid links from the page"""
 56 |         links = await page.evaluate(
 57 |             """() => {
 58 |             const links = Array.from(document.links)
 59 |                 .map(link => link.href)
 60 |                 .filter(href => href.startsWith('http'));
 61 |             return [...new Set(links)];
 62 |         }"""
 63 |         )
 64 |         return [link for link in links if link.startswith(base_url)]
 65 | 
 66 |     async def _extract_content(self, page: Page) -> Dict:
 67 |         """Extract various content formats from the page"""
 68 |         html = await page.content()
 69 | 
 70 |         # Use BeautifulSoup for better content extraction
 71 |         soup = BeautifulSoup(html, "html.parser")
 72 | 
 73 |         # Remove unwanted elements
 74 |         for element in soup.select("script, style, noscript, iframe, img"):
 75 |             element.decompose()
 76 | 
 77 |         # Initialize markdown content
 78 |         md_parts = []
 79 | 
 80 |         # Extract and format headers
 81 |         for h in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
 82 |             level = int(h.name[1])  # get header level
 83 |             md_parts.append(f"{'#' * level} {h.get_text(strip=True)}\n")
 84 | 
 85 |         # Extract and format paragraphs
 86 |         for p in soup.find_all("p"):
 87 |             text = p.get_text(strip=True)
 88 |             if text:
 89 |                 # Check if it's a link paragraph
 90 |                 links = p.find_all("a")
 91 |                 if links:
 92 |                     for link in links:
 93 |                         href = link.get("href", "")
 94 |                         text = link.get_text(strip=True)
 95 |                         if href and text:
 96 |                             md_parts.append(f"[**{text}**]({href})\n")
 97 |                 else:
 98 |                     md_parts.append(f"{text}\n")
 99 |         # Join all parts with proper spacing
100 |         markdown_content = "\n".join(md_parts)
101 |         # Get clean text
102 |         text = soup.get_text(separator="\n", strip=True)
103 | 
104 |         # Convert to markdown
105 |         # md = markdown.markdown(text)
106 | 
107 |         # Extract metadata
108 |         metadata = await page.evaluate(
109 |             """() => {
110 |             const metadata = {};
111 |             
112 |             // Basic metadata
113 |             metadata.title = document.title;
114 |             
115 |             // Meta tags
116 |             const metaTags = document.getElementsByTagName('meta');
117 |             for (let tag of metaTags) {
118 |                 const name = tag.getAttribute('name') || tag.getAttribute('property');
119 |                 const content = tag.getAttribute('content');
120 |                 if (name && content) {
121 |                     metadata[name] = content;
122 |                 }
123 |             }
124 |             
125 |             // OpenGraph
126 |             metadata.og = {};
127 |             document.querySelectorAll('meta[property^="og:"]').forEach(tag => {
128 |                 const property = tag.getAttribute('property').substring(3);
129 |                 metadata.og[property] = tag.getAttribute('content');
130 |             });
131 |             
132 |             return metadata;
133 |         }"""
134 |         )
135 |         metadata = {
136 |             "title": metadata.get("title", ""),
137 |             "description": metadata.get("description", ""),
138 |             "robots": metadata.get("robots", ""),
139 |         }
140 |         return {
141 |             "url": page.url,
142 |             "markdown": markdown_content,
143 |             "metadata": metadata,
144 |             "timestamp": datetime.utcnow().isoformat(),
145 |         }
146 | 
147 |     async def scrape_url(self, url: str, options: Dict = None) -> Dict:
148 |         """Scrape a single URL with advanced options"""
149 |         if not self.context:
150 |             await self.initialize()
151 | 
152 |         options = options or {}
153 |         page = await self.context.new_page()
154 | 
155 |         try:
156 |             # Configure page
157 |             if options.get("timeout"):
158 |                 page.set_default_timeout(options["timeout"])
159 |             # Navigate to page
160 |             try:
161 |                 response = await page.goto(url, wait_until="networkidle")
162 |             except TimeoutError:
163 |                 pass
164 |             if not response.ok:
165 |                 raise Exception(f"Failed to load page: {response.status}")
166 | 
167 |             # Handle dynamic content
168 |             if options.get("wait_for_selector"):
169 |                 await page.wait_for_selector(options["wait_for_selector"])
170 | 
171 |             if options.get("scroll", True):
172 |                 await self._smart_scroll(page)
173 | 
174 |             # Execute custom actions
175 |             if options.get("actions"):
176 |                 await self.execute_actions(page, options["actions"])
177 | 
178 |             # Take screenshot if requested
179 |             screenshot = None
180 |             if options.get("screenshot"):
181 |                 screenshot = await page.screenshot(
182 |                     full_page=True, type="jpeg", quality=80
183 |                 )
184 | 
185 |             # Extract content
186 |             content = await self._extract_content(page)
187 | 
188 |             if screenshot:
189 |                 content["screenshot"] = screenshot
190 | 
191 |             return content
192 | 
193 |         except Exception as e:
194 |             logger.error(f"Error scraping {url}: {e}")
195 |             raise
196 |         finally:
197 |             await page.close()
198 | 
199 |     async def execute_actions(self, page: Page, actions: List[Dict]):
200 |         """Execute custom actions on the page"""
201 |         for action in actions:
202 |             action_type = action.get("type")
203 |             try:
204 |                 if action_type == "click":
205 |                     await page.click(action["selector"])
206 |                 elif action_type == "type":
207 |                     await page.type(action["selector"], action["text"])
208 |                 elif action_type == "wait":
209 |                     await page.wait_for_timeout(action["duration"])
210 |                 elif action_type == "scroll":
211 |                     await self._smart_scroll(page)
212 |                 # Add more action types as needed
213 |             except Exception as e:
214 |                 logger.error(f"Error executing action {action_type}: {e}")
215 |                 raise
216 | 
217 |     async def crawl_website(self, start_url: str, options: Dict = None) -> List[Dict]:
218 |         """Crawl an entire website with advanced options"""
219 |         if not self.context:
220 |             await self.initialize()
221 | 
222 |         options = options or {}
223 |         max_pages = options.get("max_pages", 10)
224 |         max_depth = options.get("max_depth", 3)
225 | 
226 |         visited = set()
227 |         to_visit = {(start_url, 0)}  # (url, depth)
228 |         results = []
229 | 
230 |         while to_visit and len(visited) < max_pages:
231 |             url, depth = to_visit.pop()
232 |             if url in visited or depth > max_depth:
233 |                 continue
234 | 
235 |             try:
236 |                 result = await self.scrape_url(url, options)
237 |                 results.append(result)
238 |                 visited.add(url)
239 | 
240 |                 if depth < max_depth:
241 |                     page = await self.context.new_page()
242 |                     await page.goto(url)
243 |                     new_urls = await self._extract_links(page, start_url)
244 |                     await page.close()
245 | 
246 |                     for new_url in new_urls:
247 |                         if new_url not in visited:
248 |                             to_visit.add((new_url, depth + 1))
249 | 
250 |             except Exception as e:
251 |                 logger.error(f"Error crawling {url}: {e}")
252 | 
253 |         return results
254 | 


--------------------------------------------------------------------------------