├── .github └── workflows │ ├── book.yml │ ├── nodejs.yml │ ├── python.yml │ └── rust.yml ├── .gitignore ├── LICENSE ├── README.md ├── book ├── .gitignore ├── book.toml └── src │ ├── README.md │ ├── SUMMARY.md │ ├── cli │ └── getting-started.md │ ├── env.md │ ├── javascript │ ├── crawl.md │ ├── getting-started.md │ └── scrape.md │ ├── python │ ├── async-crawl.md │ ├── crawl.md │ ├── getting-started.md │ └── scrape.md │ ├── rust │ └── getting-started.md │ ├── simple-example.md │ └── website.md ├── cli ├── Cargo.lock ├── Cargo.toml ├── README.md └── src │ ├── args.rs │ ├── main.rs │ └── mod.rs ├── javascript ├── .npmignore ├── LICENSE ├── README.md ├── __tests__ │ └── spiderwebai.test.ts ├── package-lock.json ├── package.json ├── sample.env ├── src │ ├── client.ts │ ├── config.ts │ ├── index.ts │ └── utils │ │ ├── process-chunk.ts │ │ └── stream-reader.ts └── tsconfig.json ├── python ├── LICENSE ├── README.md ├── example.py ├── example_async.py ├── example_streaming.py ├── requirements.txt ├── setup.py ├── spider │ ├── __init__.py │ ├── async_spider.py │ ├── spider.py │ ├── spider.pyi │ └── spider_types.py └── tests │ ├── test_async_spider.py │ ├── test_async_spider_integration.py │ ├── test_spider.py │ └── test_spider_integration.py └── rust ├── Cargo.lock ├── Cargo.toml ├── README.md └── src └── lib.rs /.github/workflows/book.yml: -------------------------------------------------------------------------------- 1 | name: github pages 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | jobs: 10 | deploy: 11 | runs-on: ubuntu-latest 12 | concurrency: 13 | group: ${{ github.workflow }}-${{ github.ref }} 14 | steps: 15 | - uses: actions/checkout@v4 16 | 17 | - name: Setup mdBook 18 | uses: peaceiris/actions-mdbook@v2 19 | with: 20 | mdbook-version: 'latest' 21 | 22 | - run: cd book && mdbook build 23 | 24 | - name: Deploy 25 | uses: peaceiris/actions-gh-pages@v4 26 | if: ${{ github.ref == 'refs/heads/main' }} 27 | with: 28 | github_token: ${{ secrets.GITHUB_TOKEN }} 29 | publish_dir: ./book/book -------------------------------------------------------------------------------- /.github/workflows/nodejs.yml: -------------------------------------------------------------------------------- 1 | name: Node.js CI 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | pull_request: 7 | branches: ["main"] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | strategy: 14 | matrix: 15 | node-version: [18.x, 20.x] 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - name: Use Node.js ${{ matrix.node-version }} 21 | uses: actions/setup-node@v3 22 | with: 23 | node-version: ${{ matrix.node-version }} 24 | cache: "npm" 25 | cache-dependency-path: ./javascript/package-lock.json 26 | 27 | - run: npm ci 28 | working-directory: ./javascript 29 | 30 | - run: npm run build --if-present 31 | working-directory: ./javascript 32 | 33 | - run: npm test 34 | working-directory: ./javascript 35 | env: 36 | SPIDER_API_KEY: ${{ secrets.SPIDER_API_KEY }} 37 | SPIDER_EMAIL: ${{ secrets.SPIDER_EMAIL }} 38 | SPIDER_PASSWORD: ${{ secrets.SPIDER_PASSWORD }} 39 | SUPABASE_AUTO_REFRESH_TOKEN: "false" -------------------------------------------------------------------------------- /.github/workflows/python.yml: -------------------------------------------------------------------------------- 1 | name: Python CI 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | pull_request: 7 | branches: ["main"] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | strategy: 14 | matrix: 15 | python-version: [3.11, 3.12] 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v4 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install . 29 | pip install -r requirements.txt 30 | working-directory: ./python 31 | 32 | - name: Run tests 33 | run: | 34 | pytest 35 | working-directory: ./python/tests 36 | env: 37 | SPIDER_API_KEY: ${{ secrets.SPIDER_API_KEY }} 38 | SPIDER_EMAIL: ${{ secrets.SPIDER_EMAIL }} 39 | SPIDER_PASSWORD: ${{ secrets.SPIDER_PASSWORD }} 40 | SUPABASE_AUTO_REFRESH_TOKEN: "false" -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust CI 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | pull_request: 7 | branches: ["main"] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | strategy: 14 | matrix: 15 | rust-version: [stable, beta] 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - name: Set up Rust ${{ matrix.rust-version }} 21 | uses: dtolnay/rust-toolchain@stable 22 | with: 23 | toolchain: ${{ matrix.rust-version }} 24 | 25 | - name: Cache cargo registry 26 | uses: actions/cache@v3 27 | with: 28 | path: ~/.cargo/registry 29 | key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} 30 | restore-keys: | 31 | ${{ runner.os }}-cargo-registry- 32 | 33 | - name: Cache cargo index 34 | uses: actions/cache@v3 35 | with: 36 | path: ~/.cargo/git 37 | key: ${{ runner.os }}-cargo-git-${{ hashFiles('**/Cargo.lock') }} 38 | restore-keys: | 39 | ${{ runner.os }}-cargo-git- 40 | 41 | - name: Check toolchain 42 | run: rustc --version 43 | 44 | - name: Build 45 | run: cargo build --verbose 46 | working-directory: ./rust 47 | 48 | - name: Run tests 49 | run: cargo test --verbose --lib --release 50 | working-directory: ./rust 51 | env: 52 | SPIDER_API_KEY: ${{ secrets.SPIDER_API_KEY }} 53 | SPIDER_EMAIL: ${{ secrets.SPIDER_EMAIL }} 54 | SPIDER_PASSWORD: ${{ secrets.SPIDER_PASSWORD }} 55 | SUPABASE_AUTO_REFRESH_TOKEN: "false" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules 3 | dist 4 | target 5 | python/build 6 | python/spiderwebai_py.egg-info 7 | javascript/coverage 8 | # Compiled Python files 9 | *.pyc 10 | *.pyo 11 | __pycache__/ 12 | 13 | # Distribution / packaging 14 | dist/ 15 | build/ 16 | *.egg-info/ 17 | *.egg 18 | 19 | # Virtual environment 20 | venv/ 21 | .venv/ 22 | env/ 23 | .env/ 24 | .env 25 | 26 | # IDE and editor files 27 | .vscode/ 28 | .idea/ 29 | *.sublime-project 30 | *.sublime-workspace 31 | 32 | # Testing and coverage 33 | .coverage 34 | .pytest_cache/ 35 | htmlcov/ 36 | 37 | # Documentation 38 | docs/_build/ 39 | docs/api/ 40 | 41 | # Miscellaneous 42 | *.log 43 | .DS_Store 44 | Thumbs.db 45 | deploy.sh -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Spider Contributors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spider Clients 2 | 3 | Discover the ultimate toolkit for integrating the fastest and most efficient web crawler **Spider** into your projects. This repository provides client libraries designed to streamline your use of [Spider Cloud](https://spider.cloud) services from various programming environments. Whether you're tackling web crawling or data indexing, our high-performance solutions have you covered. 4 | 5 | ## Python 6 | 7 | Leverage the power of Spider in your Python applications. Navigate to our [Python client library directory](./python/) for installation instructions, usage guides, and examples. Get ready to supercharge your data extraction tasks with the efficiency and speed of Spider within your Python environment. 8 | 9 | ## JavaScript 10 | 11 | Integrate Spider effortlessly into your JavaScript projects. Visit our [JavaScript client library directory](./javascript/) to explore how you can utilize Spider in Node.js or browser environments. Enhance your web scraping capabilities and improve data collection strategies with our cutting-edge technology. 12 | 13 | ## Rust 14 | 15 | Incorporate Spider smoothly into your Rust projects. Visit our [Rust client library directory](./rust/) to learn how to use Spider in your applications. Enhance your web scraping capabilities and unlock new possibilities with our advanced technology. 16 | 17 | ## CLI 18 | 19 | Integrate Spider into your CLI with ease. Visit our [CLI client library directory](./cli/) to explore how you can utilize Spider in your command-line applications. 20 | 21 | --- 22 | 23 | ### Features 24 | 25 | - **Concurrent Crawling:** Maximize your data extraction efficiency with Spider's advanced concurrency models. 26 | - **Streaming:** Stream crawled data in real-time to ensure timely processing and analysis. 27 | - **Headless Chrome Rendering:** Capture JavaScript-rendered page contents with ease. 28 | - **HTTP Proxies Support:** Navigate anonymously and bypass content restrictions. 29 | - **Cron Jobs:** Schedule your crawling tasks to run automatically, saving time and resources. 30 | - **Smart Mode:** Automate crawling tasks with AI-driven strategies for smarter data collection. 31 | - **Blacklisting, Whitelisting, and Budgeting Depth:** Fine-tune your crawls to focus on relevant data and manage resource utilization. 32 | - **Dynamic AI Prompt Scripting Headless:** Use AI to script dynamic interactions with web pages, simulating real user behavior. 33 | 34 | ### Getting Started 35 | 36 | Dive into the world of high-speed web crawling with Spider. Whether you're looking to deploy Spider locally or utilize our hosted services, we've got you covered. Start by exploring our client libraries above, or visit the main [Spider repository](https://github.com/spider-rs/spider) for comprehensive documentation, installation guides, and more. 37 | 38 | ### Support & Contribution 39 | 40 | Your feedback and contributions are highly valued. Should you encounter any issues or have suggestions for improvements, please feel free to open an issue or submit a pull request. Visit our [Contributing Guidelines](https://github.com/spider-rs/spider/blob/master/CONTRIBUTING.md) for more information on how you can contribute to the Spider project. 41 | 42 | We're on a mission to make web crawling faster, smarter, and more accessible than ever before. Join us in redefining the boundaries of data extraction and indexing with **Spider**. 43 | -------------------------------------------------------------------------------- /book/.gitignore: -------------------------------------------------------------------------------- 1 | book 2 | -------------------------------------------------------------------------------- /book/book.toml: -------------------------------------------------------------------------------- 1 | [book] 2 | authors = ["Jeff Mendez "] 3 | language = "en" 4 | multilingual = false 5 | src = "src" 6 | title = "spider-client" 7 | 8 | [output.html] 9 | git-repository-url = "https://github.com/spider-rs/spider-clients/tree/main/book" 10 | edit-url-template = "https://github.com/spider-rs/spider-clients/edit/main/book/{path}" 11 | -------------------------------------------------------------------------------- /book/src/README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | `spider-client` is a client library to use with the [Spider Cloud](https://spider.cloud) web crawler and scraper. 4 | 5 | - Concurrent 6 | - Streaming 7 | - Headless Chrome 8 | - HTTP Proxies 9 | - Cron Jobs 10 | - Subscriptions 11 | - AI Scraping and Event Driven Actions 12 | - Blacklisting and Budgeting Depth 13 | - Exponential Backoff -------------------------------------------------------------------------------- /book/src/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | [Introduction](./README.md) 4 | 5 | # User Guide 6 | 7 | - [A Simple Example](./simple-example.md) 8 | 9 | # Python 10 | 11 | - [Getting Started](./python/getting-started.md) 12 | - [Crawl](./python/crawl.md) 13 | - [Scrape](./python/scrape.md) 14 | - [Async Crawl](./python/async-crawl.md) 15 | 16 | # Javascript 17 | 18 | - [Getting Started](./javascript/getting-started.md) 19 | - [Crawl](./javascript/crawl.md) 20 | - [Scrape](./javascript/scrape.md) 21 | 22 | # Rust 23 | 24 | - [Getting Started](./rust/getting-started.md) 25 | 26 | # CLI 27 | 28 | - [Getting Started](./cli/getting-started.md) 29 | -------------------------------------------------------------------------------- /book/src/cli/getting-started.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | Spider Cloud CLI is a command-line interface to interact with the [Spider Cloud](https://spider.cloud) web crawler. It allows you to scrape, crawl, search, and perform various other web-related tasks through simple commands. 4 | 5 | ## Installation 6 | 7 | Install the CLI using [`homebrew`](https://brew.sh/) or [`cargo`](https://doc.rust-lang.org/cargo/) from [crates.io](https://crates.io): 8 | 9 | ### Homebrew 10 | 11 | ```sh 12 | brew tap spider-rs/spider-cloud-cli 13 | brew install spider-cloud-cli 14 | ``` 15 | 16 | ### Cargo 17 | 18 | ```sh 19 | cargo install spider-cloud-cli 20 | ``` 21 | 22 | ## Usage 23 | 24 | After installing, you can use the CLI by typing `spider-cloud-cli` followed by a command and its respective arguments. 25 | 26 | ### Authentication 27 | 28 | Before using most of the commands, you need to authenticate by providing an API key: 29 | 30 | ```sh 31 | spider-cloud-cli auth --api_key YOUR_API_KEY 32 | ``` 33 | 34 | ### Commands 35 | 36 | #### Scrape 37 | 38 | Scrape data from a specified URL. 39 | 40 | ```sh 41 | spider-cloud-cli scrape --url http://example.com 42 | ``` 43 | 44 | #### Crawl 45 | 46 | Crawl a specified URL with an optional limit on the number of pages. 47 | 48 | ```sh 49 | spider-cloud-cli crawl --url http://example.com --limit 10 50 | ``` 51 | 52 | #### Links 53 | 54 | Fetch links from a specified URL. 55 | 56 | ```sh 57 | spider-cloud-cli links --url http://example.com 58 | ``` 59 | 60 | #### Screenshot 61 | 62 | Take a screenshot of a specified URL. 63 | 64 | ```sh 65 | spider-cloud-cli screenshot --url http://example.com 66 | ``` 67 | 68 | #### Search 69 | 70 | Search for a query. 71 | 72 | ```sh 73 | spider-cloud-cli search --query "example query" 74 | ``` 75 | 76 | #### Transform 77 | 78 | Transform specified data. 79 | 80 | ```sh 81 | spider-cloud-cli transform --data "sample data" 82 | ``` 83 | 84 | #### Extract Contacts 85 | 86 | Extract contact information from a specified URL. 87 | 88 | ```sh 89 | spider-cloud-cli extract_contacts --url http://example.com 90 | ``` 91 | 92 | #### Label 93 | 94 | Label data from a specified URL. 95 | 96 | ```sh 97 | spider-cloud-cli label --url http://example.com 98 | ``` 99 | 100 | #### Get Crawl State 101 | 102 | Get the crawl state of a specified URL. 103 | 104 | ```sh 105 | spider-cloud-cli get_crawl_state --url http://example.com 106 | ``` 107 | 108 | #### Query 109 | 110 | Query records of a specified domain. 111 | 112 | ```sh 113 | spider-cloud-cli query --domain example.com 114 | ``` 115 | 116 | #### Get Credits 117 | 118 | Fetch the account credits left. 119 | 120 | ```sh 121 | spider-cloud-cli get_credits 122 | ``` -------------------------------------------------------------------------------- /book/src/env.md: -------------------------------------------------------------------------------- 1 | # Environment 2 | 3 | Env variables to adjust the project. 4 | 5 | ## SPIDER_API_KEY 6 | 7 | Set this value to the API key you create at [Spider Cloud API Keys](https://spider.cloud/api-keys) after create an account and adding credits. 8 | 9 | ```sh 10 | SPIDER_API_KEY=sk-myspiderkey 11 | ``` 12 | -------------------------------------------------------------------------------- /book/src/javascript/crawl.md: -------------------------------------------------------------------------------- 1 | # Crawl 2 | 3 | We will assume that you have installed the Spider package and exported your API key as an environment variable. If you haven't, please refer to the [Getting Started](./getting-started.md) guide. 4 | 5 | Crawl a website and return the content. 6 | 7 | ```javascript 8 | import { Spider } from "@spider-cloud/spider-client"; 9 | 10 | const app = new Spider(); 11 | const url = "https://spider.cloud"; 12 | const scrapedData = await app.crawlUrl(url, { limit: 10 }); 13 | console.log(scrapedData); 14 | ``` 15 | 16 | The `crawlUrl` method returns the content of the website in markdown format as default. We set the `limit` parameter to 10 to limit the number of pages to crawl. The maximum amount of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages. 17 | 18 | Next we will see how to crawl with with different parameters. 19 | 20 | ## Crawl with different parameters 21 | 22 | The `crawlUrl` method has the following parameters: 23 | 24 | - `url` (str): The URL of the website to crawl. 25 | 26 | the following are recommended parameters and can be set in the `params` dictionary: 27 | 28 | - `limit` (int): The maximum amount of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages. 29 | - `request_timeout` (int): The maximum amount of time to wait for a response from the website. 30 | - `stealth` (bool): Whether to use stealth mode. Default is `False` on chrome. 31 | - visit the [documentation](https://spider.cloud/docs/api?ref=javascript-sdk-book) for more parameters. 32 | 33 | ```javascript 34 | import { Spider } from "@spider-cloud/spider-client"; 35 | 36 | const app = new Spider(); 37 | const url = "https://spider.cloud"; 38 | const scrapedData = await app.crawlUrl(url, { 39 | limit: 10, 40 | anti_bot: true, 41 | return_format: "raw", 42 | }); 43 | console.log(scrapedData); 44 | ``` 45 | 46 | If you have a lot of params, setting them inside the `crawlUrl` method can be cumbersome. You can set them in a seperate `params` variable that has the `SpiderParams` type which is also available in the `spider` package. You will have to use Typescript if you want type annotations. 47 | 48 | ```ts 49 | import { Spider } from "@spider-cloud/spider-client"; 50 | import type { SpiderParams } from "@spider-cloud/spider-client/dist/config"; 51 | 52 | const app = new Spider(); 53 | const url = "https://spider.cloud"; 54 | const params: SpiderParams = { 55 | return_format: ["raw", "markdown"], 56 | anti_bot: true, 57 | }; 58 | const scrapedData = await app.crawlUrl(url, params); 59 | console.log(scrapedData); 60 | ``` 61 | -------------------------------------------------------------------------------- /book/src/javascript/getting-started.md: -------------------------------------------------------------------------------- 1 | # Getting started 2 | 3 | To be able to use the javascript SDK you will (of course) have to install it. You can do so with your package manager of choice. 4 | 5 | ```bash 6 | npm install @spider-cloud/spider-client 7 | ``` 8 | 9 | ```bash 10 | yarn add @spider-cloud/spider-client 11 | ``` 12 | 13 | [Here](https://www.npmjs.com/package/@spider-cloud/spider-client) is the link to the package on npm. 14 | 15 | ## Setting & Getting Api Key 16 | 17 | To use the SDK you will need an API key. You can get one by signing up on [spider.cloud](https://spider.cloud?ref=javascript-sdk-book). 18 | 19 | Then you need to set the API key in your environment variables. 20 | 21 | ```bash 22 | export SPIDER_API_KEY=your_api_key 23 | ``` 24 | 25 | if you don't want to set the API key in your environment variables you can pass it as an argument to the `Spider` class. 26 | 27 | ```javascript 28 | import { Spider } from "@spider-cloud/spider-client"; 29 | ``` 30 | 31 | We recommend setting the API key in your environment variables. 32 | -------------------------------------------------------------------------------- /book/src/javascript/scrape.md: -------------------------------------------------------------------------------- 1 | # Scrape 2 | 3 | We will assume that you have installed the Spider package and exported your API key as an environment variable. If you haven't, please refer to the [Getting Started](./getting-started.md) guide. 4 | 5 | Scrape a website and return the content. 6 | 7 | ```javascript 8 | import { Spider } from "@spider-cloud/spider-client"; 9 | 10 | const app = new Spider(); 11 | const url = "https://spider.cloud"; 12 | const scrapedData = await app.scrapeUrl(url); 13 | console.log(scrapedData); 14 | ``` 15 | 16 | The `scrapeUrl` method returns the content of the website in markdown format as default. Next we will see how to scrape with with different parameters. 17 | 18 | ## Scrape with different parameters 19 | 20 | The `scrapeUrl` method has the following parameters: 21 | 22 | - `url` (str): The URL of the website to scrape. 23 | 24 | the following are optional parameters and can be set in the `params` dictionary: 25 | 26 | - `request` ("http", "chrome", "smart") : The type of request to make. Default is "http". 27 | - `return_format` ("raw", "markdown", "commonmark", "html2text", "text", "bytes") : The format in which to return the scraped data. Default is "markdown". 28 | - `stealth`, `anti_bot` and a ton of other parameters that you can find in the [documentation](https://spider.cloud/docs/api?ref=javascript-sdk-book). 29 | 30 | ```javascript 31 | import { Spider } from "@spider-cloud/spider-client"; 32 | 33 | const app = new Spider(); 34 | const url = "https://spider.cloud"; 35 | const scrapedData = await app.scrapeUrl(url, { 36 | return_format: "raw", 37 | anti_bot: true, 38 | }); 39 | console.log(scrapedData); 40 | ``` 41 | 42 | If you have a lot of params, setting them inside the `scrapeUrl` method can be cumbersome. You can set them in a seperate `params` variable that has the `SpiderParams` type which is also available in the `spider` package. You will have to use Typescript if you want type annotations. 43 | 44 | ```ts 45 | import { Spider } from "@spider-cloud/spider-client"; 46 | import type { SpiderParams } from "@spider-cloud/spider-client/dist/config"; 47 | 48 | const app = new Spider(); 49 | const url = "https://spider.cloud"; 50 | const params: SpiderParams = { 51 | return_format: "raw", 52 | anti_bot: true, 53 | }; 54 | const scrapedData = await app.scrapeUrl(url, params); 55 | console.log(scrapedData); 56 | ``` 57 | -------------------------------------------------------------------------------- /book/src/python/async-crawl.md: -------------------------------------------------------------------------------- 1 | # Async Crawl 2 | 3 | We will assume that you have installed the Spider package and exported your API key as an environment variable. If you haven't, please refer to the [Getting Started](./getting-started.md) guide. 4 | 5 | Crawl a website asynchronously and return the content. 6 | 7 | ```python 8 | import asyncio 9 | 10 | from spider import AsyncSpider 11 | 12 | url = "https://spider.cloud" 13 | 14 | 15 | async def async_crawl_url(url, params): 16 | async with AsyncSpider() as app: 17 | crawled_data = [] 18 | async for data in app.crawl_url(url, params=params): 19 | crawled_data.append(data) 20 | return crawled_data 21 | 22 | 23 | result = asyncio.run(async_crawl_url(url, params={"limit": 10})) 24 | print(result) 25 | ``` 26 | 27 | We use the `AsyncSpider` class to create an asynchronous instance of the Spider class. We then use the `async for` loop to iterate over the results of the `crawl_url` method. The `crawl_url` method returns a generator that yields the crawled data. We append the data to a list and return it. Simsalabim, we have crawled a website asynchronously. 28 | 29 | Next we will see how to crawl asynchronously with different parameters. 30 | 31 | ## Async Crawl with different parameters 32 | 33 | The `crawl_url` method has the following parameters: 34 | 35 | - `url` (str): The URL of the website to crawl. 36 | 37 | the following are recommended parameters and can be set in the `params` dictionary: 38 | 39 | - `limit` (int): The maximum amount of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages. 40 | - `request_timeout` (int): The maximum amount of time to wait for a response from the website. 41 | - `stealth` (bool): Whether to use stealth mode. Default is `False` on chrome. 42 | - a ton more, visit the [documentation](https://spider.cloud/docs/api?ref=python-sdk-book) for more parameters. 43 | 44 | ```python 45 | import asyncio 46 | 47 | from spider import AsyncSpider 48 | 49 | url = "https://spider.cloud" 50 | 51 | 52 | async def async_crawl_url(url, params): 53 | async with AsyncSpider() as app: 54 | crawled_data = [] 55 | async for data in app.crawl_url(url, params=params): 56 | crawled_data.append(data) 57 | return crawled_data 58 | 59 | 60 | result = asyncio.run( 61 | async_crawl_url( 62 | url, 63 | params={ 64 | "limit": 10, 65 | "request_timeout": 10, 66 | "stealth": True, 67 | "return_format": "html", 68 | }, 69 | ) 70 | ) 71 | print(result) 72 | ``` 73 | 74 | If you have a lot of params, setting them inside the `crawl_url` method can be cumbersome. You can set them in a seperate `params` variable that has the `RequestParams` type which is also available in the `spider` package. 75 | 76 | ```python 77 | import asyncio 78 | 79 | from spider import AsyncSpider, spider_types 80 | 81 | url = "https://spider.cloud" 82 | 83 | 84 | async def async_crawl_url(url, params): 85 | async with AsyncSpider() as app: 86 | crawled_data = [] 87 | async for data in app.crawl_url(url, params=params): 88 | crawled_data.append(data) 89 | return crawled_data 90 | 91 | 92 | params: spider_types.RequestParamsDict = { 93 | "limit": 10, 94 | "request_timeout": 10, 95 | "stealth": True, 96 | # Easier to read and intellisense will help you with the available options 97 | } 98 | 99 | result = asyncio.run(async_crawl_url(url, params=params)) 100 | print(result) 101 | ``` 102 | -------------------------------------------------------------------------------- /book/src/python/crawl.md: -------------------------------------------------------------------------------- 1 | # Crawl 2 | 3 | We will assume that you have installed the Spider package and exported your API key as an environment variable. If you haven't, please refer to the [Getting Started](./getting-started.md) guide. 4 | 5 | Crawl a website and return the content. 6 | 7 | ```python 8 | from spider import Spider 9 | 10 | app = Spider() 11 | url = "https://spider.cloud" 12 | crawled_data = app.crawl_url(url, params={"limit": 10}) 13 | print(crawled_data) 14 | ``` 15 | 16 | The `crawl_url` method returns the content of the website in markdown format as default. We set the `limit` parameter to 10 to limit the number of pages to crawl. The maximum amount of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages. 17 | 18 | Next we will see how to crawl with with different parameters. 19 | 20 | ## Crawl with different parameters 21 | 22 | The `crawl_url` method has the following parameters: 23 | 24 | - `url` (str): The URL of the website to crawl. 25 | 26 | the following are recommended parameters and can be set in the `params` dictionary: 27 | 28 | - `limit` (int): The maximum amount of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages. 29 | - `request_timeout` (int): The maximum amount of time to wait for a response from the website. 30 | - `stealth` (bool): Whether to use stealth mode. Default is `False` on chrome. 31 | - visit the [documentation](https://spider.cloud/docs/api?ref=python-sdk-book) for more parameters. 32 | 33 | ```python 34 | from spider import Spider 35 | 36 | app = Spider() 37 | url = "https://spider.cloud" 38 | crawled_data = app.crawl_url( 39 | url, params={"limit": 10, "request_timeout": 10, "stealth": True} 40 | ) 41 | 42 | print(crawled_data) 43 | ``` 44 | 45 | If you have a lot of params, setting them inside the `crawl_url` method can be cumbersome. You can set them in a seperate `params` variable that has the `RequestParams` type which is also available in the `spider` package. 46 | 47 | ```python 48 | from spider import Spider, spider_types 49 | 50 | params: spider_types.RequestParamsDict = { 51 | "limit": 10, 52 | "request_timeout": 10, 53 | "stealth": True, 54 | "return_format": [ "raw", "markdown" ], 55 | # Easier to read and intellisense will help you with the available options 56 | } 57 | 58 | app = Spider() 59 | url = "https://spider.cloud" 60 | crawled_data = app.crawl_url(url, params) 61 | 62 | print(crawled_data) 63 | ``` 64 | -------------------------------------------------------------------------------- /book/src/python/getting-started.md: -------------------------------------------------------------------------------- 1 | # Getting started 2 | 3 | To use the python SDK you will (of course) have to install it :) 4 | 5 | ```bash 6 | pip install spider-client 7 | ``` 8 | 9 | [Here](https://pypi.org/project/spider-client/) is the link to the package on PyPi. 10 | 11 | ## Setting & Getting Api Key 12 | 13 | To use the SDK you will need an API key. You can get one by signing up on [spider.cloud](https://spider.cloud?ref=python-sdk-book). 14 | 15 | Then you need to set the API key in your environment variables. 16 | 17 | ```bash 18 | export SPIDER_API_KEY=your_api_key 19 | ``` 20 | 21 | if you don't want to set the API key in your environment variables you can pass it as an argument to the `Spider` class. 22 | 23 | ```python 24 | from spider import Spider 25 | app = Spider(api_key='your_api_key') 26 | ``` 27 | 28 | We recommend setting the API key in your environment variables. 29 | -------------------------------------------------------------------------------- /book/src/python/scrape.md: -------------------------------------------------------------------------------- 1 | # Scrape 2 | 3 | We will assume that you have installed the Spider package and exported your API key as an environment variable. If you haven't, please refer to the [Getting Started](./getting-started.md) guide. 4 | 5 | Scrape a website and return the content. 6 | 7 | ```python 8 | from spider import Spider 9 | 10 | app = Spider() 11 | url = 'https://spider.cloud' 12 | scraped_data = app.scrape_url(url) 13 | 14 | print(scraped_data) 15 | ``` 16 | 17 | The `scrape_url` method returns the content of the website in markdown format as default. Next we will see how to scrape with with different parameters. 18 | 19 | ## Scrape with different parameters 20 | 21 | The `scrape_url` method has the following parameters: 22 | 23 | - `url` (str): The URL of the website to scrape. 24 | 25 | the following are optional parameters and can be set in the `params` dictionary: 26 | 27 | - `request` ("http", "chrome", "smart") : The type of request to make. Default is "http". 28 | - `return_format` ("raw", "markdown", "commonmark", "html2text", "text", "bytes") : The format in which to return the scraped data. Default is "markdown". 29 | - `stealth`, `anti_bot` and a ton of other parameters that you can find in the [documentation](https://spider.cloud/docs/api?ref=python-sdk-book). 30 | 31 | ```python 32 | from spider import Spider 33 | 34 | app = Spider() 35 | url = "https://spider.cloud" 36 | scraped_data = app.scrape_url(url, params={"request_timeout": 10, "stealth": True}) 37 | 38 | print(scraped_data) 39 | ``` 40 | 41 | If you have a lot of params, setting them inside the `scrape_url` method can be cumbersome. You can set them in a seperate `params` variable that has the `RequestParams` type which is also available in the `spider` package. 42 | 43 | ```python 44 | from spider import Spider, spider_types 45 | 46 | params: spider_types.RequestParamsDict = { 47 | "request_timeout": 10, 48 | "stealth": True, 49 | # Easier to read and intellisense will help you with the available options 50 | } 51 | 52 | app = Spider() 53 | url = "https://spider.cloud" 54 | scraped_data = app.scrape_url(url, params) 55 | 56 | print(scraped_data) 57 | ``` 58 | -------------------------------------------------------------------------------- /book/src/rust/getting-started.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | The Spider Cloud Rust SDK offers a toolkit for straightforward website scraping, crawling at scale, and other utilities like extracting links and taking screenshots, enabling you to collect data formatted for compatibility with language models (LLMs). It features a user-friendly interface for seamless integration with the Spider Cloud API. 4 | 5 | ## Installation 6 | 7 | To use the Spider Cloud Rust SDK, include the following in your `Cargo.toml`: 8 | 9 | ```toml 10 | [dependencies] 11 | spider-client = "0.1" 12 | ``` 13 | 14 | ## Usage 15 | 16 | 1. Get an API key from [spider.cloud](https://spider.cloud) 17 | 2. Set the API key as an environment variable named `SPIDER_API_KEY` or pass it as an argument when creating an instance of the `Spider` struct. 18 | 19 | Here's an example of how to use the SDK: 20 | 21 | ```rust 22 | use serde_json::json; 23 | use std::env; 24 | 25 | #[tokio::main] 26 | async fn main() { 27 | // Set the API key as an environment variable 28 | env::set_var("SPIDER_API_KEY", "your_api_key"); 29 | 30 | // Initialize the Spider with your API key 31 | let spider = Spider::new(None).expect("API key must be provided"); 32 | 33 | let url = "https://spider.cloud"; 34 | 35 | // Scrape a single URL 36 | let scraped_data = spider.scrape_url(url, None, false, "application/json").await.expect("Failed to scrape the URL"); 37 | 38 | println!("Scraped Data: {:?}", scraped_data); 39 | 40 | // Crawl a website 41 | let crawler_params = RequestParams { 42 | limit: Some(1), 43 | proxy_enabled: Some(true), 44 | store_data: Some(false), 45 | metadata: Some(false), 46 | request: Some(RequestType::Http), 47 | ..Default::default() 48 | }; 49 | 50 | let crawl_result = spider.crawl_url(url, Some(crawler_params), false, "application/json", None::).await.expect("Failed to crawl the URL"); 51 | 52 | println!("Crawl Result: {:?}", crawl_result); 53 | } 54 | ``` 55 | 56 | ### Scraping a URL 57 | 58 | To scrape data from a single URL: 59 | 60 | ```rust 61 | let url = "https://example.com"; 62 | let scraped_data = spider.scrape_url(url, None, false, "application/json").await.expect("Failed to scrape the URL"); 63 | ``` 64 | 65 | ### Crawling a Website 66 | 67 | To automate crawling a website: 68 | 69 | ```rust 70 | let url = "https://example.com"; 71 | let crawl_params = RequestParams { 72 | limit: Some(200), 73 | request: Some(RequestType::Smart), 74 | ..Default::default() 75 | }; 76 | let crawl_result = spider.crawl_url(url, Some(crawl_params), false, "application/json", None::).await.expect("Failed to crawl the URL"); 77 | ``` 78 | 79 | #### Crawl Streaming 80 | 81 | Stream crawl the website in chunks to scale with a callback: 82 | 83 | ```rust 84 | fn handle_json(json_obj: serde_json::Value) { 85 | println!("Received chunk: {:?}", json_obj); 86 | } 87 | 88 | let url = "https://example.com"; 89 | let crawl_params = RequestParams { 90 | limit: Some(200), 91 | store_data: Some(false), 92 | ..Default::default() 93 | }; 94 | 95 | spider.crawl_url( 96 | url, 97 | Some(crawl_params), 98 | true, 99 | "application/json", 100 | Some(handle_json) 101 | ).await.expect("Failed to crawl the URL"); 102 | ``` 103 | 104 | ### Search 105 | 106 | Perform a search for websites to crawl or gather search results: 107 | 108 | ```rust 109 | let query = "a sports website"; 110 | let crawl_params = RequestParams { 111 | request: Some(RequestType::Smart), 112 | search_limit: Some(5), 113 | limit: Some(5), 114 | fetch_page_content: Some(true), 115 | ..Default::default() 116 | }; 117 | let crawl_result = spider.search(query, Some(crawl_params), false, "application/json").await.expect("Failed to perform search"); 118 | ``` 119 | 120 | ### Retrieving Links from a URL(s) 121 | 122 | Extract all links from a specified URL: 123 | 124 | ```rust 125 | let url = "https://example.com"; 126 | let links = spider.links(url, None, false, "application/json").await.expect("Failed to retrieve links from URL"); 127 | ``` 128 | 129 | ### Transform 130 | 131 | Transform HTML to markdown or text lightning fast: 132 | 133 | ```rust 134 | let data = vec![json!({"html": "

Hello world

"})]; 135 | let params = RequestParams { 136 | readability: Some(false), 137 | return_format: Some(ReturnFormat::Markdown), 138 | ..Default::default() 139 | }; 140 | let result = spider.transform(data, Some(params), false, "application/json").await.expect("Failed to transform HTML to markdown"); 141 | println!("Transformed Data: {:?}", result); 142 | ``` 143 | 144 | ### Taking Screenshots of a URL(s) 145 | 146 | Capture a screenshot of a given URL: 147 | 148 | ```rust 149 | let url = "https://example.com"; 150 | let screenshot = spider.screenshot(url, None, false, "application/json").await.expect("Failed to take screenshot of URL"); 151 | ``` 152 | 153 | ### Extracting Contact Information 154 | 155 | Extract contact details from a specified URL: 156 | 157 | ```rust 158 | let url = "https://example.com"; 159 | let contacts = spider.extract_contacts(url, None, false, "application/json").await.expect("Failed to extract contacts from URL"); 160 | println!("Extracted Contacts: {:?}", contacts); 161 | ``` 162 | 163 | ### Labeling Data from a URL(s) 164 | 165 | Label the data extracted from a particular URL: 166 | 167 | ```rust 168 | let url = "https://example.com"; 169 | let labeled_data = spider.label(url, None, false, "application/json").await.expect("Failed to label data from URL"); 170 | println!("Labeled Data: {:?}", labeled_data); 171 | ``` 172 | 173 | ### Checking Crawl State 174 | 175 | You can check the crawl state of a specific URL: 176 | 177 | ```rust 178 | let url = "https://example.com"; 179 | let state = spider.get_crawl_state(url, None, false, "application/json").await.expect("Failed to get crawl state for URL"); 180 | println!("Crawl State: {:?}", state); 181 | ``` 182 | 183 | ### Downloading Files 184 | 185 | You can download the results of the website: 186 | 187 | ```rust 188 | let url = "https://example.com"; 189 | let options = hashmap!{ 190 | "page" => 0, 191 | "limit" => 100, 192 | "expiresIn" => 3600 // Optional, add if needed 193 | }; 194 | let response = spider.create_signed_url(Some(url), Some(options)).await.expect("Failed to create signed URL"); 195 | println!("Download URL: {:?}", response); 196 | ``` 197 | 198 | ### Checking Available Credits 199 | 200 | You can check the remaining credits on your account: 201 | 202 | ```rust 203 | let credits = spider.get_credits().await.expect("Failed to get credits"); 204 | println!("Remaining Credits: {:?}", credits); 205 | ``` 206 | 207 | ### Data Operations 208 | 209 | The Spider client can now interact with specific data tables to create, retrieve, and delete data. 210 | 211 | #### Retrieve Data from a Table 212 | 213 | To fetch data from a specified table by applying query parameters: 214 | 215 | ```rust 216 | let table_name = "pages"; 217 | let query_params = RequestParams { 218 | limit: Some(20), 219 | ..Default::default() 220 | }; 221 | let response = spider.data_get(table_name, Some(query_params)).await.expect("Failed to retrieve data from table"); 222 | println!("Data from table: {:?}", response); 223 | ``` 224 | 225 | #### Delete Data from a Table 226 | 227 | To delete data from a specified table based on certain conditions: 228 | 229 | ```rust 230 | let table_name = "websites"; 231 | let delete_params = RequestParams { 232 | domain: Some("www.example.com".to_string()), 233 | ..Default::default() 234 | }; 235 | let response = spider.data_delete(table_name, Some(delete_params)).await.expect("Failed to delete data from table"); 236 | println!("Delete Response: {:?}", response); 237 | ``` 238 | 239 | ## Streaming 240 | 241 | If you need to use streaming, set the `stream` parameter to `true` and provide a callback function: 242 | 243 | ```rust 244 | fn handle_json(json_obj: serde_json::Value) { 245 | println!("Received chunk: {:?}", json_obj); 246 | } 247 | 248 | let url = "https://example.com"; 249 | let crawler_params = RequestParams { 250 | limit: Some(1), 251 | proxy_enabled: Some(true), 252 | store_data: Some(false), 253 | metadata: Some(false), 254 | request: Some(RequestType::Http), 255 | ..Default::default() 256 | }; 257 | 258 | spider.links(url, Some(crawler_params), true, "application/json").await.expect("Failed to retrieve links from URL"); 259 | ``` 260 | 261 | ## Content-Type 262 | 263 | The following Content-type headers are supported using the `content_type` parameter: 264 | 265 | - `application/json` 266 | - `text/csv` 267 | - `application/xml` 268 | - `application/jsonl` 269 | 270 | ```rust 271 | let url = "https://example.com"; 272 | 273 | let crawler_params = RequestParams { 274 | limit: Some(1), 275 | proxy_enabled: Some(true), 276 | store_data: Some(false), 277 | metadata: Some(false), 278 | request: Some(RequestType::Http), 279 | ..Default::default() 280 | }; 281 | 282 | // Stream JSON lines back to the client 283 | spider.crawl_url(url, Some(crawler_params), true, "application/jsonl", None::).await.expect("Failed to crawl the URL"); 284 | ``` 285 | 286 | ## Error Handling 287 | 288 | The SDK handles errors returned by the Spider Cloud API and raises appropriate exceptions. If an error occurs during a request, it will be propagated to the caller with a descriptive error message. By default request use a Exponential Backoff to retry as needed. -------------------------------------------------------------------------------- /book/src/simple-example.md: -------------------------------------------------------------------------------- 1 | # Simple Example 2 | 3 | This is a simple example of what you can do with the `spider-client` library. 4 | 5 | ## Installation 6 | 7 | To install the library, you can use `pip` for Python or `npm` (make sure to have [node](https://nodejs.org/en) installed) for JavaScript.: 8 | 9 | ```bash 10 | # for python 11 | pip install spider-client 12 | ``` 13 | 14 | ```bash 15 | # for javascript 16 | npm install @spider-cloud/spider-client 17 | ``` 18 | 19 | ## Usage 20 | 21 | Here is an example of how you can use the library, make sure to replace `your_api_key` with your actual API key which you can get from the [spider.cloud](https://spider.cloud) website. 22 | 23 | ```python 24 | from spider import Spider 25 | 26 | app = Spider(api_key='your_api_key') 27 | url = 'https://spider.cloud' 28 | scraped_data = app.scrape_url(url) 29 | ``` 30 | 31 | ```javascript 32 | import { Spider } from "@spider-cloud/spider-client"; 33 | 34 | const app = new Spider({ apiKey: "your-api-key" }); 35 | const url = "https://spider.cloud"; 36 | const scrapedData = await app.scrapeUrl(url); 37 | console.log(scrapedData); 38 | ``` 39 | -------------------------------------------------------------------------------- /book/src/website.md: -------------------------------------------------------------------------------- 1 | # Website 2 | 3 | The Website class is the foundations to the spider. 4 | 5 | ## Builder pattern 6 | 7 | We use the builder pattern to configure the website for crawling. 8 | 9 | \*note: Replace `https://choosealicense.com` from the examples below with your website target URL. 10 | 11 | ```py 12 | import asyncio 13 | from spider_rs import Website 14 | 15 | async def main(): 16 | website = Website("https://choosealicense.com") 17 | website.crawl() 18 | print(website.get_links()) 19 | 20 | asyncio.run(main()) 21 | ``` 22 | 23 | ### Custom Headers 24 | 25 | Add custom HTTP headers to use when crawling/scraping. 26 | 27 | ```py 28 | import asyncio 29 | from spider_rs import Website 30 | 31 | async def main(): 32 | website = Website("https://choosealicense.com").with_headers({ "authorization": "mytoken"}) 33 | 34 | asyncio.run(main()) 35 | ``` 36 | 37 | ### Blacklist 38 | 39 | Prevent crawling a set path, url, or pattern with Regex. 40 | 41 | ```py 42 | import asyncio 43 | from spider_rs import Website 44 | 45 | async def main(): 46 | website = Website("https://choosealicense.com").with_blacklist_url(["/blog", "/resume"]) 47 | 48 | asyncio.run(main()) 49 | ``` 50 | 51 | ### Whitelist 52 | 53 | Only crawl set paths, url, or pattern with Regex. 54 | 55 | ```py 56 | import asyncio 57 | from spider_rs import Website 58 | 59 | async def main(): 60 | website = Website("https://choosealicense.com").with_whitelist_url(["/licenses"]) 61 | 62 | asyncio.run(main()) 63 | ``` 64 | 65 | ### Crons 66 | 67 | Setup a cron job that can run at any time in the background using cron-syntax. 68 | 69 | ```py 70 | import asyncio 71 | from spider_rs import Website 72 | 73 | async def main(): 74 | website = Website("https://choosealicense.com").with_cron("1/5 * * * * *") 75 | 76 | asyncio.run(main()) 77 | ``` 78 | 79 | View the [cron](./cron-job.md) section for details how to use the cron. 80 | 81 | ### Budget 82 | 83 | Add a crawl budget that prevents crawling `x` amount of pages. 84 | 85 | ```py 86 | import asyncio 87 | from spider_rs import Website 88 | 89 | async def main(): 90 | website = Website("https://choosealicense.com").with_budget({ 91 | "*": 1, 92 | }) 93 | 94 | asyncio.run(main()) 95 | ``` 96 | 97 | ### Subdomains 98 | 99 | Include subdomains in request. 100 | 101 | ```py 102 | import asyncio 103 | from spider_rs import Website 104 | 105 | async def main(): 106 | website = Website("https://choosealicense.com").with_subdomains(True) 107 | 108 | asyncio.run(main()) 109 | ``` 110 | 111 | ### TLD 112 | 113 | Include TLDs in request. 114 | 115 | ```py 116 | import asyncio 117 | from spider_rs import Website 118 | 119 | async def main(): 120 | website = Website("https://choosealicense.com").with_tld(True) 121 | 122 | asyncio.run(main()) 123 | ``` 124 | 125 | ### External Domains 126 | 127 | Add external domains to include with the website. 128 | 129 | ```py 130 | import asyncio 131 | from spider_rs import Website 132 | 133 | async def main(): 134 | website = Website("https://choosealicense.com").with_external_domains(["https://www.myotherdomain.com"]) 135 | 136 | asyncio.run(main()) 137 | ``` 138 | 139 | ### Proxy 140 | 141 | Use a proxy to crawl a website. 142 | 143 | ```py 144 | import asyncio 145 | from spider_rs import Website 146 | 147 | async def main(): 148 | website = Website("https://choosealicense.com").with_proxies(["https://www.myproxy.com"]) 149 | 150 | asyncio.run(main()) 151 | ``` 152 | 153 | ### Depth Limit 154 | 155 | Set the depth limit for the amount of forward pages. 156 | 157 | ```ts 158 | import asyncio 159 | from spider_rs import Website 160 | 161 | async def main(): 162 | website = Website("https://choosealicense.com").with_depth(3) 163 | 164 | asyncio.run(main()) 165 | ``` 166 | 167 | ### Cache 168 | 169 | Enable HTTP caching, this useful when using the spider on a server. 170 | 171 | ```py 172 | import asyncio 173 | from spider_rs import Website 174 | 175 | async def main(): 176 | website = Website("https://choosealicense.com").with_caching(True) 177 | 178 | asyncio.run(main()) 179 | ``` 180 | 181 | ### Delays 182 | 183 | Add delays between pages. Defaults to none. 184 | 185 | ```py 186 | import asyncio 187 | from spider_rs import Website 188 | 189 | async def main(): 190 | website = Website("https://choosealicense.com").with_delays(200) 191 | 192 | asyncio.run(main()) 193 | ``` 194 | 195 | ### User-Agent 196 | 197 | Use a custom User-Agent. 198 | 199 | ```py 200 | import asyncio 201 | from spider_rs import Website 202 | 203 | async def main(): 204 | website = Website("https://choosealicense.com").with_user_agent("mybot/v1") 205 | 206 | asyncio.run(main()) 207 | ``` 208 | 209 | ### Request Timeout 210 | 211 | Add a request timeout per page in miliseconds. Example shows 30 seconds. 212 | 213 | ```py 214 | import asyncio 215 | from spider_rs import Website 216 | 217 | async def main(): 218 | website = Website("https://choosealicense.com").with_request_timeout(30000) 219 | 220 | asyncio.run(main()) 221 | ``` 222 | 223 | ### Wait For Idle Network 224 | 225 | You can wait for the Network to become idle when using chrome. This helps load all the data from client side scripts. 226 | The first param is whether to enable or not and the second is the duration max timeout in milliseconds. 227 | 228 | ```py 229 | import asyncio 230 | from spider_rs import Website 231 | 232 | async def main(): 233 | website = Website("https://choosealicense.com").with_wait_for_idle_network(True, 12000) 234 | 235 | asyncio.run(main()) 236 | ``` 237 | 238 | ### Respect Robots 239 | 240 | Respect the robots.txt file. 241 | 242 | ```py 243 | import asyncio 244 | from spider_rs import Website 245 | 246 | async def main(): 247 | website = Website("https://choosealicense.com").with_respect_robots_txt(True) 248 | 249 | asyncio.run(main()) 250 | ``` 251 | 252 | ### Collect Full Resources 253 | 254 | Collect all resources found not just valid web pages. 255 | 256 | ```py 257 | import asyncio 258 | from spider_rs import Website 259 | 260 | async def main(): 261 | website = Website("https://choosealicense.com").with_full_resources(True) 262 | 263 | asyncio.run(main()) 264 | ``` 265 | 266 | ### OpenAI 267 | 268 | Use OpenAI to generate dynamic scripts to use with headless. Make sure to set the `OPENAI_API_KEY` env variable. 269 | 270 | ```py 271 | import asyncio 272 | from spider_rs import Website 273 | 274 | async def main(): 275 | website = ( 276 | Website("https://google.com") 277 | .with_openai({ 278 | "model": "gpt-3.5-turbo", 279 | "prompt": "Search for movies", 280 | "maxTokens": 300 281 | }) 282 | ) 283 | 284 | asyncio.run(main()) 285 | ``` 286 | 287 | ### Screenshots 288 | 289 | Take a screenshot of the pages on crawl when using headless chrome. 290 | 291 | ```py 292 | import asyncio 293 | from spider_rs import Website 294 | 295 | async def main(): 296 | website = ( 297 | Website("https://choosealicense.com", False) 298 | .with_screenshot({ 299 | "params": { 300 | "cdp_params": None, 301 | "full_page": True, 302 | "omit_background": False 303 | }, 304 | "bytes": False, 305 | "save": True, 306 | "output_dir": None 307 | }) 308 | ) 309 | 310 | asyncio.run(main()) 311 | ``` 312 | 313 | ### Http2 Prior Knowledge 314 | 315 | Use http2 to connect if you know the website servers supports this. 316 | 317 | ```py 318 | import asyncio 319 | from spider_rs import Website 320 | 321 | async def main(): 322 | website = Website("https://choosealicense.com").with_http2_prior_knowledge(True) 323 | 324 | asyncio.run(main()) 325 | ``` 326 | 327 | ## Chaining 328 | 329 | You can chain all of the configs together for simple configuration. 330 | 331 | ```py 332 | import asyncio 333 | from spider_rs import Website 334 | 335 | async def main(): 336 | website = Website("https://choosealicense.com").with_subdomains(true).with_tlds(true).with_user_agent("mybot/v1").with_respect_robots_txt(true) 337 | 338 | asyncio.run(main()) 339 | ``` 340 | 341 | ## Raw Content 342 | 343 | Set the second param of the website constructor to `true` to return content without UTF-8. 344 | This will return `rawContent` and leave `content` when using subscriptions or the Page Object. 345 | 346 | ```py 347 | import asyncio 348 | from spider_rs import Website 349 | 350 | async def main(): 351 | website = Website("https://choosealicense.com", True) 352 | website.scrape_url() 353 | 354 | asyncio.run(main()) 355 | ``` 356 | 357 | ## Clearing Crawl Data 358 | 359 | Use `website.clear` to remove the links visited and page data or `website.drain_links` to drain the links visited. 360 | 361 | ```py 362 | import asyncio 363 | from spider_rs import Website 364 | 365 | async def main(): 366 | website = Website("https://choosealicense.com") 367 | website.crawl() 368 | print(website.getLinks()) 369 | website.clear() 370 | print(website.getLinks()) 371 | 372 | asyncio.run(main()) 373 | ``` 374 | 375 | ## Stop crawl 376 | 377 | To stop a crawl you can use `website.stopCrawl(id)`, pass in the crawl id to stop a run or leave empty for all crawls to stop. 378 | 379 | ```py 380 | import asyncio 381 | from spider_rs import Website 382 | 383 | class Subscription: 384 | def __init__(self): 385 | print("Subscription Created...") 386 | def __call__(self, page): 387 | print(page.url + " - status: " + str(page.status_code)) 388 | 389 | async def main(): 390 | website = Website("https://choosealicense.com") 391 | website.crawl(Subscription()) 392 | # sleep for 2s and stop etc 393 | website.stop() 394 | 395 | asyncio.run(main()) 396 | ``` 397 | -------------------------------------------------------------------------------- /cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "spider-cloud-cli" 3 | version = "0.1.36" 4 | edition = "2021" 5 | authors = [ "j-mendez "] 6 | description = "The Spider Cloud CLI for web crawling and scraping" 7 | license = "MIT" 8 | readme = "README.md" 9 | keywords = ["crawler", "web-crawler", "web-scraper", "spider", "web-indexer"] 10 | categories = ["web-programming"] 11 | include = ["src/*", "../../LICENSE", "README.md"] 12 | 13 | [dependencies] 14 | clap = { version = "4", features = ["derive"]} 15 | reqwest = { version = "0.12", features = ["json", "stream"] } 16 | tokio = { version = "1", features = ["rt-multi-thread", "macros"] } 17 | spider-client = { path = "../rust", version = "0.1" } 18 | serde = { version = "1", features = ["derive"] } 19 | serde_json = "1" 20 | keyring = { version = "3", features = ["apple-native", "windows-native", "sync-secret-service"] } 21 | -------------------------------------------------------------------------------- /cli/README.md: -------------------------------------------------------------------------------- 1 | # Spider Cloud CLI 2 | 3 | Spider Cloud CLI is a command-line interface to interact with the [Spider Cloud](https://spider.cloud) web crawler. It allows you to scrape, crawl, search, and perform various other web-related tasks through simple commands. 4 | 5 | ## Installation 6 | 7 | Install the CLI using [`homebrew`](https://brew.sh/) or [`cargo`](https://doc.rust-lang.org/cargo/) from [crates.io](https://crates.io): 8 | 9 | ### Homebrew 10 | 11 | ```sh 12 | brew tap spider-rs/spider-cloud-cli 13 | brew install spider-cloud-cli 14 | ``` 15 | 16 | ### Cargo 17 | 18 | ```sh 19 | cargo install spider-cloud-cli 20 | ``` 21 | 22 | ## Usage 23 | 24 | After installing, you can use the CLI by typing `spider-cloud-cli` followed by a command and its respective arguments. 25 | 26 | ### Authentication 27 | 28 | Before using most of the commands, you need to authenticate by providing an API key: 29 | 30 | ```sh 31 | spider-cloud-cli auth --api_key YOUR_API_KEY 32 | ``` 33 | 34 | ### Commands 35 | 36 | #### Scrape 37 | 38 | Scrape data from a specified URL. 39 | 40 | ```sh 41 | spider-cloud-cli scrape --url http://example.com 42 | ``` 43 | 44 | #### Crawl 45 | 46 | Crawl a specified URL with an optional limit on the number of pages. 47 | 48 | ```sh 49 | spider-cloud-cli crawl --url http://example.com --limit 10 50 | ``` 51 | 52 | #### Links 53 | 54 | Fetch links from a specified URL. 55 | 56 | ```sh 57 | spider-cloud-cli links --url http://example.com 58 | ``` 59 | 60 | #### Screenshot 61 | 62 | Take a screenshot of a specified URL. 63 | 64 | ```sh 65 | spider-cloud-cli screenshot --url http://example.com 66 | ``` 67 | 68 | #### Search 69 | 70 | Search for a query. 71 | 72 | ```sh 73 | spider-cloud-cli search --query "example query" 74 | ``` 75 | 76 | #### Transform 77 | 78 | Transform specified data. 79 | 80 | ```sh 81 | spider-cloud-cli transform --data "sample data" 82 | ``` 83 | 84 | #### Extract Contacts 85 | 86 | Extract contact information from a specified URL. 87 | 88 | ```sh 89 | spider-cloud-cli extract_contacts --url http://example.com 90 | ``` 91 | 92 | #### Label 93 | 94 | Label data from a specified URL. 95 | 96 | ```sh 97 | spider-cloud-cli label --url http://example.com 98 | ``` 99 | 100 | #### Get Crawl State 101 | 102 | Get the crawl state of a specified URL. 103 | 104 | ```sh 105 | spider-cloud-cli get_crawl_state --url http://example.com 106 | ``` 107 | 108 | #### Query 109 | 110 | Query records of a specified domain. 111 | 112 | ```sh 113 | spider-cloud-cli query --domain example.com 114 | ``` 115 | 116 | #### Get Credits 117 | 118 | Fetch the account credits left. 119 | 120 | ```sh 121 | spider-cloud-cli get_credits 122 | ``` 123 | 124 | ## License 125 | 126 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. 127 | 128 | ## Contributing 129 | 130 | Issues and pull requests are welcome! Feel free to check the [issues page](https://github.com/spider-rs/spider-clients/issues) if you have any questions or suggestions. 131 | 132 | ## Acknowledgements 133 | 134 | Special thanks to the developers and contributors of the libraries and tools used in this project. 135 | -------------------------------------------------------------------------------- /cli/src/args.rs: -------------------------------------------------------------------------------- 1 | use clap::{Parser, Subcommand}; 2 | 3 | #[derive(Parser, Debug)] 4 | #[command(name = "Spider CLI")] 5 | #[command(version = "1.0")] 6 | #[command(about = "A CLI interface for the Spider web crawler")] 7 | pub struct Cli { 8 | #[command(subcommand)] 9 | pub command: Commands, 10 | } 11 | 12 | #[derive(Subcommand, Debug)] 13 | pub enum Commands { 14 | /// Scrape a given URL 15 | Scrape { 16 | #[arg(short, long, help = "The URL to scrape")] 17 | url: String, 18 | #[arg( 19 | short, 20 | long, 21 | help = "Returns the link(s) found on the page that match the crawler query.", 22 | required = false 23 | )] 24 | return_page_links: Option, 25 | }, 26 | /// Crawl a given URL with an optional page limit 27 | Crawl { 28 | #[arg(short, long, help = "The URL to crawl")] 29 | url: String, 30 | #[arg( 31 | short, 32 | long, 33 | help = "Limit the number of pages to crawl", 34 | required = false 35 | )] 36 | limit: Option, 37 | #[arg( 38 | short, 39 | long, 40 | help = "Returns the link(s) found on the page that match the crawler query.", 41 | required = false 42 | )] 43 | return_page_links: Option, 44 | }, 45 | /// Fetch all links from a given URL 46 | Links { 47 | #[arg(short, long, help = "The URL to fetch links from")] 48 | url: String, 49 | #[arg( 50 | short, 51 | long, 52 | help = "Limit the number of pages to crawl", 53 | required = false 54 | )] 55 | limit: Option, 56 | #[arg( 57 | short, 58 | long, 59 | help = "Returns the link(s) found on the page that match the crawler query.", 60 | required = false 61 | )] 62 | return_page_links: Option, 63 | }, 64 | /// Take a screenshot of a given URL 65 | Screenshot { 66 | #[arg(short, long, help = "The URL to take a screenshot of")] 67 | url: String, 68 | #[arg( 69 | short, 70 | long, 71 | help = "Limit the number of pages to crawl", 72 | required = false 73 | )] 74 | limit: Option, 75 | #[arg( 76 | short, 77 | long, 78 | help = "Returns the link(s) found on the page that match the crawler query.", 79 | required = false 80 | )] 81 | return_page_links: Option, 82 | }, 83 | /// Search using a given query 84 | Search { 85 | #[arg(short, long, help = "The query to search for")] 86 | query: String, 87 | #[arg( 88 | short, 89 | long, 90 | help = "Limit the number of pages to crawl", 91 | required = false 92 | )] 93 | limit: Option, 94 | #[arg( 95 | short, 96 | long, 97 | help = "Returns the link(s) found on the page that match the crawler query.", 98 | required = false 99 | )] 100 | return_page_links: Option, 101 | }, 102 | /// Transform the provided data 103 | Transform { 104 | #[arg(short, long, help = "The data to transform")] 105 | data: String, 106 | }, 107 | /// Extract leads from a given URL 108 | ExtractLeads { 109 | #[arg(short, long, help = "The URL to extract leads from")] 110 | url: String, 111 | #[arg( 112 | short, 113 | long, 114 | help = "Limit the number of pages to crawl", 115 | required = false 116 | )] 117 | limit: Option, 118 | }, 119 | /// Label data from a given URL 120 | Label { 121 | #[arg(short, long, help = "The URL to label data from")] 122 | url: String, 123 | #[arg( 124 | short, 125 | long, 126 | help = "Limit the number of pages to crawl", 127 | required = false 128 | )] 129 | limit: Option, 130 | }, 131 | /// Get the crawl state of a given URL 132 | GetCrawlState { 133 | #[arg(short, long, help = "The URL to get the crawl state of")] 134 | url: String, 135 | }, 136 | /// Query for a domain 137 | Query { 138 | #[arg(short, long, help = "The domain to query")] 139 | domain: String, 140 | }, 141 | /// Get the remaining credits 142 | GetCredits, 143 | /// Authenticate using an API key 144 | Auth { 145 | #[arg(short, long, help = "The API key to authenticate")] 146 | api_key: String, 147 | }, 148 | } 149 | -------------------------------------------------------------------------------- /cli/src/main.rs: -------------------------------------------------------------------------------- 1 | mod args; 2 | use args::{Cli, Commands}; 3 | use clap::Parser; 4 | use keyring::Entry; 5 | use serde_json::json; 6 | use spider_client::{QueryRequest, RequestParams, SearchRequestParams, Spider}; 7 | use std::collections::HashMap; 8 | use tokio; 9 | 10 | const SERVICE_NAME: &str = "spider_client"; 11 | const USERNAME: &str = "default"; 12 | 13 | #[tokio::main] 14 | async fn main() { 15 | let args = Cli::parse(); 16 | let entry = Entry::new(SERVICE_NAME, USERNAME); 17 | 18 | match entry { 19 | Ok(ent) => { 20 | match args.command { 21 | Commands::Auth { ref api_key } => match ent.set_password(&api_key.trim()) { 22 | Ok(_) => println!("API key saved successfully."), 23 | Err(e) => eprintln!("Failed to save API key: {:?}", e), 24 | }, 25 | _ => (), 26 | } 27 | 28 | match ent.get_password() { 29 | Ok(api_key) => { 30 | let spider = Spider::new(Some(api_key.clone())) 31 | .expect("Failed to initialize Spider client."); 32 | 33 | match args.command { 34 | Commands::Scrape { 35 | url, 36 | return_page_links, 37 | } => { 38 | println!("Scraping URL: {}", url); 39 | let mut params = RequestParams::default(); 40 | params.return_page_links = return_page_links; 41 | match spider 42 | .scrape_url(&url, Some(params), "application/json") 43 | .await 44 | { 45 | Ok(data) => println!("{}", json!(data)), 46 | Err(e) => eprintln!("Error scraping URL: {:?}", e), 47 | } 48 | } 49 | Commands::Crawl { 50 | url, 51 | limit, 52 | return_page_links, 53 | } => { 54 | println!("Crawling URL: {}", url); 55 | let mut params = RequestParams::default(); 56 | if let Some(limit) = limit { 57 | params.limit = Some(limit); 58 | } 59 | params.return_page_links = return_page_links; 60 | 61 | match spider 62 | .crawl_url( 63 | &url, 64 | Some(params), 65 | false, 66 | "application/json", 67 | None::, 68 | ) 69 | .await 70 | { 71 | Ok(data) => println!("{}", json!(data)), 72 | Err(e) => eprintln!("Error crawling URL: {:?}", e), 73 | } 74 | } 75 | Commands::Links { 76 | url, 77 | return_page_links, 78 | limit, 79 | } => { 80 | println!("Fetching links from URL: {}", url); 81 | let mut params = RequestParams::default(); 82 | if let Some(limit) = limit { 83 | params.limit = Some(limit); 84 | } 85 | params.return_page_links = return_page_links; 86 | 87 | match spider 88 | .links(&url, Some(params), false, "application/json") 89 | .await 90 | { 91 | Ok(data) => println!("{}", json!(data)), 92 | Err(e) => eprintln!("Error fetching links: {:?}", e), 93 | } 94 | } 95 | Commands::Screenshot { 96 | url, 97 | limit, 98 | return_page_links, 99 | } => { 100 | let mut params = RequestParams::default(); 101 | if let Some(limit) = limit { 102 | params.limit = Some(limit); 103 | } 104 | params.return_page_links = return_page_links; 105 | println!("Taking screenshot of URL: {}", url); 106 | match spider 107 | .screenshot(&url, Some(params), false, "application/json") 108 | .await 109 | { 110 | Ok(data) => println!("{}", json!(data)), 111 | Err(e) => eprintln!("Error taking screenshot: {:?}", e), 112 | } 113 | } 114 | Commands::Search { 115 | query, 116 | limit, 117 | return_page_links, 118 | } => { 119 | let mut params = SearchRequestParams::default(); 120 | if let Some(limit) = limit { 121 | params.base.limit = Some(limit); 122 | } 123 | params.base.return_page_links = return_page_links; 124 | println!("Searching for query: {}", query); 125 | match spider 126 | .search(&query, Some(params), false, "application/json") 127 | .await 128 | { 129 | Ok(data) => println!("{}", json!(data)), 130 | Err(e) => eprintln!("Error searching for query: {:?}", e), 131 | } 132 | } 133 | Commands::Transform { data } => { 134 | let data_vec = vec![HashMap::from([("content", data.as_str())])]; 135 | println!("Transforming data: {}", data); 136 | match spider 137 | .transform(data_vec, None, false, "application/json") 138 | .await 139 | { 140 | Ok(data) => println!("{}", json!(data)), 141 | Err(e) => eprintln!("Error transforming data: {:?}", e), 142 | } 143 | } 144 | Commands::ExtractLeads { url, limit } => { 145 | let mut params = RequestParams::default(); 146 | if let Some(limit) = limit { 147 | params.limit = Some(limit); 148 | } 149 | println!("Extracting leads from URL: {}", url); 150 | match spider 151 | .extract_contacts(&url, Some(params), false, "application/json") 152 | .await 153 | { 154 | Ok(data) => println!("{}", json!(data)), 155 | Err(e) => eprintln!("Error extracting leads: {:?}", e), 156 | } 157 | } 158 | Commands::Label { url, limit } => { 159 | let mut params = RequestParams::default(); 160 | if let Some(limit) = limit { 161 | params.limit = Some(limit); 162 | } 163 | println!("Labeling data from URL: {}", url); 164 | match spider 165 | .label(&url, Some(params), false, "application/json") 166 | .await 167 | { 168 | Ok(data) => println!("{}", json!(data)), 169 | Err(e) => eprintln!("Error labeling data: {:?}", e), 170 | } 171 | } 172 | Commands::GetCrawlState { url } => { 173 | println!("Getting crawl state of URL: {}", url); 174 | match spider.get_crawl_state(&url, None, "application/json").await { 175 | Ok(data) => println!("{}", json!(data)), 176 | Err(e) => eprintln!("Error getting crawl state: {:?}", e), 177 | } 178 | } 179 | Commands::Query { domain } => { 180 | let query = QueryRequest { 181 | domain: Some(domain.to_string()), 182 | ..Default::default() 183 | }; 184 | println!("Querying record for domain: {}", domain); 185 | match spider.query(&query).await { 186 | Ok(data) => println!("{}", json!(data)), 187 | Err(e) => eprintln!("Error querying record: {:?}", e), 188 | } 189 | } 190 | Commands::GetCredits => { 191 | println!("Fetching account credits left."); 192 | match spider.get_credits().await { 193 | Ok(data) => println!("{}", json!(data)), 194 | Err(e) => eprintln!("Error fetching credits: {:?}", e), 195 | } 196 | } 197 | _ => {} 198 | } 199 | } 200 | Err(_) => { 201 | eprintln!( 202 | "No API key found. Please authenticate first using the `auth` command." 203 | ); 204 | } 205 | } 206 | } 207 | _ => (), 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /cli/src/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod args; -------------------------------------------------------------------------------- /javascript/.npmignore: -------------------------------------------------------------------------------- 1 | src/ 2 | **/*.ts 3 | !**/*.d.ts 4 | tests/ 5 | __tests__/ 6 | *.spec.ts 7 | *.test.ts 8 | jest.config.js 9 | tsconfig.json 10 | tslint.json 11 | *.log 12 | *.tlog 13 | *.tmp 14 | *.temp 15 | .DS_Store 16 | Thumbs.db 17 | .idea/ 18 | .vscode/ 19 | *.swp 20 | *.swo 21 | node_modules/ 22 | dist/*.js.map 23 | dist/*.ts.map 24 | npm-debug.log* 25 | yarn-debug.log* 26 | yarn-error.log* 27 | .env 28 | .env.local 29 | coverage/ -------------------------------------------------------------------------------- /javascript/LICENSE: -------------------------------------------------------------------------------- 1 | ../LICENSE -------------------------------------------------------------------------------- /javascript/README.md: -------------------------------------------------------------------------------- 1 | # Spider Cloud JavaScript SDK 2 | 3 | The Spider Cloud JavaScript SDK offers a streamlined set of tools for web scraping and crawling, with capabilities that allow for comprehensive data extraction suitable for interfacing with AI language models. This SDK makes it easy to interact programmatically with the Spider Cloud API from any JavaScript or Node.js application. 4 | 5 | ## Installation 6 | 7 | You can install the Spider Cloud JavaScript SDK via npm: 8 | 9 | ```bash 10 | npm install @spider-cloud/spider-client 11 | ``` 12 | 13 | Or with yarn: 14 | 15 | ```bash 16 | yarn add @spider-cloud/spider-client 17 | ``` 18 | 19 | ## Configuration 20 | 21 | Before using the SDK, you will need to provide it with your API key. Obtain an API key from [spider.cloud](https://spider.cloud) and either pass it directly to the constructor or set it as an environment variable `SPIDER_API_KEY`. 22 | 23 | ## Usage 24 | 25 | Here's a basic example to demonstrate how to use the SDK: 26 | 27 | ```javascript 28 | import { Spider } from "@spider-cloud/spider-client"; 29 | 30 | // Initialize the SDK with your API key 31 | const app = new Spider({ apiKey: "YOUR_API_KEY" }); 32 | 33 | // Scrape a URL 34 | const url = "https://spider.cloud"; 35 | app 36 | .scrapeUrl(url) 37 | .then((data) => { 38 | console.log("Scraped Data:", data); 39 | }) 40 | .catch((error) => { 41 | console.error("Scrape Error:", error); 42 | }); 43 | 44 | // Crawl a website 45 | const crawlParams = { 46 | limit: 5, 47 | proxy_enabled: true, 48 | store_data: false, 49 | metadata: false, 50 | request: "http", 51 | }; 52 | app 53 | .crawlUrl(url, crawlParams) 54 | .then((result) => { 55 | console.log("Crawl Result:", result); 56 | }) 57 | .catch((error) => { 58 | console.error("Crawl Error:", error); 59 | }); 60 | ``` 61 | 62 | A real world crawl example streaming the response. 63 | 64 | ```javascript 65 | import { Spider } from "@spider-cloud/spider-client"; 66 | 67 | // Initialize the SDK with your API key 68 | const app = new Spider({ apiKey: "YOUR_API_KEY" }); 69 | 70 | // The target URL 71 | const url = "https://spider.cloud"; 72 | 73 | // Crawl a website 74 | const crawlParams = { 75 | limit: 5, 76 | store_data: false, 77 | metadata: true, 78 | request: "http", 79 | }; 80 | 81 | const stream = true; 82 | 83 | const streamCallback = (data) => { 84 | console.log(data["url"]); 85 | }; 86 | 87 | app.crawlUrl(url, crawlParams, stream, streamCallback); 88 | ``` 89 | 90 | ### Data Operations 91 | 92 | The Spider client can interact with specific data tables to create, retrieve, and delete data. 93 | 94 | #### Retrieve Data from a Table 95 | 96 | To fetch data from a specified table by applying query parameters, use the `getData` method. Provide the table name and an object containing query parameters: 97 | 98 | ```javascript 99 | const tableName = "pages"; 100 | const queryParams = { limit: 20 }; 101 | spider 102 | .getData(tableName, queryParams) 103 | .then((response) => console.log(response)) 104 | .catch((error) => console.error(error)); 105 | ``` 106 | 107 | This example retrieves data from the 'pages' table, limiting the results to 20 entries. 108 | 109 | #### Delete Data from a Table 110 | 111 | To delete data from a specified table based on certain conditions, use the `deleteData` method. Provide the table name and an object specifying the conditions for deletion: 112 | 113 | ```javascript 114 | const tableName = "websites"; 115 | const deleteParams = { domain: "www.example.com" }; 116 | spider 117 | .deleteData(tableName, deleteParams) 118 | .then((response) => console.log(response)) 119 | .catch((error) => console.error(error)); 120 | ``` 121 | 122 | #### Download storage data 123 | 124 | To download stored data like raw HTML or markdown use the `createSignedUrl` method. Provide the website name and an object containing query parameters: 125 | 126 | ```javascript 127 | const websiteName = "spider.cloud"; 128 | const queryParams = { limit: 20, page: 0 }; 129 | spider 130 | .createSignedUrl(websiteName, queryParams) 131 | .then((response) => console.log(response)) 132 | .catch((error) => console.error(error)); 133 | ``` 134 | 135 | ### Available Methods 136 | 137 | - **`scrapeUrl(url, params)`**: Scrape data from a specified URL. Optional parameters can be passed to customize the scraping behavior. 138 | - **`crawlUrl(url, params, stream)`**: Begin crawling from a specific URL with optional parameters for customization and an optional streaming response. 139 | - **`search(q, params)`**: Perform a search and gather a list of websites to start crawling and collect resources. 140 | - **`links(url, params)`**: Retrieve all links from the specified URL with optional parameters. 141 | - **`screenshot(url, params)`**: Take a screenshot of the specified URL. 142 | - **`transform(data, params)`**: Perform a fast HTML transformation to markdown or text. 143 | - **`extractContacts(url, params)`**: Extract contact information from the specified URL. 144 | - **`label(url, params)`**: Apply labeling to data extracted from the specified URL. 145 | - **`getCrawlState(url, params)`**: Check the website crawl state. 146 | - **`getCredits()`**: Retrieve account's remaining credits. 147 | - **`getData(table, params)`**: Retrieve data records from the DB. 148 | - **`deleteData(table, params)`**: Delete records from the DB. 149 | - **`createSignedUrl(domain, params)`**: Download the records from the DB. 150 | 151 | ## Error Handling 152 | 153 | The SDK provides robust error handling and will throw exceptions when it encounters critical issues. Always use `.catch()` on promises to handle these errors gracefully. 154 | 155 | ## Contributing 156 | 157 | Contributions are always welcome! Feel free to open an issue or submit a pull request on our GitHub repository. 158 | 159 | ## License 160 | 161 | The Spider Cloud JavaScript SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT). 162 | -------------------------------------------------------------------------------- /javascript/__tests__/spiderwebai.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, test } from "node:test"; 2 | import assert from "node:assert"; 3 | import { Collection, Spider } from "../src"; 4 | import "dotenv/config"; 5 | import { GenericParams } from "../src/client"; 6 | 7 | describe("Spider JS SDK", () => { 8 | const url = "https://example.com"; 9 | const params: GenericParams = { 10 | limit: 1, 11 | return_format: "markdown", 12 | depth: 2, 13 | cache: true, 14 | }; 15 | 16 | test("should throw error if API key is not provided", () => { 17 | if (!process.env.SPIDER_API_KEY) { 18 | assert.throws(() => new Spider({ apiKey: null })); 19 | } else { 20 | assert.doesNotThrow(() => new Spider({ apiKey: null })); 21 | } 22 | }); 23 | 24 | test("should scrape url with data", async () => { 25 | const spiderClient = new Spider(); 26 | const spiderData = await spiderClient.scrapeUrl(url, params); 27 | 28 | assert(Array.isArray(spiderData)); 29 | assert(spiderData.length > 0); 30 | assert(spiderData[0].content); 31 | assert(spiderData[0].error !== undefined); 32 | assert(spiderData[0].status); 33 | assert(spiderData[0].url); 34 | }); 35 | 36 | test("should crawl url with data", async () => { 37 | const spiderClient = new Spider(); 38 | const spiderData = await spiderClient.crawlUrl(url, params); 39 | 40 | assert(Array.isArray(spiderData)); 41 | assert(spiderData.length > 0); 42 | assert(spiderData[0].content); 43 | assert(spiderData[0].error !== undefined); 44 | assert(spiderData[0].status); 45 | assert(spiderData[0].url); 46 | }); 47 | 48 | test("should crawl url with data streaming", async () => { 49 | const spiderClient = new Spider(); 50 | 51 | const cb = (spiderData: any) => { 52 | assert(spiderData.content); 53 | assert(spiderData.status); 54 | assert(spiderData.url); 55 | }; 56 | 57 | await spiderClient.crawlUrl(url, params, true, cb); 58 | }); 59 | 60 | test("should get links", async () => { 61 | const spiderClient = new Spider(); 62 | const linksData = await spiderClient.links(url, params); 63 | 64 | assert(Array.isArray(linksData)); 65 | assert(linksData.length > 0); 66 | assert(linksData[0].error !== undefined); 67 | assert(linksData[0].status); 68 | assert(linksData[0].url); 69 | }); 70 | 71 | test("should take screenshot", async () => { 72 | const spiderClient = new Spider(); 73 | const screenshotData = await spiderClient.screenshot(url, { limit: 1 }); 74 | 75 | assert(Array.isArray(screenshotData)); 76 | }); 77 | 78 | test.skip("should perform search", async () => { 79 | const spiderClient = new Spider(); 80 | const searchData = await spiderClient.search( 81 | "example search query", 82 | params 83 | ); 84 | 85 | assert(Array.isArray(searchData)); 86 | assert(searchData.length > 0); 87 | assert(searchData[0].content); 88 | assert(searchData[0].error !== undefined); 89 | assert(searchData[0].status); 90 | assert(searchData[0].url); 91 | }); 92 | 93 | test.skip("should transform data", async () => { 94 | const spiderClient = new Spider(); 95 | const transformData = [ 96 | { html: "Example", url: url }, 97 | ]; 98 | const transformedData = await spiderClient.transform(transformData, params); 99 | 100 | assert(typeof transformedData === "object"); 101 | assert(transformedData.content); 102 | assert(transformedData.error !== undefined); 103 | assert(transformedData.status); 104 | }); 105 | 106 | test("should extract contacts", async () => { 107 | const spiderClient = new Spider(); 108 | const contactsData = await spiderClient.extractContacts(url, params); 109 | 110 | assert(Array.isArray(contactsData)); 111 | assert(contactsData.length > 0); 112 | assert(contactsData[0].content); 113 | assert(contactsData[0].error !== undefined); 114 | assert(contactsData[0].status); 115 | assert(contactsData[0].url); 116 | }); 117 | 118 | test("should label data", async () => { 119 | const spiderClient = new Spider(); 120 | const labelData = await spiderClient.label(url, params); 121 | 122 | assert(Array.isArray(labelData)); 123 | assert(labelData.length > 0); 124 | assert(labelData[0].content); 125 | assert(labelData[0].error !== undefined); 126 | assert(labelData[0].status); 127 | assert(labelData[0].url); 128 | }); 129 | 130 | test("should get crawl state", async () => { 131 | const spiderClient = new Spider(); 132 | const crawlState = await spiderClient.getCrawlState(url, params); 133 | 134 | assert(typeof crawlState === "object"); 135 | assert(Array.isArray(crawlState.data)); 136 | }); 137 | 138 | test.skip("should query global db", async () => { 139 | const spiderClient = new Spider(); 140 | const crawlState = await spiderClient.query({ domain: "spider.cloud" }); 141 | 142 | assert(typeof crawlState === "object"); 143 | assert(crawlState.content); 144 | }); 145 | 146 | test("should download the file", async () => { 147 | const spiderClient = new Spider(); 148 | const { data } = await spiderClient.getData(Collection.Pages, { 149 | domain: "example.com", 150 | limit: 1, 151 | }); 152 | 153 | // the file might be deleted before hand. we need to not delete the file being used throughout test. 154 | const text = data.length 155 | ? await spiderClient.download({ url: data[0].url }, "text") 156 | : ""; 157 | 158 | assert(typeof text === "string"); 159 | }); 160 | 161 | test("should get credits", async () => { 162 | const spiderClient = new Spider(); 163 | const credits = await spiderClient.getCredits(); 164 | 165 | assert(typeof credits === "object"); 166 | }); 167 | 168 | test("should post data", async () => { 169 | const spiderClient = new Spider(); 170 | const postData = { url: url }; 171 | const response = await spiderClient.postData(Collection.Websites, postData); 172 | assert([200, 201].includes(response.status)); 173 | }); 174 | 175 | test("should get data", async () => { 176 | const spiderClient = new Spider(); 177 | const response = await spiderClient.getData(Collection.Websites, params); 178 | 179 | assert(typeof response === "object"); 180 | assert(Array.isArray(response.data)); 181 | }); 182 | 183 | test("should delete data", async () => { 184 | const spiderClient = new Spider(); 185 | const response = await spiderClient.deleteData(Collection.Websites, params); 186 | 187 | assert(response.status >= 200 && response.status <= 299); 188 | }); 189 | 190 | test("should create signed url", async () => { 191 | const spiderClient = new Spider(); 192 | const { fileName, signedUrl } = await spiderClient.createSignedUrl( 193 | "example.com" 194 | ); 195 | 196 | assert(typeof signedUrl === "string"); 197 | assert(typeof fileName === "string"); 198 | }); 199 | }); 200 | -------------------------------------------------------------------------------- /javascript/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@spider-cloud/spider-client", 3 | "version": "0.1.36", 4 | "description": "Isomorphic Javascript SDK for Spider Cloud services", 5 | "scripts": { 6 | "test": "node --import tsx --test __tests__/*test.ts", 7 | "build": "tsc", 8 | "prepublishOnly": "npm test && npm run build" 9 | }, 10 | "main": "dist/index.js", 11 | "types": "dist/client.d.ts", 12 | "files": [ 13 | "dist/**/*" 14 | ], 15 | "keywords": [ 16 | "spider", 17 | "sdk", 18 | "web crawling", 19 | "web scraping", 20 | "api", 21 | "llm scraping" 22 | ], 23 | "author": "Jeff Mendez", 24 | "license": "MIT", 25 | "devDependencies": { 26 | "@types/node": "22.10.7", 27 | "dotenv": "^16.4.7", 28 | "tsx": "^4.19.2", 29 | "typescript": "5.7.3" 30 | }, 31 | "dependencies": { 32 | "exponential-backoff": "^3.1.1" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /javascript/sample.env: -------------------------------------------------------------------------------- 1 | SPIDER_API_KEY= -------------------------------------------------------------------------------- /javascript/src/client.ts: -------------------------------------------------------------------------------- 1 | import { 2 | ChunkCallbackFunction, 3 | Collection, 4 | QueryRequest, 5 | SpiderCoreResponse, 6 | SpiderParams, 7 | APISchema, 8 | APIRoutes, 9 | ApiVersion, 10 | } from "./config"; 11 | import { version } from "../package.json"; 12 | import { streamReader } from "./utils/stream-reader"; 13 | import { backOff } from "exponential-backoff"; 14 | 15 | /** 16 | * Generic params for core request. 17 | */ 18 | export type GenericParams = Omit; 19 | 20 | /** 21 | * Configuration interface for Spider. 22 | */ 23 | export interface SpiderConfig { 24 | apiKey?: string | null; 25 | } 26 | 27 | /** 28 | * A class to interact with the Spider API. 29 | */ 30 | export class Spider { 31 | private apiKey?: string; 32 | 33 | /** 34 | * Create an instance of Spider. 35 | * @param {string | null} apiKey - The API key used to authenticate to the Spider API. If null, attempts to source from environment variables. 36 | * @throws Will throw an error if the API key is not provided. 37 | */ 38 | constructor(props?: SpiderConfig) { 39 | this.apiKey = props?.apiKey || process?.env?.SPIDER_API_KEY; 40 | 41 | if (!this.apiKey) { 42 | throw new Error("No API key provided"); 43 | } 44 | } 45 | 46 | /** 47 | * Internal method to handle POST requests. 48 | * @param {string} endpoint - The API endpoint to which the POST request should be sent. 49 | * @param {Record} data - The JSON data to be sent in the request body. 50 | * @param {boolean} [stream=false] - Whether to stream the response back without parsing. 51 | * @returns {Promise} The response in JSON if not streamed, or the Response object if streamed. 52 | */ 53 | private async _apiPost( 54 | endpoint: string, 55 | data: Record, 56 | stream?: boolean, 57 | jsonl?: boolean 58 | ) { 59 | const headers = jsonl ? this.prepareHeadersJsonL : this.prepareHeaders; 60 | const response = await backOff( 61 | () => 62 | fetch(`${APISchema["url"]}/${ApiVersion.V1}/${endpoint}`, { 63 | method: "POST", 64 | headers: headers, 65 | body: JSON.stringify(data), 66 | }), 67 | { 68 | numOfAttempts: 5, 69 | } 70 | ); 71 | 72 | if (!stream) { 73 | if (response.ok) { 74 | return response.json(); 75 | } else { 76 | this.handleError(response, `post to ${endpoint}`); 77 | } 78 | } 79 | return response; 80 | } 81 | 82 | /** 83 | * Internal method to handle GET requests. 84 | * @param {string} endpoint - The API endpoint from which data should be retrieved. 85 | * @returns {Promise} The data returned from the endpoint in JSON format. 86 | */ 87 | private async _apiGet(endpoint: string) { 88 | const headers = this.prepareHeaders; 89 | const response = await backOff( 90 | () => 91 | fetch(`${APISchema["url"]}/${ApiVersion.V1}/${endpoint}`, { 92 | method: "GET", 93 | headers: headers, 94 | }), 95 | { 96 | numOfAttempts: 5, 97 | } 98 | ); 99 | 100 | if (response.ok) { 101 | return response.json(); 102 | } else { 103 | this.handleError(response, `get from ${endpoint}`); 104 | } 105 | } 106 | 107 | /** 108 | * Internal method to handle DELETE requests. 109 | * @param {string} endpoint - The API endpoint from which data should be retrieved. 110 | * @returns {Promise} The data returned from the endpoint in JSON format. 111 | */ 112 | private async _apiDelete(endpoint: string) { 113 | const headers = this.prepareHeaders; 114 | const response = await backOff( 115 | () => 116 | fetch(`${APISchema["url"]}/${ApiVersion.V1}/${endpoint}`, { 117 | method: "DELETE", 118 | headers, 119 | body: JSON.stringify({}) 120 | }), 121 | { 122 | numOfAttempts: 5, 123 | } 124 | ); 125 | 126 | if (response.ok) { 127 | return response; 128 | } else { 129 | return this.handleError(response, `delete from ${endpoint}`); 130 | } 131 | } 132 | 133 | /** 134 | * Scrapes data from a specified URL. 135 | * @param {string} url - The URL to scrape. 136 | * @param {GenericParams} [params={}] - Additional parameters for the scraping request. 137 | * @returns {Promise} The scraped data from the URL. 138 | */ 139 | async scrapeUrl(url: string, params: GenericParams = {}) { 140 | return this._apiPost(APIRoutes.Crawl, { url: url, limit: 1, ...params }); 141 | } 142 | 143 | /** 144 | * Initiates a crawling job starting from the specified URL. 145 | * @param {string} url - The URL to start crawling. 146 | * @param {GenericParams} [params={}] - Additional parameters for the crawl. 147 | * @param {boolean} [stream=false] - Whether to receive the response as a stream. 148 | * @param {function} [callback=function] - The callback function when streaming per chunk. If this is set with stream you will not get a end response. 149 | * @returns {Promise} The result of the crawl, either structured data or a Response object if streaming. 150 | */ 151 | async crawlUrl( 152 | url: string, 153 | params: GenericParams = {}, 154 | stream = false, 155 | cb?: ChunkCallbackFunction 156 | ): Promise { 157 | const jsonl = stream && cb; 158 | const res = await this._apiPost( 159 | APIRoutes.Crawl, 160 | { url, ...params }, 161 | stream, 162 | !!jsonl 163 | ); 164 | 165 | if (jsonl) { 166 | return await streamReader(res, cb); 167 | } 168 | 169 | return res; 170 | } 171 | 172 | /** 173 | * Retrieves all links from the specified URL. 174 | * @param {string} url - The URL from which to gather links. 175 | * @param {GenericParams} [params={}] - Additional parameters for the crawl. 176 | * @param {boolean} [stream=false] - Whether to receive the response as a stream. 177 | * @param {function} [callback=function] - The callback function when streaming per chunk. If this is set with stream you will not get a end response. 178 | * @returns {Promise} The result of the crawl, either structured data or a Response object if streaming. 179 | */ 180 | async links( 181 | url: string, 182 | params: GenericParams = {}, 183 | stream = false, 184 | cb?: ChunkCallbackFunction 185 | ): Promise { 186 | const jsonl = stream && cb; 187 | const res = await this._apiPost( 188 | APIRoutes.Links, 189 | { url, ...params }, 190 | stream, 191 | !!jsonl 192 | ); 193 | 194 | if (jsonl) { 195 | return await streamReader(res, cb); 196 | } 197 | 198 | return res; 199 | } 200 | 201 | /** 202 | * Takes a screenshot of the website starting from this URL. 203 | * @param {string} url - The URL to start the screenshot. 204 | * @param {GenericParams} [params={}] - Configuration parameters for the screenshot. 205 | * @returns {Promise} The screenshot data. 206 | */ 207 | async screenshot(url: string, params: GenericParams = {}) { 208 | return this._apiPost(APIRoutes.Screenshot, { url: url, ...params }); 209 | } 210 | 211 | /** 212 | * Perform a search and gather a list of websites to start crawling and collect resources. 213 | * @param {string} search - The search query. 214 | * @param {GenericParams} [params={}] - Configuration parameters for the search. 215 | * @returns {Promise} The result of the crawl, either structured data or a Response object if streaming. 216 | */ 217 | async search(q: string, params: GenericParams = {}) { 218 | return this._apiPost(APIRoutes.Search, { search: q, ...params }); 219 | } 220 | 221 | /** 222 | * Transform HTML to Markdown or text. You can send up to 10MB of data at once. 223 | * @param {object} data - The data to trasnform, a list of objects with the key 'html' and optional 'url' key for readability. 224 | * @param {object} [params={}] - Configuration parameters for the transformation. 225 | * @returns {Promise} The transformation result. 226 | */ 227 | async transform(data: { html: string; url?: string }[], params = {}) { 228 | return this._apiPost(APIRoutes.Transform, { data, ...params }); 229 | } 230 | 231 | /** 232 | * Extracts leads from a website. 233 | * @param {string} url - The URL from which to extract contacts. 234 | * @param {GenericParams} [params={}] - Configuration parameters for the extraction. 235 | * @returns {Promise} The contact information extracted. 236 | */ 237 | async extractContacts(url: string, params: GenericParams = {}) { 238 | return this._apiPost(APIRoutes.PiplineExtractLeads, { 239 | url: url, 240 | ...params, 241 | }); 242 | } 243 | 244 | /** 245 | * Applies labeling to data extracted from a specified URL. 246 | * @param {string} url - The URL to label. 247 | * @param {GenericParams} [params={}] - Configuration parameters for labeling. 248 | * @returns {Promise} The labeled data. 249 | */ 250 | async label(url: string, params: GenericParams = {}) { 251 | return this._apiPost(APIRoutes.PiplineLabel, { url: url, ...params }); 252 | } 253 | 254 | /** 255 | * Check the crawl state of the website. 256 | * @param {string} url - The URL to check. 257 | * @param {GenericParams} [params={}] - Configuration parameters for crawl state. Can also pass in "domain" instead of the url to query. 258 | * @returns {Promise} The crawl state data. 259 | */ 260 | async getCrawlState(url: string, params: GenericParams = {}) { 261 | return this._apiPost(APIRoutes.DataCrawlState, { url: url, ...params }); 262 | } 263 | 264 | /** 265 | * Create a signed url to download files from the storage. 266 | * @param {string} [domain] - The domain for the user's storage. If not provided, downloads all files. 267 | * @param {Object} [options] - The download options. 268 | * @param {boolean} [raw] - Return the raw response. 269 | 270 | * @returns {Promise} The response containing the file stream. 271 | */ 272 | async createSignedUrl( 273 | url?: string, 274 | options?: { 275 | page?: number; 276 | limit?: number; 277 | expiresIn?: number; 278 | // optional if you do not know the url put the domain and path. 279 | domain?: string; 280 | pathname?: string; 281 | } 282 | ): Promise { 283 | const { page, limit, expiresIn, domain, pathname } = options ?? {}; 284 | 285 | const params = new URLSearchParams({ 286 | ...(url && { url }), 287 | ...(domain && { domain }), 288 | ...(pathname && { pathname }), 289 | ...(page && { page: page.toString() }), 290 | ...(limit && { limit: limit.toString() }), 291 | ...(expiresIn && { expiresIn: expiresIn.toString() }), 292 | }); 293 | const endpoint = `${APISchema["url"]}/${ 294 | APIRoutes.DataSignUrl 295 | }?${params.toString()}`; 296 | const headers = this.prepareHeaders; 297 | 298 | const response = await fetch(endpoint, { 299 | method: "GET", 300 | headers, 301 | }); 302 | 303 | if (response.ok) { 304 | return await response.json(); 305 | } else { 306 | this.handleError(response, `Failed to sign files`); 307 | } 308 | } 309 | 310 | /** 311 | * Retrieves the number of credits available on the account. 312 | * @returns {Promise} The current credit balance. 313 | */ 314 | async getCredits() { 315 | return this._apiGet(APIRoutes.DataCredits); 316 | } 317 | 318 | /** 319 | * Send a POST request to insert data into a specified table. 320 | * @param {string} table - The table name in the database. 321 | * @param {object} data - The data to be inserted. 322 | * @returns {Promise} The response from the server. 323 | */ 324 | async postData( 325 | collection: Collection, 326 | data: GenericParams | Record 327 | ): Promise { 328 | return this._apiPost(`${APIRoutes.Data}/${collection}`, data); 329 | } 330 | 331 | /** 332 | * Send a GET request to retrieve data from a specified table. 333 | * @param {Collection} table - The table name in the database. 334 | * @param {object} params - The query parameters for data retrieval. 335 | * @returns {Promise} The response from the server. 336 | */ 337 | async getData( 338 | collections: Collection, 339 | params: GenericParams | Record 340 | ): Promise { 341 | return this._apiGet( 342 | `${APIRoutes.Data}/${collections}?${new URLSearchParams( 343 | params as any 344 | ).toString()}` 345 | ); 346 | } 347 | 348 | /** 349 | * Download a record. The url is the path of the storage hash returned and not the exact website url. 350 | * @param {QueryRequest} params - The query parameters for data retrieval. 351 | * @returns {Promise} The download response from the server. 352 | */ 353 | async download(query: QueryRequest, output?: "text" | "blob"): Promise { 354 | const headers = this.prepareHeaders; 355 | const endpoint = `${APIRoutes.DataDownload}?${new URLSearchParams( 356 | query as Record 357 | ).toString()}`; 358 | const response = await fetch( 359 | `${APISchema["url"]}/${ApiVersion.V1}/${endpoint}`, 360 | { 361 | method: "GET", 362 | headers, 363 | } 364 | ); 365 | 366 | if (response.ok) { 367 | if (output === "text") { 368 | return await response.text(); 369 | } 370 | return await response.blob(); 371 | } else { 372 | this.handleError(response, `get from ${endpoint}`); 373 | } 374 | } 375 | 376 | /** 377 | * Perform a query to get a document. 378 | * @param {QueryRequest} params - The query parameters for data retrieval. 379 | * @returns {Promise} The response from the server. 380 | */ 381 | async query(query: QueryRequest): Promise { 382 | return this._apiGet( 383 | `${APIRoutes.DataQuery}?${new URLSearchParams( 384 | query as Record 385 | ).toString()}` 386 | ); 387 | } 388 | 389 | /** 390 | * Send a DELETE request to remove data from a specified table. 391 | * @param {Collection} table - The table name in the database. 392 | * @param {object} params - Parameters to identify records to delete. 393 | * @returns {Promise} The response from the server. 394 | */ 395 | async deleteData( 396 | collection: Collection, 397 | params: GenericParams | Record 398 | ): Promise { 399 | return this._apiDelete( 400 | `${APIRoutes.Data}/${collection}?${new URLSearchParams( 401 | params as any 402 | ).toString()}` 403 | ); 404 | } 405 | 406 | /** 407 | * Prepares common headers for each API request. 408 | * @returns {HeadersInit} A headers object for fetch requests. 409 | */ 410 | get prepareHeaders() { 411 | return { 412 | "Content-Type": "application/json", 413 | Authorization: `Bearer ${this.apiKey}`, 414 | "User-Agent": `Spider-Client/${version}`, 415 | }; 416 | } 417 | 418 | /** 419 | * Prepares common headers for each API request with JSONl content-type suitable for streaming. 420 | * @returns {HeadersInit} A headers object for fetch requests. 421 | */ 422 | get prepareHeadersJsonL() { 423 | return { 424 | ...this.prepareHeaders, 425 | "Content-Type": "application/jsonl", 426 | }; 427 | } 428 | 429 | /** 430 | * Handles errors from API requests. 431 | * @param {Response} response - The fetch response object. 432 | * @param {string} action - Description of the attempted action. 433 | * @throws Will throw an error with detailed status information. 434 | */ 435 | handleError(response: Response, action: string) { 436 | throw new Error(`Failed to ${action}. Status code: ${response.status}.`); 437 | } 438 | } 439 | -------------------------------------------------------------------------------- /javascript/src/config.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Represents viewport dimensions. 3 | */ 4 | export interface Viewport { 5 | width: number; 6 | height: number; 7 | } 8 | 9 | /** 10 | * Represents HTTP headers as a dictionary object. 11 | */ 12 | export interface Headers { 13 | [key: string]: string; 14 | } 15 | 16 | /** 17 | * Represents a budget for various resources. 18 | */ 19 | export interface Budget { 20 | [key: string]: number; 21 | } 22 | 23 | /** 24 | * The chunking algorithm to use. 25 | */ 26 | export type ChunkingAlgType = 27 | | "ByWords" 28 | | "ByLines" 29 | | "ByCharacterLength" 30 | | "BySentence"; 31 | 32 | /** 33 | * The chunking algorithm with the value to chunk by. 34 | */ 35 | export interface ChunkingAlg { 36 | type: ChunkingAlgType; 37 | value: number; 38 | } 39 | 40 | /** 41 | * Represents a timeout configuration. 42 | * @typedef {Object} Timeout 43 | * @property {number} secs - The number of seconds. 44 | * @property {number} nanos - The number of nanoseconds. 45 | */ 46 | interface Timeout { 47 | secs: number; 48 | nanos: number; 49 | } 50 | 51 | /** 52 | * Represents the webhook configuration. 53 | * @typedef {Object} WebhookSettings 54 | * @property {Object} object - The webhook configuration. 55 | */ 56 | interface WebhookSettings { 57 | /** 58 | * The URL or endpoint where the webhook information will be sent. 59 | */ 60 | destination: string; 61 | /** 62 | * Flag to indicate an action should be taken when all credits are depleted. 63 | */ 64 | on_credits_depleted: boolean; 65 | /** 66 | * Flag to indicate an action should be taken when half of the credits are depleted. 67 | */ 68 | on_credits_half_depleted: boolean; 69 | /** 70 | * Flag to trigger a notification on a website status update event. 71 | */ 72 | on_website_status: boolean; 73 | /** 74 | * Flag to send information about a new page find, such as links and data size. 75 | */ 76 | on_find: boolean; 77 | /** 78 | * Flag to handle the metadata of a new page that has been found. 79 | */ 80 | on_find_metadata: boolean; 81 | } 82 | 83 | /** 84 | * Represents the idle network configuration. 85 | * @typedef {Object} IdleNetwork 86 | * @property {Timeout} timeout - The timeout configuration. 87 | */ 88 | interface IdleNetwork { 89 | timeout: Timeout; 90 | } 91 | 92 | /** 93 | * Represents the selector configuration. 94 | * @typedef {Object} Selector 95 | * @property {Timeout} timeout - The timeout configuration. 96 | * @property {string} selector - The CSS selector to wait for. 97 | */ 98 | interface Selector { 99 | timeout: Timeout; 100 | selector: string; 101 | } 102 | 103 | /** 104 | * Represents the delay configuration. 105 | * @typedef {Object} Delay 106 | * @property {Timeout} timeout - The timeout configuration. 107 | */ 108 | interface Delay { 109 | timeout: Timeout; 110 | } 111 | 112 | /** 113 | * Represents the wait_for configuration. 114 | * @typedef {Object} WaitFor 115 | * @property {IdleNetwork} [idle_network] - Configuration to wait for network to be idle. 116 | * @property {Selector} [selector] - Configuration to wait for a CSS selector. 117 | * @property {Delay} [delay] - Configuration to wait for a delay. 118 | * @property {boolean} [page_navigations] - Whether to wait for page navigations. 119 | */ 120 | interface WaitFor { 121 | idle_network?: IdleNetwork; 122 | selector?: Selector; 123 | delay?: Delay; 124 | page_navigations?: boolean; 125 | } 126 | 127 | /** 128 | * Represents the query API endpoint request to get documents from the global spider collection. 129 | */ 130 | export interface QueryRequest { 131 | /** 132 | * The exact URL to get. 133 | */ 134 | url?: string; 135 | /** 136 | * The domain to get a document from. 137 | */ 138 | domain?: string; 139 | /** 140 | * The path of the webpage to get the document. This is used with the domain key. 141 | */ 142 | pathname?: string; 143 | } 144 | 145 | // Define the CSSSelector type 146 | type CSSSelector = { 147 | // The name of the selector group 148 | name: string; 149 | // An array of CSS selectors 150 | selectors: string[]; 151 | }; 152 | 153 | // Define the CSSExtractionMap type 154 | type CSSExtractionMap = { 155 | // The map keys are strings (paths), and the values are arrays of CSSSelector objects 156 | [path: string]: CSSSelector[]; 157 | }; 158 | 159 | // Web automation using chrome 160 | export type WebAutomation = 161 | | { Evaluate: string } 162 | | { Click: string } 163 | | { Wait: number } 164 | | { WaitForNavigation: boolean } 165 | | { WaitFor: string } 166 | | { WaitForAndClick: string } 167 | | { ScrollX: number } 168 | | { ScrollY: number } 169 | | { Fill: { selector: string; value?: string } } 170 | | { InfiniteScroll: number }; 171 | 172 | export type ReturnFormat = 173 | | "markdown" 174 | | "commonmark" 175 | | "raw" 176 | | "text" 177 | | "html2text" 178 | | "bytes" 179 | | "xml" 180 | | "empty"; 181 | 182 | // Map automation scripts for paths or urls. 183 | export type WebAutomationMap = Record; 184 | // Map execution scripts for paths or urls. 185 | export type ExecutionScriptsMap = Record; 186 | 187 | // The HTTP redirect policy to use. Loose allows all domains and Strict only allows relative requests to the domain. 188 | export enum RedirectPolicy { 189 | Loose = "Loose", 190 | Strict = "Strict", 191 | } 192 | 193 | /** 194 | * Represents the options available for making a spider request. 195 | */ 196 | export interface SpiderParams { 197 | /** 198 | * The URL to be crawled. 199 | */ 200 | url: string; 201 | 202 | /** 203 | * The type of request to be made. 204 | */ 205 | request?: "http" | "chrome" | "smart"; 206 | 207 | /** 208 | * The maximum number of pages the crawler should visit. 209 | */ 210 | limit?: number; 211 | 212 | /** 213 | * The format in which the result should be returned. When setting the return format as an array a object is returned mapping by the name. 214 | */ 215 | return_format?: ReturnFormat | ReturnFormat[]; 216 | 217 | /** 218 | * Specifies whether to only visit the top-level domain. 219 | */ 220 | tld?: boolean; 221 | 222 | /** 223 | * The depth of the crawl. 224 | */ 225 | depth?: number; 226 | 227 | /** 228 | * Specifies whether the request should be cached. 229 | */ 230 | cache?: boolean; 231 | 232 | /** 233 | * The budget for various resources. 234 | */ 235 | budget?: Budget; 236 | 237 | /** 238 | * The blacklist routes to ignore. This can be a Regex string pattern. 239 | */ 240 | blacklist?: string[]; 241 | 242 | /** 243 | * The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing. 244 | */ 245 | whitelist?: string[]; 246 | 247 | /** 248 | * The locale to be used during the crawl. 249 | */ 250 | locale?: string; 251 | 252 | /** 253 | * The cookies to be set for the request, formatted as a single string. 254 | */ 255 | cookies?: string; 256 | 257 | /** 258 | * Specifies whether to use stealth techniques to avoid detection. 259 | */ 260 | stealth?: boolean; 261 | 262 | /** 263 | * The headers to be used for the request. 264 | */ 265 | headers?: Headers; 266 | 267 | /** 268 | * Specifies whether anti-bot measures should be used. 269 | */ 270 | anti_bot?: boolean; 271 | 272 | /** 273 | * Specifies whether to include metadata in the response. 274 | */ 275 | metadata?: boolean; 276 | 277 | /** 278 | * Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page. 279 | */ 280 | css_extraction_map?: CSSExtractionMap; 281 | 282 | /** 283 | * The dimensions of the viewport. 284 | */ 285 | viewport?: Viewport; 286 | 287 | /** 288 | * The encoding to be used for the request. 289 | */ 290 | encoding?: "UTF-8" | "SHIFT_JIS" | string; 291 | 292 | /** 293 | * Specifies whether to include subdomains in the crawl. 294 | */ 295 | subdomains?: boolean; 296 | 297 | /** 298 | * The user agent string to be used for the request. 299 | */ 300 | user_agent?: string; 301 | 302 | /** 303 | * Specifies whether the response data should be stored. 304 | */ 305 | store_data?: boolean; 306 | 307 | /** 308 | * Use webhooks to send data. 309 | */ 310 | webhooks?: WebhookSettings; 311 | /** 312 | * Configuration settings for GPT (general purpose texture mappings). 313 | */ 314 | gpt_config?: Record; 315 | 316 | /** 317 | * Specifies whether to use fingerprinting protection. 318 | */ 319 | fingerprint?: boolean; 320 | 321 | /** 322 | * Specifies whether to perform the request without using storage. 323 | */ 324 | storageless?: boolean; 325 | 326 | /** 327 | * Specifies whether readability optimizations should be applied. 328 | */ 329 | readability?: boolean; 330 | 331 | /** 332 | * Specifies whether to use a proxy for the request. 333 | */ 334 | proxy_enabled?: boolean; 335 | 336 | /** 337 | * Specifies whether to respect the site's robots.txt file. 338 | */ 339 | respect_robots?: boolean; 340 | 341 | /** 342 | * CSS root selector to be used to filter the content. 343 | */ 344 | root_selector?: string; 345 | 346 | /** 347 | * Specifies whether to load all resources of the crawl target. 348 | */ 349 | full_resources?: boolean; 350 | 351 | /** 352 | * Specifies whether to use the sitemap links. 353 | */ 354 | sitemap?: boolean; 355 | 356 | /** 357 | * Specifies whether to only use the sitemap links. 358 | */ 359 | sitemap_only?: boolean; 360 | 361 | /** 362 | * External domains to include the crawl. 363 | */ 364 | 365 | external_domains?: string[]; 366 | 367 | /** 368 | * Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`. 369 | */ 370 | return_embeddings?: boolean; 371 | 372 | /** 373 | * Returns the HTTP response headers used. 374 | */ 375 | return_headers?: boolean; 376 | 377 | /** 378 | * Returns the link(s) found on the page that match the crawler query. 379 | */ 380 | return_page_links?: boolean; 381 | 382 | /** 383 | * Returns the HTTP response cookies used. 384 | */ 385 | return_cookies?: boolean; 386 | 387 | /** 388 | * The timeout for the request, in milliseconds. 389 | */ 390 | request_timeout?: number; 391 | 392 | /** 393 | * Specifies whether to run the request in the background. 394 | */ 395 | run_in_background?: boolean; 396 | 397 | /** 398 | * Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'. 399 | */ 400 | 401 | scroll?: number; 402 | 403 | /** 404 | * Specifies whether to skip configuration checks. 405 | */ 406 | skip_config_checks?: boolean; 407 | 408 | /** 409 | * The chunking algorithm to use. 410 | */ 411 | chunking_alg?: ChunkingAlg; 412 | 413 | /** 414 | * The wait for events on the page. You need to make your `request` `chrome` or `smart`. 415 | */ 416 | wait_for?: WaitFor; 417 | 418 | /** 419 | * Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content. 420 | */ 421 | disable_intercept?: boolean; 422 | 423 | /** 424 | * Perform custom web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`. 425 | */ 426 | automation_scripts?: WebAutomationMap; 427 | 428 | /** 429 | * Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`. 430 | */ 431 | execution_scripts?: ExecutionScriptsMap; 432 | 433 | /** 434 | * The redirect policy for HTTP request. Set the value to Loose to allow all. 435 | */ 436 | redirect_policy?: RedirectPolicy; 437 | 438 | /** 439 | * Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent. 440 | */ 441 | event_tracker?: { 442 | responses?: true; 443 | requests?: true; 444 | }; 445 | 446 | /** 447 | * The timeout to stop the crawl. 448 | */ 449 | crawl_timeout?: Timeout; 450 | 451 | /** 452 | * Evaluates given script in every frame upon creation (before loading frame's scripts). 453 | */ 454 | evaluate_on_new_document?: string; 455 | } 456 | 457 | // Core actions response type. 458 | export type SpiderCoreResponse = { 459 | // The content of the request like html or transformation markdown etc. 460 | content?: string; 461 | // A detailed message of a response. 462 | message?: string; 463 | // If an error occured. 464 | error?: string; 465 | // The HTTP status code. 466 | status?: number; 467 | // The website url. 468 | url?: string; 469 | }; 470 | 471 | export type ChunkCallbackFunction = (data: SpiderCoreResponse) => void; 472 | 473 | // records that you can query 474 | export enum Collection { 475 | Websites = "websites", 476 | Pages = "pages", 477 | PagesMetadata = "pages_metadata", 478 | // Leads 479 | Contacts = "contacts", 480 | CrawlState = "crawl_state", 481 | CrawlLogs = "crawl_logs", 482 | Profiles = "profiles", 483 | Credits = "credits", 484 | Webhooks = "webhooks", 485 | APIKeys = "api_keys", 486 | } 487 | 488 | // The API version for Spider 489 | export enum ApiVersion { 490 | V1 = "v1", 491 | } 492 | 493 | // The API routes paths. 494 | export enum APIRoutes { 495 | // Crawl a website to collect the contents. Can be one page or many. 496 | Crawl = "crawl", 497 | // Crawl a website to collect the links. Can be one page or many. 498 | Links = "links", 499 | // Crawl a website to collect screenshots. Can be one page or many. 500 | Screenshot = "screenshot", 501 | // Search for something and optionally crawl the pages or get the results of the search. 502 | Search = "search", 503 | // Transform HTML to markdown or text. 504 | Transform = "transform", 505 | // Pipeline extract leads for a website - emails, phones, etc. 506 | PiplineExtractLeads = "pipeline/extract-contacts", 507 | // Pipeline label a website by category using AI and metadata. 508 | PiplineLabel = "pipeline/label", 509 | // Dynamic collection routes. 510 | Data = "data", 511 | // The last crawl state of a website. 512 | DataCrawlState = "data/crawl_state", 513 | // Sign a file from storage based on the exact url path of the storage or domain - pathname. 514 | DataSignUrl = "data/sign-url", 515 | // Download a file from storage based on the exact url path of the storage or domain - pathname. 516 | DataDownload = "data/download", 517 | // Perform a query on the global database to grab content without crawling if available. 518 | DataQuery = "data/query", 519 | // Get the credits remaining for an account. 520 | DataCredits = "data/credits", 521 | } 522 | 523 | // The base API target info for Spider Cloud. 524 | export const APISchema = { 525 | url: "https://api.spider.cloud", 526 | versions: { 527 | current: ApiVersion.V1, 528 | v1: { 529 | routes: APIRoutes, 530 | end_date: "", 531 | }, 532 | latest: { 533 | routes: APIRoutes, 534 | end_date: "", 535 | }, 536 | }, 537 | }; 538 | 539 | // Adjust the Spider Cloud endpoint. 540 | export const setBaseUrl = (url: string) => { 541 | if (url) { 542 | APISchema["url"] = url; 543 | } 544 | }; 545 | -------------------------------------------------------------------------------- /javascript/src/index.ts: -------------------------------------------------------------------------------- 1 | export { Spider } from "./client"; 2 | export { Collection, setBaseUrl, APISchema } from "./config"; 3 | export type { SpiderParams, Budget, Viewport, QueryRequest } from "./config"; 4 | -------------------------------------------------------------------------------- /javascript/src/utils/process-chunk.ts: -------------------------------------------------------------------------------- 1 | import type { SpiderCoreResponse } from "../config"; 2 | 3 | export const createJsonLineProcessor = ( 4 | cb: (r: SpiderCoreResponse) => void 5 | ) => { 6 | let buffer = ""; 7 | 8 | return (chunk: Buffer | string) => { 9 | buffer += chunk.toString(); 10 | let boundary: number; 11 | 12 | while ((boundary = buffer.indexOf("\n")) !== -1) { 13 | const line = buffer.slice(0, boundary); 14 | buffer = buffer.slice(boundary + 1); 15 | 16 | if (line.trim()) { 17 | try { 18 | cb(JSON.parse(line)); 19 | } catch (_error) {} 20 | } 21 | } 22 | }; 23 | }; 24 | -------------------------------------------------------------------------------- /javascript/src/utils/stream-reader.ts: -------------------------------------------------------------------------------- 1 | import type { ChunkCallbackFunction } from "../config"; 2 | import { createJsonLineProcessor } from "./process-chunk"; 3 | 4 | // Stream the response via callbacks. 5 | export const streamReader = async ( 6 | res: Response, 7 | cb: ChunkCallbackFunction 8 | ) => { 9 | if (res.ok) { 10 | const reader = res.body?.getReader(); 11 | const decoder = new TextDecoder(); 12 | const processChunk = createJsonLineProcessor(cb); 13 | 14 | if (reader) { 15 | while (true) { 16 | const { done, value } = await reader.read(); 17 | 18 | if (done) { 19 | break; 20 | } 21 | 22 | const chunk = decoder.decode(value, { stream: true }); 23 | processChunk(chunk); 24 | } 25 | 26 | processChunk(decoder.decode(new Uint8Array(), { stream: false })); 27 | } 28 | } 29 | }; 30 | -------------------------------------------------------------------------------- /javascript/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es2019", 4 | "module": "commonjs", 5 | "moduleResolution": "node", 6 | "lib": ["es5", "es6", "dom"], 7 | "outDir": "./dist", 8 | "rootDir": "./src", 9 | "strict": true, 10 | "esModuleInterop": true, 11 | "skipLibCheck": true, 12 | "forceConsistentCasingInFileNames": true, 13 | "resolveJsonModule": true, 14 | "allowSyntheticDefaultImports": true, 15 | "experimentalDecorators": true, 16 | "emitDecoratorMetadata": true, 17 | "declaration": true 18 | }, 19 | "include": ["src/**/*"], 20 | "exclude": ["node_modules", "**/*.test.ts"] 21 | } 22 | -------------------------------------------------------------------------------- /python/LICENSE: -------------------------------------------------------------------------------- 1 | ../LICENSE -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | # Spider Cloud Python SDK 2 | 3 | The Spider Cloud Python SDK offers a toolkit for straightforward website scraping, crawling at scale, and other utilities like extracting links and taking screenshots, enabling you to collect data formatted for compatibility with language models (LLMs). It features a user-friendly interface for seamless integration with the Spider Cloud API. 4 | 5 | ## Installation 6 | 7 | To install the Spider Cloud Python SDK, you can use pip: 8 | 9 | ```bash 10 | pip install spider_client 11 | ``` 12 | 13 | ## Usage 14 | 15 | 1. Get an API key from [spider.cloud](https://spider.cloud) 16 | 2. Set the API key as an environment variable named `SPIDER_API_KEY` or pass it as a parameter to the `Spider` class. 17 | 18 | Here's an example of how to use the SDK: 19 | 20 | ```python 21 | from spider import Spider 22 | 23 | # Initialize the Spider with your API key 24 | app = Spider(api_key='your_api_key') 25 | 26 | # Scrape a single URL 27 | url = 'https://spider.cloud' 28 | scraped_data = app.scrape_url(url) 29 | 30 | # Crawl a website 31 | crawler_params = { 32 | 'limit': 1, 33 | 'proxy_enabled': True, 34 | 'store_data': False, 35 | 'metadata': False, 36 | 'request': 'http' 37 | } 38 | crawl_result = app.crawl_url(url, params=crawler_params) 39 | ``` 40 | 41 | ### Scraping a URL 42 | 43 | To scrape data from a single URL: 44 | 45 | ```python 46 | url = 'https://example.com' 47 | scraped_data = app.scrape_url(url) 48 | ``` 49 | 50 | ### Crawling a Website 51 | 52 | To automate crawling a website: 53 | 54 | ```python 55 | url = 'https://example.com' 56 | crawl_params = { 57 | 'limit': 200, 58 | 'request': 'smart_mode' 59 | } 60 | crawl_result = app.crawl_url(url, params=crawl_params) 61 | ``` 62 | 63 | #### Crawl Streaming 64 | 65 | Stream crawl the website in chunks to scale. 66 | 67 | ```python 68 | def handle_json(json_obj: dict) -> None: 69 | assert json_obj["url"] is not None 70 | 71 | url = 'https://example.com' 72 | crawl_params = { 73 | 'limit': 200, 74 | 'store_data': False 75 | } 76 | response = app.crawl_url( 77 | url, 78 | params=params, 79 | stream=True, 80 | callback=handle_json, 81 | ) 82 | ``` 83 | 84 | ### Search 85 | 86 | Perform a search for websites to crawl or gather search results: 87 | 88 | ```python 89 | query = 'a sports website' 90 | crawl_params = { 91 | 'request': 'smart_mode', 92 | 'search_limit': 5, 93 | 'limit': 5, 94 | 'fetch_page_content': True 95 | } 96 | crawl_result = app.search(query, params=crawl_params) 97 | ``` 98 | 99 | ### Retrieving Links from a URL(s) 100 | 101 | Extract all links from a specified URL: 102 | 103 | ```python 104 | url = 'https://example.com' 105 | links = app.links(url) 106 | ``` 107 | 108 | ### Transform 109 | 110 | Transform HTML to markdown or text lightning fast: 111 | 112 | ```python 113 | data = [ { 'html': '

Hello world

' } ] 114 | params = { 115 | 'readability': False, 116 | 'return_format': 'markdown', 117 | } 118 | result = app.transform(data, params=params) 119 | ``` 120 | 121 | ### Taking Screenshots of a URL(s) 122 | 123 | Capture a screenshot of a given URL: 124 | 125 | ```python 126 | url = 'https://example.com' 127 | screenshot = app.screenshot(url) 128 | ``` 129 | 130 | ### Extracting Contact Information 131 | 132 | Extract contact details from a specified URL: 133 | 134 | ```python 135 | url = 'https://example.com' 136 | contacts = app.extract_contacts(url) 137 | ``` 138 | 139 | ### Labeling Data from a URL(s) 140 | 141 | Label the data extracted from a particular URL: 142 | 143 | ```python 144 | url = 'https://example.com' 145 | labeled_data = app.label(url) 146 | ``` 147 | 148 | ### Checking Crawl State 149 | 150 | You can check the crawl state of the website: 151 | 152 | ```python 153 | url = 'https://example.com' 154 | state = app.get_crawl_state(url) 155 | ``` 156 | 157 | ### Downloading files 158 | 159 | You can download the results of the website: 160 | 161 | ```python 162 | url = 'https://example.com' 163 | params = { 164 | 'page': 0, 165 | 'limit': 100, 166 | 'expiresIn': 3600 # Optional, add if needed 167 | } 168 | stream = True 169 | 170 | state = app.create_signed_url(url, params, stream) 171 | ``` 172 | 173 | ### Checking Available Credits 174 | 175 | You can check the remaining credits on your account: 176 | 177 | ```python 178 | credits = app.get_credits() 179 | ``` 180 | 181 | ### Data Operations 182 | 183 | The Spider client can now interact with specific data tables to create, retrieve, and delete data. 184 | 185 | #### Retrieve Data from a Table 186 | 187 | To fetch data from a specified table by applying query parameters: 188 | 189 | ```python 190 | table_name = 'pages' 191 | query_params = {'limit': 20 } 192 | response = app.data_get(table_name, query_params) 193 | print(response) 194 | ``` 195 | 196 | #### Delete Data from a Table 197 | 198 | To delete data from a specified table based on certain conditions: 199 | 200 | ```python 201 | table_name = 'websites' 202 | delete_params = {'domain': 'www.example.com'} 203 | response = app.data_delete(table_name, delete_params) 204 | print(response) 205 | ``` 206 | 207 | ## Streaming 208 | 209 | If you need to stream the request use the third param: 210 | 211 | ```python 212 | url = 'https://example.com' 213 | 214 | crawler_params = { 215 | 'limit': 1, 216 | 'proxy_enabled': True, 217 | 'store_data': False, 218 | 'metadata': False, 219 | 'request': 'http' 220 | } 221 | 222 | links = app.links(url, crawler_params, True) 223 | ``` 224 | 225 | ## Content-Type 226 | 227 | The following Content-type headers are supported using the fourth param: 228 | 229 | 1. `application/json` 230 | 1. `text/csv` 231 | 1. `application/xml` 232 | 1. `application/jsonl` 233 | 234 | ```python 235 | url = 'https://example.com' 236 | 237 | crawler_params = { 238 | 'limit': 1, 239 | 'proxy_enabled': True, 240 | 'store_data': False, 241 | 'metadata': False, 242 | 'request': 'http' 243 | } 244 | 245 | # stream json lines back to the client 246 | links = app.crawl(url, crawler_params, True, "application/jsonl") 247 | ``` 248 | 249 | ## Error Handling 250 | 251 | The SDK handles errors returned by the Spider Cloud API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. 252 | 253 | ## Contributing 254 | 255 | Contributions to the Spider Cloud Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository. 256 | 257 | ## License 258 | 259 | The Spider Cloud Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT). 260 | -------------------------------------------------------------------------------- /python/example.py: -------------------------------------------------------------------------------- 1 | 2 | from spider import Spider 3 | 4 | # Initialize the Spider with your API key using the env key SPIDER_API_KEY 5 | app = Spider() 6 | 7 | crawler_params = { 8 | 'limit': 5, 9 | 'proxy_enabled': False, 10 | 'store_data': False, 11 | 'metadata': False, 12 | 'request': 'http' 13 | } 14 | crawl_result = app.crawl_url('https://spider.cloud', params=crawler_params) 15 | 16 | print(crawl_result) -------------------------------------------------------------------------------- /python/example_async.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from spider import AsyncSpider 3 | 4 | crawler_params = { 5 | 'limit': 1, 6 | 'proxy_enabled': True, 7 | 'store_data': False, 8 | 'metadata': False, 9 | 'request': 'http' 10 | } 11 | 12 | 13 | # A callback 14 | def process_json(data: dict) -> None: 15 | print("Processing data:") 16 | for key, value in data.items(): 17 | print(f"{key}: {value}") 18 | 19 | 20 | async def crawl_url(): 21 | # Initialize the AsyncSpider 22 | spider = AsyncSpider() 23 | 24 | # URL to crawl 25 | url = 'https://spider.cloud' 26 | 27 | # For non-streaming usage: 28 | print("Non-streaming crawl:") 29 | async for result in spider.crawl_url(url, params=crawler_params, stream=False): 30 | print(result) 31 | 32 | # For streaming usage with a callback: 33 | print("\nStreaming crawl with callback:") 34 | async for _ in spider.crawl_url(url, params=crawler_params, stream=True, callback=process_json): 35 | pass # The callback function handles the data processing 36 | 37 | # For streaming usage without a callback (just prints the response headers): 38 | print("\nStreaming crawl without callback:") 39 | async for chunk in spider.crawl_url(url, params=crawler_params, stream=True): 40 | print(f"Received chunk: {chunk}") 41 | 42 | 43 | asyncio.run(crawl_url()) 44 | 45 | 46 | async def scrape_url(): 47 | # Initialize the AsyncSpider 48 | spider = AsyncSpider() 49 | 50 | # URL to crawl 51 | url = 'https://spider.cloud' 52 | 53 | # For non-streaming usage: 54 | print("Non-streaming scrape:") 55 | async for result in spider.scrape_url(url, params=crawler_params, stream=False): 56 | print(result) 57 | 58 | # For streaming usage without a callback (just prints the response headers): 59 | print("\nStreaming scrape without callback:") 60 | async for chunk in spider.scrape_url(url, params=crawler_params, stream=True): 61 | print(f"Received chunk: {chunk}") 62 | 63 | 64 | async def links(): 65 | # Initialize the AsyncSpider 66 | spider = AsyncSpider() 67 | 68 | # URL to crawl 69 | url = 'https://spider.cloud' 70 | 71 | # For non-streaming usage: 72 | print("Non-streaming links:") 73 | async for result in spider.links(url, params=crawler_params, stream=False): 74 | print(result) 75 | 76 | # For streaming usage without a callback (just prints the response headers): 77 | print("\nStreaming links without callback:") 78 | async for chunk in spider.links(url, params=crawler_params, stream=True): 79 | print(f"Received chunk: {chunk}") 80 | 81 | 82 | async def screenshot(): 83 | # Initialize the AsyncSpider 84 | spider = AsyncSpider() 85 | 86 | # URL to crawl 87 | url = 'https://spider.cloud' 88 | 89 | # For non-streaming usage: 90 | print("Non-streaming screenshot:") 91 | async for result in spider.screenshot(url, params=crawler_params, stream=False): 92 | print(result) 93 | 94 | # For streaming usage without a callback (just prints the response headers): 95 | print("\nStreaming screenshot without callback:") 96 | async for chunk in spider.screenshot(url, params=crawler_params, stream=True): 97 | print(f"Received chunk: {chunk}") 98 | 99 | 100 | async def search(): 101 | # Initialize the AsyncSpider 102 | spider = AsyncSpider() 103 | 104 | # Search term 105 | q = "what is spider cloud?" 106 | 107 | # For non-streaming usage: 108 | print("Non-streaming search:") 109 | async for result in spider.search(q=q, params=crawler_params, stream=False): 110 | print(result) 111 | 112 | # For streaming usage without a callback (just prints the response headers): 113 | print("\nStreaming search without callback:") 114 | async for chunk in spider.search(q=q,params=crawler_params, stream=True): 115 | print(f"Received chunk: {chunk}") 116 | 117 | 118 | async def transform(): 119 | # Initialize the AsyncSpider 120 | spider = AsyncSpider() 121 | 122 | # URL to crawl 123 | url = 'https://spider.cloud' 124 | 125 | # Get html 126 | async for result in spider.crawl_url(url=url, params=crawler_params, stream=False): 127 | data = result 128 | 129 | data[0]['html'] = data[0]['content'] # ! Transform endpoint expects html, not content 130 | print("Non-streaming transform:") 131 | async for result in spider.transform(data=data, params=crawler_params, stream=False): 132 | print(result) 133 | 134 | # For streaming usage without a callback (just prints the response headers): 135 | print("\nStreaming transform without callback:") 136 | async for chunk in spider.transform(data=data,params=crawler_params, stream=True): 137 | print(f"Received chunk: {chunk}") 138 | 139 | 140 | async def contacts(): 141 | # Initialize the AsyncSpider 142 | spider = AsyncSpider() 143 | 144 | # URL to crawl 145 | url = 'https://spider.cloud' 146 | 147 | # For non-streaming usage: 148 | print("Non-streaming contacts:") 149 | async for result in spider.extract_contacts(url, params=crawler_params, stream=False): 150 | print(result) 151 | 152 | # For streaming usage without a callback (just prints the response headers): 153 | print("\nStreaming contacts without callback:") 154 | async for chunk in spider.extract_contacts(url, params=crawler_params, stream=True): 155 | print(f"Received chunk: {chunk}") 156 | 157 | 158 | async def credits(): 159 | # Initialize the AsyncSpider 160 | spider = AsyncSpider() 161 | 162 | async for result in spider.get_credits(): 163 | print(result) 164 | 165 | 166 | async def data_get(): 167 | spider = AsyncSpider() 168 | 169 | async for result in spider.data_get("websites", params=crawler_params): 170 | print(result) 171 | 172 | 173 | async def data_delete(): 174 | spider = AsyncSpider() 175 | 176 | async for result in spider.data_delete("websites", params=crawler_params): 177 | print(result) 178 | 179 | if __name__ == "__main__": 180 | asyncio.run(crawl_url()) 181 | asyncio.run(scrape_url()) 182 | asyncio.run(links()) 183 | asyncio.run(screenshot()) 184 | asyncio.run(search()) 185 | asyncio.run(transform()) 186 | asyncio.run(contacts()) 187 | asyncio.run(credits()) 188 | asyncio.run(data_get()) 189 | asyncio.run(data_delete()) 190 | 191 | 192 | -------------------------------------------------------------------------------- /python/example_streaming.py: -------------------------------------------------------------------------------- 1 | 2 | from spider import Spider 3 | 4 | # Initialize the Spider with your API key using the env key SPIDER_API_KEY 5 | app = Spider() 6 | 7 | crawler_params = { 8 | 'limit': 1000, 9 | 'proxy_enabled': False, 10 | 'store_data': False, 11 | 'metadata': False, 12 | 'request': 'http' 13 | } 14 | 15 | count = [0] 16 | 17 | def process_json(data: dict) -> None: 18 | print(f"Processing: {count[0]}") 19 | count[0] += 1 20 | for key, value in data.items(): 21 | print(f"{key}: {value}") 22 | 23 | app.crawl_url('https://spider.cloud', params=crawler_params, stream=True, callback=process_json) 24 | -------------------------------------------------------------------------------- /python/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-asyncio 3 | python-dotenv 4 | aiohttp 5 | python-dotenv 6 | ijson 7 | tenacity -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import os 3 | 4 | 5 | def read_file(fname): 6 | return open(os.path.join(os.path.dirname(__file__), fname), encoding="utf-8").read() 7 | 8 | 9 | setup( 10 | name="spider_client", 11 | version="0.1.36", 12 | url="https://github.com/spider-rs/spider-clients/tree/main/python", 13 | license="MIT", 14 | author="Spider", 15 | author_email="jeff@spider.cloud", 16 | description="Python SDK for Spider Cloud API", 17 | packages=find_packages(), 18 | install_requires=["requests", "ijson", "tenacity", "aiohttp"], 19 | long_description=read_file("README.md"), 20 | long_description_content_type="text/markdown", 21 | classifiers=[ 22 | "Development Status :: 5 - Production/Stable", 23 | "Intended Audience :: Developers", 24 | "Intended Audience :: Information Technology", 25 | "Topic :: Software Development :: Libraries :: Python Modules", 26 | "Topic :: Internet", 27 | "Topic :: Internet :: WWW/HTTP", 28 | "Topic :: Internet :: WWW/HTTP :: Indexing/Search", 29 | "Operating System :: OS Independent", 30 | ], 31 | ) 32 | -------------------------------------------------------------------------------- /python/spider/__init__.py: -------------------------------------------------------------------------------- 1 | from .spider import Spider 2 | from .async_spider import AsyncSpider -------------------------------------------------------------------------------- /python/spider/spider.py: -------------------------------------------------------------------------------- 1 | import os, requests, logging, ijson, tenacity 2 | from typing import Optional, Dict 3 | from spider.spider_types import RequestParamsDict, JsonCallback, QueryRequest 4 | 5 | 6 | class Spider: 7 | def __init__(self, api_key: Optional[str] = None): 8 | """ 9 | Initialize the Spider with an API key. 10 | 11 | :param api_key: A string of the API key for Spider. Defaults to the SPIDER_API_KEY environment variable. 12 | :raises ValueError: If no API key is provided. 13 | """ 14 | self.api_key = api_key or os.getenv("SPIDER_API_KEY") 15 | if self.api_key is None: 16 | raise ValueError("No API key provided") 17 | 18 | @tenacity.retry( 19 | wait=tenacity.wait_exponential(multiplier=1, min=1, max=60), 20 | stop=tenacity.stop_after_attempt(5) 21 | ) 22 | def api_post( 23 | self, 24 | endpoint: str, 25 | data: dict, 26 | stream: bool = False, 27 | content_type: str = "application/json", 28 | ): 29 | """ 30 | Send a POST request to the specified API endpoint. 31 | 32 | :param endpoint: The API endpoint to which the POST request is sent. 33 | :param data: The data (dictionary) to be sent in the POST request. 34 | :param stream: Boolean indicating if the response should be streamed. 35 | :return: The JSON response or the raw response stream if stream is True. 36 | """ 37 | headers = self._prepare_headers(content_type) 38 | response = self._post_request( 39 | f"https://api.spider.cloud/{endpoint}", data, headers, stream 40 | ) 41 | if stream: 42 | return response 43 | elif 200 <= response.status_code < 300: 44 | return response.json() 45 | else: 46 | self._handle_error(response, f"post to {endpoint}") 47 | 48 | @tenacity.retry( 49 | wait=tenacity.wait_exponential(multiplier=1, min=1, max=60), 50 | stop=tenacity.stop_after_attempt(5) 51 | ) 52 | def api_get( 53 | self, 54 | endpoint: str, 55 | params: Optional[dict] = None, 56 | stream: bool = False, 57 | content_type: str = "application/json", 58 | ): 59 | """ 60 | Send a GET request to the specified endpoint. 61 | 62 | :param endpoint: The API endpoint from which to retrieve data. 63 | :param params: Query parameters to attach to the URL. 64 | :return: The JSON decoded response. 65 | """ 66 | headers = self._prepare_headers(content_type) 67 | response = requests.get( 68 | f"https://api.spider.cloud/{endpoint}", 69 | headers=headers, 70 | params=params, 71 | stream=stream, 72 | ) 73 | if 200 <= response.status_code < 300: 74 | return response.json() 75 | else: 76 | self._handle_error(response, f"get from {endpoint}") 77 | 78 | @tenacity.retry( 79 | wait=tenacity.wait_exponential(multiplier=1, min=1, max=60), 80 | stop=tenacity.stop_after_attempt(5) 81 | ) 82 | def api_delete( 83 | self, 84 | endpoint: str, 85 | params: Optional[RequestParamsDict] = None, 86 | stream: Optional[bool] = False, 87 | content_type: Optional[str] = "application/json", 88 | ): 89 | """ 90 | Send a DELETE request to the specified endpoint. 91 | 92 | :param endpoint: The API endpoint from which to retrieve data. 93 | :param params: Optional parameters to include in the DELETE request. 94 | :param stream: Boolean indicating if the response should be streamed. 95 | :param content_type: The content type of the request. 96 | :return: The JSON decoded response. 97 | """ 98 | headers = self._prepare_headers(content_type) 99 | response = self._delete_request( 100 | f"https://api.spider.cloud/v1/{endpoint}", 101 | headers=headers, 102 | json=params, 103 | stream=stream, 104 | ) 105 | if 200 <= response.status_code < 300: 106 | return response.json() 107 | else: 108 | self._handle_error(response, f"delete from {endpoint}") 109 | 110 | def scrape_url( 111 | self, 112 | url: str, 113 | params: Optional[RequestParamsDict] = None, 114 | stream: bool = False, 115 | content_type: str = "application/json", 116 | ): 117 | """ 118 | Scrape data from the specified URL. 119 | 120 | :param url: The URL from which to scrape data. 121 | :param params: Optional dictionary of additional parameters for the scrape request. 122 | :return: JSON response containing the scraping results. 123 | """ 124 | return self.api_post( 125 | "crawl", {"url": url, "limit": 1, **(params or {})}, stream, content_type 126 | ) 127 | 128 | def crawl_url( 129 | self, 130 | url: str, 131 | params: Optional[RequestParamsDict], 132 | stream: Optional[bool] = False, 133 | content_type: Optional[str] = "application/json", 134 | callback: Optional[JsonCallback] = None, 135 | ): 136 | """ 137 | Start crawling at the specified URL. 138 | 139 | :param url: The URL to begin crawling. 140 | :param params: Optional dictionary with additional parameters to customize the crawl. 141 | :param stream: Optional Boolean indicating if the response should be streamed. Defaults to False. 142 | :param content_type: Optional str to determine the content-type header of the request. 143 | :param callback: Optional callback to use with streaming. This will only send the data via callback. 144 | 145 | :return: JSON response or the raw response stream if streaming enabled. 146 | """ 147 | jsonl = stream and callable(callback) 148 | 149 | if jsonl: 150 | content_type = "application/jsonl" 151 | 152 | response = self.api_post( 153 | "crawl", {"url": url, **(params or {})}, stream, content_type 154 | ) 155 | 156 | if jsonl: 157 | return self.stream_reader(response, callback) 158 | else: 159 | return response 160 | 161 | def links( 162 | self, 163 | url: str, 164 | params: Optional[RequestParamsDict] = None, 165 | stream: bool = False, 166 | content_type: str = "application/json", 167 | ): 168 | """ 169 | Retrieve links from the specified URL. 170 | 171 | :param url: The URL from which to extract links. 172 | :param params: Optional parameters for the link retrieval request. 173 | :return: JSON response containing the links. 174 | """ 175 | return self.api_post( 176 | "links", {"url": url, **(params or {})}, stream, content_type 177 | ) 178 | 179 | def screenshot( 180 | self, 181 | url: str, 182 | params: Optional[RequestParamsDict] = None, 183 | stream: bool = False, 184 | content_type: str = "application/json", 185 | ): 186 | """ 187 | Take a screenshot of the specified URL. 188 | 189 | :param url: The URL to capture a screenshot from. 190 | :param params: Optional parameters to customize the screenshot capture. 191 | :return: JSON response with screenshot data. 192 | """ 193 | return self.api_post( 194 | "screenshot", {"url": url, **(params or {})}, stream, content_type 195 | ) 196 | 197 | def search( 198 | self, 199 | q: str, 200 | params: Optional[RequestParamsDict] = None, 201 | stream: bool = False, 202 | content_type: str = "application/json", 203 | ): 204 | """ 205 | Perform a search and gather a list of websites to start crawling and collect resources. 206 | 207 | :param search: The search query. 208 | :param params: Optional parameters to customize the search. 209 | :return: JSON response or the raw response stream if streaming enabled. 210 | """ 211 | return self.api_post( 212 | "search", {"search": q, **(params or {})}, stream, content_type 213 | ) 214 | 215 | def transform( 216 | self, data, params=None, stream=False, content_type="application/json" 217 | ): 218 | """ 219 | Transform HTML to Markdown or text. You can send up to 10MB of data at once. 220 | 221 | :param data: The data to transform a list of objects with the 'html' key and an optional 'url' key only used readability mode. 222 | :param params: Optional parameters to customize the search. 223 | :return: JSON response or the raw response stream if streaming enabled. 224 | """ 225 | return self.api_post( 226 | "transform", {"data": data, **(params or {})}, stream, content_type 227 | ) 228 | 229 | def extract_contacts( 230 | self, 231 | url: str, 232 | params: Optional[RequestParamsDict] = None, 233 | stream: bool = False, 234 | content_type: str = "application/json", 235 | ): 236 | """ 237 | Extract contact information from the specified URL. 238 | 239 | :param url: The URL from which to extract contact information. 240 | :param params: Optional parameters for the contact extraction. 241 | :return: JSON response containing extracted contact details. 242 | """ 243 | return self.api_post( 244 | "pipeline/extract-contacts", 245 | {"url": url, **(params or {})}, 246 | stream, 247 | content_type, 248 | ) 249 | 250 | def label( 251 | self, 252 | url: str, 253 | params: Optional[RequestParamsDict] = None, 254 | stream: bool = False, 255 | content_type: str = "application/json", 256 | ): 257 | """ 258 | Apply labeling to data extracted from the specified URL. 259 | 260 | :param url: The URL to label data from. 261 | :param params: Optional parameters to guide the labeling process. 262 | :return: JSON response with labeled data. 263 | """ 264 | return self.api_post( 265 | "pipeline/label", {"url": url, **(params or {})}, stream, content_type 266 | ) 267 | 268 | def query( 269 | self, 270 | params: QueryRequest = None, 271 | stream: bool = False, 272 | content_type: str = "application/json", 273 | ): 274 | """ 275 | Query a website resource from our database. This costs 1 credit per successful retrieval. 276 | :param params: Optional parameters to guide the labeling process. 277 | :return: The website contents markup. 278 | """ 279 | return self.api_get("data/query", {**(params or {})}, stream, content_type) 280 | 281 | def download( 282 | self, 283 | url: Optional[str] = None, 284 | params: Optional[Dict[str, int]] = None, 285 | stream: Optional[bool] = True, 286 | ): 287 | """ 288 | Download the file from storage. 289 | 290 | :param url: Optional url of the exact path to specify the storage path. 291 | :param params: Optional dictionary containing configuration parameters, such as: 292 | - 'page': Optional page number for pagination. 293 | - 'limit': Optional page limit for pagination. 294 | - 'domain': Optional domain name to use when url is not known. 295 | - 'pathname': Optional pathname to use when urls is not known. 296 | - 'expiresIn': Optional expiration time for the signed URL. 297 | :param stream: Boolean indicating if the response should be streamed. Defaults to True. 298 | :return: The raw response stream if stream is True. 299 | """ 300 | if url: 301 | params["url"] = url 302 | if params: 303 | params.update(params) 304 | 305 | endpoint = "data/download" 306 | headers = self._prepare_headers("application/octet-stream") 307 | response = self._get_request( 308 | f"https://api.spider.cloud/v1/{endpoint}", headers, stream, params=params 309 | ) 310 | if 200 <= response.status_code < 300: 311 | if stream: 312 | return response.raw 313 | else: 314 | return response.content 315 | else: 316 | self._handle_error(response, f"download from {endpoint}") 317 | 318 | def create_signed_url( 319 | self, 320 | url: Optional[str] = None, 321 | params: Optional[Dict[str, int]] = None, 322 | stream: Optional[bool] = True, 323 | ): 324 | """ 325 | Create a signed url to download files from the storage. 326 | 327 | :param url: Optional url of the exact path to specify the storage path. 328 | :param params: Optional dictionary containing configuration parameters, such as: 329 | - 'page': Optional page number for pagination. 330 | - 'limit': Optional page limit for pagination. 331 | - 'domain': Optional domain name to use when url is not known. 332 | - 'pathname': Optional pathname to use when urls is not known. 333 | - 'expiresIn': Optional expiration time for the signed URL. 334 | :param stream: Boolean indicating if the response should be streamed. Defaults to True. 335 | :return: The raw response stream if stream is True. 336 | """ 337 | if url: 338 | params["url"] = url 339 | if params: 340 | params.update(params) 341 | 342 | endpoint = "data/sign-url" 343 | headers = self._prepare_headers("application/octet-stream") 344 | response = self._get_request( 345 | f"https://api.spider.cloud/v1/{endpoint}", headers, stream, params=params 346 | ) 347 | if 200 <= response.status_code < 300: 348 | if stream: 349 | return response.raw 350 | else: 351 | return response.content 352 | else: 353 | self._handle_error(response, f"download from {endpoint}") 354 | 355 | def get_crawl_state( 356 | self, 357 | url: str, 358 | params: Optional[RequestParamsDict] = None, 359 | stream: Optional[bool] = False, 360 | content_type: Optional[str] = "application/json", 361 | ): 362 | """ 363 | Retrieve the website active crawl state. 364 | 365 | :return: JSON response of the crawl state and credits used. 366 | """ 367 | payload = {"url": url, "stream": stream, "content_type": content_type} 368 | if params: 369 | payload.update(params) 370 | 371 | return self.api_post("data/crawl_state", payload, stream) 372 | 373 | def get_credits(self): 374 | """ 375 | Retrieve the account's remaining credits. 376 | 377 | :return: JSON response containing the number of credits left. 378 | """ 379 | return self.api_get("data/credits") 380 | 381 | def data_post(self, table: str, data: Optional[RequestParamsDict] = None): 382 | """ 383 | Send data to a specific table via POST request. 384 | :param table: The table name to which the data will be posted. 385 | :param data: A dictionary representing the data to be posted. 386 | :return: The JSON response from the server. 387 | """ 388 | return self.api_post(f"data/{table}", data, stream=False) 389 | 390 | def data_get( 391 | self, 392 | table: str, 393 | params: Optional[RequestParamsDict] = None, 394 | ): 395 | """ 396 | Retrieve data from a specific table via GET request. 397 | :param table: The table name from which to retrieve data. 398 | :param params: Optional parameters to modify the query. 399 | :return: The JSON response from the server. 400 | """ 401 | return self.api_get(f"data/{table}", params) 402 | 403 | def data_delete( 404 | self, 405 | table: str, 406 | params: Optional[RequestParamsDict] = None, 407 | ): 408 | """ 409 | Delete data from a specific table via DELETE request. 410 | :param table: The table name from which data will be deleted. 411 | :param params: Parameters to identify which data to delete. 412 | :return: The JSON response from the server. 413 | """ 414 | return self.api_delete(f"data/{table}", params=params) 415 | 416 | def stream_reader(self, response, callback): 417 | response.raise_for_status() 418 | 419 | try: 420 | for json_obj in ijson.items(response.raw, "", multiple_values=True): 421 | callback(json_obj) 422 | 423 | except Exception as e: 424 | logging.error(f"An error occurred while parsing JSON: {e}") 425 | 426 | def _prepare_headers(self, content_type: str = "application/json"): 427 | return { 428 | "Content-Type": content_type, 429 | "Authorization": f"Bearer {self.api_key}", 430 | "User-Agent": f"Spider-Client/0.1.36", 431 | } 432 | 433 | def _post_request(self, url: str, data, headers, stream=False): 434 | return requests.post(url, headers=headers, json=data, stream=stream) 435 | 436 | def _get_request(self, url: str, headers, stream=False, params=None): 437 | return requests.get(url, headers=headers, stream=stream, params=params) 438 | 439 | def _delete_request(self, url: str, headers, json=None, stream=False): 440 | return requests.delete(url, headers=headers, json=json, stream=stream) 441 | 442 | def _handle_error(self, response, action): 443 | if response.status_code in [402, 409, 500]: 444 | error_message = response.json().get("error", "Unknown error occurred") 445 | raise Exception( 446 | f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}" 447 | ) 448 | else: 449 | raise Exception( 450 | f"Unexpected error occurred while trying to {action}. Status code: {response.status_code}. Here is the response: {response.text}" 451 | ) 452 | -------------------------------------------------------------------------------- /python/spider/spider.pyi: -------------------------------------------------------------------------------- 1 | from typing import Optional, Dict, Any 2 | from spider_types import RequestParamsDict, QueryRequest 3 | 4 | class Spider: 5 | api_key: str 6 | 7 | def __init__(self, api_key: Optional[str] = None) -> None: ... 8 | def api_post( 9 | self, 10 | endpoint: str, 11 | data: dict, 12 | stream: bool, 13 | content_type: str = "application/json", 14 | ) -> Any: ... 15 | def api_get( 16 | self, endpoint: str, stream: bool, content_type: str = "application/json" 17 | ) -> Any: ... 18 | def api_delete( 19 | self, endpoint: str, stream: bool, content_type: str = "application/json" 20 | ) -> Any: ... 21 | def scrape_url( 22 | self, 23 | url: str, 24 | params: Optional[RequestParamsDict] = None, 25 | stream: bool = False, 26 | content_type: str = "application/json", 27 | ) -> Any: ... 28 | def crawl_url( 29 | self, 30 | url: str, 31 | params: Optional[RequestParamsDict] = None, 32 | stream: bool = False, 33 | content_type: str = "application/json", 34 | ) -> Any: ... 35 | def links( 36 | self, 37 | url: str, 38 | params: Optional[RequestParamsDict] = None, 39 | stream: bool = False, 40 | content_type: str = "application/json", 41 | ) -> Any: ... 42 | def screenshot( 43 | self, 44 | url: str, 45 | params: Optional[RequestParamsDict] = None, 46 | stream: bool = False, 47 | content_type: str = "application/json", 48 | ) -> Any: ... 49 | def search( 50 | self, 51 | q: str, 52 | params: Optional[RequestParamsDict] = None, 53 | stream: bool = False, 54 | content_type: str = "application/json", 55 | ) -> Any: ... 56 | def transform( 57 | self, 58 | data: Any, 59 | params: Optional[RequestParamsDict] = None, 60 | stream: bool = False, 61 | content_type: str = "application/json", 62 | ) -> Any: ... 63 | def extract_contacts( 64 | self, 65 | url: str, 66 | params: Optional[RequestParamsDict] = None, 67 | stream: bool = False, 68 | content_type: str = "application/json", 69 | ) -> Any: ... 70 | def label( 71 | self, 72 | url: str, 73 | params: Optional[RequestParamsDict] = None, 74 | stream: bool = False, 75 | content_type: str = "application/json", 76 | ) -> Any: ... 77 | def get_crawl_state( 78 | self, 79 | url: str, 80 | params: Optional[RequestParamsDict] = None, 81 | stream: bool = False, 82 | content_type: str = "application/json", 83 | ) -> Any: ... 84 | def query( 85 | self, 86 | params: QueryRequest 87 | ) -> Any: ... 88 | def get_credits(self) -> Any: ... 89 | def data_post( 90 | self, table: str, data: Optional[RequestParamsDict] = None 91 | ) -> Any: ... 92 | def create_signed_url( 93 | self, 94 | domain: Optional[str] = None, 95 | params: Optional[Dict[str, int]] = None, 96 | stream: Optional[bool] = True, 97 | ) -> Any: ... 98 | def data_get( 99 | self, 100 | table: str, 101 | params: Optional[RequestParamsDict] = None, 102 | ) -> Any: ... 103 | def data_delete( 104 | self, 105 | table: str, 106 | params: Optional[RequestParamsDict] = None, 107 | ) -> Any: ... 108 | def _prepare_headers( 109 | self, content_type: str = "application/json" 110 | ) -> Dict[str, str]: ... 111 | def _post_request( 112 | self, url: str, data: Any, headers: Dict[str, str], stream: bool = False 113 | ) -> Any: ... 114 | def _get_request( 115 | self, url: str, headers: Dict[str, str], stream: bool = False 116 | ) -> Any: ... 117 | def _delete_request( 118 | self, url: str, headers: Dict[str, str], stream: bool = False 119 | ) -> Any: ... 120 | def _handle_error(self, response: Any, action: str) -> None: ... 121 | -------------------------------------------------------------------------------- /python/spider/spider_types.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict, Optional, Dict, List, Union, Literal, Callable 2 | from dataclasses import dataclass, field 3 | 4 | @dataclass 5 | class Evaluate: 6 | code: str 7 | type: str = "Evaluate" 8 | 9 | @dataclass 10 | class Click: 11 | selector: str 12 | type: str = "Click" 13 | 14 | @dataclass 15 | class Wait: 16 | duration: int 17 | type: str = "Wait" 18 | 19 | @dataclass 20 | class WaitForNavigation: 21 | type: str = "WaitForNavigation" 22 | 23 | @dataclass 24 | class WaitFor: 25 | selector: str 26 | type: str = "WaitFor" 27 | 28 | @dataclass 29 | class WaitForAndClick: 30 | selector: str 31 | type: str = "WaitForAndClick" 32 | 33 | @dataclass 34 | class ScrollX: 35 | pixels: int 36 | type: str = "ScrollX" 37 | 38 | @dataclass 39 | class ScrollY: 40 | pixels: int 41 | type: str = "ScrollY" 42 | 43 | @dataclass 44 | class Fill: 45 | selector: str 46 | value: str 47 | type: str = "Fill" 48 | 49 | @dataclass 50 | class InfiniteScroll: 51 | times: int 52 | type: str = "InfiniteScroll" 53 | 54 | WebAutomation = Union[ 55 | Evaluate, 56 | Click, 57 | Wait, 58 | WaitForNavigation, 59 | WaitFor, 60 | WaitForAndClick, 61 | ScrollX, 62 | ScrollY, 63 | Fill, 64 | InfiniteScroll, 65 | ] 66 | 67 | WebAutomationMap = Dict[str, List[WebAutomation]] 68 | ExecutionScriptsMap = Dict[str, str] 69 | 70 | RedirectPolicy = Literal[ 71 | "Loose", 72 | "Strict" 73 | ] 74 | 75 | @dataclass 76 | class QueryRequest: 77 | url: Optional[str] = field(default=None) 78 | domain: Optional[str] = field(default=None) 79 | pathname: Optional[str] = field(default=None) 80 | 81 | 82 | class ChunkingAlgDict(TypedDict): 83 | # The chunking algorithm to use with the value to chunk by. 84 | type: Literal["ByWords", "ByLines", "ByCharacterLength", "BySentence"] 85 | # The amount to chunk by. 86 | value: int 87 | 88 | 89 | class TimeoutDict(TypedDict): 90 | secs: int 91 | nanos: int 92 | 93 | class EventTracker(TypedDict): 94 | responses: bool 95 | requests: bool 96 | 97 | class IdleNetworkDict(TypedDict): 98 | timeout: TimeoutDict 99 | 100 | 101 | class SelectorDict(TypedDict): 102 | timeout: TimeoutDict 103 | selector: str 104 | 105 | 106 | class DelayDict(TypedDict): 107 | timeout: TimeoutDict 108 | 109 | 110 | class WaitForDict(TypedDict, total=False): 111 | idle_network: Optional[IdleNetworkDict] 112 | selector: Optional[SelectorDict] 113 | delay: Optional[DelayDict] 114 | page_navigations: Optional[bool] 115 | 116 | 117 | @dataclass 118 | class WebhookSettings: 119 | # The destination where the webhook data is sent via HTTP POST. 120 | destination: str 121 | # Flag to trigger an action when all credits are depleted 122 | on_credits_depleted: bool 123 | # Flag to trigger when half of the credits are depleted 124 | on_credits_half_depleted: bool 125 | # Flag to notify on website status update events 126 | on_website_status: bool 127 | # Flag to send information (links, bytes) about a new page find 128 | on_find: bool 129 | # Flag to handle the metadata of a found page 130 | on_find_metadata: bool 131 | 132 | class CSSSelector(TypedDict): 133 | """ 134 | Represents a set of CSS selectors grouped under a common name. 135 | """ 136 | 137 | name: str # The name of the selector group (e.g., "headers") 138 | selectors: List[str] # A list of CSS selectors (e.g., ["h1", "h2", "h3"]) 139 | 140 | 141 | # CSSExtractionMap is a dictionary where: 142 | # - Keys are strings representing paths (e.g., "/blog") 143 | # - Values are lists of CSSSelector items 144 | CSSExtractionMap = Dict[str, List[CSSSelector]] 145 | 146 | ReturnFormat = Literal["raw", "markdown", "commonmark", "html2text", "text", "xml", "bytes"]; 147 | 148 | class RequestParamsDict(TypedDict, total=False): 149 | # The URL to be crawled. 150 | url: Optional[str] 151 | 152 | # The type of request to be made. 153 | request: Optional[Literal["http", "chrome", "smart"]] 154 | 155 | # The maximum number of pages the crawler should visit. 156 | limit: Optional[int] 157 | 158 | # The format in which the result should be returned. 159 | return_format: Optional[ 160 | Union[ 161 | ReturnFormat, 162 | List[ReturnFormat], 163 | ] 164 | ] 165 | 166 | # Specifies whether to only visit the top-level domain. 167 | tld: Optional[bool] 168 | 169 | # The depth of the crawl. 170 | depth: Optional[int] 171 | 172 | # Specifies whether the request should be cached. 173 | cache: Optional[bool] 174 | 175 | # The budget for various resources. 176 | budget: Optional[Dict[str, int]] 177 | 178 | # The blacklist routes to ignore. This can be a Regex string pattern. 179 | blacklist: Optional[List[str]] 180 | 181 | # The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing. 182 | whitelist: Optional[List[str]] 183 | 184 | # The locale to be used during the crawl. 185 | locale: Optional[str] 186 | 187 | # The cookies to be set for the request, formatted as a single string. 188 | cookies: Optional[str] 189 | 190 | # Specifies whether to use stealth techniques to avoid detection. 191 | stealth: Optional[bool] 192 | 193 | # The headers to be used for the request. 194 | headers: Optional[Dict[str, str]] 195 | 196 | # Specifies whether anti-bot measures should be used. 197 | anti_bot: Optional[bool] 198 | 199 | # Specifies whether to include metadata in the response. 200 | metadata: Optional[bool] 201 | 202 | # The dimensions of the viewport. 203 | viewport: Optional[Dict[str, int]] 204 | 205 | # The encoding to be used for the request. 206 | encoding: Optional[str] 207 | 208 | # Specifies whether to include subdomains in the crawl. 209 | subdomains: Optional[bool] 210 | 211 | # The user agent string to be used for the request. 212 | user_agent: Optional[str] 213 | 214 | # Specifies whether the response data should be stored. 215 | store_data: Optional[bool] 216 | 217 | # Configuration settings for GPT (general purpose texture mappings). 218 | gpt_config: Optional[Dict] 219 | 220 | # Specifies whether to use fingerprinting protection. 221 | fingerprint: Optional[bool] 222 | 223 | # Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page. 224 | css_extraction_map: Optional[CSSExtractionMap] 225 | 226 | # Specifies whether to perform the request without using storage. 227 | storageless: Optional[bool] 228 | 229 | # Specifies whether readability optimizations should be applied. 230 | readability: Optional[bool] 231 | 232 | # Specifies whether to use a proxy for the request. 233 | proxy_enabled: Optional[bool] 234 | 235 | # Specifies whether to respect the site's robots.txt file. 236 | respect_robots: Optional[bool] 237 | 238 | # CSS selector to be used to filter the content. 239 | root_selector: Optional[str] 240 | 241 | # Specifies whether to load all resources of the crawl target. 242 | full_resources: Optional[bool] 243 | 244 | # Specifies whether to use the sitemap links. 245 | sitemap: Optional[bool] 246 | 247 | # Specifies whether to only use the sitemap links. 248 | sitemap_only: Optional[bool] 249 | 250 | # External domains to include in the crawl. 251 | external_domains: Optional[List[str]] 252 | 253 | # Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`. 254 | return_embeddings: Optional[bool] 255 | 256 | # Use webhooks to send data to another location via POST. 257 | webhooks: Optional[WebhookSettings] 258 | 259 | # Returns the link(s) found on the page that match the crawler query. 260 | return_page_links: Optional[bool] 261 | 262 | # Returns the HTTP response headers used. 263 | return_headers: Optional[bool] 264 | 265 | # Returns the HTTP response cookies used. 266 | return_cookies: Optional[bool] 267 | 268 | # The timeout for the request, in milliseconds. 269 | request_timeout: Optional[int] 270 | 271 | # Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'. 272 | scroll: Optional[int] 273 | 274 | # Specifies whether to run the request in the background. 275 | run_in_background: Optional[bool] 276 | 277 | # Specifies whether to skip configuration checks. 278 | skip_config_checks: Optional[bool] 279 | 280 | # The chunking algorithm to use. 281 | chunking_alg: Optional[ChunkingAlgDict] 282 | 283 | # Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content. 284 | disable_intercept: Optional[bool] 285 | 286 | # The wait for events on the page. You need to make your `request` `chrome` or `smart`. 287 | wait_for: Optional[WaitForDict] 288 | 289 | # Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`. 290 | exuecution_scripts: Optional[ExecutionScriptsMap] 291 | 292 | # Perform custom web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`. 293 | automation_scripts: Optional[WebAutomationMap] 294 | 295 | # The redirect policy for HTTP request. Set the value to Loose to allow all. 296 | redirect_policy: Optional[RedirectPolicy] 297 | 298 | # Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent. 299 | event_tracker: Optional[EventTracker] 300 | 301 | # The timeout to stop the crawl. 302 | crawl_timeout: Optional[TimeoutDict] 303 | 304 | # Evaluates given script in every frame upon creation (before loading frame's scripts). 305 | evaluate_on_new_document: Optional[str] 306 | 307 | JsonCallback = Callable[[dict], None] 308 | -------------------------------------------------------------------------------- /python/tests/test_async_spider.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from unittest.mock import patch, AsyncMock 4 | from spider.async_spider import AsyncSpider 5 | from spider.spider_types import RequestParamsDict 6 | from dotenv import load_dotenv 7 | import aiohttp 8 | import json 9 | 10 | load_dotenv() 11 | 12 | @pytest.fixture 13 | def async_spider(): 14 | return AsyncSpider(api_key="test_api_key") 15 | 16 | @pytest.fixture 17 | def url(): 18 | return "http://example.com" 19 | 20 | @pytest.fixture 21 | def params(): 22 | return { 23 | "limit": 1, 24 | "return_format": "markdown", 25 | "depth": 2, 26 | "cache": True, 27 | "domain": "example.com", 28 | } 29 | 30 | def test_init_with_env_variable(): 31 | os.environ["SPIDER_API_KEY"] = "env_api_key" 32 | spider = AsyncSpider() 33 | assert spider.api_key == "env_api_key" 34 | del os.environ["SPIDER_API_KEY"] 35 | 36 | def test_init_without_api_key(): 37 | with pytest.raises(ValueError): 38 | AsyncSpider(api_key=None) 39 | 40 | @pytest.mark.asyncio 41 | async def test_scrape_url(async_spider, url, params): 42 | mock_response = [{"content": "data", "error": None, "status": 200, "url": url}] 43 | 44 | async def mock_request(*args, **kwargs): 45 | yield mock_response 46 | 47 | with patch.object(AsyncSpider, '_request', side_effect=mock_request): 48 | async for response in async_spider.scrape_url(url, params=params): 49 | assert isinstance(response, list) 50 | assert len(response) > 0 51 | assert isinstance(response[0], dict) 52 | assert 'content' in response[0] 53 | assert 'error' in response[0] 54 | assert 'status' in response[0] 55 | assert 'url' in response[0] 56 | 57 | @pytest.mark.asyncio 58 | async def test_crawl_url(async_spider, url, params): 59 | mock_response = [{"content": "data", "error": None, "status": 200, "url": url}] 60 | 61 | async def mock_request(*args, **kwargs): 62 | yield mock_response 63 | 64 | with patch.object(AsyncSpider, '_request', side_effect=mock_request): 65 | async for response in async_spider.crawl_url(url, params=params): 66 | assert isinstance(response, list) 67 | assert len(response) > 0 68 | assert isinstance(response[0], dict) 69 | assert 'content' in response[0] 70 | assert 'error' in response[0] 71 | assert 'status' in response[0] 72 | assert 'url' in response[0] 73 | 74 | @pytest.mark.asyncio 75 | async def test_crawl_url_streaming(async_spider, url, params): 76 | mock_response = b'{"url": "http://example.com"}' 77 | 78 | async def mock_request(*args, **kwargs): 79 | yield mock_response 80 | 81 | with patch.object(AsyncSpider, '_request', side_effect=mock_request): 82 | def handle_json(json_obj): 83 | json_obj = json.loads(json_obj.decode('utf-8')) 84 | assert json_obj["url"] == "http://example.com" 85 | 86 | async for response in async_spider.crawl_url(url, params=params, stream=True, content_type="application/jsonl"): 87 | handle_json(response) 88 | 89 | @pytest.mark.asyncio 90 | async def test_links(async_spider, url, params): 91 | mock_response = [{"error": None, "status": 200, "url": url}] 92 | 93 | async def mock_request(*args, **kwargs): 94 | yield mock_response 95 | 96 | with patch.object(AsyncSpider, '_request', side_effect=mock_request): 97 | async for response in async_spider.links(url, params=params): 98 | assert isinstance(response, list) 99 | assert len(response) > 0 100 | assert isinstance(response[0], dict) 101 | assert 'error' in response[0] 102 | assert 'status' in response[0] 103 | assert 'url' in response[0] 104 | 105 | @pytest.mark.asyncio 106 | async def test_screenshot(async_spider, url, params): 107 | mock_response = [{"content": "base64_encoded_image", "error": None, "status": 200, "url": url}] 108 | 109 | async def mock_request(*args, **kwargs): 110 | yield mock_response 111 | 112 | with patch.object(AsyncSpider, '_request', side_effect=mock_request): 113 | async for response in async_spider.screenshot(url, params=params): 114 | assert isinstance(response, list) 115 | assert len(response) > 0 116 | assert isinstance(response[0], dict) 117 | assert 'content' in response[0] 118 | assert 'error' in response[0] 119 | assert 'status' in response[0] 120 | assert 'url' in response[0] 121 | 122 | @pytest.mark.asyncio 123 | async def test_search(async_spider, params): 124 | mock_response = [{"content": "result", "error": None, "status": 200, "url": "http://example.com"}] 125 | 126 | async def mock_request(*args, **kwargs): 127 | yield mock_response 128 | 129 | with patch.object(AsyncSpider, '_request', side_effect=mock_request): 130 | async for response in async_spider.search("example search query", params=params): 131 | assert isinstance(response, list) 132 | assert len(response) > 0 133 | assert isinstance(response[0], dict) 134 | assert 'content' in response[0] 135 | assert 'error' in response[0] 136 | assert 'status' in response[0] 137 | assert 'url' in response[0] 138 | 139 | @pytest.mark.asyncio 140 | async def test_transform(async_spider, url, params): 141 | mock_response = {"content": "transformed", "error": None, "status": 200} 142 | 143 | async def mock_request(*args, **kwargs): 144 | yield mock_response 145 | 146 | with patch.object(AsyncSpider, '_request', side_effect=mock_request): 147 | transform_data = [{"html": "Example", "url": url}] 148 | async for response in async_spider.transform(transform_data, params=params): 149 | assert isinstance(response, dict) 150 | assert 'content' in response 151 | assert 'error' in response 152 | assert 'status' in response 153 | 154 | @pytest.mark.asyncio 155 | async def test_extract_contacts(async_spider, url, params): 156 | mock_response = [{"content": "contacts", "error": None, "status": 200, "url": url}] 157 | 158 | async def mock_request(*args, **kwargs): 159 | yield mock_response 160 | 161 | with patch.object(AsyncSpider, '_request', side_effect=mock_request): 162 | async for response in async_spider.extract_contacts(url, params=params): 163 | assert isinstance(response, list) 164 | assert len(response) > 0 165 | assert isinstance(response[0], dict) 166 | assert 'content' in response[0] 167 | assert 'error' in response[0] 168 | assert 'status' in response[0] 169 | assert 'url' in response[0] 170 | 171 | @pytest.mark.asyncio 172 | async def test_label(async_spider, url, params): 173 | mock_response = [{"content": "labels", "error": None, "status": 200, "url": url}] 174 | 175 | async def mock_request(*args, **kwargs): 176 | yield mock_response 177 | 178 | with patch.object(AsyncSpider, '_request', side_effect=mock_request): 179 | async for response in async_spider.label(url, params=params): 180 | assert isinstance(response, list) 181 | assert len(response) > 0 182 | assert isinstance(response[0], dict) 183 | assert 'content' in response[0] 184 | assert 'error' in response[0] 185 | assert 'status' in response[0] 186 | assert 'url' in response[0] 187 | 188 | @pytest.mark.asyncio 189 | async def test_get_crawl_state(async_spider, url, params): 190 | mock_response = {"data": [{"state": "completed", "credits_used": 10}]} 191 | 192 | async def mock_request(*args, **kwargs): 193 | yield mock_response 194 | 195 | with patch.object(AsyncSpider, '_request', side_effect=mock_request): 196 | async for response in async_spider.get_crawl_state(url, params=params): 197 | assert isinstance(response, dict) 198 | assert 'data' in response 199 | assert isinstance(response['data'], list) 200 | 201 | @pytest.mark.asyncio 202 | async def test_get_credits(async_spider): 203 | mock_response = {"data": [{"credits": 1000}]} 204 | 205 | async def mock_request(*args, **kwargs): 206 | yield mock_response 207 | 208 | with patch.object(AsyncSpider, '_request', side_effect=mock_request): 209 | async for response in async_spider.get_credits(): 210 | assert isinstance(response, dict) 211 | assert 'data' in response 212 | assert isinstance(response['data'], list) 213 | 214 | @pytest.mark.asyncio 215 | async def test_data_post(async_spider, url): 216 | mock_response = None 217 | 218 | async def mock_request(*args, **kwargs): 219 | yield mock_response 220 | 221 | with patch.object(AsyncSpider, '_request', side_effect=mock_request): 222 | table = "websites" 223 | post_data: RequestParamsDict = {"url": url} 224 | async for response in async_spider.data_post(table, post_data): 225 | assert response is None 226 | 227 | @pytest.mark.asyncio 228 | async def test_data_get(async_spider, url, params): 229 | mock_response = {"data": [{"url": url}]} 230 | 231 | async def mock_request(*args, **kwargs): 232 | yield mock_response 233 | 234 | with patch.object(AsyncSpider, '_request', side_effect=mock_request): 235 | table = "websites" 236 | async for response in async_spider.data_get(table, params=params): 237 | assert isinstance(response['data'], list) 238 | 239 | @pytest.mark.asyncio 240 | async def test_query(async_spider, params): 241 | mock_response = {"data": {"status": 200}} 242 | 243 | async def mock_request(*args, **kwargs): 244 | yield mock_response 245 | 246 | with patch.object(AsyncSpider, '_request', side_effect=mock_request): 247 | async for response in async_spider.data_get("query", params=params): 248 | assert isinstance(response['data'], object) 249 | 250 | @pytest.mark.asyncio 251 | async def test_data_delete(async_spider, params): 252 | mock_response = None 253 | 254 | async def mock_request(*args, **kwargs): 255 | yield mock_response 256 | 257 | with patch.object(AsyncSpider, '_request', side_effect=mock_request): 258 | table = "websites" 259 | async for response in async_spider.data_delete(table, params=params): 260 | assert response is None 261 | 262 | @pytest.mark.asyncio 263 | async def test_create_signed_url(async_spider): 264 | mock_response = b"mocked raw data" 265 | 266 | async def mock_request(*args, **kwargs): 267 | yield mock_response 268 | 269 | with patch.object(AsyncSpider, '_request', side_effect=mock_request): 270 | async for response in async_spider.create_signed_url(params={"domain": "example.com"}): 271 | assert response == b"mocked raw data" 272 | 273 | @pytest.mark.asyncio 274 | async def test_handle_error(): 275 | async_spider = AsyncSpider(api_key="test_api_key") 276 | mock_response = AsyncMock(spec=aiohttp.ClientResponse) 277 | mock_response.status = 402 278 | mock_response.json.return_value = {"error": "Payment Required"} 279 | 280 | with pytest.raises(Exception, match="Failed to test action. Status code: 402. Error: Payment Required"): 281 | await async_spider._handle_error(mock_response, "test action") -------------------------------------------------------------------------------- /python/tests/test_async_spider_integration.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import logging 4 | from spider.async_spider import AsyncSpider 5 | from dotenv import load_dotenv 6 | 7 | load_dotenv() 8 | 9 | logging.basicConfig(level=logging.INFO) 10 | logger = logging.getLogger(__name__) 11 | 12 | @pytest.fixture 13 | def api_key(): 14 | api_key = os.getenv("SPIDER_API_KEY") 15 | if not api_key: 16 | pytest.skip("SPIDER_API_KEY not set in .env file") 17 | return api_key 18 | 19 | @pytest.fixture 20 | def url(): 21 | return "http://example.com" 22 | 23 | @pytest.fixture 24 | def params(): 25 | return { 26 | "limit": 1, 27 | "return_format": "markdown", 28 | "depth": 2, 29 | "cache": True, 30 | "domain": "example.com", 31 | } 32 | 33 | @pytest.mark.asyncio 34 | async def test_scrape_url(api_key, url, params): 35 | async with AsyncSpider(api_key=api_key) as spider: 36 | async for response in spider.scrape_url(url, params=params): 37 | print(type(response)) 38 | logger.info(f"Scrape URL response: {response}") 39 | assert len(response) > 0 40 | assert isinstance(response[0], dict) 41 | assert 'content' in response[0] 42 | assert 'error' in response[0] 43 | assert 'status' in response[0] 44 | assert 'url' in response[0] 45 | 46 | @pytest.mark.asyncio 47 | async def test_crawl_url(api_key, url, params): 48 | async with AsyncSpider(api_key=api_key) as spider: 49 | async for response in spider.crawl_url(url, params=params): 50 | logger.info(f"Crawl URL response: {response}") 51 | assert isinstance(response, list) 52 | assert len(response) > 0 53 | assert isinstance(response[0], dict) 54 | assert 'content' in response[0] 55 | assert 'error' in response[0] 56 | assert 'status' in response[0] 57 | assert 'url' in response[0] 58 | 59 | # TODO "Credits or a valid subscription required to use the API"? 60 | # @pytest.mark.asyncio 61 | # async def test_crawl_url_streaming(url, params): 62 | # async with AsyncSpider(api_key=api_key) as spider: 63 | # async for response in spider.crawl_url(url, params=params, stream=True): 64 | # print(response) 65 | # json_obj = json.loads(response.decode('utf-8')) 66 | # assert json_obj["url"] == "http://example.com" 67 | 68 | @pytest.mark.asyncio 69 | async def test_links(api_key, url, params): 70 | async with AsyncSpider(api_key=api_key) as spider: 71 | async for response in spider.links(url, params=params): 72 | logger.info(f"Links response: {response}") 73 | assert isinstance(response, list) 74 | assert len(response) > 0 75 | assert isinstance(response[0], dict) 76 | assert 'error' in response[0] 77 | assert 'status' in response[0] 78 | assert 'url' in response[0] 79 | 80 | @pytest.mark.asyncio 81 | async def test_screenshot(api_key, url, params): 82 | async with AsyncSpider(api_key=api_key) as spider: 83 | async for response in spider.screenshot(url, params=params): 84 | logger.info(f"Screenshot response: {response}") 85 | assert isinstance(response, list) 86 | assert len(response) > 0 87 | assert isinstance(response[0], dict) 88 | assert 'content' in response[0] 89 | assert 'error' in response[0] 90 | assert 'status' in response[0] 91 | assert 'url' in response[0] 92 | 93 | @pytest.mark.asyncio 94 | async def test_search(api_key, params): 95 | async with AsyncSpider(api_key=api_key) as spider: 96 | async for response in spider.search("example search query", params=params): 97 | logger.info(f"Search response: {response}") 98 | assert isinstance(response, list) 99 | assert len(response) > 0 100 | assert isinstance(response[0], dict) 101 | assert 'content' in response[0] 102 | assert 'error' in response[0] 103 | assert 'status' in response[0] 104 | assert 'url' in response[0] 105 | 106 | @pytest.mark.asyncio 107 | async def test_transform(api_key, url, params): 108 | async with AsyncSpider(api_key=api_key) as spider: 109 | transform_data = [{"html": "Example", "url": url}] 110 | async for response in spider.transform(transform_data, params=params): 111 | logger.info(f"Transform response: {response}") 112 | assert isinstance(response, dict) 113 | assert 'content' in response 114 | assert 'error' in response 115 | assert 'status' in response 116 | 117 | @pytest.mark.asyncio 118 | async def test_extract_contacts(api_key, url, params): 119 | async with AsyncSpider(api_key=api_key) as spider: 120 | async for response in spider.extract_contacts(url, params=params): 121 | logger.info(f"Extract contacts response: {response}") 122 | assert isinstance(response, list) 123 | assert len(response) > 0 124 | assert isinstance(response[0], dict) 125 | assert 'content' in response[0] 126 | assert 'error' in response[0] 127 | assert 'status' in response[0] 128 | assert 'url' in response[0] 129 | 130 | @pytest.mark.asyncio 131 | async def test_label(api_key, url, params): 132 | async with AsyncSpider(api_key=api_key) as spider: 133 | async for response in spider.label(url, params=params): 134 | logger.info(f"Label response: {response}") 135 | assert isinstance(response, list) 136 | assert len(response) > 0 137 | assert isinstance(response[0], dict) 138 | assert 'content' in response[0] 139 | assert 'error' in response[0] 140 | assert 'status' in response[0] 141 | assert 'url' in response[0] 142 | 143 | @pytest.mark.asyncio 144 | async def test_get_crawl_state(api_key, url, params): 145 | async with AsyncSpider(api_key=api_key) as spider: 146 | async for response in spider.get_crawl_state(url, params=params): 147 | logger.info(f"Get crawl state response: {response}") 148 | assert isinstance(response, dict) 149 | assert 'data' in response 150 | assert isinstance(response['data'], list) 151 | 152 | @pytest.mark.asyncio 153 | async def test_get_credits(api_key): 154 | async with AsyncSpider(api_key=api_key) as spider: 155 | async for response in spider.get_credits(): 156 | logger.info(f"Get credits response: {response}") 157 | assert isinstance(response, dict) 158 | assert 'data' in response 159 | assert isinstance(response['data'], list) 160 | 161 | @pytest.mark.asyncio 162 | async def test_data_post(api_key, url): 163 | async with AsyncSpider(api_key=api_key) as spider: 164 | table = "websites" 165 | post_data = {"url": url} 166 | async for response in spider.data_post(table, post_data): 167 | logger.info(f"Data post response: {response}") 168 | assert 200 <= response['status'] < 300 169 | assert response['data']['created_at'] is not None 170 | 171 | # TODO 500 error 172 | # @pytest.mark.asyncio 173 | # async def test_data_get(api_key, url, params): 174 | # async with AsyncSpider(api_key=api_key) as spider: 175 | # table = "websites" 176 | # async for response in spider.data_get(table, params=params): 177 | # logger.info(f"Data get response: {response}") 178 | # print(response) 179 | # assert isinstance(response['data'], list) 180 | 181 | @pytest.mark.asyncio 182 | async def test_data_delete(api_key, url, params): 183 | async with AsyncSpider(api_key=api_key) as spider: 184 | table = "websites" 185 | async for response in spider.data_delete(table, params=params): 186 | logger.info(f"Data delete response: {response}") 187 | print(response) 188 | assert response['message'] == 'ok' 189 | 190 | @pytest.mark.asyncio 191 | async def test_create_signed_url(api_key): 192 | async with AsyncSpider(api_key=api_key) as spider: 193 | async for response in spider.create_signed_url(params={"domain": "example.com"}): 194 | logger.info(f"Create signed URL response: {response}") 195 | assert isinstance(response, bytes) -------------------------------------------------------------------------------- /python/tests/test_spider.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from io import BytesIO 4 | from unittest.mock import patch, MagicMock 5 | from spider.spider import Spider 6 | from spider.spider_types import RequestParamsDict 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | @pytest.fixture 12 | def spider(): 13 | return Spider(api_key="test_api_key") 14 | 15 | @pytest.fixture 16 | def url(): 17 | return "http://example.com" 18 | 19 | @pytest.fixture 20 | def params(): 21 | return { 22 | "limit": 1, 23 | "return_format": "markdown", 24 | "depth": 2, 25 | "cache": True, 26 | "domain": "example.com", 27 | } 28 | 29 | def test_init_with_env_variable(): 30 | os.environ["SPIDER_API_KEY"] = "env_api_key" 31 | spider = Spider() 32 | assert spider.api_key == "env_api_key" 33 | del os.environ["SPIDER_API_KEY"] 34 | 35 | def test_init_without_api_key(): 36 | with pytest.raises(ValueError): 37 | Spider(api_key=None) 38 | 39 | @patch('requests.post') 40 | def test_scrape_url(mock_post, spider, url, params): 41 | mock_response = MagicMock() 42 | mock_response.status_code = 200 43 | mock_response.json.return_value = [{"content": "data", "error": None, "status": 200, "url": url}] 44 | mock_post.return_value = mock_response 45 | 46 | response = spider.scrape_url(url, params=params) 47 | assert isinstance(response, list) 48 | assert len(response) > 0 49 | assert isinstance(response[0], dict) 50 | assert 'content' in response[0] 51 | assert 'error' in response[0] 52 | assert 'status' in response[0] 53 | assert 'url' in response[0] 54 | mock_post.assert_called_once() 55 | 56 | @patch('requests.post') 57 | def test_crawl_url(mock_post, spider, url, params): 58 | mock_response = MagicMock() 59 | mock_response.status_code = 200 60 | mock_response.json.return_value = [{"content": "data", "error": None, "status": 200, "url": url}] 61 | mock_post.return_value = mock_response 62 | 63 | response = spider.crawl_url(url, params=params) 64 | assert isinstance(response, list) 65 | assert len(response) > 0 66 | assert isinstance(response[0], dict) 67 | assert 'content' in response[0] 68 | assert 'error' in response[0] 69 | assert 'status' in response[0] 70 | assert 'url' in response[0] 71 | mock_post.assert_called_once() 72 | 73 | @patch('requests.post') 74 | def test_crawl_url_streaming(mock_post, spider, url, params): 75 | mock_response = MagicMock() 76 | mock_response.status_code = 200 77 | mock_response.iter_content.return_value = [b'{"url": "http://example.com"}'] 78 | mock_post.return_value = mock_response 79 | 80 | def handle_json(json_obj): 81 | assert json_obj["url"] == "http://example.com" 82 | 83 | spider.crawl_url(url, params=params, stream=True, content_type="application/jsonl", callback=handle_json) 84 | mock_post.assert_called_once() 85 | 86 | @patch('requests.post') 87 | def test_links(mock_post, spider, url, params): 88 | mock_response = MagicMock() 89 | mock_response.status_code = 200 90 | mock_response.json.return_value = [{"error": None, "status": 200, "url": url}] 91 | mock_post.return_value = mock_response 92 | 93 | response = spider.links(url, params=params) 94 | assert isinstance(response, list) 95 | assert len(response) > 0 96 | assert isinstance(response[0], dict) 97 | assert 'error' in response[0] 98 | assert 'status' in response[0] 99 | assert 'url' in response[0] 100 | mock_post.assert_called_once() 101 | 102 | @patch('requests.post') 103 | def test_screenshot(mock_post, spider, url, params): 104 | mock_response = MagicMock() 105 | mock_response.status_code = 200 106 | mock_response.json.return_value = [{"content": "base64_encoded_image", "error": None, "status": 200, "url": url}] 107 | mock_post.return_value = mock_response 108 | 109 | response = spider.screenshot(url, params=params) 110 | assert isinstance(response, list) 111 | assert len(response) > 0 112 | assert isinstance(response[0], dict) 113 | assert 'content' in response[0] 114 | assert 'error' in response[0] 115 | assert 'status' in response[0] 116 | assert 'url' in response[0] 117 | mock_post.assert_called_once() 118 | 119 | @patch('requests.post') 120 | def test_search(mock_post, spider, params): 121 | mock_response = MagicMock() 122 | mock_response.status_code = 200 123 | mock_response.json.return_value = [{"content": "result", "error": None, "status": 200, "url": "http://example.com"}] 124 | mock_post.return_value = mock_response 125 | 126 | response = spider.search("example search query", params=params) 127 | assert isinstance(response, list) 128 | assert len(response) > 0 129 | assert isinstance(response[0], dict) 130 | assert 'content' in response[0] 131 | assert 'error' in response[0] 132 | assert 'status' in response[0] 133 | assert 'url' in response[0] 134 | mock_post.assert_called_once() 135 | 136 | @patch('requests.post') 137 | def test_transform(mock_post, spider, url, params): 138 | mock_response = MagicMock() 139 | mock_response.status_code = 200 140 | mock_response.json.return_value = {"content": "transformed", "error": None, "status": 200} 141 | mock_post.return_value = mock_response 142 | 143 | transform_data = [{"html": "Example", "url": url}] 144 | response = spider.transform(transform_data, params=params) 145 | assert isinstance(response, dict) 146 | assert 'content' in response 147 | assert 'error' in response 148 | assert 'status' in response 149 | mock_post.assert_called_once() 150 | 151 | @patch('requests.post') 152 | def test_extract_contacts(mock_post, spider, url, params): 153 | mock_response = MagicMock() 154 | mock_response.status_code = 200 155 | mock_response.json.return_value = [{"content": "contacts", "error": None, "status": 200, "url": url}] 156 | mock_post.return_value = mock_response 157 | 158 | response = spider.extract_contacts(url, params=params) 159 | assert isinstance(response, list) 160 | assert len(response) > 0 161 | assert isinstance(response[0], dict) 162 | assert 'content' in response[0] 163 | assert 'error' in response[0] 164 | assert 'status' in response[0] 165 | assert 'url' in response[0] 166 | mock_post.assert_called_once() 167 | 168 | @patch('requests.post') 169 | def test_label(mock_post, spider, url, params): 170 | mock_response = MagicMock() 171 | mock_response.status_code = 200 172 | mock_response.json.return_value = [{"content": "labels", "error": None, "status": 200, "url": url}] 173 | mock_post.return_value = mock_response 174 | 175 | response = spider.label(url, params=params) 176 | assert isinstance(response, list) 177 | assert len(response) > 0 178 | assert isinstance(response[0], dict) 179 | assert 'content' in response[0] 180 | assert 'error' in response[0] 181 | assert 'status' in response[0] 182 | assert 'url' in response[0] 183 | mock_post.assert_called_once() 184 | 185 | @patch('requests.post') 186 | def test_get_crawl_state(mock_post, spider, url, params): 187 | mock_response = MagicMock() 188 | mock_response.status_code = 200 189 | mock_response.json.return_value = {"data": [{"state": "completed", "credits_used": 10}]} 190 | mock_post.return_value = mock_response 191 | 192 | response = spider.get_crawl_state(url, params=params) 193 | assert isinstance(response, dict) 194 | assert 'data' in response 195 | assert isinstance(response['data'], list) 196 | mock_post.assert_called_once() 197 | 198 | @patch('requests.get') 199 | def test_get_credits(mock_get, spider): 200 | mock_response = MagicMock() 201 | mock_response.status_code = 200 202 | mock_response.json.return_value = {"data": [{"credits": 1000}]} 203 | mock_get.return_value = mock_response 204 | 205 | response = spider.get_credits() 206 | assert isinstance(response, dict) 207 | assert 'data' in response 208 | assert isinstance(response['data'], list) 209 | mock_get.assert_called_once() 210 | 211 | @patch('requests.post') 212 | def test_data_post(mock_post, spider, url): 213 | mock_response = MagicMock() 214 | mock_response.status_code = 204 215 | mock_post.return_value = mock_response 216 | 217 | table = "websites" 218 | post_data: RequestParamsDict = {"url": url} 219 | response = spider.data_post(table, post_data) 220 | assert response is not None 221 | mock_post.assert_called_once() 222 | 223 | @patch('requests.get') 224 | def test_data_get(mock_get, spider, url, params): 225 | mock_response = MagicMock() 226 | mock_response.status_code = 200 227 | mock_response.json.return_value = {"data": [{"url": url}]} 228 | mock_get.return_value = mock_response 229 | 230 | table = "websites" 231 | response = spider.data_get(table, params=params) 232 | assert isinstance(response['data'], list) 233 | mock_get.assert_called_once() 234 | 235 | @patch('requests.get') 236 | def test_query(mock_get, spider, params): 237 | mock_response = MagicMock() 238 | mock_response.status_code = 200 239 | mock_response.json.return_value = {"data": {"status": 200}} 240 | mock_get.return_value = mock_response 241 | response = spider.data_get("query", params=params) 242 | assert isinstance(response['data'], object) 243 | mock_get.assert_called_once() 244 | 245 | @patch('requests.delete') 246 | def test_data_delete(mock_delete, spider, params): 247 | mock_response = MagicMock() 248 | mock_response.status_code = 204 249 | mock_delete.return_value = mock_response 250 | 251 | table = "websites" 252 | response = spider.data_delete(table, params=params) 253 | assert response is not None 254 | mock_delete.assert_called_once() 255 | 256 | @patch('requests.get') 257 | def test_create_signed_url(mock_get, spider): 258 | mock_response = MagicMock() 259 | mock_response.status_code = 200 260 | mock_response.raw = b"mocked raw data" 261 | mock_get.return_value = mock_response 262 | 263 | response = spider.create_signed_url(params={"domain": "example.com"}) 264 | assert response == b"mocked raw data" 265 | mock_get.assert_called_once() 266 | 267 | def test_stream_reader(): 268 | spider = Spider(api_key="test_api_key") 269 | mock_response = MagicMock() 270 | raw_data = b'{"key": "value"}\n{"key2": "value2"}\n' 271 | mock_response = MagicMock() 272 | mock_response.raw = BytesIO(raw_data) 273 | 274 | callback_data = [] 275 | def callback(json_obj): 276 | callback_data.append(json_obj) 277 | 278 | spider.stream_reader(mock_response, callback) 279 | 280 | assert len(callback_data) == 2 281 | assert callback_data[0] == {"key": "value"} 282 | assert callback_data[1] == {"key2": "value2"} 283 | 284 | def test_handle_error(): 285 | spider = Spider(api_key="test_api_key") 286 | mock_response = MagicMock() 287 | mock_response.status_code = 402 288 | mock_response.json.return_value = {"error": "Payment Required"} 289 | 290 | with pytest.raises(Exception, match="Failed to test action. Status code: 402. Error: Payment Required"): 291 | spider._handle_error(mock_response, "test action") 292 | -------------------------------------------------------------------------------- /python/tests/test_spider_integration.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import logging 4 | from spider.spider import Spider 5 | from dotenv import load_dotenv 6 | 7 | load_dotenv() 8 | 9 | logging.basicConfig(level=logging.INFO) 10 | logger = logging.getLogger(__name__) 11 | 12 | @pytest.fixture 13 | def spider(): 14 | api_key = os.getenv("SPIDER_API_KEY") 15 | if not api_key: 16 | pytest.skip("SPIDER_API_KEY not set in .env file") 17 | return Spider(api_key=api_key) 18 | 19 | @pytest.fixture 20 | def url(): 21 | return "http://example.com" 22 | 23 | @pytest.fixture 24 | def params(): 25 | return { 26 | "limit": 1, 27 | "return_format": "markdown", 28 | "depth": 2, 29 | "cache": True, 30 | "domain": "example.com", 31 | } 32 | 33 | def test_scrape_url(spider, url, params): 34 | response = spider.scrape_url(url, params=params) 35 | logger.info(f"Scrape URL response: {response}") 36 | assert isinstance(response, list) 37 | assert len(response) > 0 38 | assert isinstance(response[0], dict) 39 | assert 'content' in response[0] 40 | assert 'error' in response[0] 41 | assert 'status' in response[0] 42 | assert 'url' in response[0] 43 | 44 | def test_crawl_url(spider, url, params): 45 | response = spider.crawl_url(url, params=params) 46 | logger.info(f"Crawl URL response: {response}") 47 | assert isinstance(response, list) 48 | assert len(response) > 0 49 | assert isinstance(response[0], dict) 50 | assert 'content' in response[0] 51 | assert 'error' in response[0] 52 | assert 'status' in response[0] 53 | assert 'url' in response[0] 54 | 55 | def test_links(spider, url, params): 56 | response = spider.links(url, params=params) 57 | logger.info(f"Links response: {response}") 58 | assert isinstance(response, list) 59 | assert len(response) > 0 60 | assert isinstance(response[0], dict) 61 | assert 'error' in response[0] 62 | assert 'status' in response[0] 63 | assert 'url' in response[0] 64 | 65 | def test_screenshot(spider, url, params): 66 | response = spider.screenshot(url, params=params) 67 | logger.info(f"Screenshot response: {response}") 68 | assert isinstance(response, list) 69 | assert len(response) > 0 70 | assert isinstance(response[0], dict) 71 | assert 'content' in response[0] 72 | assert 'error' in response[0] 73 | assert 'status' in response[0] 74 | assert 'url' in response[0] 75 | 76 | def test_search(spider, params): 77 | response = spider.search("example search query", params=params) 78 | logger.info(f"Search response: {response}") 79 | assert isinstance(response, list) 80 | assert len(response) > 0 81 | assert isinstance(response[0], dict) 82 | assert 'content' in response[0] 83 | assert 'error' in response[0] 84 | assert 'status' in response[0] 85 | assert 'url' in response[0] 86 | 87 | def test_transform(spider, url, params): 88 | transform_data = [{"html": "Example", "url": url}] 89 | response = spider.transform(transform_data, params=params) 90 | logger.info(f"Transform response: {response}") 91 | assert isinstance(response, dict) 92 | assert 'content' in response 93 | assert 'error' in response 94 | assert 'status' in response 95 | 96 | def test_extract_contacts(spider, url, params): 97 | response = spider.extract_contacts(url, params=params) 98 | logger.info(f"Extract contacts response: {response}") 99 | assert isinstance(response, list) 100 | assert len(response) > 0 101 | assert isinstance(response[0], dict) 102 | assert 'content' in response[0] 103 | assert 'error' in response[0] 104 | assert 'status' in response[0] 105 | assert 'url' in response[0] 106 | 107 | def test_label(spider, url, params): 108 | response = spider.label(url, params=params) 109 | logger.info(f"Label response: {response}") 110 | assert isinstance(response, list) 111 | assert len(response) > 0 112 | assert isinstance(response[0], dict) 113 | assert 'content' in response[0] 114 | assert 'error' in response[0] 115 | assert 'status' in response[0] 116 | assert 'url' in response[0] 117 | 118 | def test_get_crawl_state(spider, url, params): 119 | response = spider.get_crawl_state(url, params=params) 120 | logger.info(f"Get crawl state response: {response}") 121 | assert isinstance(response, dict) 122 | assert 'data' in response 123 | assert isinstance(response['data'], list) 124 | 125 | def test_get_credits(spider): 126 | response = spider.get_credits() 127 | logger.info(f"Get credits response: {response}") 128 | assert isinstance(response, dict) 129 | assert 'data' in response 130 | assert isinstance(response['data'], list) 131 | 132 | def test_data_post(spider, url): 133 | table = "websites" 134 | post_data = {"url": url} 135 | response = spider.data_post(table, post_data) 136 | logger.info(f"Data post response: {response}") 137 | assert isinstance(response['data'], dict) 138 | assert response['data']['url'] == url 139 | assert response['data']['domain'] == url.replace("http://", "").replace("https://", "") 140 | assert response['error'] == None 141 | 142 | # TODO: 500 error. 143 | # def test_data_get(spider, params): 144 | # table = "websites" 145 | # response = spider.data_get(table, params=params) 146 | # logger.info(f"Data get response: {response}") 147 | # assert isinstance(response['data'], list) 148 | 149 | def test_data_delete(spider, params): 150 | table = "websites" 151 | response = spider.data_delete(table, params=params) 152 | logger.info(f"Data delete response: {response}") 153 | assert response['message'] == 'ok' 154 | 155 | # TODO: 500 error. 156 | # def test_create_signed_url(spider): 157 | # response = spider.create_signed_url(domain="example.com", options={"page": 1, "limit": 10}) 158 | # logger.info(f"Create signed URL response: {response}") 159 | # assert isinstance(response, bytes) 160 | -------------------------------------------------------------------------------- /rust/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "spider-client" 3 | version = "0.1.36" 4 | edition = "2021" 5 | authors = [ "j-mendez "] 6 | description = "Spider Cloud client" 7 | license = "MIT" 8 | readme = "README.md" 9 | repository = "https://github.com/spider-rs/spider-clients" 10 | keywords = ["crawler", "web-crawler", "web-scraper", "spider", "web-indexer"] 11 | categories = ["web-programming"] 12 | include = ["src/*", "../../LICENSE", "README.md"] 13 | 14 | [dependencies] 15 | reqwest = { version = "0.12", features = ["json", "stream"] } 16 | bytes = "1" 17 | tokio = { version = "1", features = ["rt-multi-thread", "macros"] } 18 | serde = { version = "1", features = ["derive"] } 19 | serde_json = { version = "1" } 20 | tokio-stream = "0.1" 21 | backon = { version = "1", features = ["tokio-sleep"] } 22 | tokio-util = "0.7" 23 | 24 | [dev-dependencies] 25 | dotenv = "0.15.0" 26 | lazy_static = "1.5.0" 27 | -------------------------------------------------------------------------------- /rust/README.md: -------------------------------------------------------------------------------- 1 | # Spider Cloud Rust SDK 2 | 3 | The Spider Cloud Rust SDK offers a toolkit for straightforward website scraping, crawling at scale, and other utilities like extracting links and taking screenshots, enabling you to collect data formatted for compatibility with language models (LLMs). It features a user-friendly interface for seamless integration with the Spider Cloud API. 4 | 5 | ## Installation 6 | 7 | To use the Spider Cloud Rust SDK, include the following in your `Cargo.toml`: 8 | 9 | ```toml 10 | [dependencies] 11 | spider-client = "0.1" 12 | ``` 13 | 14 | ## Usage 15 | 16 | 1. Get an API key from [spider.cloud](https://spider.cloud) 17 | 2. Set the API key as an environment variable named `SPIDER_API_KEY` or pass it as an argument when creating an instance of the `Spider` struct. 18 | 19 | Here's an example of how to use the SDK: 20 | 21 | ```rust 22 | use serde_json::json; 23 | use std::env; 24 | 25 | #[tokio::main] 26 | async fn main() { 27 | // Set the API key as an environment variable 28 | env::set_var("SPIDER_API_KEY", "your_api_key"); 29 | 30 | // Initialize the Spider with your API key 31 | let spider = Spider::new(None).expect("API key must be provided"); 32 | 33 | let url = "https://spider.cloud"; 34 | 35 | // Scrape a single URL 36 | let scraped_data = spider.scrape_url(url, None, false, "application/json").await.expect("Failed to scrape the URL"); 37 | 38 | println!("Scraped Data: {:?}", scraped_data); 39 | 40 | // Crawl a website 41 | let crawler_params = RequestParams { 42 | limit: Some(1), 43 | proxy_enabled: Some(true), 44 | store_data: Some(false), 45 | metadata: Some(false), 46 | request: Some(RequestType::Http), 47 | ..Default::default() 48 | }; 49 | 50 | let crawl_result = spider.crawl_url(url, Some(crawler_params), false, "application/json", None::).await.expect("Failed to crawl the URL"); 51 | 52 | println!("Crawl Result: {:?}", crawl_result); 53 | } 54 | ``` 55 | 56 | ### Scraping a URL 57 | 58 | To scrape data from a single URL: 59 | 60 | ```rust 61 | let url = "https://example.com"; 62 | let scraped_data = spider.scrape_url(url, None, false, "application/json").await.expect("Failed to scrape the URL"); 63 | ``` 64 | 65 | ### Crawling a Website 66 | 67 | To automate crawling a website: 68 | 69 | ```rust 70 | let url = "https://example.com"; 71 | let crawl_params = RequestParams { 72 | limit: Some(200), 73 | request: Some(RequestType::Smart), 74 | ..Default::default() 75 | }; 76 | let crawl_result = spider.crawl_url(url, Some(crawl_params), false, "application/json", None::).await.expect("Failed to crawl the URL"); 77 | ``` 78 | 79 | #### Crawl Streaming 80 | 81 | Stream crawl the website in chunks to scale with a callback: 82 | 83 | ```rust 84 | fn handle_json(json_obj: serde_json::Value) { 85 | println!("Received chunk: {:?}", json_obj); 86 | } 87 | 88 | let url = "https://example.com"; 89 | let crawl_params = RequestParams { 90 | limit: Some(200), 91 | store_data: Some(false), 92 | ..Default::default() 93 | }; 94 | 95 | spider.crawl_url( 96 | url, 97 | Some(crawl_params), 98 | true, 99 | "application/json", 100 | Some(handle_json) 101 | ).await.expect("Failed to crawl the URL"); 102 | ``` 103 | 104 | ### Search 105 | 106 | Perform a search for websites to crawl or gather search results: 107 | 108 | ```rust 109 | let query = "a sports website"; 110 | let crawl_params = RequestParams { 111 | request: Some(RequestType::Smart), 112 | search_limit: Some(5), 113 | limit: Some(5), 114 | fetch_page_content: Some(true), 115 | ..Default::default() 116 | }; 117 | let crawl_result = spider.search(query, Some(crawl_params), false, "application/json").await.expect("Failed to perform search"); 118 | ``` 119 | 120 | ### Retrieving Links from a URL(s) 121 | 122 | Extract all links from a specified URL: 123 | 124 | ```rust 125 | let url = "https://example.com"; 126 | let links = spider.links(url, None, false, "application/json").await.expect("Failed to retrieve links from URL"); 127 | ``` 128 | 129 | ### Transform 130 | 131 | Transform HTML to markdown or text lightning fast: 132 | 133 | ```rust 134 | let data = vec![json!({"html": "

Hello world

"})]; 135 | let params = RequestParams { 136 | readability: Some(false), 137 | return_format: Some(ReturnFormat::Markdown), 138 | ..Default::default() 139 | }; 140 | let result = spider.transform(data, Some(params), false, "application/json").await.expect("Failed to transform HTML to markdown"); 141 | println!("Transformed Data: {:?}", result); 142 | ``` 143 | 144 | ### Taking Screenshots of a URL(s) 145 | 146 | Capture a screenshot of a given URL: 147 | 148 | ```rust 149 | let url = "https://example.com"; 150 | let screenshot = spider.screenshot(url, None, false, "application/json").await.expect("Failed to take screenshot of URL"); 151 | ``` 152 | 153 | ### Extracting Contact Information 154 | 155 | Extract contact details from a specified URL: 156 | 157 | ```rust 158 | let url = "https://example.com"; 159 | let contacts = spider.extract_contacts(url, None, false, "application/json").await.expect("Failed to extract contacts from URL"); 160 | println!("Extracted Contacts: {:?}", contacts); 161 | ``` 162 | 163 | ### Labeling Data from a URL(s) 164 | 165 | Label the data extracted from a particular URL: 166 | 167 | ```rust 168 | let url = "https://example.com"; 169 | let labeled_data = spider.label(url, None, false, "application/json").await.expect("Failed to label data from URL"); 170 | println!("Labeled Data: {:?}", labeled_data); 171 | ``` 172 | 173 | ### Checking Crawl State 174 | 175 | You can check the crawl state of a specific URL: 176 | 177 | ```rust 178 | let url = "https://example.com"; 179 | let state = spider.get_crawl_state(url, None, false, "application/json").await.expect("Failed to get crawl state for URL"); 180 | println!("Crawl State: {:?}", state); 181 | ``` 182 | 183 | ### Downloading Files 184 | 185 | You can download the results of the website: 186 | 187 | ```rust 188 | let url = "https://example.com"; 189 | let options = hashmap!{ 190 | "page" => 0, 191 | "limit" => 100, 192 | "expiresIn" => 3600 // Optional, add if needed 193 | }; 194 | let response = spider.create_signed_url(Some(url), Some(options)).await.expect("Failed to create signed URL"); 195 | println!("Download URL: {:?}", response); 196 | ``` 197 | 198 | ### Checking Available Credits 199 | 200 | You can check the remaining credits on your account: 201 | 202 | ```rust 203 | let credits = spider.get_credits().await.expect("Failed to get credits"); 204 | println!("Remaining Credits: {:?}", credits); 205 | ``` 206 | 207 | ### Data Operations 208 | 209 | The Spider client can now interact with specific data tables to create, retrieve, and delete data. 210 | 211 | #### Retrieve Data from a Table 212 | 213 | To fetch data from a specified table by applying query parameters: 214 | 215 | ```rust 216 | let table_name = "pages"; 217 | let query_params = RequestParams { 218 | limit: Some(20), 219 | ..Default::default() 220 | }; 221 | let response = spider.data_get(table_name, Some(query_params)).await.expect("Failed to retrieve data from table"); 222 | println!("Data from table: {:?}", response); 223 | ``` 224 | 225 | #### Delete Data from a Table 226 | 227 | To delete data from a specified table based on certain conditions: 228 | 229 | ```rust 230 | let table_name = "websites"; 231 | let delete_params = RequestParams { 232 | domain: Some("www.example.com".to_string()), 233 | ..Default::default() 234 | }; 235 | let response = spider.data_delete(table_name, Some(delete_params)).await.expect("Failed to delete data from table"); 236 | println!("Delete Response: {:?}", response); 237 | ``` 238 | 239 | ## Streaming 240 | 241 | If you need to use streaming, set the `stream` parameter to `true` and provide a callback function: 242 | 243 | ```rust 244 | fn handle_json(json_obj: serde_json::Value) { 245 | println!("Received chunk: {:?}", json_obj); 246 | } 247 | 248 | let url = "https://example.com"; 249 | let crawler_params = RequestParams { 250 | limit: Some(1), 251 | proxy_enabled: Some(true), 252 | store_data: Some(false), 253 | metadata: Some(false), 254 | request: Some(RequestType::Http), 255 | ..Default::default() 256 | }; 257 | 258 | spider.links(url, Some(crawler_params), true, "application/json").await.expect("Failed to retrieve links from URL"); 259 | ``` 260 | 261 | ## Content-Type 262 | 263 | The following Content-type headers are supported using the `content_type` parameter: 264 | 265 | - `application/json` 266 | - `text/csv` 267 | - `application/xml` 268 | - `application/jsonl` 269 | 270 | ```rust 271 | let url = "https://example.com"; 272 | 273 | let crawler_params = RequestParams { 274 | limit: Some(1), 275 | proxy_enabled: Some(true), 276 | store_data: Some(false), 277 | metadata: Some(false), 278 | request: Some(RequestType::Http), 279 | ..Default::default() 280 | }; 281 | 282 | // Stream JSON lines back to the client 283 | spider.crawl_url(url, Some(crawler_params), true, "application/jsonl", None::).await.expect("Failed to crawl the URL"); 284 | ``` 285 | 286 | ## Error Handling 287 | 288 | The SDK handles errors returned by the Spider Cloud API and raises appropriate exceptions. If an error occurs during a request, it will be propagated to the caller with a descriptive error message. 289 | 290 | ## Contributing 291 | 292 | Contributions to the Spider Cloud Rust SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository. 293 | 294 | ## License 295 | 296 | The Spider Cloud Rust SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT). 297 | --------------------------------------------------------------------------------