├── .github
    └── workflows
    │   ├── book.yml
    │   ├── nodejs.yml
    │   ├── python.yml
    │   └── rust.yml
├── .gitignore
├── LICENSE
├── README.md
├── book
    ├── .gitignore
    ├── book.toml
    └── src
    │   ├── README.md
    │   ├── SUMMARY.md
    │   ├── cli
    │       └── getting-started.md
    │   ├── env.md
    │   ├── javascript
    │       ├── crawl.md
    │       ├── getting-started.md
    │       └── scrape.md
    │   ├── python
    │       ├── async-crawl.md
    │       ├── crawl.md
    │       ├── getting-started.md
    │       └── scrape.md
    │   ├── rust
    │       └── getting-started.md
    │   ├── simple-example.md
    │   └── website.md
├── cli
    ├── Cargo.lock
    ├── Cargo.toml
    ├── README.md
    └── src
    │   ├── args.rs
    │   ├── main.rs
    │   └── mod.rs
├── javascript
    ├── .npmignore
    ├── LICENSE
    ├── README.md
    ├── __tests__
    │   └── spiderwebai.test.ts
    ├── package-lock.json
    ├── package.json
    ├── sample.env
    ├── src
    │   ├── client.ts
    │   ├── config.ts
    │   ├── index.ts
    │   └── utils
    │   │   ├── process-chunk.ts
    │   │   └── stream-reader.ts
    └── tsconfig.json
├── python
    ├── LICENSE
    ├── README.md
    ├── example.py
    ├── example_async.py
    ├── example_streaming.py
    ├── requirements.txt
    ├── setup.py
    ├── spider
    │   ├── __init__.py
    │   ├── async_spider.py
    │   ├── spider.py
    │   ├── spider.pyi
    │   └── spider_types.py
    └── tests
    │   ├── test_async_spider.py
    │   ├── test_async_spider_integration.py
    │   ├── test_spider.py
    │   └── test_spider_integration.py
└── rust
    ├── Cargo.lock
    ├── Cargo.toml
    ├── README.md
    └── src
        └── lib.rs


/.github/workflows/book.yml:
--------------------------------------------------------------------------------
 1 | name: github pages
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 | 
 9 | jobs:
10 |   deploy:
11 |     runs-on: ubuntu-latest
12 |     concurrency:
13 |       group: ${{ github.workflow }}-${{ github.ref }}
14 |     steps:
15 |       - uses: actions/checkout@v4
16 | 
17 |       - name: Setup mdBook
18 |         uses: peaceiris/actions-mdbook@v2
19 |         with:
20 |           mdbook-version: 'latest'
21 | 
22 |       - run: cd book && mdbook build
23 | 
24 |       - name: Deploy
25 |         uses: peaceiris/actions-gh-pages@v4
26 |         if: ${{ github.ref == 'refs/heads/main' }}
27 |         with:
28 |           github_token: ${{ secrets.GITHUB_TOKEN }}
29 |           publish_dir: ./book/book


--------------------------------------------------------------------------------
/.github/workflows/nodejs.yml:
--------------------------------------------------------------------------------
 1 | name: Node.js CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ["main"]
 6 |   pull_request:
 7 |     branches: ["main"]
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 | 
13 |     strategy:
14 |       matrix:
15 |         node-version: [18.x, 20.x]
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v4
19 | 
20 |       - name: Use Node.js ${{ matrix.node-version }}
21 |         uses: actions/setup-node@v3
22 |         with:
23 |           node-version: ${{ matrix.node-version }}
24 |           cache: "npm"
25 |           cache-dependency-path: ./javascript/package-lock.json
26 | 
27 |       - run: npm ci
28 |         working-directory: ./javascript
29 | 
30 |       - run: npm run build --if-present
31 |         working-directory: ./javascript
32 | 
33 |       - run: npm test
34 |         working-directory: ./javascript
35 |         env:
36 |           SPIDER_API_KEY: ${{ secrets.SPIDER_API_KEY }}
37 |           SPIDER_EMAIL: ${{ secrets.SPIDER_EMAIL }}
38 |           SPIDER_PASSWORD: ${{ secrets.SPIDER_PASSWORD }}
39 |           SUPABASE_AUTO_REFRESH_TOKEN: "false"


--------------------------------------------------------------------------------
/.github/workflows/python.yml:
--------------------------------------------------------------------------------
 1 | name: Python CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ["main"]
 6 |   pull_request:
 7 |     branches: ["main"]
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 | 
13 |     strategy:
14 |       matrix:
15 |         python-version: [3.11, 3.12]
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v4
19 | 
20 |       - name: Set up Python ${{ matrix.python-version }}
21 |         uses: actions/setup-python@v4
22 |         with:
23 |           python-version: ${{ matrix.python-version }}
24 | 
25 |       - name: Install dependencies
26 |         run: |
27 |           python -m pip install --upgrade pip
28 |           pip install .
29 |           pip install -r requirements.txt
30 |         working-directory: ./python
31 | 
32 |       - name: Run tests
33 |         run: |
34 |           pytest
35 |         working-directory: ./python/tests
36 |         env:
37 |           SPIDER_API_KEY: ${{ secrets.SPIDER_API_KEY }}
38 |           SPIDER_EMAIL: ${{ secrets.SPIDER_EMAIL }}
39 |           SPIDER_PASSWORD: ${{ secrets.SPIDER_PASSWORD }}
40 |           SUPABASE_AUTO_REFRESH_TOKEN: "false"


--------------------------------------------------------------------------------
/.github/workflows/rust.yml:
--------------------------------------------------------------------------------
 1 | name: Rust CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ["main"]
 6 |   pull_request:
 7 |     branches: ["main"]
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 | 
13 |     strategy:
14 |       matrix:
15 |         rust-version: [stable, beta]
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v4
19 | 
20 |       - name: Set up Rust ${{ matrix.rust-version }}
21 |         uses: dtolnay/rust-toolchain@stable
22 |         with:
23 |           toolchain: ${{ matrix.rust-version }}
24 | 
25 |       - name: Cache cargo registry
26 |         uses: actions/cache@v3
27 |         with:
28 |           path: ~/.cargo/registry
29 |           key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}
30 |           restore-keys: |
31 |             ${{ runner.os }}-cargo-registry-
32 | 
33 |       - name: Cache cargo index
34 |         uses: actions/cache@v3
35 |         with:
36 |           path: ~/.cargo/git
37 |           key: ${{ runner.os }}-cargo-git-${{ hashFiles('**/Cargo.lock') }}
38 |           restore-keys: |
39 |             ${{ runner.os }}-cargo-git-
40 | 
41 |       - name: Check toolchain
42 |         run: rustc --version
43 | 
44 |       - name: Build
45 |         run: cargo build --verbose
46 |         working-directory: ./rust
47 | 
48 |       - name: Run tests
49 |         run: cargo test --verbose --lib --release
50 |         working-directory: ./rust
51 |         env:
52 |             SPIDER_API_KEY: ${{ secrets.SPIDER_API_KEY }}
53 |             SPIDER_EMAIL: ${{ secrets.SPIDER_EMAIL }}
54 |             SPIDER_PASSWORD: ${{ secrets.SPIDER_PASSWORD }}
55 |             SUPABASE_AUTO_REFRESH_TOKEN: "false"


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | node_modules
 3 | dist
 4 | target
 5 | python/build
 6 | python/spiderwebai_py.egg-info
 7 | javascript/coverage
 8 | # Compiled Python files
 9 | *.pyc
10 | *.pyo
11 | __pycache__/
12 | 
13 | # Distribution / packaging
14 | dist/
15 | build/
16 | *.egg-info/
17 | *.egg
18 | 
19 | # Virtual environment
20 | venv/
21 | .venv/
22 | env/
23 | .env/
24 | .env
25 | 
26 | # IDE and editor files
27 | .vscode/
28 | .idea/
29 | *.sublime-project
30 | *.sublime-workspace
31 | 
32 | # Testing and coverage
33 | .coverage
34 | .pytest_cache/
35 | htmlcov/
36 | 
37 | # Documentation
38 | docs/_build/
39 | docs/api/
40 | 
41 | # Miscellaneous
42 | *.log
43 | .DS_Store
44 | Thumbs.db
45 | deploy.sh


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Spider Contributors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Spider Clients
 2 | 
 3 | Discover the ultimate toolkit for integrating the fastest and most efficient web crawler **Spider** into your projects. This repository provides client libraries designed to streamline your use of [Spider Cloud](https://spider.cloud) services from various programming environments. Whether you're tackling web crawling or data indexing, our high-performance solutions have you covered.
 4 | 
 5 | ## Python
 6 | 
 7 | Leverage the power of Spider in your Python applications. Navigate to our [Python client library directory](./python/) for installation instructions, usage guides, and examples. Get ready to supercharge your data extraction tasks with the efficiency and speed of Spider within your Python environment.
 8 | 
 9 | ## JavaScript
10 | 
11 | Integrate Spider effortlessly into your JavaScript projects. Visit our [JavaScript client library directory](./javascript/) to explore how you can utilize Spider in Node.js or browser environments. Enhance your web scraping capabilities and improve data collection strategies with our cutting-edge technology.
12 | 
13 | ## Rust
14 | 
15 | Incorporate Spider smoothly into your Rust projects. Visit our [Rust client library directory](./rust/) to learn how to use Spider in your applications. Enhance your web scraping capabilities and unlock new possibilities with our advanced technology.
16 | 
17 | ## CLI
18 | 
19 | Integrate Spider into your CLI with ease. Visit our [CLI client library directory](./cli/) to explore how you can utilize Spider in your command-line applications.
20 | 
21 | ---
22 | 
23 | ### Features
24 | 
25 | - **Concurrent Crawling:** Maximize your data extraction efficiency with Spider's advanced concurrency models.
26 | - **Streaming:** Stream crawled data in real-time to ensure timely processing and analysis.
27 | - **Headless Chrome Rendering:** Capture JavaScript-rendered page contents with ease.
28 | - **HTTP Proxies Support:** Navigate anonymously and bypass content restrictions.
29 | - **Cron Jobs:** Schedule your crawling tasks to run automatically, saving time and resources.
30 | - **Smart Mode:** Automate crawling tasks with AI-driven strategies for smarter data collection.
31 | - **Blacklisting, Whitelisting, and Budgeting Depth:** Fine-tune your crawls to focus on relevant data and manage resource utilization.
32 | - **Dynamic AI Prompt Scripting Headless:** Use AI to script dynamic interactions with web pages, simulating real user behavior.
33 | 
34 | ### Getting Started
35 | 
36 | Dive into the world of high-speed web crawling with Spider. Whether you're looking to deploy Spider locally or utilize our hosted services, we've got you covered. Start by exploring our client libraries above, or visit the main [Spider repository](https://github.com/spider-rs/spider) for comprehensive documentation, installation guides, and more.
37 | 
38 | ### Support & Contribution
39 | 
40 | Your feedback and contributions are highly valued. Should you encounter any issues or have suggestions for improvements, please feel free to open an issue or submit a pull request. Visit our [Contributing Guidelines](https://github.com/spider-rs/spider/blob/master/CONTRIBUTING.md) for more information on how you can contribute to the Spider project.
41 | 
42 | We're on a mission to make web crawling faster, smarter, and more accessible than ever before. Join us in redefining the boundaries of data extraction and indexing with **Spider**.
43 | 


--------------------------------------------------------------------------------
/book/.gitignore:
--------------------------------------------------------------------------------
1 | book
2 | 


--------------------------------------------------------------------------------
/book/book.toml:
--------------------------------------------------------------------------------
 1 | [book]
 2 | authors = ["Jeff Mendez <jeff@spider.cloud>"]
 3 | language = "en"
 4 | multilingual = false
 5 | src = "src"
 6 | title = "spider-client"
 7 | 
 8 | [output.html]
 9 | git-repository-url = "https://github.com/spider-rs/spider-clients/tree/main/book"
10 | edit-url-template  = "https://github.com/spider-rs/spider-clients/edit/main/book/{path}"
11 | 


--------------------------------------------------------------------------------
/book/src/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | `spider-client` is a client library to use with the [Spider Cloud](https://spider.cloud) web crawler and scraper.
 4 | 
 5 | - Concurrent
 6 | - Streaming
 7 | - Headless Chrome
 8 | - HTTP Proxies
 9 | - Cron Jobs
10 | - Subscriptions
11 | - AI Scraping and Event Driven Actions
12 | - Blacklisting and Budgeting Depth
13 | - Exponential Backoff


--------------------------------------------------------------------------------
/book/src/SUMMARY.md:
--------------------------------------------------------------------------------
 1 | # Summary
 2 | 
 3 | [Introduction](./README.md)
 4 | 
 5 | # User Guide
 6 | 
 7 | - [A Simple Example](./simple-example.md)
 8 | 
 9 | # Python
10 | 
11 | - [Getting Started](./python/getting-started.md)
12 | - [Crawl](./python/crawl.md)
13 | - [Scrape](./python/scrape.md)
14 | - [Async Crawl](./python/async-crawl.md)
15 | 
16 | # Javascript
17 | 
18 | - [Getting Started](./javascript/getting-started.md)
19 | - [Crawl](./javascript/crawl.md)
20 | - [Scrape](./javascript/scrape.md)
21 | 
22 | # Rust
23 | 
24 | - [Getting Started](./rust/getting-started.md)
25 | 
26 | # CLI
27 | 
28 | - [Getting Started](./cli/getting-started.md)
29 | 


--------------------------------------------------------------------------------
/book/src/cli/getting-started.md:
--------------------------------------------------------------------------------
  1 | # Getting Started
  2 | 
  3 | Spider Cloud CLI is a command-line interface to interact with the [Spider Cloud](https://spider.cloud) web crawler. It allows you to scrape, crawl, search, and perform various other web-related tasks through simple commands.
  4 | 
  5 | ## Installation
  6 | 
  7 | Install the CLI using [`homebrew`](https://brew.sh/) or [`cargo`](https://doc.rust-lang.org/cargo/) from [crates.io](https://crates.io):
  8 | 
  9 | ### Homebrew
 10 | 
 11 | ```sh
 12 | brew tap spider-rs/spider-cloud-cli
 13 | brew install spider-cloud-cli
 14 | ```
 15 | 
 16 | ### Cargo
 17 | 
 18 | ```sh
 19 | cargo install spider-cloud-cli
 20 | ```
 21 | 
 22 | ## Usage
 23 | 
 24 | After installing, you can use the CLI by typing `spider-cloud-cli` followed by a command and its respective arguments.
 25 | 
 26 | ### Authentication
 27 | 
 28 | Before using most of the commands, you need to authenticate by providing an API key:
 29 | 
 30 | ```sh
 31 | spider-cloud-cli auth --api_key YOUR_API_KEY
 32 | ```
 33 | 
 34 | ### Commands
 35 | 
 36 | #### Scrape
 37 | 
 38 | Scrape data from a specified URL.
 39 | 
 40 | ```sh
 41 | spider-cloud-cli scrape --url http://example.com
 42 | ```
 43 | 
 44 | #### Crawl
 45 | 
 46 | Crawl a specified URL with an optional limit on the number of pages.
 47 | 
 48 | ```sh
 49 | spider-cloud-cli crawl --url http://example.com --limit 10
 50 | ```
 51 | 
 52 | #### Links
 53 | 
 54 | Fetch links from a specified URL.
 55 | 
 56 | ```sh
 57 | spider-cloud-cli links --url http://example.com
 58 | ```
 59 | 
 60 | #### Screenshot
 61 | 
 62 | Take a screenshot of a specified URL.
 63 | 
 64 | ```sh
 65 | spider-cloud-cli screenshot --url http://example.com
 66 | ```
 67 | 
 68 | #### Search
 69 | 
 70 | Search for a query.
 71 | 
 72 | ```sh
 73 | spider-cloud-cli search --query "example query"
 74 | ```
 75 | 
 76 | #### Transform
 77 | 
 78 | Transform specified data.
 79 | 
 80 | ```sh
 81 | spider-cloud-cli transform --data "sample data"
 82 | ```
 83 | 
 84 | #### Extract Contacts
 85 | 
 86 | Extract contact information from a specified URL.
 87 | 
 88 | ```sh
 89 | spider-cloud-cli extract_contacts --url http://example.com
 90 | ```
 91 | 
 92 | #### Label
 93 | 
 94 | Label data from a specified URL.
 95 | 
 96 | ```sh
 97 | spider-cloud-cli label --url http://example.com
 98 | ```
 99 | 
100 | #### Get Crawl State
101 | 
102 | Get the crawl state of a specified URL.
103 | 
104 | ```sh
105 | spider-cloud-cli get_crawl_state --url http://example.com
106 | ```
107 | 
108 | #### Query
109 | 
110 | Query records of a specified domain.
111 | 
112 | ```sh
113 | spider-cloud-cli query --domain example.com
114 | ```
115 | 
116 | #### Get Credits
117 | 
118 | Fetch the account credits left.
119 | 
120 | ```sh
121 | spider-cloud-cli get_credits
122 | ```


--------------------------------------------------------------------------------
/book/src/env.md:
--------------------------------------------------------------------------------
 1 | # Environment
 2 | 
 3 | Env variables to adjust the project.
 4 | 
 5 | ## SPIDER_API_KEY
 6 | 
 7 | Set this value to the API key you create at [Spider Cloud API Keys](https://spider.cloud/api-keys) after create an account and adding credits.
 8 | 
 9 | ```sh
10 | SPIDER_API_KEY=sk-myspiderkey
11 | ```
12 | 


--------------------------------------------------------------------------------
/book/src/javascript/crawl.md:
--------------------------------------------------------------------------------
 1 | # Crawl
 2 | 
 3 | We will assume that you have installed the Spider package and exported your API key as an environment variable. If you haven't, please refer to the [Getting Started](./getting-started.md) guide.
 4 | 
 5 | Crawl a website and return the content.
 6 | 
 7 | ```javascript
 8 | import { Spider } from "@spider-cloud/spider-client";
 9 | 
10 | const app = new Spider();
11 | const url = "https://spider.cloud";
12 | const scrapedData = await app.crawlUrl(url, { limit: 10 });
13 | console.log(scrapedData);
14 | ```
15 | 
16 | The `crawlUrl` method returns the content of the website in markdown format as default. We set the `limit` parameter to 10 to limit the number of pages to crawl. The maximum amount of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages.
17 | 
18 | Next we will see how to crawl with with different parameters.
19 | 
20 | ## Crawl with different parameters
21 | 
22 | The `crawlUrl` method has the following parameters:
23 | 
24 | - `url` (str): The URL of the website to crawl.
25 | 
26 | the following are recommended parameters and can be set in the `params` dictionary:
27 | 
28 | - `limit` (int): The maximum amount of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages.
29 | - `request_timeout` (int): The maximum amount of time to wait for a response from the website.
30 | - `stealth` (bool): Whether to use stealth mode. Default is `False` on chrome.
31 | - visit the [documentation](https://spider.cloud/docs/api?ref=javascript-sdk-book) for more parameters.
32 | 
33 | ```javascript
34 | import { Spider } from "@spider-cloud/spider-client";
35 | 
36 | const app = new Spider();
37 | const url = "https://spider.cloud";
38 | const scrapedData = await app.crawlUrl(url, {
39 |   limit: 10,
40 |   anti_bot: true,
41 |   return_format: "raw",
42 | });
43 | console.log(scrapedData);
44 | ```
45 | 
46 | If you have a lot of params, setting them inside the `crawlUrl` method can be cumbersome. You can set them in a seperate `params` variable that has the `SpiderParams` type which is also available in the `spider` package. You will have to use Typescript if you want type annotations.
47 | 
48 | ```ts
49 | import { Spider } from "@spider-cloud/spider-client";
50 | import type { SpiderParams } from "@spider-cloud/spider-client/dist/config";
51 | 
52 | const app = new Spider();
53 | const url = "https://spider.cloud";
54 | const params: SpiderParams = {
55 |   return_format: ["raw", "markdown"],
56 |   anti_bot: true,
57 | };
58 | const scrapedData = await app.crawlUrl(url, params);
59 | console.log(scrapedData);
60 | ```
61 | 


--------------------------------------------------------------------------------
/book/src/javascript/getting-started.md:
--------------------------------------------------------------------------------
 1 | # Getting started
 2 | 
 3 | To be able to use the javascript SDK you will (of course) have to install it. You can do so with your package manager of choice.
 4 | 
 5 | ```bash
 6 | npm install @spider-cloud/spider-client
 7 | ```
 8 | 
 9 | ```bash
10 | yarn add @spider-cloud/spider-client
11 | ```
12 | 
13 | [Here](https://www.npmjs.com/package/@spider-cloud/spider-client) is the link to the package on npm.
14 | 
15 | ## Setting & Getting Api Key
16 | 
17 | To use the SDK you will need an API key. You can get one by signing up on [spider.cloud](https://spider.cloud?ref=javascript-sdk-book).
18 | 
19 | Then you need to set the API key in your environment variables.
20 | 
21 | ```bash
22 | export SPIDER_API_KEY=your_api_key
23 | ```
24 | 
25 | if you don't want to set the API key in your environment variables you can pass it as an argument to the `Spider` class.
26 | 
27 | ```javascript
28 | import { Spider } from "@spider-cloud/spider-client";
29 | ```
30 | 
31 | We recommend setting the API key in your environment variables.
32 | 


--------------------------------------------------------------------------------
/book/src/javascript/scrape.md:
--------------------------------------------------------------------------------
 1 | # Scrape
 2 | 
 3 | We will assume that you have installed the Spider package and exported your API key as an environment variable. If you haven't, please refer to the [Getting Started](./getting-started.md) guide.
 4 | 
 5 | Scrape a website and return the content.
 6 | 
 7 | ```javascript
 8 | import { Spider } from "@spider-cloud/spider-client";
 9 | 
10 | const app = new Spider();
11 | const url = "https://spider.cloud";
12 | const scrapedData = await app.scrapeUrl(url);
13 | console.log(scrapedData);
14 | ```
15 | 
16 | The `scrapeUrl` method returns the content of the website in markdown format as default. Next we will see how to scrape with with different parameters.
17 | 
18 | ## Scrape with different parameters
19 | 
20 | The `scrapeUrl` method has the following parameters:
21 | 
22 | - `url` (str): The URL of the website to scrape.
23 | 
24 | the following are optional parameters and can be set in the `params` dictionary:
25 | 
26 | - `request` ("http", "chrome", "smart") : The type of request to make. Default is "http".
27 | - `return_format` ("raw", "markdown", "commonmark", "html2text", "text", "bytes") : The format in which to return the scraped data. Default is "markdown".
28 | - `stealth`, `anti_bot` and a ton of other parameters that you can find in the [documentation](https://spider.cloud/docs/api?ref=javascript-sdk-book).
29 | 
30 | ```javascript
31 | import { Spider } from "@spider-cloud/spider-client";
32 | 
33 | const app = new Spider();
34 | const url = "https://spider.cloud";
35 | const scrapedData = await app.scrapeUrl(url, {
36 |   return_format: "raw",
37 |   anti_bot: true,
38 | });
39 | console.log(scrapedData);
40 | ```
41 | 
42 | If you have a lot of params, setting them inside the `scrapeUrl` method can be cumbersome. You can set them in a seperate `params` variable that has the `SpiderParams` type which is also available in the `spider` package. You will have to use Typescript if you want type annotations.
43 | 
44 | ```ts
45 | import { Spider } from "@spider-cloud/spider-client";
46 | import type { SpiderParams } from "@spider-cloud/spider-client/dist/config";
47 | 
48 | const app = new Spider();
49 | const url = "https://spider.cloud";
50 | const params: SpiderParams = {
51 |   return_format: "raw",
52 |   anti_bot: true,
53 | };
54 | const scrapedData = await app.scrapeUrl(url, params);
55 | console.log(scrapedData);
56 | ```
57 | 


--------------------------------------------------------------------------------
/book/src/python/async-crawl.md:
--------------------------------------------------------------------------------
  1 | # Async Crawl
  2 | 
  3 | We will assume that you have installed the Spider package and exported your API key as an environment variable. If you haven't, please refer to the [Getting Started](./getting-started.md) guide.
  4 | 
  5 | Crawl a website asynchronously and return the content.
  6 | 
  7 | ```python
  8 | import asyncio
  9 | 
 10 | from spider import AsyncSpider
 11 | 
 12 | url = "https://spider.cloud"
 13 | 
 14 | 
 15 | async def async_crawl_url(url, params):
 16 |     async with AsyncSpider() as app:
 17 |         crawled_data = []
 18 |         async for data in app.crawl_url(url, params=params):
 19 |             crawled_data.append(data)
 20 |     return crawled_data
 21 | 
 22 | 
 23 | result = asyncio.run(async_crawl_url(url, params={"limit": 10}))
 24 | print(result)
 25 | ```
 26 | 
 27 | We use the `AsyncSpider` class to create an asynchronous instance of the Spider class. We then use the `async for` loop to iterate over the results of the `crawl_url` method. The `crawl_url` method returns a generator that yields the crawled data. We append the data to a list and return it. Simsalabim, we have crawled a website asynchronously.
 28 | 
 29 | Next we will see how to crawl asynchronously with different parameters.
 30 | 
 31 | ## Async Crawl with different parameters
 32 | 
 33 | The `crawl_url` method has the following parameters:
 34 | 
 35 | - `url` (str): The URL of the website to crawl.
 36 | 
 37 | the following are recommended parameters and can be set in the `params` dictionary:
 38 | 
 39 | - `limit` (int): The maximum amount of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages.
 40 | - `request_timeout` (int): The maximum amount of time to wait for a response from the website.
 41 | - `stealth` (bool): Whether to use stealth mode. Default is `False` on chrome.
 42 | - a ton more, visit the [documentation](https://spider.cloud/docs/api?ref=python-sdk-book) for more parameters.
 43 | 
 44 | ```python
 45 | import asyncio
 46 | 
 47 | from spider import AsyncSpider
 48 | 
 49 | url = "https://spider.cloud"
 50 | 
 51 | 
 52 | async def async_crawl_url(url, params):
 53 |     async with AsyncSpider() as app:
 54 |         crawled_data = []
 55 |         async for data in app.crawl_url(url, params=params):
 56 |             crawled_data.append(data)
 57 |     return crawled_data
 58 | 
 59 | 
 60 | result = asyncio.run(
 61 |     async_crawl_url(
 62 |         url,
 63 |         params={
 64 |             "limit": 10,
 65 |             "request_timeout": 10,
 66 |             "stealth": True,
 67 |             "return_format": "html",
 68 |         },
 69 |     )
 70 | )
 71 | print(result)
 72 | ```
 73 | 
 74 | If you have a lot of params, setting them inside the `crawl_url` method can be cumbersome. You can set them in a seperate `params` variable that has the `RequestParams` type which is also available in the `spider` package.
 75 | 
 76 | ```python
 77 | import asyncio
 78 | 
 79 | from spider import AsyncSpider, spider_types
 80 | 
 81 | url = "https://spider.cloud"
 82 | 
 83 | 
 84 | async def async_crawl_url(url, params):
 85 |     async with AsyncSpider() as app:
 86 |         crawled_data = []
 87 |         async for data in app.crawl_url(url, params=params):
 88 |             crawled_data.append(data)
 89 |     return crawled_data
 90 | 
 91 | 
 92 | params: spider_types.RequestParamsDict = {
 93 |     "limit": 10,
 94 |     "request_timeout": 10,
 95 |     "stealth": True,
 96 |     # Easier to read and intellisense will help you with the available options
 97 | }
 98 | 
 99 | result = asyncio.run(async_crawl_url(url, params=params))
100 | print(result)
101 | ```
102 | 


--------------------------------------------------------------------------------
/book/src/python/crawl.md:
--------------------------------------------------------------------------------
 1 | # Crawl
 2 | 
 3 | We will assume that you have installed the Spider package and exported your API key as an environment variable. If you haven't, please refer to the [Getting Started](./getting-started.md) guide.
 4 | 
 5 | Crawl a website and return the content.
 6 | 
 7 | ```python
 8 | from spider import Spider
 9 | 
10 | app = Spider()
11 | url = "https://spider.cloud"
12 | crawled_data = app.crawl_url(url, params={"limit": 10})
13 | print(crawled_data)
14 | ```
15 | 
16 | The `crawl_url` method returns the content of the website in markdown format as default. We set the `limit` parameter to 10 to limit the number of pages to crawl. The maximum amount of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages.
17 | 
18 | Next we will see how to crawl with with different parameters.
19 | 
20 | ## Crawl with different parameters
21 | 
22 | The `crawl_url` method has the following parameters:
23 | 
24 | - `url` (str): The URL of the website to crawl.
25 | 
26 | the following are recommended parameters and can be set in the `params` dictionary:
27 | 
28 | - `limit` (int): The maximum amount of pages allowed to crawl per website. Remove the value or set it to `0` to crawl all pages.
29 | - `request_timeout` (int): The maximum amount of time to wait for a response from the website.
30 | - `stealth` (bool): Whether to use stealth mode. Default is `False` on chrome.
31 | - visit the [documentation](https://spider.cloud/docs/api?ref=python-sdk-book) for more parameters.
32 | 
33 | ```python
34 | from spider import Spider
35 | 
36 | app = Spider()
37 | url = "https://spider.cloud"
38 | crawled_data = app.crawl_url(
39 |     url, params={"limit": 10, "request_timeout": 10, "stealth": True}
40 | )
41 | 
42 | print(crawled_data)
43 | ```
44 | 
45 | If you have a lot of params, setting them inside the `crawl_url` method can be cumbersome. You can set them in a seperate `params` variable that has the `RequestParams` type which is also available in the `spider` package.
46 | 
47 | ```python
48 | from spider import Spider, spider_types
49 | 
50 | params: spider_types.RequestParamsDict = {
51 |     "limit": 10,
52 |     "request_timeout": 10,
53 |     "stealth": True,
54 |     "return_format": [ "raw", "markdown" ],
55 |     # Easier to read and intellisense will help you with the available options
56 | }
57 | 
58 | app = Spider()
59 | url = "https://spider.cloud"
60 | crawled_data = app.crawl_url(url, params)
61 | 
62 | print(crawled_data)
63 | ```
64 | 


--------------------------------------------------------------------------------
/book/src/python/getting-started.md:
--------------------------------------------------------------------------------
 1 | # Getting started
 2 | 
 3 | To use the python SDK you will (of course) have to install it :)
 4 | 
 5 | ```bash
 6 | pip install spider-client
 7 | ```
 8 | 
 9 | [Here](https://pypi.org/project/spider-client/) is the link to the package on PyPi.
10 | 
11 | ## Setting & Getting Api Key
12 | 
13 | To use the SDK you will need an API key. You can get one by signing up on [spider.cloud](https://spider.cloud?ref=python-sdk-book).
14 | 
15 | Then you need to set the API key in your environment variables.
16 | 
17 | ```bash
18 | export SPIDER_API_KEY=your_api_key
19 | ```
20 | 
21 | if you don't want to set the API key in your environment variables you can pass it as an argument to the `Spider` class.
22 | 
23 | ```python
24 | from spider import Spider
25 | app = Spider(api_key='your_api_key')
26 | ```
27 | 
28 | We recommend setting the API key in your environment variables.
29 | 


--------------------------------------------------------------------------------
/book/src/python/scrape.md:
--------------------------------------------------------------------------------
 1 | # Scrape
 2 | 
 3 | We will assume that you have installed the Spider package and exported your API key as an environment variable. If you haven't, please refer to the [Getting Started](./getting-started.md) guide.
 4 | 
 5 | Scrape a website and return the content.
 6 | 
 7 | ```python
 8 | from spider import Spider
 9 | 
10 | app = Spider()
11 | url = 'https://spider.cloud'
12 | scraped_data = app.scrape_url(url)
13 | 
14 | print(scraped_data)
15 | ```
16 | 
17 | The `scrape_url` method returns the content of the website in markdown format as default. Next we will see how to scrape with with different parameters.
18 | 
19 | ## Scrape with different parameters
20 | 
21 | The `scrape_url` method has the following parameters:
22 | 
23 | - `url` (str): The URL of the website to scrape.
24 | 
25 | the following are optional parameters and can be set in the `params` dictionary:
26 | 
27 | - `request` ("http", "chrome", "smart") : The type of request to make. Default is "http".
28 | - `return_format` ("raw", "markdown", "commonmark", "html2text", "text", "bytes") : The format in which to return the scraped data. Default is "markdown".
29 | - `stealth`, `anti_bot` and a ton of other parameters that you can find in the [documentation](https://spider.cloud/docs/api?ref=python-sdk-book).
30 | 
31 | ```python
32 | from spider import Spider
33 | 
34 | app = Spider()
35 | url = "https://spider.cloud"
36 | scraped_data = app.scrape_url(url, params={"request_timeout": 10, "stealth": True})
37 | 
38 | print(scraped_data)
39 | ```
40 | 
41 | If you have a lot of params, setting them inside the `scrape_url` method can be cumbersome. You can set them in a seperate `params` variable that has the `RequestParams` type which is also available in the `spider` package.
42 | 
43 | ```python
44 | from spider import Spider, spider_types
45 | 
46 | params: spider_types.RequestParamsDict = {
47 |     "request_timeout": 10,
48 |     "stealth": True,
49 |     # Easier to read and intellisense will help you with the available options
50 | }
51 | 
52 | app = Spider()
53 | url = "https://spider.cloud"
54 | scraped_data = app.scrape_url(url, params)
55 | 
56 | print(scraped_data)
57 | ```
58 | 


--------------------------------------------------------------------------------
/book/src/rust/getting-started.md:
--------------------------------------------------------------------------------
  1 | # Getting Started
  2 | 
  3 | The Spider Cloud Rust SDK offers a toolkit for straightforward website scraping, crawling at scale, and other utilities like extracting links and taking screenshots, enabling you to collect data formatted for compatibility with language models (LLMs). It features a user-friendly interface for seamless integration with the Spider Cloud API.
  4 | 
  5 | ## Installation
  6 | 
  7 | To use the Spider Cloud Rust SDK, include the following in your `Cargo.toml`:
  8 | 
  9 | ```toml
 10 | [dependencies]
 11 | spider-client = "0.1"
 12 | ```
 13 | 
 14 | ## Usage
 15 | 
 16 | 1. Get an API key from [spider.cloud](https://spider.cloud)
 17 | 2. Set the API key as an environment variable named `SPIDER_API_KEY` or pass it as an argument when creating an instance of the `Spider` struct.
 18 | 
 19 | Here's an example of how to use the SDK:
 20 | 
 21 | ```rust
 22 | use serde_json::json;
 23 | use std::env;
 24 | 
 25 | #[tokio::main]
 26 | async fn main() {
 27 |     // Set the API key as an environment variable
 28 |     env::set_var("SPIDER_API_KEY", "your_api_key");
 29 | 
 30 |     // Initialize the Spider with your API key
 31 |     let spider = Spider::new(None).expect("API key must be provided");
 32 | 
 33 |     let url = "https://spider.cloud";
 34 | 
 35 |     // Scrape a single URL
 36 |     let scraped_data = spider.scrape_url(url, None, false, "application/json").await.expect("Failed to scrape the URL");
 37 | 
 38 |     println!("Scraped Data: {:?}", scraped_data);
 39 | 
 40 |     // Crawl a website
 41 |     let crawler_params = RequestParams {
 42 |         limit: Some(1),
 43 |         proxy_enabled: Some(true),
 44 |         store_data: Some(false),
 45 |         metadata: Some(false),
 46 |         request: Some(RequestType::Http),
 47 |         ..Default::default()
 48 |     };
 49 | 
 50 |     let crawl_result = spider.crawl_url(url, Some(crawler_params), false, "application/json", None::<fn(serde_json::Value)>).await.expect("Failed to crawl the URL");
 51 | 
 52 |     println!("Crawl Result: {:?}", crawl_result);
 53 | }
 54 | ```
 55 | 
 56 | ### Scraping a URL
 57 | 
 58 | To scrape data from a single URL:
 59 | 
 60 | ```rust
 61 | let url = "https://example.com";
 62 | let scraped_data = spider.scrape_url(url, None, false, "application/json").await.expect("Failed to scrape the URL");
 63 | ```
 64 | 
 65 | ### Crawling a Website
 66 | 
 67 | To automate crawling a website:
 68 | 
 69 | ```rust
 70 | let url = "https://example.com";
 71 | let crawl_params = RequestParams {
 72 |     limit: Some(200),
 73 |     request: Some(RequestType::Smart),
 74 |     ..Default::default()
 75 | };
 76 | let crawl_result = spider.crawl_url(url, Some(crawl_params), false, "application/json", None::<fn(serde_json::Value)>).await.expect("Failed to crawl the URL");
 77 | ```
 78 | 
 79 | #### Crawl Streaming
 80 | 
 81 | Stream crawl the website in chunks to scale with a callback:
 82 | 
 83 | ```rust
 84 | fn handle_json(json_obj: serde_json::Value) {
 85 |     println!("Received chunk: {:?}", json_obj);
 86 | }
 87 | 
 88 | let url = "https://example.com";
 89 | let crawl_params = RequestParams {
 90 |     limit: Some(200),
 91 |     store_data: Some(false),
 92 |     ..Default::default()
 93 | };
 94 | 
 95 | spider.crawl_url(
 96 |     url,
 97 |     Some(crawl_params),
 98 |     true,
 99 |     "application/json",
100 |     Some(handle_json)
101 | ).await.expect("Failed to crawl the URL");
102 | ```
103 | 
104 | ### Search
105 | 
106 | Perform a search for websites to crawl or gather search results:
107 | 
108 | ```rust
109 | let query = "a sports website";
110 | let crawl_params = RequestParams {
111 |     request: Some(RequestType::Smart),
112 |     search_limit: Some(5),
113 |     limit: Some(5),
114 |     fetch_page_content: Some(true),
115 |     ..Default::default()
116 | };
117 | let crawl_result = spider.search(query, Some(crawl_params), false, "application/json").await.expect("Failed to perform search");
118 | ```
119 | 
120 | ### Retrieving Links from a URL(s)
121 | 
122 | Extract all links from a specified URL:
123 | 
124 | ```rust
125 | let url = "https://example.com";
126 | let links = spider.links(url, None, false, "application/json").await.expect("Failed to retrieve links from URL");
127 | ```
128 | 
129 | ### Transform
130 | 
131 | Transform HTML to markdown or text lightning fast:
132 | 
133 | ```rust
134 | let data = vec![json!({"html": "<html><body><h1>Hello world</h1></body></html>"})];
135 | let params = RequestParams {
136 |     readability: Some(false),
137 |     return_format: Some(ReturnFormat::Markdown),
138 |     ..Default::default()
139 | };
140 | let result = spider.transform(data, Some(params), false, "application/json").await.expect("Failed to transform HTML to markdown");
141 | println!("Transformed Data: {:?}", result);
142 | ```
143 | 
144 | ### Taking Screenshots of a URL(s)
145 | 
146 | Capture a screenshot of a given URL:
147 | 
148 | ```rust
149 | let url = "https://example.com";
150 | let screenshot = spider.screenshot(url, None, false, "application/json").await.expect("Failed to take screenshot of URL");
151 | ```
152 | 
153 | ### Extracting Contact Information
154 | 
155 | Extract contact details from a specified URL:
156 | 
157 | ```rust
158 | let url = "https://example.com";
159 | let contacts = spider.extract_contacts(url, None, false, "application/json").await.expect("Failed to extract contacts from URL");
160 | println!("Extracted Contacts: {:?}", contacts);
161 | ```
162 | 
163 | ### Labeling Data from a URL(s)
164 | 
165 | Label the data extracted from a particular URL:
166 | 
167 | ```rust
168 | let url = "https://example.com";
169 | let labeled_data = spider.label(url, None, false, "application/json").await.expect("Failed to label data from URL");
170 | println!("Labeled Data: {:?}", labeled_data);
171 | ```
172 | 
173 | ### Checking Crawl State
174 | 
175 | You can check the crawl state of a specific URL:
176 | 
177 | ```rust
178 | let url = "https://example.com";
179 | let state = spider.get_crawl_state(url, None, false, "application/json").await.expect("Failed to get crawl state for URL");
180 | println!("Crawl State: {:?}", state);
181 | ```
182 | 
183 | ### Downloading Files
184 | 
185 | You can download the results of the website:
186 | 
187 | ```rust
188 | let url = "https://example.com";
189 | let options = hashmap!{
190 |     "page" => 0,
191 |     "limit" => 100,
192 |     "expiresIn" => 3600 // Optional, add if needed
193 | };
194 | let response = spider.create_signed_url(Some(url), Some(options)).await.expect("Failed to create signed URL");
195 | println!("Download URL: {:?}", response);
196 | ```
197 | 
198 | ### Checking Available Credits
199 | 
200 | You can check the remaining credits on your account:
201 | 
202 | ```rust
203 | let credits = spider.get_credits().await.expect("Failed to get credits");
204 | println!("Remaining Credits: {:?}", credits);
205 | ```
206 | 
207 | ### Data Operations
208 | 
209 | The Spider client can now interact with specific data tables to create, retrieve, and delete data.
210 | 
211 | #### Retrieve Data from a Table
212 | 
213 | To fetch data from a specified table by applying query parameters:
214 | 
215 | ```rust
216 | let table_name = "pages";
217 | let query_params = RequestParams {
218 |     limit: Some(20),
219 |     ..Default::default()
220 | };
221 | let response = spider.data_get(table_name, Some(query_params)).await.expect("Failed to retrieve data from table");
222 | println!("Data from table: {:?}", response);
223 | ```
224 | 
225 | #### Delete Data from a Table
226 | 
227 | To delete data from a specified table based on certain conditions:
228 | 
229 | ```rust
230 | let table_name = "websites";
231 | let delete_params = RequestParams {
232 |     domain: Some("www.example.com".to_string()),
233 |     ..Default::default()
234 | };
235 | let response = spider.data_delete(table_name, Some(delete_params)).await.expect("Failed to delete data from table");
236 | println!("Delete Response: {:?}", response);
237 | ```
238 | 
239 | ## Streaming
240 | 
241 | If you need to use streaming, set the `stream` parameter to `true` and provide a callback function:
242 | 
243 | ```rust
244 | fn handle_json(json_obj: serde_json::Value) {
245 |     println!("Received chunk: {:?}", json_obj);
246 | }
247 | 
248 | let url = "https://example.com";
249 | let crawler_params = RequestParams {
250 |     limit: Some(1),
251 |     proxy_enabled: Some(true),
252 |     store_data: Some(false),
253 |     metadata: Some(false),
254 |     request: Some(RequestType::Http),
255 |     ..Default::default()
256 | };
257 | 
258 | spider.links(url, Some(crawler_params), true, "application/json").await.expect("Failed to retrieve links from URL");
259 | ```
260 | 
261 | ## Content-Type
262 | 
263 | The following Content-type headers are supported using the `content_type` parameter:
264 | 
265 | - `application/json`
266 | - `text/csv`
267 | - `application/xml`
268 | - `application/jsonl`
269 | 
270 | ```rust
271 | let url = "https://example.com";
272 | 
273 | let crawler_params = RequestParams {
274 |     limit: Some(1),
275 |     proxy_enabled: Some(true),
276 |     store_data: Some(false),
277 |     metadata: Some(false),
278 |     request: Some(RequestType::Http),
279 |     ..Default::default()
280 | };
281 | 
282 | // Stream JSON lines back to the client
283 | spider.crawl_url(url, Some(crawler_params), true, "application/jsonl", None::<fn(serde_json::Value)>).await.expect("Failed to crawl the URL");
284 | ```
285 | 
286 | ## Error Handling
287 | 
288 | The SDK handles errors returned by the Spider Cloud API and raises appropriate exceptions. If an error occurs during a request, it will be propagated to the caller with a descriptive error message. By default request use a Exponential Backoff to retry as needed.


--------------------------------------------------------------------------------
/book/src/simple-example.md:
--------------------------------------------------------------------------------
 1 | # Simple Example
 2 | 
 3 | This is a simple example of what you can do with the `spider-client` library.
 4 | 
 5 | ## Installation
 6 | 
 7 | To install the library, you can use `pip` for Python or `npm` (make sure to have [node](https://nodejs.org/en) installed) for JavaScript.:
 8 | 
 9 | ```bash
10 | # for python
11 | pip install spider-client
12 | ```
13 | 
14 | ```bash
15 | # for javascript
16 | npm install @spider-cloud/spider-client
17 | ```
18 | 
19 | ## Usage
20 | 
21 | Here is an example of how you can use the library, make sure to replace `your_api_key` with your actual API key which you can get from the [spider.cloud](https://spider.cloud) website.
22 | 
23 | ```python
24 | from spider import Spider
25 | 
26 | app = Spider(api_key='your_api_key')
27 | url = 'https://spider.cloud'
28 | scraped_data = app.scrape_url(url)
29 | ```
30 | 
31 | ```javascript
32 | import { Spider } from "@spider-cloud/spider-client";
33 | 
34 | const app = new Spider({ apiKey: "your-api-key" });
35 | const url = "https://spider.cloud";
36 | const scrapedData = await app.scrapeUrl(url);
37 | console.log(scrapedData);
38 | ```
39 | 


--------------------------------------------------------------------------------
/book/src/website.md:
--------------------------------------------------------------------------------
  1 | # Website
  2 | 
  3 | The Website class is the foundations to the spider.
  4 | 
  5 | ## Builder pattern
  6 | 
  7 | We use the builder pattern to configure the website for crawling.
  8 | 
  9 | \*note: Replace `https://choosealicense.com` from the examples below with your website target URL.
 10 | 
 11 | ```py
 12 | import asyncio
 13 | from spider_rs import Website
 14 | 
 15 | async def main():
 16 |     website = Website("https://choosealicense.com")
 17 |     website.crawl()
 18 |     print(website.get_links())
 19 | 
 20 | asyncio.run(main())
 21 | ```
 22 | 
 23 | ### Custom Headers
 24 | 
 25 | Add custom HTTP headers to use when crawling/scraping.
 26 | 
 27 | ```py
 28 | import asyncio
 29 | from spider_rs import Website
 30 | 
 31 | async def main():
 32 |     website = Website("https://choosealicense.com").with_headers({ "authorization": "mytoken"})
 33 | 
 34 | asyncio.run(main())
 35 | ```
 36 | 
 37 | ### Blacklist
 38 | 
 39 | Prevent crawling a set path, url, or pattern with Regex.
 40 | 
 41 | ```py
 42 | import asyncio
 43 | from spider_rs import Website
 44 | 
 45 | async def main():
 46 |     website = Website("https://choosealicense.com").with_blacklist_url(["/blog", "/resume"])
 47 | 
 48 | asyncio.run(main())
 49 | ```
 50 | 
 51 | ### Whitelist
 52 | 
 53 | Only crawl set paths, url, or pattern with Regex.
 54 | 
 55 | ```py
 56 | import asyncio
 57 | from spider_rs import Website
 58 | 
 59 | async def main():
 60 |     website = Website("https://choosealicense.com").with_whitelist_url(["/licenses"])
 61 | 
 62 | asyncio.run(main())
 63 | ```
 64 | 
 65 | ### Crons
 66 | 
 67 | Setup a cron job that can run at any time in the background using cron-syntax.
 68 | 
 69 | ```py
 70 | import asyncio
 71 | from spider_rs import Website
 72 | 
 73 | async def main():
 74 |     website = Website("https://choosealicense.com").with_cron("1/5 * * * * *")
 75 | 
 76 | asyncio.run(main())
 77 | ```
 78 | 
 79 | View the [cron](./cron-job.md) section for details how to use the cron.
 80 | 
 81 | ### Budget
 82 | 
 83 | Add a crawl budget that prevents crawling `x` amount of pages.
 84 | 
 85 | ```py
 86 | import asyncio
 87 | from spider_rs import Website
 88 | 
 89 | async def main():
 90 |     website = Website("https://choosealicense.com").with_budget({
 91 |     "*": 1,
 92 |   })
 93 | 
 94 | asyncio.run(main())
 95 | ```
 96 | 
 97 | ### Subdomains
 98 | 
 99 | Include subdomains in request.
100 | 
101 | ```py
102 | import asyncio
103 | from spider_rs import Website
104 | 
105 | async def main():
106 |     website = Website("https://choosealicense.com").with_subdomains(True)
107 | 
108 | asyncio.run(main())
109 | ```
110 | 
111 | ### TLD
112 | 
113 | Include TLDs in request.
114 | 
115 | ```py
116 | import asyncio
117 | from spider_rs import Website
118 | 
119 | async def main():
120 |     website = Website("https://choosealicense.com").with_tld(True)
121 | 
122 | asyncio.run(main())
123 | ```
124 | 
125 | ### External Domains
126 | 
127 | Add external domains to include with the website.
128 | 
129 | ```py
130 | import asyncio
131 | from spider_rs import Website
132 | 
133 | async def main():
134 |     website = Website("https://choosealicense.com").with_external_domains(["https://www.myotherdomain.com"])
135 | 
136 | asyncio.run(main())
137 | ```
138 | 
139 | ### Proxy
140 | 
141 | Use a proxy to crawl a website.
142 | 
143 | ```py
144 | import asyncio
145 | from spider_rs import Website
146 | 
147 | async def main():
148 |     website = Website("https://choosealicense.com").with_proxies(["https://www.myproxy.com"])
149 | 
150 | asyncio.run(main())
151 | ```
152 | 
153 | ### Depth Limit
154 | 
155 | Set the depth limit for the amount of forward pages.
156 | 
157 | ```ts
158 | import asyncio
159 | from spider_rs import Website
160 | 
161 | async def main():
162 |     website = Website("https://choosealicense.com").with_depth(3)
163 | 
164 | asyncio.run(main())
165 | ```
166 | 
167 | ### Cache
168 | 
169 | Enable HTTP caching, this useful when using the spider on a server.
170 | 
171 | ```py
172 | import asyncio
173 | from spider_rs import Website
174 | 
175 | async def main():
176 |     website = Website("https://choosealicense.com").with_caching(True)
177 | 
178 | asyncio.run(main())
179 | ```
180 | 
181 | ### Delays
182 | 
183 | Add delays between pages. Defaults to none.
184 | 
185 | ```py
186 | import asyncio
187 | from spider_rs import Website
188 | 
189 | async def main():
190 |     website = Website("https://choosealicense.com").with_delays(200)
191 | 
192 | asyncio.run(main())
193 | ```
194 | 
195 | ### User-Agent
196 | 
197 | Use a custom User-Agent.
198 | 
199 | ```py
200 | import asyncio
201 | from spider_rs import Website
202 | 
203 | async def main():
204 |     website = Website("https://choosealicense.com").with_user_agent("mybot/v1")
205 | 
206 | asyncio.run(main())
207 | ```
208 | 
209 | ### Request Timeout
210 | 
211 | Add a request timeout per page in miliseconds. Example shows 30 seconds.
212 | 
213 | ```py
214 | import asyncio
215 | from spider_rs import Website
216 | 
217 | async def main():
218 |     website = Website("https://choosealicense.com").with_request_timeout(30000)
219 | 
220 | asyncio.run(main())
221 | ```
222 | 
223 | ### Wait For Idle Network
224 | 
225 | You can wait for the Network to become idle when using chrome. This helps load all the data from client side scripts.
226 | The first param is whether to enable or not and the second is the duration max timeout in milliseconds.
227 | 
228 | ```py
229 | import asyncio
230 | from spider_rs import Website
231 | 
232 | async def main():
233 |     website = Website("https://choosealicense.com").with_wait_for_idle_network(True, 12000)
234 | 
235 | asyncio.run(main())
236 | ```
237 | 
238 | ### Respect Robots
239 | 
240 | Respect the robots.txt file.
241 | 
242 | ```py
243 | import asyncio
244 | from spider_rs import Website
245 | 
246 | async def main():
247 |     website = Website("https://choosealicense.com").with_respect_robots_txt(True)
248 | 
249 | asyncio.run(main())
250 | ```
251 | 
252 | ### Collect Full Resources
253 | 
254 | Collect all resources found not just valid web pages.
255 | 
256 | ```py
257 | import asyncio
258 | from spider_rs import Website
259 | 
260 | async def main():
261 |     website = Website("https://choosealicense.com").with_full_resources(True)
262 | 
263 | asyncio.run(main())
264 | ```
265 | 
266 | ### OpenAI
267 | 
268 | Use OpenAI to generate dynamic scripts to use with headless. Make sure to set the `OPENAI_API_KEY` env variable.
269 | 
270 | ```py
271 | import asyncio
272 | from spider_rs import Website
273 | 
274 | async def main():
275 |     website = (
276 |         Website("https://google.com")
277 |         .with_openai({
278 |             "model": "gpt-3.5-turbo",
279 |             "prompt": "Search for movies",
280 |             "maxTokens": 300
281 |         })
282 |     )
283 | 
284 | asyncio.run(main())
285 | ```
286 | 
287 | ### Screenshots
288 | 
289 | Take a screenshot of the pages on crawl when using headless chrome.
290 | 
291 | ```py
292 | import asyncio
293 | from spider_rs import Website
294 | 
295 | async def main():
296 |     website = (
297 |         Website("https://choosealicense.com", False)
298 |         .with_screenshot({
299 |             "params": {
300 |                 "cdp_params": None,
301 |                 "full_page": True,
302 |                 "omit_background": False
303 |             },
304 |             "bytes": False,
305 |             "save": True,
306 |             "output_dir": None
307 |         })
308 |     )
309 | 
310 | asyncio.run(main())
311 | ```
312 | 
313 | ### Http2 Prior Knowledge
314 | 
315 | Use http2 to connect if you know the website servers supports this.
316 | 
317 | ```py
318 | import asyncio
319 | from spider_rs import Website
320 | 
321 | async def main():
322 |     website = Website("https://choosealicense.com").with_http2_prior_knowledge(True)
323 | 
324 | asyncio.run(main())
325 | ```
326 | 
327 | ## Chaining
328 | 
329 | You can chain all of the configs together for simple configuration.
330 | 
331 | ```py
332 | import asyncio
333 | from spider_rs import Website
334 | 
335 | async def main():
336 |     website = Website("https://choosealicense.com").with_subdomains(true).with_tlds(true).with_user_agent("mybot/v1").with_respect_robots_txt(true)
337 | 
338 | asyncio.run(main())
339 | ```
340 | 
341 | ## Raw Content
342 | 
343 | Set the second param of the website constructor to `true` to return content without UTF-8.
344 | This will return `rawContent` and leave `content` when using subscriptions or the Page Object.
345 | 
346 | ```py
347 | import asyncio
348 | from spider_rs import Website
349 | 
350 | async def main():
351 |     website = Website("https://choosealicense.com", True)
352 |     website.scrape_url()
353 | 
354 | asyncio.run(main())
355 | ```
356 | 
357 | ## Clearing Crawl Data
358 | 
359 | Use `website.clear` to remove the links visited and page data or `website.drain_links` to drain the links visited.
360 | 
361 | ```py
362 | import asyncio
363 | from spider_rs import Website
364 | 
365 | async def main():
366 |     website = Website("https://choosealicense.com")
367 |     website.crawl()
368 |     print(website.getLinks())
369 |     website.clear()
370 |     print(website.getLinks())
371 | 
372 | asyncio.run(main())
373 | ```
374 | 
375 | ## Stop crawl
376 | 
377 | To stop a crawl you can use `website.stopCrawl(id)`, pass in the crawl id to stop a run or leave empty for all crawls to stop.
378 | 
379 | ```py
380 | import asyncio
381 | from spider_rs import Website
382 | 
383 | class Subscription:
384 |     def __init__(self):
385 |         print("Subscription Created...")
386 |     def __call__(self, page):
387 |         print(page.url + " - status: " + str(page.status_code))
388 | 
389 | async def main():
390 |     website = Website("https://choosealicense.com")
391 |     website.crawl(Subscription())
392 |     # sleep for 2s and stop etc
393 |     website.stop()
394 | 
395 | asyncio.run(main())
396 | ```
397 | 


--------------------------------------------------------------------------------
/cli/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "spider-cloud-cli"
 3 | version = "0.1.36"
 4 | edition = "2021"
 5 | authors = [ "j-mendez <jeff@spider.cloud>"]
 6 | description = "The Spider Cloud CLI for web crawling and scraping"
 7 | license = "MIT"
 8 | readme = "README.md"
 9 | keywords = ["crawler", "web-crawler", "web-scraper", "spider", "web-indexer"]
10 | categories = ["web-programming"]
11 | include = ["src/*", "../../LICENSE", "README.md"]
12 | 
13 | [dependencies]
14 | clap = { version = "4", features = ["derive"]}
15 | reqwest = { version = "0.12", features = ["json", "stream"] }
16 | tokio = { version = "1", features = ["rt-multi-thread", "macros"] }
17 | spider-client = { path = "../rust", version = "0.1" }
18 | serde = { version = "1", features = ["derive"] }
19 | serde_json = "1"
20 | keyring = { version = "3", features = ["apple-native", "windows-native", "sync-secret-service"] }
21 | 


--------------------------------------------------------------------------------
/cli/README.md:
--------------------------------------------------------------------------------
  1 | # Spider Cloud CLI
  2 | 
  3 | Spider Cloud CLI is a command-line interface to interact with the [Spider Cloud](https://spider.cloud) web crawler. It allows you to scrape, crawl, search, and perform various other web-related tasks through simple commands.
  4 | 
  5 | ## Installation
  6 | 
  7 | Install the CLI using [`homebrew`](https://brew.sh/) or [`cargo`](https://doc.rust-lang.org/cargo/) from [crates.io](https://crates.io):
  8 | 
  9 | ### Homebrew
 10 | 
 11 | ```sh
 12 | brew tap spider-rs/spider-cloud-cli
 13 | brew install spider-cloud-cli
 14 | ```
 15 | 
 16 | ### Cargo
 17 | 
 18 | ```sh
 19 | cargo install spider-cloud-cli
 20 | ```
 21 | 
 22 | ## Usage
 23 | 
 24 | After installing, you can use the CLI by typing `spider-cloud-cli` followed by a command and its respective arguments.
 25 | 
 26 | ### Authentication
 27 | 
 28 | Before using most of the commands, you need to authenticate by providing an API key:
 29 | 
 30 | ```sh
 31 | spider-cloud-cli auth --api_key YOUR_API_KEY
 32 | ```
 33 | 
 34 | ### Commands
 35 | 
 36 | #### Scrape
 37 | 
 38 | Scrape data from a specified URL.
 39 | 
 40 | ```sh
 41 | spider-cloud-cli scrape --url http://example.com
 42 | ```
 43 | 
 44 | #### Crawl
 45 | 
 46 | Crawl a specified URL with an optional limit on the number of pages.
 47 | 
 48 | ```sh
 49 | spider-cloud-cli crawl --url http://example.com --limit 10
 50 | ```
 51 | 
 52 | #### Links
 53 | 
 54 | Fetch links from a specified URL.
 55 | 
 56 | ```sh
 57 | spider-cloud-cli links --url http://example.com
 58 | ```
 59 | 
 60 | #### Screenshot
 61 | 
 62 | Take a screenshot of a specified URL.
 63 | 
 64 | ```sh
 65 | spider-cloud-cli screenshot --url http://example.com
 66 | ```
 67 | 
 68 | #### Search
 69 | 
 70 | Search for a query.
 71 | 
 72 | ```sh
 73 | spider-cloud-cli search --query "example query"
 74 | ```
 75 | 
 76 | #### Transform
 77 | 
 78 | Transform specified data.
 79 | 
 80 | ```sh
 81 | spider-cloud-cli transform --data "sample data"
 82 | ```
 83 | 
 84 | #### Extract Contacts
 85 | 
 86 | Extract contact information from a specified URL.
 87 | 
 88 | ```sh
 89 | spider-cloud-cli extract_contacts --url http://example.com
 90 | ```
 91 | 
 92 | #### Label
 93 | 
 94 | Label data from a specified URL.
 95 | 
 96 | ```sh
 97 | spider-cloud-cli label --url http://example.com
 98 | ```
 99 | 
100 | #### Get Crawl State
101 | 
102 | Get the crawl state of a specified URL.
103 | 
104 | ```sh
105 | spider-cloud-cli get_crawl_state --url http://example.com
106 | ```
107 | 
108 | #### Query
109 | 
110 | Query records of a specified domain.
111 | 
112 | ```sh
113 | spider-cloud-cli query --domain example.com
114 | ```
115 | 
116 | #### Get Credits
117 | 
118 | Fetch the account credits left.
119 | 
120 | ```sh
121 | spider-cloud-cli get_credits
122 | ```
123 | 
124 | ## License
125 | 
126 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
127 | 
128 | ## Contributing
129 | 
130 | Issues and pull requests are welcome! Feel free to check the [issues page](https://github.com/spider-rs/spider-clients/issues) if you have any questions or suggestions.
131 | 
132 | ## Acknowledgements
133 | 
134 | Special thanks to the developers and contributors of the libraries and tools used in this project.
135 | 


--------------------------------------------------------------------------------
/cli/src/args.rs:
--------------------------------------------------------------------------------
  1 | use clap::{Parser, Subcommand};
  2 | 
  3 | #[derive(Parser, Debug)]
  4 | #[command(name = "Spider CLI")]
  5 | #[command(version = "1.0")]
  6 | #[command(about = "A CLI interface for the Spider web crawler")]
  7 | pub struct Cli {
  8 |     #[command(subcommand)]
  9 |     pub command: Commands,
 10 | }
 11 | 
 12 | #[derive(Subcommand, Debug)]
 13 | pub enum Commands {
 14 |     /// Scrape a given URL
 15 |     Scrape {
 16 |         #[arg(short, long, help = "The URL to scrape")]
 17 |         url: String,
 18 |         #[arg(
 19 |             short,
 20 |             long,
 21 |             help = "Returns the link(s) found on the page that match the crawler query.",
 22 |             required = false
 23 |         )]
 24 |         return_page_links: Option<bool>,
 25 |     },
 26 |     /// Crawl a given URL with an optional page limit
 27 |     Crawl {
 28 |         #[arg(short, long, help = "The URL to crawl")]
 29 |         url: String,
 30 |         #[arg(
 31 |             short,
 32 |             long,
 33 |             help = "Limit the number of pages to crawl",
 34 |             required = false
 35 |         )]
 36 |         limit: Option<u32>,
 37 |         #[arg(
 38 |             short,
 39 |             long,
 40 |             help = "Returns the link(s) found on the page that match the crawler query.",
 41 |             required = false
 42 |         )]
 43 |         return_page_links: Option<bool>,
 44 |     },
 45 |     /// Fetch all links from a given URL
 46 |     Links {
 47 |         #[arg(short, long, help = "The URL to fetch links from")]
 48 |         url: String,
 49 |         #[arg(
 50 |             short,
 51 |             long,
 52 |             help = "Limit the number of pages to crawl",
 53 |             required = false
 54 |         )]
 55 |         limit: Option<u32>,
 56 |         #[arg(
 57 |             short,
 58 |             long,
 59 |             help = "Returns the link(s) found on the page that match the crawler query.",
 60 |             required = false
 61 |         )]
 62 |         return_page_links: Option<bool>,
 63 |     },
 64 |     /// Take a screenshot of a given URL
 65 |     Screenshot {
 66 |         #[arg(short, long, help = "The URL to take a screenshot of")]
 67 |         url: String,
 68 |         #[arg(
 69 |             short,
 70 |             long,
 71 |             help = "Limit the number of pages to crawl",
 72 |             required = false
 73 |         )]
 74 |         limit: Option<u32>,
 75 |         #[arg(
 76 |             short,
 77 |             long,
 78 |             help = "Returns the link(s) found on the page that match the crawler query.",
 79 |             required = false
 80 |         )]
 81 |         return_page_links: Option<bool>,
 82 |     },
 83 |     /// Search using a given query
 84 |     Search {
 85 |         #[arg(short, long, help = "The query to search for")]
 86 |         query: String,
 87 |         #[arg(
 88 |             short,
 89 |             long,
 90 |             help = "Limit the number of pages to crawl",
 91 |             required = false
 92 |         )]
 93 |         limit: Option<u32>,
 94 |         #[arg(
 95 |             short,
 96 |             long,
 97 |             help = "Returns the link(s) found on the page that match the crawler query.",
 98 |             required = false
 99 |         )]
100 |         return_page_links: Option<bool>,
101 |     },
102 |     /// Transform the provided data
103 |     Transform {
104 |         #[arg(short, long, help = "The data to transform")]
105 |         data: String,
106 |     },
107 |     /// Extract leads from a given URL
108 |     ExtractLeads {
109 |         #[arg(short, long, help = "The URL to extract leads from")]
110 |         url: String,
111 |         #[arg(
112 |             short,
113 |             long,
114 |             help = "Limit the number of pages to crawl",
115 |             required = false
116 |         )]
117 |         limit: Option<u32>,
118 |     },
119 |     /// Label data from a given URL
120 |     Label {
121 |         #[arg(short, long, help = "The URL to label data from")]
122 |         url: String,
123 |         #[arg(
124 |             short,
125 |             long,
126 |             help = "Limit the number of pages to crawl",
127 |             required = false
128 |         )]
129 |         limit: Option<u32>,
130 |     },
131 |     /// Get the crawl state of a given URL
132 |     GetCrawlState {
133 |         #[arg(short, long, help = "The URL to get the crawl state of")]
134 |         url: String,
135 |     },
136 |     /// Query for a domain
137 |     Query {
138 |         #[arg(short, long, help = "The domain to query")]
139 |         domain: String,
140 |     },
141 |     /// Get the remaining credits
142 |     GetCredits,
143 |     /// Authenticate using an API key
144 |     Auth {
145 |         #[arg(short, long, help = "The API key to authenticate")]
146 |         api_key: String,
147 |     },
148 | }
149 | 


--------------------------------------------------------------------------------
/cli/src/main.rs:
--------------------------------------------------------------------------------
  1 | mod args;
  2 | use args::{Cli, Commands};
  3 | use clap::Parser;
  4 | use keyring::Entry;
  5 | use serde_json::json;
  6 | use spider_client::{QueryRequest, RequestParams, SearchRequestParams, Spider};
  7 | use std::collections::HashMap;
  8 | use tokio;
  9 | 
 10 | const SERVICE_NAME: &str = "spider_client";
 11 | const USERNAME: &str = "default";
 12 | 
 13 | #[tokio::main]
 14 | async fn main() {
 15 |     let args = Cli::parse();
 16 |     let entry = Entry::new(SERVICE_NAME, USERNAME);
 17 | 
 18 |     match entry {
 19 |         Ok(ent) => {
 20 |             match args.command {
 21 |                 Commands::Auth { ref api_key } => match ent.set_password(&api_key.trim()) {
 22 |                     Ok(_) => println!("API key saved successfully."),
 23 |                     Err(e) => eprintln!("Failed to save API key: {:?}", e),
 24 |                 },
 25 |                 _ => (),
 26 |             }
 27 | 
 28 |             match ent.get_password() {
 29 |                 Ok(api_key) => {
 30 |                     let spider = Spider::new(Some(api_key.clone()))
 31 |                         .expect("Failed to initialize Spider client.");
 32 | 
 33 |                     match args.command {
 34 |                         Commands::Scrape {
 35 |                             url,
 36 |                             return_page_links,
 37 |                         } => {
 38 |                             println!("Scraping URL: {}", url);
 39 |                             let mut params = RequestParams::default();
 40 |                             params.return_page_links = return_page_links;
 41 |                             match spider
 42 |                                 .scrape_url(&url, Some(params), "application/json")
 43 |                                 .await
 44 |                             {
 45 |                                 Ok(data) => println!("{}", json!(data)),
 46 |                                 Err(e) => eprintln!("Error scraping URL: {:?}", e),
 47 |                             }
 48 |                         }
 49 |                         Commands::Crawl {
 50 |                             url,
 51 |                             limit,
 52 |                             return_page_links,
 53 |                         } => {
 54 |                             println!("Crawling URL: {}", url);
 55 |                             let mut params = RequestParams::default();
 56 |                             if let Some(limit) = limit {
 57 |                                 params.limit = Some(limit);
 58 |                             }
 59 |                             params.return_page_links = return_page_links;
 60 | 
 61 |                             match spider
 62 |                                 .crawl_url(
 63 |                                     &url,
 64 |                                     Some(params),
 65 |                                     false,
 66 |                                     "application/json",
 67 |                                     None::<fn(serde_json::Value)>,
 68 |                                 )
 69 |                                 .await
 70 |                             {
 71 |                                 Ok(data) => println!("{}", json!(data)),
 72 |                                 Err(e) => eprintln!("Error crawling URL: {:?}", e),
 73 |                             }
 74 |                         }
 75 |                         Commands::Links {
 76 |                             url,
 77 |                             return_page_links,
 78 |                             limit,
 79 |                         } => {
 80 |                             println!("Fetching links from URL: {}", url);
 81 |                             let mut params = RequestParams::default();
 82 |                             if let Some(limit) = limit {
 83 |                                 params.limit = Some(limit);
 84 |                             }
 85 |                             params.return_page_links = return_page_links;
 86 | 
 87 |                             match spider
 88 |                                 .links(&url, Some(params), false, "application/json")
 89 |                                 .await
 90 |                             {
 91 |                                 Ok(data) => println!("{}", json!(data)),
 92 |                                 Err(e) => eprintln!("Error fetching links: {:?}", e),
 93 |                             }
 94 |                         }
 95 |                         Commands::Screenshot {
 96 |                             url,
 97 |                             limit,
 98 |                             return_page_links,
 99 |                         } => {
100 |                             let mut params = RequestParams::default();
101 |                             if let Some(limit) = limit {
102 |                                 params.limit = Some(limit);
103 |                             }
104 |                             params.return_page_links = return_page_links;
105 |                             println!("Taking screenshot of URL: {}", url);
106 |                             match spider
107 |                                 .screenshot(&url, Some(params), false, "application/json")
108 |                                 .await
109 |                             {
110 |                                 Ok(data) => println!("{}", json!(data)),
111 |                                 Err(e) => eprintln!("Error taking screenshot: {:?}", e),
112 |                             }
113 |                         }
114 |                         Commands::Search {
115 |                             query,
116 |                             limit,
117 |                             return_page_links,
118 |                         } => {
119 |                             let mut params = SearchRequestParams::default();
120 |                             if let Some(limit) = limit {
121 |                                 params.base.limit = Some(limit);
122 |                             }
123 |                             params.base.return_page_links = return_page_links;
124 |                             println!("Searching for query: {}", query);
125 |                             match spider
126 |                                 .search(&query, Some(params), false, "application/json")
127 |                                 .await
128 |                             {
129 |                                 Ok(data) => println!("{}", json!(data)),
130 |                                 Err(e) => eprintln!("Error searching for query: {:?}", e),
131 |                             }
132 |                         }
133 |                         Commands::Transform { data } => {
134 |                             let data_vec = vec![HashMap::from([("content", data.as_str())])];
135 |                             println!("Transforming data: {}", data);
136 |                             match spider
137 |                                 .transform(data_vec, None, false, "application/json")
138 |                                 .await
139 |                             {
140 |                                 Ok(data) => println!("{}", json!(data)),
141 |                                 Err(e) => eprintln!("Error transforming data: {:?}", e),
142 |                             }
143 |                         }
144 |                         Commands::ExtractLeads { url, limit } => {
145 |                             let mut params = RequestParams::default();
146 |                             if let Some(limit) = limit {
147 |                                 params.limit = Some(limit);
148 |                             }
149 |                             println!("Extracting leads from URL: {}", url);
150 |                             match spider
151 |                                 .extract_contacts(&url, Some(params), false, "application/json")
152 |                                 .await
153 |                             {
154 |                                 Ok(data) => println!("{}", json!(data)),
155 |                                 Err(e) => eprintln!("Error extracting leads: {:?}", e),
156 |                             }
157 |                         }
158 |                         Commands::Label { url, limit } => {
159 |                             let mut params = RequestParams::default();
160 |                             if let Some(limit) = limit {
161 |                                 params.limit = Some(limit);
162 |                             }
163 |                             println!("Labeling data from URL: {}", url);
164 |                             match spider
165 |                                 .label(&url, Some(params), false, "application/json")
166 |                                 .await
167 |                             {
168 |                                 Ok(data) => println!("{}", json!(data)),
169 |                                 Err(e) => eprintln!("Error labeling data: {:?}", e),
170 |                             }
171 |                         }
172 |                         Commands::GetCrawlState { url } => {
173 |                             println!("Getting crawl state of URL: {}", url);
174 |                             match spider.get_crawl_state(&url, None, "application/json").await {
175 |                                 Ok(data) => println!("{}", json!(data)),
176 |                                 Err(e) => eprintln!("Error getting crawl state: {:?}", e),
177 |                             }
178 |                         }
179 |                         Commands::Query { domain } => {
180 |                             let query = QueryRequest {
181 |                                 domain: Some(domain.to_string()),
182 |                                 ..Default::default()
183 |                             };
184 |                             println!("Querying record for domain: {}", domain);
185 |                             match spider.query(&query).await {
186 |                                 Ok(data) => println!("{}", json!(data)),
187 |                                 Err(e) => eprintln!("Error querying record: {:?}", e),
188 |                             }
189 |                         }
190 |                         Commands::GetCredits => {
191 |                             println!("Fetching account credits left.");
192 |                             match spider.get_credits().await {
193 |                                 Ok(data) => println!("{}", json!(data)),
194 |                                 Err(e) => eprintln!("Error fetching credits: {:?}", e),
195 |                             }
196 |                         }
197 |                         _ => {}
198 |                     }
199 |                 }
200 |                 Err(_) => {
201 |                     eprintln!(
202 |                         "No API key found. Please authenticate first using the `auth` command."
203 |                     );
204 |                 }
205 |             }
206 |         }
207 |         _ => (),
208 |     }
209 | }
210 | 


--------------------------------------------------------------------------------
/cli/src/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod args;


--------------------------------------------------------------------------------
/javascript/.npmignore:
--------------------------------------------------------------------------------
 1 | src/
 2 | **/*.ts
 3 | !**/*.d.ts
 4 | tests/
 5 | __tests__/
 6 | *.spec.ts
 7 | *.test.ts
 8 | jest.config.js
 9 | tsconfig.json
10 | tslint.json
11 | *.log
12 | *.tlog
13 | *.tmp
14 | *.temp
15 | .DS_Store
16 | Thumbs.db
17 | .idea/
18 | .vscode/
19 | *.swp
20 | *.swo
21 | node_modules/
22 | dist/*.js.map
23 | dist/*.ts.map
24 | npm-debug.log*
25 | yarn-debug.log*
26 | yarn-error.log*
27 | .env
28 | .env.local
29 | coverage/


--------------------------------------------------------------------------------
/javascript/LICENSE:
--------------------------------------------------------------------------------
1 | ../LICENSE


--------------------------------------------------------------------------------
/javascript/README.md:
--------------------------------------------------------------------------------
  1 | # Spider Cloud JavaScript SDK
  2 | 
  3 | The Spider Cloud JavaScript SDK offers a streamlined set of tools for web scraping and crawling, with capabilities that allow for comprehensive data extraction suitable for interfacing with AI language models. This SDK makes it easy to interact programmatically with the Spider Cloud API from any JavaScript or Node.js application.
  4 | 
  5 | ## Installation
  6 | 
  7 | You can install the Spider Cloud JavaScript SDK via npm:
  8 | 
  9 | ```bash
 10 | npm install @spider-cloud/spider-client
 11 | ```
 12 | 
 13 | Or with yarn:
 14 | 
 15 | ```bash
 16 | yarn add @spider-cloud/spider-client
 17 | ```
 18 | 
 19 | ## Configuration
 20 | 
 21 | Before using the SDK, you will need to provide it with your API key. Obtain an API key from [spider.cloud](https://spider.cloud) and either pass it directly to the constructor or set it as an environment variable `SPIDER_API_KEY`.
 22 | 
 23 | ## Usage
 24 | 
 25 | Here's a basic example to demonstrate how to use the SDK:
 26 | 
 27 | ```javascript
 28 | import { Spider } from "@spider-cloud/spider-client";
 29 | 
 30 | // Initialize the SDK with your API key
 31 | const app = new Spider({ apiKey: "YOUR_API_KEY" });
 32 | 
 33 | // Scrape a URL
 34 | const url = "https://spider.cloud";
 35 | app
 36 |   .scrapeUrl(url)
 37 |   .then((data) => {
 38 |     console.log("Scraped Data:", data);
 39 |   })
 40 |   .catch((error) => {
 41 |     console.error("Scrape Error:", error);
 42 |   });
 43 | 
 44 | // Crawl a website
 45 | const crawlParams = {
 46 |   limit: 5,
 47 |   proxy_enabled: true,
 48 |   store_data: false,
 49 |   metadata: false,
 50 |   request: "http",
 51 | };
 52 | app
 53 |   .crawlUrl(url, crawlParams)
 54 |   .then((result) => {
 55 |     console.log("Crawl Result:", result);
 56 |   })
 57 |   .catch((error) => {
 58 |     console.error("Crawl Error:", error);
 59 |   });
 60 | ```
 61 | 
 62 | A real world crawl example streaming the response.
 63 | 
 64 | ```javascript
 65 | import { Spider } from "@spider-cloud/spider-client";
 66 | 
 67 | // Initialize the SDK with your API key
 68 | const app = new Spider({ apiKey: "YOUR_API_KEY" });
 69 | 
 70 | // The target URL
 71 | const url = "https://spider.cloud";
 72 | 
 73 | // Crawl a website
 74 | const crawlParams = {
 75 |   limit: 5,
 76 |   store_data: false,
 77 |   metadata: true,
 78 |   request: "http",
 79 | };
 80 | 
 81 | const stream = true;
 82 | 
 83 | const streamCallback = (data) => {
 84 |   console.log(data["url"]);
 85 | };
 86 | 
 87 | app.crawlUrl(url, crawlParams, stream, streamCallback);
 88 | ```
 89 | 
 90 | ### Data Operations
 91 | 
 92 | The Spider client can interact with specific data tables to create, retrieve, and delete data.
 93 | 
 94 | #### Retrieve Data from a Table
 95 | 
 96 | To fetch data from a specified table by applying query parameters, use the `getData` method. Provide the table name and an object containing query parameters:
 97 | 
 98 | ```javascript
 99 | const tableName = "pages";
100 | const queryParams = { limit: 20 };
101 | spider
102 |   .getData(tableName, queryParams)
103 |   .then((response) => console.log(response))
104 |   .catch((error) => console.error(error));
105 | ```
106 | 
107 | This example retrieves data from the 'pages' table, limiting the results to 20 entries.
108 | 
109 | #### Delete Data from a Table
110 | 
111 | To delete data from a specified table based on certain conditions, use the `deleteData` method. Provide the table name and an object specifying the conditions for deletion:
112 | 
113 | ```javascript
114 | const tableName = "websites";
115 | const deleteParams = { domain: "www.example.com" };
116 | spider
117 |   .deleteData(tableName, deleteParams)
118 |   .then((response) => console.log(response))
119 |   .catch((error) => console.error(error));
120 | ```
121 | 
122 | #### Download storage data
123 | 
124 | To download stored data like raw HTML or markdown use the `createSignedUrl` method. Provide the website name and an object containing query parameters:
125 | 
126 | ```javascript
127 | const websiteName = "spider.cloud";
128 | const queryParams = { limit: 20, page: 0 };
129 | spider
130 |   .createSignedUrl(websiteName, queryParams)
131 |   .then((response) => console.log(response))
132 |   .catch((error) => console.error(error));
133 | ```
134 | 
135 | ### Available Methods
136 | 
137 | - **`scrapeUrl(url, params)`**: Scrape data from a specified URL. Optional parameters can be passed to customize the scraping behavior.
138 | - **`crawlUrl(url, params, stream)`**: Begin crawling from a specific URL with optional parameters for customization and an optional streaming response.
139 | - **`search(q, params)`**: Perform a search and gather a list of websites to start crawling and collect resources.
140 | - **`links(url, params)`**: Retrieve all links from the specified URL with optional parameters.
141 | - **`screenshot(url, params)`**: Take a screenshot of the specified URL.
142 | - **`transform(data, params)`**: Perform a fast HTML transformation to markdown or text.
143 | - **`extractContacts(url, params)`**: Extract contact information from the specified URL.
144 | - **`label(url, params)`**: Apply labeling to data extracted from the specified URL.
145 | - **`getCrawlState(url, params)`**: Check the website crawl state.
146 | - **`getCredits()`**: Retrieve account's remaining credits.
147 | - **`getData(table, params)`**: Retrieve data records from the DB.
148 | - **`deleteData(table, params)`**: Delete records from the DB.
149 | - **`createSignedUrl(domain, params)`**: Download the records from the DB.
150 | 
151 | ## Error Handling
152 | 
153 | The SDK provides robust error handling and will throw exceptions when it encounters critical issues. Always use `.catch()` on promises to handle these errors gracefully.
154 | 
155 | ## Contributing
156 | 
157 | Contributions are always welcome! Feel free to open an issue or submit a pull request on our GitHub repository.
158 | 
159 | ## License
160 | 
161 | The Spider Cloud JavaScript SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).
162 | 


--------------------------------------------------------------------------------
/javascript/__tests__/spiderwebai.test.ts:
--------------------------------------------------------------------------------
  1 | import { describe, test } from "node:test";
  2 | import assert from "node:assert";
  3 | import { Collection, Spider } from "../src";
  4 | import "dotenv/config";
  5 | import { GenericParams } from "../src/client";
  6 | 
  7 | describe("Spider JS SDK", () => {
  8 |   const url = "https://example.com";
  9 |   const params: GenericParams = {
 10 |     limit: 1,
 11 |     return_format: "markdown",
 12 |     depth: 2,
 13 |     cache: true,
 14 |   };
 15 | 
 16 |   test("should throw error if API key is not provided", () => {
 17 |     if (!process.env.SPIDER_API_KEY) {
 18 |       assert.throws(() => new Spider({ apiKey: null }));
 19 |     } else {
 20 |       assert.doesNotThrow(() => new Spider({ apiKey: null }));
 21 |     }
 22 |   });
 23 | 
 24 |   test("should scrape url with data", async () => {
 25 |     const spiderClient = new Spider();
 26 |     const spiderData = await spiderClient.scrapeUrl(url, params);
 27 | 
 28 |     assert(Array.isArray(spiderData));
 29 |     assert(spiderData.length > 0);
 30 |     assert(spiderData[0].content);
 31 |     assert(spiderData[0].error !== undefined);
 32 |     assert(spiderData[0].status);
 33 |     assert(spiderData[0].url);
 34 |   });
 35 | 
 36 |   test("should crawl url with data", async () => {
 37 |     const spiderClient = new Spider();
 38 |     const spiderData = await spiderClient.crawlUrl(url, params);
 39 | 
 40 |     assert(Array.isArray(spiderData));
 41 |     assert(spiderData.length > 0);
 42 |     assert(spiderData[0].content);
 43 |     assert(spiderData[0].error !== undefined);
 44 |     assert(spiderData[0].status);
 45 |     assert(spiderData[0].url);
 46 |   });
 47 | 
 48 |   test("should crawl url with data streaming", async () => {
 49 |     const spiderClient = new Spider();
 50 | 
 51 |     const cb = (spiderData: any) => {
 52 |       assert(spiderData.content);
 53 |       assert(spiderData.status);
 54 |       assert(spiderData.url);
 55 |     };
 56 | 
 57 |     await spiderClient.crawlUrl(url, params, true, cb);
 58 |   });
 59 | 
 60 |   test("should get links", async () => {
 61 |     const spiderClient = new Spider();
 62 |     const linksData = await spiderClient.links(url, params);
 63 | 
 64 |     assert(Array.isArray(linksData));
 65 |     assert(linksData.length > 0);
 66 |     assert(linksData[0].error !== undefined);
 67 |     assert(linksData[0].status);
 68 |     assert(linksData[0].url);
 69 |   });
 70 | 
 71 |   test("should take screenshot", async () => {
 72 |     const spiderClient = new Spider();
 73 |     const screenshotData = await spiderClient.screenshot(url, { limit: 1 });
 74 | 
 75 |     assert(Array.isArray(screenshotData));
 76 |   });
 77 | 
 78 |   test.skip("should perform search", async () => {
 79 |     const spiderClient = new Spider();
 80 |     const searchData = await spiderClient.search(
 81 |       "example search query",
 82 |       params
 83 |     );
 84 | 
 85 |     assert(Array.isArray(searchData));
 86 |     assert(searchData.length > 0);
 87 |     assert(searchData[0].content);
 88 |     assert(searchData[0].error !== undefined);
 89 |     assert(searchData[0].status);
 90 |     assert(searchData[0].url);
 91 |   });
 92 | 
 93 |   test.skip("should transform data", async () => {
 94 |     const spiderClient = new Spider();
 95 |     const transformData = [
 96 |       { html: "<html><body>Example</body></html>", url: url },
 97 |     ];
 98 |     const transformedData = await spiderClient.transform(transformData, params);
 99 | 
100 |     assert(typeof transformedData === "object");
101 |     assert(transformedData.content);
102 |     assert(transformedData.error !== undefined);
103 |     assert(transformedData.status);
104 |   });
105 | 
106 |   test("should extract contacts", async () => {
107 |     const spiderClient = new Spider();
108 |     const contactsData = await spiderClient.extractContacts(url, params);
109 | 
110 |     assert(Array.isArray(contactsData));
111 |     assert(contactsData.length > 0);
112 |     assert(contactsData[0].content);
113 |     assert(contactsData[0].error !== undefined);
114 |     assert(contactsData[0].status);
115 |     assert(contactsData[0].url);
116 |   });
117 | 
118 |   test("should label data", async () => {
119 |     const spiderClient = new Spider();
120 |     const labelData = await spiderClient.label(url, params);
121 | 
122 |     assert(Array.isArray(labelData));
123 |     assert(labelData.length > 0);
124 |     assert(labelData[0].content);
125 |     assert(labelData[0].error !== undefined);
126 |     assert(labelData[0].status);
127 |     assert(labelData[0].url);
128 |   });
129 | 
130 |   test("should get crawl state", async () => {
131 |     const spiderClient = new Spider();
132 |     const crawlState = await spiderClient.getCrawlState(url, params);
133 | 
134 |     assert(typeof crawlState === "object");
135 |     assert(Array.isArray(crawlState.data));
136 |   });
137 | 
138 |   test.skip("should query global db", async () => {
139 |     const spiderClient = new Spider();
140 |     const crawlState = await spiderClient.query({ domain: "spider.cloud" });
141 | 
142 |     assert(typeof crawlState === "object");
143 |     assert(crawlState.content);
144 |   });
145 | 
146 |   test("should download the file", async () => {
147 |     const spiderClient = new Spider();
148 |     const { data } = await spiderClient.getData(Collection.Pages, {
149 |       domain: "example.com",
150 |       limit: 1,
151 |     });
152 | 
153 |     // the file might be deleted before hand. we need to not delete the file being used throughout test.
154 |     const text = data.length
155 |       ? await spiderClient.download({ url: data[0].url }, "text")
156 |       : "";
157 | 
158 |     assert(typeof text === "string");
159 |   });
160 | 
161 |   test("should get credits", async () => {
162 |     const spiderClient = new Spider();
163 |     const credits = await spiderClient.getCredits();
164 | 
165 |     assert(typeof credits === "object");
166 |   });
167 | 
168 |   test("should post data", async () => {
169 |     const spiderClient = new Spider();
170 |     const postData = { url: url };
171 |     const response = await spiderClient.postData(Collection.Websites, postData);
172 |     assert([200, 201].includes(response.status));
173 |   });
174 | 
175 |   test("should get data", async () => {
176 |     const spiderClient = new Spider();
177 |     const response = await spiderClient.getData(Collection.Websites, params);
178 | 
179 |     assert(typeof response === "object");
180 |     assert(Array.isArray(response.data));
181 |   });
182 | 
183 |   test("should delete data", async () => {
184 |     const spiderClient = new Spider();
185 |     const response = await spiderClient.deleteData(Collection.Websites, params);
186 | 
187 |     assert(response.status >= 200 && response.status <= 299);
188 |   });
189 | 
190 |   test("should create signed url", async () => {
191 |     const spiderClient = new Spider();
192 |     const { fileName, signedUrl } = await spiderClient.createSignedUrl(
193 |       "example.com"
194 |     );
195 | 
196 |     assert(typeof signedUrl === "string");
197 |     assert(typeof fileName === "string");
198 |   });
199 | });
200 | 


--------------------------------------------------------------------------------
/javascript/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@spider-cloud/spider-client",
 3 |   "version": "0.1.36",
 4 |   "description": "Isomorphic Javascript SDK for Spider Cloud services",
 5 |   "scripts": {
 6 |     "test": "node --import tsx  --test __tests__/*test.ts",
 7 |     "build": "tsc",
 8 |     "prepublishOnly": "npm test && npm run build"
 9 |   },
10 |   "main": "dist/index.js",
11 |   "types": "dist/client.d.ts",
12 |   "files": [
13 |     "dist/**/*"
14 |   ],
15 |   "keywords": [
16 |     "spider",
17 |     "sdk",
18 |     "web crawling",
19 |     "web scraping",
20 |     "api",
21 |     "llm scraping"
22 |   ],
23 |   "author": "Jeff Mendez<jeff@spider.cloud>",
24 |   "license": "MIT",
25 |   "devDependencies": {
26 |     "@types/node": "22.10.7",
27 |     "dotenv": "^16.4.7",
28 |     "tsx": "^4.19.2",
29 |     "typescript": "5.7.3"
30 |   },
31 |   "dependencies": {
32 |     "exponential-backoff": "^3.1.1"
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/javascript/sample.env:
--------------------------------------------------------------------------------
1 | SPIDER_API_KEY=


--------------------------------------------------------------------------------
/javascript/src/client.ts:
--------------------------------------------------------------------------------
  1 | import {
  2 |   ChunkCallbackFunction,
  3 |   Collection,
  4 |   QueryRequest,
  5 |   SpiderCoreResponse,
  6 |   SpiderParams,
  7 |   APISchema,
  8 |   APIRoutes,
  9 |   ApiVersion,
 10 | } from "./config";
 11 | import { version } from "../package.json";
 12 | import { streamReader } from "./utils/stream-reader";
 13 | import { backOff } from "exponential-backoff";
 14 | 
 15 | /**
 16 |  * Generic params for core request.
 17 |  */
 18 | export type GenericParams = Omit<SpiderParams, "url">;
 19 | 
 20 | /**
 21 |  * Configuration interface for Spider.
 22 |  */
 23 | export interface SpiderConfig {
 24 |   apiKey?: string | null;
 25 | }
 26 | 
 27 | /**
 28 |  * A class to interact with the Spider API.
 29 |  */
 30 | export class Spider {
 31 |   private apiKey?: string;
 32 | 
 33 |   /**
 34 |    * Create an instance of Spider.
 35 |    * @param {string | null} apiKey - The API key used to authenticate to the Spider API. If null, attempts to source from environment variables.
 36 |    * @throws Will throw an error if the API key is not provided.
 37 |    */
 38 |   constructor(props?: SpiderConfig) {
 39 |     this.apiKey = props?.apiKey || process?.env?.SPIDER_API_KEY;
 40 | 
 41 |     if (!this.apiKey) {
 42 |       throw new Error("No API key provided");
 43 |     }
 44 |   }
 45 | 
 46 |   /**
 47 |    * Internal method to handle POST requests.
 48 |    * @param {string} endpoint - The API endpoint to which the POST request should be sent.
 49 |    * @param {Record<string, any>} data - The JSON data to be sent in the request body.
 50 |    * @param {boolean} [stream=false] - Whether to stream the response back without parsing.
 51 |    * @returns {Promise<Response | any>} The response in JSON if not streamed, or the Response object if streamed.
 52 |    */
 53 |   private async _apiPost(
 54 |     endpoint: string,
 55 |     data: Record<string, any>,
 56 |     stream?: boolean,
 57 |     jsonl?: boolean
 58 |   ) {
 59 |     const headers = jsonl ? this.prepareHeadersJsonL : this.prepareHeaders;
 60 |     const response = await backOff(
 61 |       () =>
 62 |         fetch(`${APISchema["url"]}/${ApiVersion.V1}/${endpoint}`, {
 63 |           method: "POST",
 64 |           headers: headers,
 65 |           body: JSON.stringify(data),
 66 |         }),
 67 |       {
 68 |         numOfAttempts: 5,
 69 |       }
 70 |     );
 71 | 
 72 |     if (!stream) {
 73 |       if (response.ok) {
 74 |         return response.json();
 75 |       } else {
 76 |         this.handleError(response, `post to ${endpoint}`);
 77 |       }
 78 |     }
 79 |     return response;
 80 |   }
 81 | 
 82 |   /**
 83 |    * Internal method to handle GET requests.
 84 |    * @param {string} endpoint - The API endpoint from which data should be retrieved.
 85 |    * @returns {Promise<any>} The data returned from the endpoint in JSON format.
 86 |    */
 87 |   private async _apiGet(endpoint: string) {
 88 |     const headers = this.prepareHeaders;
 89 |     const response = await backOff(
 90 |       () =>
 91 |         fetch(`${APISchema["url"]}/${ApiVersion.V1}/${endpoint}`, {
 92 |           method: "GET",
 93 |           headers: headers,
 94 |         }),
 95 |       {
 96 |         numOfAttempts: 5,
 97 |       }
 98 |     );
 99 | 
100 |     if (response.ok) {
101 |       return response.json();
102 |     } else {
103 |       this.handleError(response, `get from ${endpoint}`);
104 |     }
105 |   }
106 | 
107 |   /**
108 |    * Internal method to handle DELETE requests.
109 |    * @param {string} endpoint - The API endpoint from which data should be retrieved.
110 |    * @returns {Promise<any>} The data returned from the endpoint in JSON format.
111 |    */
112 |   private async _apiDelete(endpoint: string) {
113 |     const headers = this.prepareHeaders;
114 |     const response = await backOff(
115 |       () =>
116 |         fetch(`${APISchema["url"]}/${ApiVersion.V1}/${endpoint}`, {
117 |           method: "DELETE",
118 |           headers,
119 |           body: JSON.stringify({})
120 |         }),
121 |       {
122 |         numOfAttempts: 5,
123 |       }
124 |     );
125 | 
126 |     if (response.ok) {
127 |       return response;
128 |     } else {
129 |       return this.handleError(response, `delete from ${endpoint}`);
130 |     }
131 |   }
132 | 
133 |   /**
134 |    * Scrapes data from a specified URL.
135 |    * @param {string} url - The URL to scrape.
136 |    * @param {GenericParams} [params={}] - Additional parameters for the scraping request.
137 |    * @returns {Promise<any>} The scraped data from the URL.
138 |    */
139 |   async scrapeUrl(url: string, params: GenericParams = {}) {
140 |     return this._apiPost(APIRoutes.Crawl, { url: url, limit: 1, ...params });
141 |   }
142 | 
143 |   /**
144 |    * Initiates a crawling job starting from the specified URL.
145 |    * @param {string} url - The URL to start crawling.
146 |    * @param {GenericParams} [params={}] - Additional parameters for the crawl.
147 |    * @param {boolean} [stream=false] - Whether to receive the response as a stream.
148 |    * @param {function} [callback=function] - The callback function when streaming per chunk. If this is set with stream you will not get a end response.
149 |    * @returns {Promise<any | Response>} The result of the crawl, either structured data or a Response object if streaming.
150 |    */
151 |   async crawlUrl(
152 |     url: string,
153 |     params: GenericParams = {},
154 |     stream = false,
155 |     cb?: ChunkCallbackFunction
156 |   ): Promise<SpiderCoreResponse[] | void> {
157 |     const jsonl = stream && cb;
158 |     const res = await this._apiPost(
159 |       APIRoutes.Crawl,
160 |       { url, ...params },
161 |       stream,
162 |       !!jsonl
163 |     );
164 | 
165 |     if (jsonl) {
166 |       return await streamReader(res, cb);
167 |     }
168 | 
169 |     return res;
170 |   }
171 | 
172 |   /**
173 |    * Retrieves all links from the specified URL.
174 |    * @param {string} url - The URL from which to gather links.
175 |    * @param {GenericParams} [params={}] - Additional parameters for the crawl.
176 |    * @param {boolean} [stream=false] - Whether to receive the response as a stream.
177 |    * @param {function} [callback=function] - The callback function when streaming per chunk. If this is set with stream you will not get a end response.
178 |    * @returns {Promise<any | Response>} The result of the crawl, either structured data or a Response object if streaming.
179 |    */
180 |   async links(
181 |     url: string,
182 |     params: GenericParams = {},
183 |     stream = false,
184 |     cb?: ChunkCallbackFunction
185 |   ): Promise<SpiderCoreResponse[] | void> {
186 |     const jsonl = stream && cb;
187 |     const res = await this._apiPost(
188 |       APIRoutes.Links,
189 |       { url, ...params },
190 |       stream,
191 |       !!jsonl
192 |     );
193 | 
194 |     if (jsonl) {
195 |       return await streamReader(res, cb);
196 |     }
197 | 
198 |     return res;
199 |   }
200 | 
201 |   /**
202 |    * Takes a screenshot of the website starting from this URL.
203 |    * @param {string} url - The URL to start the screenshot.
204 |    * @param {GenericParams} [params={}] - Configuration parameters for the screenshot.
205 |    * @returns {Promise<any>} The screenshot data.
206 |    */
207 |   async screenshot(url: string, params: GenericParams = {}) {
208 |     return this._apiPost(APIRoutes.Screenshot, { url: url, ...params });
209 |   }
210 | 
211 |   /**
212 |    *  Perform a search and gather a list of websites to start crawling and collect resources.
213 |    * @param {string} search - The search query.
214 |    * @param {GenericParams} [params={}] - Configuration parameters for the search.
215 |    * @returns {Promise<any>} The result of the crawl, either structured data or a Response object if streaming.
216 |    */
217 |   async search(q: string, params: GenericParams = {}) {
218 |     return this._apiPost(APIRoutes.Search, { search: q, ...params });
219 |   }
220 | 
221 |   /**
222 |    *  Transform HTML to Markdown or text. You can send up to 10MB of data at once.
223 |    * @param {object} data - The data to trasnform, a list of objects with the key 'html' and optional 'url' key for readability.
224 |    * @param {object} [params={}] - Configuration parameters for the transformation.
225 |    * @returns {Promise<any>} The transformation result.
226 |    */
227 |   async transform(data: { html: string; url?: string }[], params = {}) {
228 |     return this._apiPost(APIRoutes.Transform, { data, ...params });
229 |   }
230 | 
231 |   /**
232 |    * Extracts leads from a website.
233 |    * @param {string} url - The URL from which to extract contacts.
234 |    * @param {GenericParams} [params={}] - Configuration parameters for the extraction.
235 |    * @returns {Promise<any>} The contact information extracted.
236 |    */
237 |   async extractContacts(url: string, params: GenericParams = {}) {
238 |     return this._apiPost(APIRoutes.PiplineExtractLeads, {
239 |       url: url,
240 |       ...params,
241 |     });
242 |   }
243 | 
244 |   /**
245 |    * Applies labeling to data extracted from a specified URL.
246 |    * @param {string} url - The URL to label.
247 |    * @param {GenericParams} [params={}] - Configuration parameters for labeling.
248 |    * @returns {Promise<any>} The labeled data.
249 |    */
250 |   async label(url: string, params: GenericParams = {}) {
251 |     return this._apiPost(APIRoutes.PiplineLabel, { url: url, ...params });
252 |   }
253 | 
254 |   /**
255 |    * Check the crawl state of the website.
256 |    * @param {string} url - The URL to check.
257 |    * @param {GenericParams} [params={}] - Configuration parameters for crawl state. Can also pass in "domain" instead of the url to query.
258 |    * @returns {Promise<any>} The crawl state data.
259 |    */
260 |   async getCrawlState(url: string, params: GenericParams = {}) {
261 |     return this._apiPost(APIRoutes.DataCrawlState, { url: url, ...params });
262 |   }
263 | 
264 |   /**
265 |    * Create a signed url to download files from the storage.
266 |    * @param {string} [domain] - The domain for the user's storage. If not provided, downloads all files.
267 |    * @param {Object} [options] - The download options.
268 |    * @param {boolean} [raw] - Return the raw response.
269 | 
270 |    * @returns {Promise<Response>} The response containing the file stream.
271 |    */
272 |   async createSignedUrl(
273 |     url?: string,
274 |     options?: {
275 |       page?: number;
276 |       limit?: number;
277 |       expiresIn?: number;
278 |       // optional if you do not know the url put the domain and path.
279 |       domain?: string;
280 |       pathname?: string;
281 |     }
282 |   ): Promise<any> {
283 |     const { page, limit, expiresIn, domain, pathname } = options ?? {};
284 | 
285 |     const params = new URLSearchParams({
286 |       ...(url && { url }),
287 |       ...(domain && { domain }),
288 |       ...(pathname && { pathname }),
289 |       ...(page && { page: page.toString() }),
290 |       ...(limit && { limit: limit.toString() }),
291 |       ...(expiresIn && { expiresIn: expiresIn.toString() }),
292 |     });
293 |     const endpoint = `${APISchema["url"]}/${
294 |       APIRoutes.DataSignUrl
295 |     }?${params.toString()}`;
296 |     const headers = this.prepareHeaders;
297 | 
298 |     const response = await fetch(endpoint, {
299 |       method: "GET",
300 |       headers,
301 |     });
302 | 
303 |     if (response.ok) {
304 |       return await response.json();
305 |     } else {
306 |       this.handleError(response, `Failed to sign files`);
307 |     }
308 |   }
309 | 
310 |   /**
311 |    * Retrieves the number of credits available on the account.
312 |    * @returns {Promise<any>} The current credit balance.
313 |    */
314 |   async getCredits() {
315 |     return this._apiGet(APIRoutes.DataCredits);
316 |   }
317 | 
318 |   /**
319 |    * Send a POST request to insert data into a specified table.
320 |    * @param {string} table - The table name in the database.
321 |    * @param {object} data - The data to be inserted.
322 |    * @returns {Promise<any>} The response from the server.
323 |    */
324 |   async postData(
325 |     collection: Collection,
326 |     data: GenericParams | Record<string, any>
327 |   ): Promise<any> {
328 |     return this._apiPost(`${APIRoutes.Data}/${collection}`, data);
329 |   }
330 | 
331 |   /**
332 |    * Send a GET request to retrieve data from a specified table.
333 |    * @param {Collection} table - The table name in the database.
334 |    * @param {object} params - The query parameters for data retrieval.
335 |    * @returns {Promise<any>} The response from the server.
336 |    */
337 |   async getData(
338 |     collections: Collection,
339 |     params: GenericParams | Record<string, any>
340 |   ): Promise<any> {
341 |     return this._apiGet(
342 |       `${APIRoutes.Data}/${collections}?${new URLSearchParams(
343 |         params as any
344 |       ).toString()}`
345 |     );
346 |   }
347 | 
348 |   /**
349 |    * Download a record. The url is the path of the storage hash returned and not the exact website url.
350 |    * @param {QueryRequest} params - The query parameters for data retrieval.
351 |    * @returns {Promise<any>} The download response from the server.
352 |    */
353 |   async download(query: QueryRequest, output?: "text" | "blob"): Promise<any> {
354 |     const headers = this.prepareHeaders;
355 |     const endpoint = `${APIRoutes.DataDownload}?${new URLSearchParams(
356 |       query as Record<string, string>
357 |     ).toString()}`;
358 |     const response = await fetch(
359 |       `${APISchema["url"]}/${ApiVersion.V1}/${endpoint}`,
360 |       {
361 |         method: "GET",
362 |         headers,
363 |       }
364 |     );
365 | 
366 |     if (response.ok) {
367 |       if (output === "text") {
368 |         return await response.text();
369 |       }
370 |       return await response.blob();
371 |     } else {
372 |       this.handleError(response, `get from ${endpoint}`);
373 |     }
374 |   }
375 | 
376 |   /**
377 |    * Perform a query to get a document.
378 |    * @param {QueryRequest} params - The query parameters for data retrieval.
379 |    * @returns {Promise<any>} The response from the server.
380 |    */
381 |   async query(query: QueryRequest): Promise<any> {
382 |     return this._apiGet(
383 |       `${APIRoutes.DataQuery}?${new URLSearchParams(
384 |         query as Record<string, string>
385 |       ).toString()}`
386 |     );
387 |   }
388 | 
389 |   /**
390 |    * Send a DELETE request to remove data from a specified table.
391 |    * @param {Collection} table - The table name in the database.
392 |    * @param {object} params - Parameters to identify records to delete.
393 |    * @returns {Promise<any>} The response from the server.
394 |    */
395 |   async deleteData(
396 |     collection: Collection,
397 |     params: GenericParams | Record<string, any>
398 |   ): Promise<any> {
399 |     return this._apiDelete(
400 |       `${APIRoutes.Data}/${collection}?${new URLSearchParams(
401 |         params as any
402 |       ).toString()}`
403 |     );
404 |   }
405 | 
406 |   /**
407 |    * Prepares common headers for each API request.
408 |    * @returns {HeadersInit} A headers object for fetch requests.
409 |    */
410 |   get prepareHeaders() {
411 |     return {
412 |       "Content-Type": "application/json",
413 |       Authorization: `Bearer ${this.apiKey}`,
414 |       "User-Agent": `Spider-Client/${version}`,
415 |     };
416 |   }
417 | 
418 |   /**
419 |    * Prepares common headers for each API request with JSONl content-type suitable for streaming.
420 |    * @returns {HeadersInit} A headers object for fetch requests.
421 |    */
422 |   get prepareHeadersJsonL() {
423 |     return {
424 |       ...this.prepareHeaders,
425 |       "Content-Type": "application/jsonl",
426 |     };
427 |   }
428 | 
429 |   /**
430 |    * Handles errors from API requests.
431 |    * @param {Response} response - The fetch response object.
432 |    * @param {string} action - Description of the attempted action.
433 |    * @throws Will throw an error with detailed status information.
434 |    */
435 |   handleError(response: Response, action: string) {
436 |     throw new Error(`Failed to ${action}. Status code: ${response.status}.`);
437 |   }
438 | }
439 | 


--------------------------------------------------------------------------------
/javascript/src/config.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Represents viewport dimensions.
  3 |  */
  4 | export interface Viewport {
  5 |   width: number;
  6 |   height: number;
  7 | }
  8 | 
  9 | /**
 10 |  * Represents HTTP headers as a dictionary object.
 11 |  */
 12 | export interface Headers {
 13 |   [key: string]: string;
 14 | }
 15 | 
 16 | /**
 17 |  * Represents a budget for various resources.
 18 |  */
 19 | export interface Budget {
 20 |   [key: string]: number;
 21 | }
 22 | 
 23 | /**
 24 |  * The chunking algorithm to use.
 25 |  */
 26 | export type ChunkingAlgType =
 27 |   | "ByWords"
 28 |   | "ByLines"
 29 |   | "ByCharacterLength"
 30 |   | "BySentence";
 31 | 
 32 | /**
 33 |  * The chunking algorithm with the value to chunk by.
 34 |  */
 35 | export interface ChunkingAlg {
 36 |   type: ChunkingAlgType;
 37 |   value: number;
 38 | }
 39 | 
 40 | /**
 41 |  * Represents a timeout configuration.
 42 |  * @typedef {Object} Timeout
 43 |  * @property {number} secs - The number of seconds.
 44 |  * @property {number} nanos - The number of nanoseconds.
 45 |  */
 46 | interface Timeout {
 47 |   secs: number;
 48 |   nanos: number;
 49 | }
 50 | 
 51 | /**
 52 |  * Represents the webhook configuration.
 53 |  * @typedef {Object} WebhookSettings
 54 |  * @property {Object} object - The webhook configuration.
 55 |  */
 56 | interface WebhookSettings {
 57 |   /**
 58 |    * The URL or endpoint where the webhook information will be sent.
 59 |    */
 60 |   destination: string;
 61 |   /**
 62 |    * Flag to indicate an action should be taken when all credits are depleted.
 63 |    */
 64 |   on_credits_depleted: boolean;
 65 |   /**
 66 |    * Flag to indicate an action should be taken when half of the credits are depleted.
 67 |    */
 68 |   on_credits_half_depleted: boolean;
 69 |   /**
 70 |    * Flag to trigger a notification on a website status update event.
 71 |    */
 72 |   on_website_status: boolean;
 73 |   /**
 74 |    * Flag to send information about a new page find, such as links and data size.
 75 |    */
 76 |   on_find: boolean;
 77 |   /**
 78 |    * Flag to handle the metadata of a new page that has been found.
 79 |    */
 80 |   on_find_metadata: boolean;
 81 | }
 82 | 
 83 | /**
 84 |  * Represents the idle network configuration.
 85 |  * @typedef {Object} IdleNetwork
 86 |  * @property {Timeout} timeout - The timeout configuration.
 87 |  */
 88 | interface IdleNetwork {
 89 |   timeout: Timeout;
 90 | }
 91 | 
 92 | /**
 93 |  * Represents the selector configuration.
 94 |  * @typedef {Object} Selector
 95 |  * @property {Timeout} timeout - The timeout configuration.
 96 |  * @property {string} selector - The CSS selector to wait for.
 97 |  */
 98 | interface Selector {
 99 |   timeout: Timeout;
100 |   selector: string;
101 | }
102 | 
103 | /**
104 |  * Represents the delay configuration.
105 |  * @typedef {Object} Delay
106 |  * @property {Timeout} timeout - The timeout configuration.
107 |  */
108 | interface Delay {
109 |   timeout: Timeout;
110 | }
111 | 
112 | /**
113 |  * Represents the wait_for configuration.
114 |  * @typedef {Object} WaitFor
115 |  * @property {IdleNetwork} [idle_network] - Configuration to wait for network to be idle.
116 |  * @property {Selector} [selector] - Configuration to wait for a CSS selector.
117 |  * @property {Delay} [delay] - Configuration to wait for a delay.
118 |  * @property {boolean} [page_navigations] - Whether to wait for page navigations.
119 |  */
120 | interface WaitFor {
121 |   idle_network?: IdleNetwork;
122 |   selector?: Selector;
123 |   delay?: Delay;
124 |   page_navigations?: boolean;
125 | }
126 | 
127 | /**
128 |  * Represents the query API endpoint request to get documents from the global spider collection.
129 |  */
130 | export interface QueryRequest {
131 |   /**
132 |    * The exact URL to get.
133 |    */
134 |   url?: string;
135 |   /**
136 |    * The domain to get a document from.
137 |    */
138 |   domain?: string;
139 |   /**
140 |    * The path of the webpage to get the document. This is used with the domain key.
141 |    */
142 |   pathname?: string;
143 | }
144 | 
145 | // Define the CSSSelector type
146 | type CSSSelector = {
147 |   // The name of the selector group
148 |   name: string;
149 |   // An array of CSS selectors
150 |   selectors: string[];
151 | };
152 | 
153 | // Define the CSSExtractionMap type
154 | type CSSExtractionMap = {
155 |   // The map keys are strings (paths), and the values are arrays of CSSSelector objects
156 |   [path: string]: CSSSelector[];
157 | };
158 | 
159 | // Web automation using chrome
160 | export type WebAutomation =
161 |   | { Evaluate: string }
162 |   | { Click: string }
163 |   | { Wait: number }
164 |   | { WaitForNavigation: boolean }
165 |   | { WaitFor: string }
166 |   | { WaitForAndClick: string }
167 |   | { ScrollX: number }
168 |   | { ScrollY: number }
169 |   | { Fill: { selector: string; value?: string } }
170 |   | { InfiniteScroll: number };
171 | 
172 | export type ReturnFormat =
173 |   | "markdown"
174 |   | "commonmark"
175 |   | "raw"
176 |   | "text"
177 |   | "html2text"
178 |   | "bytes"
179 |   | "xml"
180 |   | "empty";
181 | 
182 | // Map automation scripts for paths or urls.
183 | export type WebAutomationMap = Record<string, WebAutomation[]>;
184 | // Map execution scripts for paths or urls.
185 | export type ExecutionScriptsMap = Record<string, string>;
186 | 
187 | // The HTTP redirect policy to use. Loose allows all domains and Strict only allows relative requests to the domain.
188 | export enum RedirectPolicy {
189 |   Loose = "Loose",
190 |   Strict = "Strict",
191 | }
192 | 
193 | /**
194 |  * Represents the options available for making a spider request.
195 |  */
196 | export interface SpiderParams {
197 |   /**
198 |    * The URL to be crawled.
199 |    */
200 |   url: string;
201 | 
202 |   /**
203 |    * The type of request to be made.
204 |    */
205 |   request?: "http" | "chrome" | "smart";
206 | 
207 |   /**
208 |    * The maximum number of pages the crawler should visit.
209 |    */
210 |   limit?: number;
211 | 
212 |   /**
213 |    * The format in which the result should be returned. When setting the return format as an array a object is returned mapping by the name.
214 |    */
215 |   return_format?: ReturnFormat | ReturnFormat[];
216 | 
217 |   /**
218 |    * Specifies whether to only visit the top-level domain.
219 |    */
220 |   tld?: boolean;
221 | 
222 |   /**
223 |    * The depth of the crawl.
224 |    */
225 |   depth?: number;
226 | 
227 |   /**
228 |    * Specifies whether the request should be cached.
229 |    */
230 |   cache?: boolean;
231 | 
232 |   /**
233 |    * The budget for various resources.
234 |    */
235 |   budget?: Budget;
236 | 
237 |   /**
238 |    * The blacklist routes to ignore. This can be a Regex string pattern.
239 |    */
240 |   blacklist?: string[];
241 | 
242 |   /**
243 |    * The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
244 |    */
245 |   whitelist?: string[];
246 | 
247 |   /**
248 |    * The locale to be used during the crawl.
249 |    */
250 |   locale?: string;
251 | 
252 |   /**
253 |    * The cookies to be set for the request, formatted as a single string.
254 |    */
255 |   cookies?: string;
256 | 
257 |   /**
258 |    * Specifies whether to use stealth techniques to avoid detection.
259 |    */
260 |   stealth?: boolean;
261 | 
262 |   /**
263 |    * The headers to be used for the request.
264 |    */
265 |   headers?: Headers;
266 | 
267 |   /**
268 |    * Specifies whether anti-bot measures should be used.
269 |    */
270 |   anti_bot?: boolean;
271 | 
272 |   /**
273 |    * Specifies whether to include metadata in the response.
274 |    */
275 |   metadata?: boolean;
276 | 
277 |   /**
278 |    * Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
279 |    */
280 |   css_extraction_map?: CSSExtractionMap;
281 | 
282 |   /**
283 |    * The dimensions of the viewport.
284 |    */
285 |   viewport?: Viewport;
286 | 
287 |   /**
288 |    * The encoding to be used for the request.
289 |    */
290 |   encoding?: "UTF-8" | "SHIFT_JIS" | string;
291 | 
292 |   /**
293 |    * Specifies whether to include subdomains in the crawl.
294 |    */
295 |   subdomains?: boolean;
296 | 
297 |   /**
298 |    * The user agent string to be used for the request.
299 |    */
300 |   user_agent?: string;
301 | 
302 |   /**
303 |    * Specifies whether the response data should be stored.
304 |    */
305 |   store_data?: boolean;
306 | 
307 |   /**
308 |    * Use webhooks to send data.
309 |    */
310 |   webhooks?: WebhookSettings;
311 |   /**
312 |    * Configuration settings for GPT (general purpose texture mappings).
313 |    */
314 |   gpt_config?: Record<string, any>;
315 | 
316 |   /**
317 |    * Specifies whether to use fingerprinting protection.
318 |    */
319 |   fingerprint?: boolean;
320 | 
321 |   /**
322 |    * Specifies whether to perform the request without using storage.
323 |    */
324 |   storageless?: boolean;
325 | 
326 |   /**
327 |    * Specifies whether readability optimizations should be applied.
328 |    */
329 |   readability?: boolean;
330 | 
331 |   /**
332 |    * Specifies whether to use a proxy for the request.
333 |    */
334 |   proxy_enabled?: boolean;
335 | 
336 |   /**
337 |    * Specifies whether to respect the site's robots.txt file.
338 |    */
339 |   respect_robots?: boolean;
340 | 
341 |   /**
342 |    * CSS root selector to be used to filter the content.
343 |    */
344 |   root_selector?: string;
345 | 
346 |   /**
347 |    * Specifies whether to load all resources of the crawl target.
348 |    */
349 |   full_resources?: boolean;
350 | 
351 |   /**
352 |    * Specifies whether to use the sitemap links.
353 |    */
354 |   sitemap?: boolean;
355 | 
356 |   /**
357 |    * Specifies whether to only use the sitemap links.
358 |    */
359 |   sitemap_only?: boolean;
360 | 
361 |   /**
362 |    * External domains to include the crawl.
363 |    */
364 | 
365 |   external_domains?: string[];
366 | 
367 |   /**
368 |    * Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
369 |    */
370 |   return_embeddings?: boolean;
371 | 
372 |   /**
373 |    * Returns the HTTP response headers used.
374 |    */
375 |   return_headers?: boolean;
376 | 
377 |   /**
378 |    * Returns the link(s) found on the page that match the crawler query.
379 |    */
380 |   return_page_links?: boolean;
381 | 
382 |   /**
383 |    * Returns the HTTP response cookies used.
384 |    */
385 |   return_cookies?: boolean;
386 | 
387 |   /**
388 |    * The timeout for the request, in milliseconds.
389 |    */
390 |   request_timeout?: number;
391 | 
392 |   /**
393 |    * Specifies whether to run the request in the background.
394 |    */
395 |   run_in_background?: boolean;
396 | 
397 |   /**
398 |    *  Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
399 |    */
400 | 
401 |   scroll?: number;
402 | 
403 |   /**
404 |    * Specifies whether to skip configuration checks.
405 |    */
406 |   skip_config_checks?: boolean;
407 | 
408 |   /**
409 |    * The chunking algorithm to use.
410 |    */
411 |   chunking_alg?: ChunkingAlg;
412 | 
413 |   /**
414 |    * The wait for events on the page. You need to make your `request` `chrome` or `smart`.
415 |    */
416 |   wait_for?: WaitFor;
417 | 
418 |   /**
419 |    * Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
420 |    */
421 |   disable_intercept?: boolean;
422 | 
423 |   /**
424 |    * Perform custom web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`.
425 |    */
426 |   automation_scripts?: WebAutomationMap;
427 | 
428 |   /**
429 |    * Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`.
430 |    */
431 |   execution_scripts?: ExecutionScriptsMap;
432 | 
433 |   /**
434 |    * The redirect policy for HTTP request. Set the value to Loose to allow all.
435 |    */
436 |   redirect_policy?: RedirectPolicy;
437 | 
438 |   /**
439 |    * Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
440 |    */
441 |   event_tracker?: {
442 |     responses?: true;
443 |     requests?: true;
444 |   };
445 | 
446 |   /**
447 |    * The timeout to stop the crawl.
448 |    */
449 |   crawl_timeout?: Timeout;
450 | 
451 |   /**
452 |    * Evaluates given script in every frame upon creation (before loading frame's scripts).
453 |    */
454 |   evaluate_on_new_document?: string;
455 | }
456 | 
457 | // Core actions response type.
458 | export type SpiderCoreResponse = {
459 |   // The content of the request like html or transformation markdown etc.
460 |   content?: string;
461 |   // A detailed message of a response.
462 |   message?: string;
463 |   // If an error occured.
464 |   error?: string;
465 |   // The HTTP status code.
466 |   status?: number;
467 |   // The website url.
468 |   url?: string;
469 | };
470 | 
471 | export type ChunkCallbackFunction = (data: SpiderCoreResponse) => void;
472 | 
473 | // records that you can query
474 | export enum Collection {
475 |   Websites = "websites",
476 |   Pages = "pages",
477 |   PagesMetadata = "pages_metadata",
478 |   // Leads
479 |   Contacts = "contacts",
480 |   CrawlState = "crawl_state",
481 |   CrawlLogs = "crawl_logs",
482 |   Profiles = "profiles",
483 |   Credits = "credits",
484 |   Webhooks = "webhooks",
485 |   APIKeys = "api_keys",
486 | }
487 | 
488 | // The API version for Spider
489 | export enum ApiVersion {
490 |   V1 = "v1",
491 | }
492 | 
493 | // The API routes paths.
494 | export enum APIRoutes {
495 |   // Crawl a website to collect the contents. Can be one page or many.
496 |   Crawl = "crawl",
497 |   // Crawl a website to collect the links. Can be one page or many.
498 |   Links = "links",
499 |   // Crawl a website to collect screenshots. Can be one page or many.
500 |   Screenshot = "screenshot",
501 |   // Search for something and optionally crawl the pages or get the results of the search.
502 |   Search = "search",
503 |   // Transform HTML to markdown or text.
504 |   Transform = "transform",
505 |   // Pipeline extract leads for a website - emails, phones, etc.
506 |   PiplineExtractLeads = "pipeline/extract-contacts",
507 |   // Pipeline label a website by category using AI and metadata.
508 |   PiplineLabel = "pipeline/label",
509 |   // Dynamic collection routes.
510 |   Data = "data",
511 |   // The last crawl state of a website.
512 |   DataCrawlState = "data/crawl_state",
513 |   // Sign a file from storage based on the exact url path of the storage or domain - pathname.
514 |   DataSignUrl = "data/sign-url",
515 |   // Download a file from storage based on the exact url path of the storage or domain - pathname.
516 |   DataDownload = "data/download",
517 |   // Perform a query on the global database to grab content without crawling if available.
518 |   DataQuery = "data/query",
519 |   // Get the credits remaining for an account.
520 |   DataCredits = "data/credits",
521 | }
522 | 
523 | // The base API target info for Spider Cloud.
524 | export const APISchema = {
525 |   url: "https://api.spider.cloud",
526 |   versions: {
527 |     current: ApiVersion.V1,
528 |     v1: {
529 |       routes: APIRoutes,
530 |       end_date: "",
531 |     },
532 |     latest: {
533 |       routes: APIRoutes,
534 |       end_date: "",
535 |     },
536 |   },
537 | };
538 | 
539 | // Adjust the Spider Cloud endpoint.
540 | export const setBaseUrl = (url: string) => {
541 |   if (url) {
542 |     APISchema["url"] = url;
543 |   }
544 | };
545 | 


--------------------------------------------------------------------------------
/javascript/src/index.ts:
--------------------------------------------------------------------------------
1 | export { Spider } from "./client";
2 | export { Collection, setBaseUrl, APISchema } from "./config";
3 | export type { SpiderParams, Budget, Viewport, QueryRequest } from "./config";
4 | 


--------------------------------------------------------------------------------
/javascript/src/utils/process-chunk.ts:
--------------------------------------------------------------------------------
 1 | import type { SpiderCoreResponse } from "../config";
 2 | 
 3 | export const createJsonLineProcessor = (
 4 |   cb: (r: SpiderCoreResponse) => void
 5 | ) => {
 6 |   let buffer = "";
 7 | 
 8 |   return (chunk: Buffer | string) => {
 9 |     buffer += chunk.toString();
10 |     let boundary: number;
11 | 
12 |     while ((boundary = buffer.indexOf("\n")) !== -1) {
13 |       const line = buffer.slice(0, boundary);
14 |       buffer = buffer.slice(boundary + 1);
15 | 
16 |       if (line.trim()) {
17 |         try {
18 |           cb(JSON.parse(line));
19 |         } catch (_error) {}
20 |       }
21 |     }
22 |   };
23 | };
24 | 


--------------------------------------------------------------------------------
/javascript/src/utils/stream-reader.ts:
--------------------------------------------------------------------------------
 1 | import type { ChunkCallbackFunction } from "../config";
 2 | import { createJsonLineProcessor } from "./process-chunk";
 3 | 
 4 | // Stream the response via callbacks.
 5 | export const streamReader = async (
 6 |   res: Response,
 7 |   cb: ChunkCallbackFunction
 8 | ) => {
 9 |   if (res.ok) {
10 |     const reader = res.body?.getReader();
11 |     const decoder = new TextDecoder();
12 |     const processChunk = createJsonLineProcessor(cb);
13 | 
14 |     if (reader) {
15 |       while (true) {
16 |         const { done, value } = await reader.read();
17 | 
18 |         if (done) {
19 |           break;
20 |         }
21 | 
22 |         const chunk = decoder.decode(value, { stream: true });
23 |         processChunk(chunk);
24 |       }
25 | 
26 |       processChunk(decoder.decode(new Uint8Array(), { stream: false }));
27 |     }
28 |   }
29 | };
30 | 


--------------------------------------------------------------------------------
/javascript/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "es2019",
 4 |     "module": "commonjs",
 5 |     "moduleResolution": "node",
 6 |     "lib": ["es5", "es6", "dom"],
 7 |     "outDir": "./dist",
 8 |     "rootDir": "./src",
 9 |     "strict": true,
10 |     "esModuleInterop": true,
11 |     "skipLibCheck": true,
12 |     "forceConsistentCasingInFileNames": true,
13 |     "resolveJsonModule": true,
14 |     "allowSyntheticDefaultImports": true,
15 |     "experimentalDecorators": true,
16 |     "emitDecoratorMetadata": true,
17 |     "declaration": true
18 |   },
19 |   "include": ["src/**/*"],
20 |   "exclude": ["node_modules", "**/*.test.ts"]
21 | }
22 | 


--------------------------------------------------------------------------------
/python/LICENSE:
--------------------------------------------------------------------------------
1 | ../LICENSE


--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
  1 | # Spider Cloud Python SDK
  2 | 
  3 | The Spider Cloud Python SDK offers a toolkit for straightforward website scraping, crawling at scale, and other utilities like extracting links and taking screenshots, enabling you to collect data formatted for compatibility with language models (LLMs). It features a user-friendly interface for seamless integration with the Spider Cloud API.
  4 | 
  5 | ## Installation
  6 | 
  7 | To install the Spider Cloud Python SDK, you can use pip:
  8 | 
  9 | ```bash
 10 | pip install spider_client
 11 | ```
 12 | 
 13 | ## Usage
 14 | 
 15 | 1. Get an API key from [spider.cloud](https://spider.cloud)
 16 | 2. Set the API key as an environment variable named `SPIDER_API_KEY` or pass it as a parameter to the `Spider` class.
 17 | 
 18 | Here's an example of how to use the SDK:
 19 | 
 20 | ```python
 21 | from spider import Spider
 22 | 
 23 | # Initialize the Spider with your API key
 24 | app = Spider(api_key='your_api_key')
 25 | 
 26 | # Scrape a single URL
 27 | url = 'https://spider.cloud'
 28 | scraped_data = app.scrape_url(url)
 29 | 
 30 | # Crawl a website
 31 | crawler_params = {
 32 |     'limit': 1,
 33 |     'proxy_enabled': True,
 34 |     'store_data': False,
 35 |     'metadata': False,
 36 |     'request': 'http'
 37 | }
 38 | crawl_result = app.crawl_url(url, params=crawler_params)
 39 | ```
 40 | 
 41 | ### Scraping a URL
 42 | 
 43 | To scrape data from a single URL:
 44 | 
 45 | ```python
 46 | url = 'https://example.com'
 47 | scraped_data = app.scrape_url(url)
 48 | ```
 49 | 
 50 | ### Crawling a Website
 51 | 
 52 | To automate crawling a website:
 53 | 
 54 | ```python
 55 | url = 'https://example.com'
 56 | crawl_params = {
 57 |     'limit': 200,
 58 |     'request': 'smart_mode'
 59 | }
 60 | crawl_result = app.crawl_url(url, params=crawl_params)
 61 | ```
 62 | 
 63 | #### Crawl Streaming
 64 | 
 65 | Stream crawl the website in chunks to scale.
 66 | 
 67 | ```python
 68 |     def handle_json(json_obj: dict) -> None:
 69 |         assert json_obj["url"] is not None
 70 | 
 71 |     url = 'https://example.com'
 72 |     crawl_params = {
 73 |         'limit': 200,
 74 |         'store_data': False
 75 |     }
 76 |     response = app.crawl_url(
 77 |         url,
 78 |         params=params,
 79 |         stream=True,
 80 |         callback=handle_json,
 81 |     )
 82 | ```
 83 | 
 84 | ### Search
 85 | 
 86 | Perform a search for websites to crawl or gather search results:
 87 | 
 88 | ```python
 89 | query = 'a sports website'
 90 | crawl_params = {
 91 |     'request': 'smart_mode',
 92 |     'search_limit': 5,
 93 |     'limit': 5,
 94 |     'fetch_page_content': True
 95 | }
 96 | crawl_result = app.search(query, params=crawl_params)
 97 | ```
 98 | 
 99 | ### Retrieving Links from a URL(s)
100 | 
101 | Extract all links from a specified URL:
102 | 
103 | ```python
104 | url = 'https://example.com'
105 | links = app.links(url)
106 | ```
107 | 
108 | ### Transform
109 | 
110 | Transform HTML to markdown or text lightning fast:
111 | 
112 | ```python
113 | data = [ { 'html': '<html><body><h1>Hello world</h1></body></html>' } ]
114 | params = {
115 |     'readability': False,
116 |     'return_format': 'markdown',
117 | }
118 | result = app.transform(data, params=params)
119 | ```
120 | 
121 | ### Taking Screenshots of a URL(s)
122 | 
123 | Capture a screenshot of a given URL:
124 | 
125 | ```python
126 | url = 'https://example.com'
127 | screenshot = app.screenshot(url)
128 | ```
129 | 
130 | ### Extracting Contact Information
131 | 
132 | Extract contact details from a specified URL:
133 | 
134 | ```python
135 | url = 'https://example.com'
136 | contacts = app.extract_contacts(url)
137 | ```
138 | 
139 | ### Labeling Data from a URL(s)
140 | 
141 | Label the data extracted from a particular URL:
142 | 
143 | ```python
144 | url = 'https://example.com'
145 | labeled_data = app.label(url)
146 | ```
147 | 
148 | ### Checking Crawl State
149 | 
150 | You can check the crawl state of the website:
151 | 
152 | ```python
153 | url = 'https://example.com'
154 | state = app.get_crawl_state(url)
155 | ```
156 | 
157 | ### Downloading files
158 | 
159 | You can download the results of the website:
160 | 
161 | ```python
162 | url = 'https://example.com'
163 | params = {
164 |     'page': 0,
165 |     'limit': 100,
166 |     'expiresIn': 3600  # Optional, add if needed
167 | }
168 | stream = True
169 | 
170 | state = app.create_signed_url(url, params, stream)
171 | ```
172 | 
173 | ### Checking Available Credits
174 | 
175 | You can check the remaining credits on your account:
176 | 
177 | ```python
178 | credits = app.get_credits()
179 | ```
180 | 
181 | ### Data Operations
182 | 
183 | The Spider client can now interact with specific data tables to create, retrieve, and delete data.
184 | 
185 | #### Retrieve Data from a Table
186 | 
187 | To fetch data from a specified table by applying query parameters:
188 | 
189 | ```python
190 | table_name = 'pages'
191 | query_params = {'limit': 20 }
192 | response = app.data_get(table_name, query_params)
193 | print(response)
194 | ```
195 | 
196 | #### Delete Data from a Table
197 | 
198 | To delete data from a specified table based on certain conditions:
199 | 
200 | ```python
201 | table_name = 'websites'
202 | delete_params = {'domain': 'www.example.com'}
203 | response = app.data_delete(table_name, delete_params)
204 | print(response)
205 | ```
206 | 
207 | ## Streaming
208 | 
209 | If you need to stream the request use the third param:
210 | 
211 | ```python
212 | url = 'https://example.com'
213 | 
214 | crawler_params = {
215 |     'limit': 1,
216 |     'proxy_enabled': True,
217 |     'store_data': False,
218 |     'metadata': False,
219 |     'request': 'http'
220 | }
221 | 
222 | links = app.links(url, crawler_params, True)
223 | ```
224 | 
225 | ## Content-Type
226 | 
227 | The following Content-type headers are supported using the fourth param:
228 | 
229 | 1. `application/json`
230 | 1. `text/csv`
231 | 1. `application/xml`
232 | 1. `application/jsonl`
233 | 
234 | ```python
235 | url = 'https://example.com'
236 | 
237 | crawler_params = {
238 |     'limit': 1,
239 |     'proxy_enabled': True,
240 |     'store_data': False,
241 |     'metadata': False,
242 |     'request': 'http'
243 | }
244 | 
245 | # stream json lines back to the client
246 | links = app.crawl(url, crawler_params, True, "application/jsonl")
247 | ```
248 | 
249 | ## Error Handling
250 | 
251 | The SDK handles errors returned by the Spider Cloud API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
252 | 
253 | ## Contributing
254 | 
255 | Contributions to the Spider Cloud Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
256 | 
257 | ## License
258 | 
259 | The Spider Cloud Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).
260 | 


--------------------------------------------------------------------------------
/python/example.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from spider import Spider
 3 | 
 4 | # Initialize the Spider with your API key using the env key SPIDER_API_KEY
 5 | app = Spider()
 6 | 
 7 | crawler_params = {
 8 |     'limit': 5,
 9 |     'proxy_enabled': False,
10 |     'store_data': False,
11 |     'metadata': False,
12 |     'request': 'http'
13 | }
14 | crawl_result = app.crawl_url('https://spider.cloud', params=crawler_params)
15 | 
16 | print(crawl_result)


--------------------------------------------------------------------------------
/python/example_async.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from spider import AsyncSpider
  3 | 
  4 | crawler_params = {
  5 |     'limit': 1,
  6 |     'proxy_enabled': True,
  7 |     'store_data': False,
  8 |     'metadata': False,
  9 |     'request': 'http'
 10 | }
 11 | 
 12 | 
 13 | # A callback
 14 | def process_json(data: dict) -> None:
 15 |     print("Processing data:")
 16 |     for key, value in data.items():
 17 |         print(f"{key}: {value}")
 18 | 
 19 | 
 20 | async def crawl_url():
 21 |     # Initialize the AsyncSpider
 22 |     spider = AsyncSpider()
 23 | 
 24 |     # URL to crawl
 25 |     url = 'https://spider.cloud'
 26 |         
 27 |     # For non-streaming usage:
 28 |     print("Non-streaming crawl:")
 29 |     async for result in spider.crawl_url(url, params=crawler_params, stream=False):
 30 |         print(result)
 31 |     
 32 |     # For streaming usage with a callback:
 33 |     print("\nStreaming crawl with callback:")
 34 |     async for _ in spider.crawl_url(url, params=crawler_params, stream=True, callback=process_json):
 35 |         pass  # The callback function handles the data processing
 36 | 
 37 |     # For streaming usage without a callback (just prints the response headers):
 38 |     print("\nStreaming crawl without callback:")
 39 |     async for chunk in spider.crawl_url(url, params=crawler_params, stream=True):
 40 |         print(f"Received chunk: {chunk}")
 41 | 
 42 | 
 43 | asyncio.run(crawl_url())
 44 | 
 45 | 
 46 | async def scrape_url():
 47 |     # Initialize the AsyncSpider
 48 |     spider = AsyncSpider()
 49 | 
 50 |     # URL to crawl
 51 |     url = 'https://spider.cloud'
 52 |         
 53 |     # For non-streaming usage:
 54 |     print("Non-streaming scrape:")
 55 |     async for result in spider.scrape_url(url, params=crawler_params, stream=False):
 56 |         print(result)
 57 | 
 58 |     # For streaming usage without a callback (just prints the response headers):
 59 |     print("\nStreaming scrape without callback:")
 60 |     async for chunk in spider.scrape_url(url, params=crawler_params, stream=True):
 61 |         print(f"Received chunk: {chunk}")
 62 | 
 63 | 
 64 | async def links():
 65 |     # Initialize the AsyncSpider
 66 |     spider = AsyncSpider()
 67 | 
 68 |     # URL to crawl
 69 |     url = 'https://spider.cloud'
 70 |         
 71 |     # For non-streaming usage:
 72 |     print("Non-streaming links:")
 73 |     async for result in spider.links(url, params=crawler_params, stream=False):
 74 |         print(result)
 75 | 
 76 |     # For streaming usage without a callback (just prints the response headers):
 77 |     print("\nStreaming links without callback:")
 78 |     async for chunk in spider.links(url, params=crawler_params, stream=True):
 79 |         print(f"Received chunk: {chunk}")
 80 | 
 81 | 
 82 | async def screenshot():
 83 |     # Initialize the AsyncSpider
 84 |     spider = AsyncSpider()
 85 | 
 86 |     # URL to crawl
 87 |     url = 'https://spider.cloud'
 88 |         
 89 |     # For non-streaming usage:
 90 |     print("Non-streaming screenshot:")
 91 |     async for result in spider.screenshot(url, params=crawler_params, stream=False):
 92 |         print(result)
 93 | 
 94 |     # For streaming usage without a callback (just prints the response headers):
 95 |     print("\nStreaming screenshot without callback:")
 96 |     async for chunk in spider.screenshot(url, params=crawler_params, stream=True):
 97 |         print(f"Received chunk: {chunk}")
 98 | 
 99 | 
100 | async def search():
101 |     # Initialize the AsyncSpider
102 |     spider = AsyncSpider()
103 | 
104 |     # Search term
105 |     q = "what is spider cloud?"
106 |     
107 |     # For non-streaming usage:
108 |     print("Non-streaming search:")
109 |     async for result in spider.search(q=q, params=crawler_params, stream=False):
110 |         print(result)
111 | 
112 |     # For streaming usage without a callback (just prints the response headers):
113 |     print("\nStreaming search without callback:")
114 |     async for chunk in spider.search(q=q,params=crawler_params, stream=True):
115 |         print(f"Received chunk: {chunk}")
116 | 
117 | 
118 | async def transform():
119 |     # Initialize the AsyncSpider
120 |     spider = AsyncSpider()
121 | 
122 |     # URL to crawl
123 |     url = 'https://spider.cloud'
124 |     
125 |     # Get html
126 |     async for result in spider.crawl_url(url=url, params=crawler_params, stream=False):
127 |         data = result
128 | 
129 |     data[0]['html'] = data[0]['content'] # ! Transform endpoint expects html, not content
130 |     print("Non-streaming transform:")
131 |     async for result in spider.transform(data=data, params=crawler_params, stream=False):
132 |         print(result)
133 |         
134 |     # For streaming usage without a callback (just prints the response headers):
135 |     print("\nStreaming transform without callback:")
136 |     async for chunk in spider.transform(data=data,params=crawler_params, stream=True):
137 |         print(f"Received chunk: {chunk}")
138 | 
139 | 
140 | async def contacts():
141 |     # Initialize the AsyncSpider
142 |     spider = AsyncSpider()
143 | 
144 |     # URL to crawl
145 |     url = 'https://spider.cloud'
146 |         
147 |     # For non-streaming usage:
148 |     print("Non-streaming contacts:")
149 |     async for result in spider.extract_contacts(url, params=crawler_params, stream=False):
150 |         print(result)
151 | 
152 |     # For streaming usage without a callback (just prints the response headers):
153 |     print("\nStreaming contacts without callback:")
154 |     async for chunk in spider.extract_contacts(url, params=crawler_params, stream=True):
155 |         print(f"Received chunk: {chunk}")
156 |         
157 | 
158 | async def credits():
159 |     # Initialize the AsyncSpider
160 |     spider = AsyncSpider()
161 |     
162 |     async for result in spider.get_credits():
163 |         print(result)
164 | 
165 | 
166 | async def data_get():
167 |     spider = AsyncSpider()
168 |     
169 |     async for result in spider.data_get("websites", params=crawler_params):
170 |         print(result)
171 |         
172 | 
173 | async def data_delete():
174 |     spider = AsyncSpider()
175 |     
176 |     async for result in spider.data_delete("websites", params=crawler_params):
177 |         print(result)
178 |         
179 | if __name__ == "__main__":
180 |     asyncio.run(crawl_url())
181 |     asyncio.run(scrape_url())
182 |     asyncio.run(links())
183 |     asyncio.run(screenshot())
184 |     asyncio.run(search())
185 |     asyncio.run(transform())
186 |     asyncio.run(contacts())
187 |     asyncio.run(credits())    
188 |     asyncio.run(data_get())    
189 |     asyncio.run(data_delete())    
190 |     
191 |     
192 |     


--------------------------------------------------------------------------------
/python/example_streaming.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from spider import Spider
 3 | 
 4 | # Initialize the Spider with your API key using the env key SPIDER_API_KEY
 5 | app = Spider()
 6 | 
 7 | crawler_params = {
 8 |     'limit': 1000,
 9 |     'proxy_enabled': False,
10 |     'store_data': False,
11 |     'metadata': False,
12 |     'request': 'http'
13 | }
14 | 
15 | count = [0]
16 | 
17 | def process_json(data: dict) -> None:
18 |     print(f"Processing: {count[0]}")
19 |     count[0] += 1
20 |     for key, value in data.items():
21 |         print(f"{key}: {value}")
22 | 
23 | app.crawl_url('https://spider.cloud', params=crawler_params, stream=True, callback=process_json)
24 | 


--------------------------------------------------------------------------------
/python/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-asyncio
3 | python-dotenv
4 | aiohttp
5 | python-dotenv
6 | ijson
7 | tenacity


--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import os
 3 | 
 4 | 
 5 | def read_file(fname):
 6 |     return open(os.path.join(os.path.dirname(__file__), fname), encoding="utf-8").read()
 7 | 
 8 | 
 9 | setup(
10 |     name="spider_client",
11 |     version="0.1.36",
12 |     url="https://github.com/spider-rs/spider-clients/tree/main/python",
13 |     license="MIT",
14 |     author="Spider",
15 |     author_email="jeff@spider.cloud",
16 |     description="Python SDK for Spider Cloud API",
17 |     packages=find_packages(),
18 |     install_requires=["requests", "ijson", "tenacity", "aiohttp"],
19 |     long_description=read_file("README.md"),
20 |     long_description_content_type="text/markdown",
21 |     classifiers=[
22 |         "Development Status :: 5 - Production/Stable",
23 |         "Intended Audience :: Developers",
24 |         "Intended Audience :: Information Technology",
25 |         "Topic :: Software Development :: Libraries :: Python Modules",
26 |         "Topic :: Internet",
27 |         "Topic :: Internet :: WWW/HTTP",
28 |         "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
29 |         "Operating System :: OS Independent",
30 |     ],
31 | )
32 | 


--------------------------------------------------------------------------------
/python/spider/__init__.py:
--------------------------------------------------------------------------------
1 | from .spider import Spider
2 | from .async_spider import AsyncSpider


--------------------------------------------------------------------------------
/python/spider/spider.py:
--------------------------------------------------------------------------------
  1 | import os, requests, logging, ijson, tenacity
  2 | from typing import Optional, Dict
  3 | from spider.spider_types import RequestParamsDict, JsonCallback, QueryRequest
  4 | 
  5 | 
  6 | class Spider:
  7 |     def __init__(self, api_key: Optional[str] = None):
  8 |         """
  9 |         Initialize the Spider with an API key.
 10 | 
 11 |         :param api_key: A string of the API key for Spider. Defaults to the SPIDER_API_KEY environment variable.
 12 |         :raises ValueError: If no API key is provided.
 13 |         """
 14 |         self.api_key = api_key or os.getenv("SPIDER_API_KEY")
 15 |         if self.api_key is None:
 16 |             raise ValueError("No API key provided")
 17 | 
 18 |     @tenacity.retry(
 19 |         wait=tenacity.wait_exponential(multiplier=1, min=1, max=60),
 20 |         stop=tenacity.stop_after_attempt(5)
 21 |     )
 22 |     def api_post(
 23 |         self,
 24 |         endpoint: str,
 25 |         data: dict,
 26 |         stream: bool = False,
 27 |         content_type: str = "application/json",
 28 |     ):
 29 |         """
 30 |         Send a POST request to the specified API endpoint.
 31 | 
 32 |         :param endpoint: The API endpoint to which the POST request is sent.
 33 |         :param data: The data (dictionary) to be sent in the POST request.
 34 |         :param stream: Boolean indicating if the response should be streamed.
 35 |         :return: The JSON response or the raw response stream if stream is True.
 36 |         """
 37 |         headers = self._prepare_headers(content_type)
 38 |         response = self._post_request(
 39 |             f"https://api.spider.cloud/{endpoint}", data, headers, stream
 40 |         )
 41 |         if stream:
 42 |             return response
 43 |         elif 200 <= response.status_code < 300:
 44 |             return response.json()
 45 |         else:
 46 |             self._handle_error(response, f"post to {endpoint}")
 47 | 
 48 |     @tenacity.retry(
 49 |         wait=tenacity.wait_exponential(multiplier=1, min=1, max=60),
 50 |         stop=tenacity.stop_after_attempt(5)
 51 |     )
 52 |     def api_get(
 53 |         self,
 54 |         endpoint: str,
 55 |         params: Optional[dict] = None,
 56 |         stream: bool = False,
 57 |         content_type: str = "application/json",
 58 |     ):
 59 |         """
 60 |         Send a GET request to the specified endpoint.
 61 | 
 62 |         :param endpoint: The API endpoint from which to retrieve data.
 63 |         :param params: Query parameters to attach to the URL.
 64 |         :return: The JSON decoded response.
 65 |         """
 66 |         headers = self._prepare_headers(content_type)
 67 |         response = requests.get(
 68 |             f"https://api.spider.cloud/{endpoint}",
 69 |             headers=headers,
 70 |             params=params,
 71 |             stream=stream,
 72 |         )
 73 |         if 200 <= response.status_code < 300:
 74 |             return response.json()
 75 |         else:
 76 |             self._handle_error(response, f"get from {endpoint}")
 77 | 
 78 |     @tenacity.retry(
 79 |         wait=tenacity.wait_exponential(multiplier=1, min=1, max=60),
 80 |         stop=tenacity.stop_after_attempt(5)
 81 |     )
 82 |     def api_delete(
 83 |         self,
 84 |         endpoint: str,
 85 |         params: Optional[RequestParamsDict] = None,
 86 |         stream: Optional[bool] = False,
 87 |         content_type: Optional[str] = "application/json",
 88 |     ):
 89 |         """
 90 |         Send a DELETE request to the specified endpoint.
 91 | 
 92 |         :param endpoint: The API endpoint from which to retrieve data.
 93 |         :param params: Optional parameters to include in the DELETE request.
 94 |         :param stream: Boolean indicating if the response should be streamed.
 95 |         :param content_type: The content type of the request.
 96 |         :return: The JSON decoded response.
 97 |         """
 98 |         headers = self._prepare_headers(content_type)
 99 |         response = self._delete_request(
100 |             f"https://api.spider.cloud/v1/{endpoint}",
101 |             headers=headers,
102 |             json=params,
103 |             stream=stream,
104 |         )
105 |         if 200 <= response.status_code < 300:
106 |             return response.json()
107 |         else:
108 |             self._handle_error(response, f"delete from {endpoint}")
109 | 
110 |     def scrape_url(
111 |         self,
112 |         url: str,
113 |         params: Optional[RequestParamsDict] = None,
114 |         stream: bool = False,
115 |         content_type: str = "application/json",
116 |     ):
117 |         """
118 |         Scrape data from the specified URL.
119 | 
120 |         :param url: The URL from which to scrape data.
121 |         :param params: Optional dictionary of additional parameters for the scrape request.
122 |         :return: JSON response containing the scraping results.
123 |         """
124 |         return self.api_post(
125 |             "crawl", {"url": url, "limit": 1, **(params or {})}, stream, content_type
126 |         )
127 | 
128 |     def crawl_url(
129 |         self,
130 |         url: str,
131 |         params: Optional[RequestParamsDict],
132 |         stream: Optional[bool] = False,
133 |         content_type: Optional[str] = "application/json",
134 |         callback: Optional[JsonCallback] = None,
135 |     ):
136 |         """
137 |         Start crawling at the specified URL.
138 | 
139 |         :param url: The URL to begin crawling.
140 |         :param params: Optional dictionary with additional parameters to customize the crawl.
141 |         :param stream: Optional Boolean indicating if the response should be streamed. Defaults to False.
142 |         :param content_type: Optional str to determine the content-type header of the request.
143 |         :param callback: Optional callback to use with streaming. This will only send the data via callback.
144 | 
145 |         :return: JSON response or the raw response stream if streaming enabled.
146 |         """
147 |         jsonl = stream and callable(callback)
148 | 
149 |         if jsonl:
150 |             content_type = "application/jsonl"
151 | 
152 |         response = self.api_post(
153 |             "crawl", {"url": url, **(params or {})}, stream, content_type
154 |         )
155 | 
156 |         if jsonl:
157 |             return self.stream_reader(response, callback)
158 |         else:
159 |             return response
160 | 
161 |     def links(
162 |         self,
163 |         url: str,
164 |         params: Optional[RequestParamsDict] = None,
165 |         stream: bool = False,
166 |         content_type: str = "application/json",
167 |     ):
168 |         """
169 |         Retrieve links from the specified URL.
170 | 
171 |         :param url: The URL from which to extract links.
172 |         :param params: Optional parameters for the link retrieval request.
173 |         :return: JSON response containing the links.
174 |         """
175 |         return self.api_post(
176 |             "links", {"url": url, **(params or {})}, stream, content_type
177 |         )
178 | 
179 |     def screenshot(
180 |         self,
181 |         url: str,
182 |         params: Optional[RequestParamsDict] = None,
183 |         stream: bool = False,
184 |         content_type: str = "application/json",
185 |     ):
186 |         """
187 |         Take a screenshot of the specified URL.
188 | 
189 |         :param url: The URL to capture a screenshot from.
190 |         :param params: Optional parameters to customize the screenshot capture.
191 |         :return: JSON response with screenshot data.
192 |         """
193 |         return self.api_post(
194 |             "screenshot", {"url": url, **(params or {})}, stream, content_type
195 |         )
196 | 
197 |     def search(
198 |         self,
199 |         q: str,
200 |         params: Optional[RequestParamsDict] = None,
201 |         stream: bool = False,
202 |         content_type: str = "application/json",
203 |     ):
204 |         """
205 |         Perform a search and gather a list of websites to start crawling and collect resources.
206 | 
207 |         :param search: The search query.
208 |         :param params: Optional parameters to customize the search.
209 |         :return: JSON response or the raw response stream if streaming enabled.
210 |         """
211 |         return self.api_post(
212 |             "search", {"search": q, **(params or {})}, stream, content_type
213 |         )
214 | 
215 |     def transform(
216 |         self, data, params=None, stream=False, content_type="application/json"
217 |     ):
218 |         """
219 |         Transform HTML to Markdown or text. You can send up to 10MB of data at once.
220 | 
221 |         :param data: The data to transform a list of objects with the 'html' key and an optional 'url' key only used readability mode.
222 |         :param params: Optional parameters to customize the search.
223 |         :return: JSON response or the raw response stream if streaming enabled.
224 |         """
225 |         return self.api_post(
226 |             "transform", {"data": data, **(params or {})}, stream, content_type
227 |         )
228 | 
229 |     def extract_contacts(
230 |         self,
231 |         url: str,
232 |         params: Optional[RequestParamsDict] = None,
233 |         stream: bool = False,
234 |         content_type: str = "application/json",
235 |     ):
236 |         """
237 |         Extract contact information from the specified URL.
238 | 
239 |         :param url: The URL from which to extract contact information.
240 |         :param params: Optional parameters for the contact extraction.
241 |         :return: JSON response containing extracted contact details.
242 |         """
243 |         return self.api_post(
244 |             "pipeline/extract-contacts",
245 |             {"url": url, **(params or {})},
246 |             stream,
247 |             content_type,
248 |         )
249 | 
250 |     def label(
251 |         self,
252 |         url: str,
253 |         params: Optional[RequestParamsDict] = None,
254 |         stream: bool = False,
255 |         content_type: str = "application/json",
256 |     ):
257 |         """
258 |         Apply labeling to data extracted from the specified URL.
259 | 
260 |         :param url: The URL to label data from.
261 |         :param params: Optional parameters to guide the labeling process.
262 |         :return: JSON response with labeled data.
263 |         """
264 |         return self.api_post(
265 |             "pipeline/label", {"url": url, **(params or {})}, stream, content_type
266 |         )
267 | 
268 |     def query(
269 |         self,
270 |         params: QueryRequest = None,
271 |         stream: bool = False,
272 |         content_type: str = "application/json",
273 |     ):
274 |         """
275 |         Query a website resource from our database. This costs 1 credit per successful retrieval.
276 |         :param params: Optional parameters to guide the labeling process.
277 |         :return: The website contents markup.
278 |         """
279 |         return self.api_get("data/query", {**(params or {})}, stream, content_type)
280 | 
281 |     def download(
282 |         self,
283 |         url: Optional[str] = None,
284 |         params: Optional[Dict[str, int]] = None,
285 |         stream: Optional[bool] = True,
286 |     ):
287 |         """
288 |         Download the file from storage.
289 | 
290 |         :param url: Optional url of the exact path to specify the storage path.
291 |         :param params: Optional dictionary containing configuration parameters, such as:
292 |             - 'page': Optional page number for pagination.
293 |             - 'limit': Optional page limit for pagination.
294 |             - 'domain': Optional domain name to use when url is not known.
295 |             - 'pathname': Optional pathname to use when urls is not known.
296 |             - 'expiresIn': Optional expiration time for the signed URL.
297 |         :param stream: Boolean indicating if the response should be streamed. Defaults to True.
298 |         :return: The raw response stream if stream is True.
299 |         """
300 |         if url:
301 |             params["url"] = url
302 |         if params:
303 |             params.update(params)
304 | 
305 |         endpoint = "data/download"
306 |         headers = self._prepare_headers("application/octet-stream")
307 |         response = self._get_request(
308 |             f"https://api.spider.cloud/v1/{endpoint}", headers, stream, params=params
309 |         )
310 |         if 200 <= response.status_code < 300:
311 |             if stream:
312 |                 return response.raw
313 |             else:
314 |                 return response.content
315 |         else:
316 |             self._handle_error(response, f"download from {endpoint}")
317 | 
318 |     def create_signed_url(
319 |         self,
320 |         url: Optional[str] = None,
321 |         params: Optional[Dict[str, int]] = None,
322 |         stream: Optional[bool] = True,
323 |     ):
324 |         """
325 |         Create a signed url to download files from the storage.
326 | 
327 |         :param url: Optional url of the exact path to specify the storage path.
328 |         :param params: Optional dictionary containing configuration parameters, such as:
329 |             - 'page': Optional page number for pagination.
330 |             - 'limit': Optional page limit for pagination.
331 |             - 'domain': Optional domain name to use when url is not known.
332 |             - 'pathname': Optional pathname to use when urls is not known.
333 |             - 'expiresIn': Optional expiration time for the signed URL.
334 |         :param stream: Boolean indicating if the response should be streamed. Defaults to True.
335 |         :return: The raw response stream if stream is True.
336 |         """
337 |         if url:
338 |             params["url"] = url
339 |         if params:
340 |             params.update(params)
341 | 
342 |         endpoint = "data/sign-url"
343 |         headers = self._prepare_headers("application/octet-stream")
344 |         response = self._get_request(
345 |             f"https://api.spider.cloud/v1/{endpoint}", headers, stream, params=params
346 |         )
347 |         if 200 <= response.status_code < 300:
348 |             if stream:
349 |                 return response.raw
350 |             else:
351 |                 return response.content
352 |         else:
353 |             self._handle_error(response, f"download from {endpoint}")
354 | 
355 |     def get_crawl_state(
356 |         self,
357 |         url: str,
358 |         params: Optional[RequestParamsDict] = None,
359 |         stream: Optional[bool] = False,
360 |         content_type: Optional[str] = "application/json",
361 |     ):
362 |         """
363 |         Retrieve the website active crawl state.
364 | 
365 |         :return: JSON response of the crawl state and credits used.
366 |         """
367 |         payload = {"url": url, "stream": stream, "content_type": content_type}
368 |         if params:
369 |             payload.update(params)
370 | 
371 |         return self.api_post("data/crawl_state", payload, stream)
372 | 
373 |     def get_credits(self):
374 |         """
375 |         Retrieve the account's remaining credits.
376 | 
377 |         :return: JSON response containing the number of credits left.
378 |         """
379 |         return self.api_get("data/credits")
380 | 
381 |     def data_post(self, table: str, data: Optional[RequestParamsDict] = None):
382 |         """
383 |         Send data to a specific table via POST request.
384 |         :param table: The table name to which the data will be posted.
385 |         :param data: A dictionary representing the data to be posted.
386 |         :return: The JSON response from the server.
387 |         """
388 |         return self.api_post(f"data/{table}", data, stream=False)
389 | 
390 |     def data_get(
391 |         self,
392 |         table: str,
393 |         params: Optional[RequestParamsDict] = None,
394 |     ):
395 |         """
396 |         Retrieve data from a specific table via GET request.
397 |         :param table: The table name from which to retrieve data.
398 |         :param params: Optional parameters to modify the query.
399 |         :return: The JSON response from the server.
400 |         """
401 |         return self.api_get(f"data/{table}", params)
402 | 
403 |     def data_delete(
404 |         self,
405 |         table: str,
406 |         params: Optional[RequestParamsDict] = None,
407 |     ):
408 |         """
409 |         Delete data from a specific table via DELETE request.
410 |         :param table: The table name from which data will be deleted.
411 |         :param params: Parameters to identify which data to delete.
412 |         :return: The JSON response from the server.
413 |         """
414 |         return self.api_delete(f"data/{table}", params=params)
415 | 
416 |     def stream_reader(self, response, callback):
417 |         response.raise_for_status()
418 | 
419 |         try:
420 |             for json_obj in ijson.items(response.raw, "", multiple_values=True):
421 |                 callback(json_obj)
422 | 
423 |         except Exception as e:
424 |             logging.error(f"An error occurred while parsing JSON: {e}")
425 | 
426 |     def _prepare_headers(self, content_type: str = "application/json"):
427 |         return {
428 |             "Content-Type": content_type,
429 |             "Authorization": f"Bearer {self.api_key}",
430 |             "User-Agent": f"Spider-Client/0.1.36",
431 |         }
432 | 
433 |     def _post_request(self, url: str, data, headers, stream=False):
434 |         return requests.post(url, headers=headers, json=data, stream=stream)
435 | 
436 |     def _get_request(self, url: str, headers, stream=False, params=None):
437 |         return requests.get(url, headers=headers, stream=stream, params=params)
438 | 
439 |     def _delete_request(self, url: str, headers, json=None, stream=False):
440 |         return requests.delete(url, headers=headers, json=json, stream=stream)
441 | 
442 |     def _handle_error(self, response, action):
443 |         if response.status_code in [402, 409, 500]:
444 |             error_message = response.json().get("error", "Unknown error occurred")
445 |             raise Exception(
446 |                 f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}"
447 |             )
448 |         else:
449 |             raise Exception(
450 |                 f"Unexpected error occurred while trying to {action}. Status code: {response.status_code}. Here is the response: {response.text}"
451 |             )
452 | 


--------------------------------------------------------------------------------
/python/spider/spider.pyi:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Dict, Any
  2 | from spider_types import RequestParamsDict, QueryRequest
  3 | 
  4 | class Spider:
  5 |     api_key: str
  6 | 
  7 |     def __init__(self, api_key: Optional[str] = None) -> None: ...
  8 |     def api_post(
  9 |         self,
 10 |         endpoint: str,
 11 |         data: dict,
 12 |         stream: bool,
 13 |         content_type: str = "application/json",
 14 |     ) -> Any: ...
 15 |     def api_get(
 16 |         self, endpoint: str, stream: bool, content_type: str = "application/json"
 17 |     ) -> Any: ...
 18 |     def api_delete(
 19 |         self, endpoint: str, stream: bool, content_type: str = "application/json"
 20 |     ) -> Any: ...
 21 |     def scrape_url(
 22 |         self,
 23 |         url: str,
 24 |         params: Optional[RequestParamsDict] = None,
 25 |         stream: bool = False,
 26 |         content_type: str = "application/json",
 27 |     ) -> Any: ...
 28 |     def crawl_url(
 29 |         self,
 30 |         url: str,
 31 |         params: Optional[RequestParamsDict] = None,
 32 |         stream: bool = False,
 33 |         content_type: str = "application/json",
 34 |     ) -> Any: ...
 35 |     def links(
 36 |         self,
 37 |         url: str,
 38 |         params: Optional[RequestParamsDict] = None,
 39 |         stream: bool = False,
 40 |         content_type: str = "application/json",
 41 |     ) -> Any: ...
 42 |     def screenshot(
 43 |         self,
 44 |         url: str,
 45 |         params: Optional[RequestParamsDict] = None,
 46 |         stream: bool = False,
 47 |         content_type: str = "application/json",
 48 |     ) -> Any: ...
 49 |     def search(
 50 |         self,
 51 |         q: str,
 52 |         params: Optional[RequestParamsDict] = None,
 53 |         stream: bool = False,
 54 |         content_type: str = "application/json",
 55 |     ) -> Any: ...
 56 |     def transform(
 57 |         self,
 58 |         data: Any,
 59 |         params: Optional[RequestParamsDict] = None,
 60 |         stream: bool = False,
 61 |         content_type: str = "application/json",
 62 |     ) -> Any: ...
 63 |     def extract_contacts(
 64 |         self,
 65 |         url: str,
 66 |         params: Optional[RequestParamsDict] = None,
 67 |         stream: bool = False,
 68 |         content_type: str = "application/json",
 69 |     ) -> Any: ...
 70 |     def label(
 71 |         self,
 72 |         url: str,
 73 |         params: Optional[RequestParamsDict] = None,
 74 |         stream: bool = False,
 75 |         content_type: str = "application/json",
 76 |     ) -> Any: ...
 77 |     def get_crawl_state(
 78 |         self,
 79 |         url: str,
 80 |         params: Optional[RequestParamsDict] = None,
 81 |         stream: bool = False,
 82 |         content_type: str = "application/json",
 83 |     ) -> Any: ...
 84 |     def query(
 85 |         self,
 86 |         params: QueryRequest
 87 |     ) -> Any: ...
 88 |     def get_credits(self) -> Any: ...
 89 |     def data_post(
 90 |         self, table: str, data: Optional[RequestParamsDict] = None
 91 |     ) -> Any: ...
 92 |     def create_signed_url(
 93 |         self,
 94 |         domain: Optional[str] = None,
 95 |         params: Optional[Dict[str, int]] = None,
 96 |         stream: Optional[bool] = True,
 97 |     ) -> Any: ...
 98 |     def data_get(
 99 |         self,
100 |         table: str,
101 |         params: Optional[RequestParamsDict] = None,
102 |     ) -> Any: ...
103 |     def data_delete(
104 |         self,
105 |         table: str,
106 |         params: Optional[RequestParamsDict] = None,
107 |     ) -> Any: ...
108 |     def _prepare_headers(
109 |         self, content_type: str = "application/json"
110 |     ) -> Dict[str, str]: ...
111 |     def _post_request(
112 |         self, url: str, data: Any, headers: Dict[str, str], stream: bool = False
113 |     ) -> Any: ...
114 |     def _get_request(
115 |         self, url: str, headers: Dict[str, str], stream: bool = False
116 |     ) -> Any: ...
117 |     def _delete_request(
118 |         self, url: str, headers: Dict[str, str], stream: bool = False
119 |     ) -> Any: ...
120 |     def _handle_error(self, response: Any, action: str) -> None: ...
121 | 


--------------------------------------------------------------------------------
/python/spider/spider_types.py:
--------------------------------------------------------------------------------
  1 | from typing import TypedDict, Optional, Dict, List, Union, Literal, Callable
  2 | from dataclasses import dataclass, field
  3 | 
  4 | @dataclass
  5 | class Evaluate:
  6 |     code: str
  7 |     type: str = "Evaluate"
  8 | 
  9 | @dataclass
 10 | class Click:
 11 |     selector: str
 12 |     type: str = "Click"
 13 | 
 14 | @dataclass
 15 | class Wait:
 16 |     duration: int
 17 |     type: str = "Wait"
 18 | 
 19 | @dataclass
 20 | class WaitForNavigation:
 21 |     type: str = "WaitForNavigation"
 22 | 
 23 | @dataclass
 24 | class WaitFor:
 25 |     selector: str
 26 |     type: str = "WaitFor"
 27 | 
 28 | @dataclass
 29 | class WaitForAndClick:
 30 |     selector: str
 31 |     type: str = "WaitForAndClick"
 32 | 
 33 | @dataclass
 34 | class ScrollX:
 35 |     pixels: int
 36 |     type: str = "ScrollX"
 37 | 
 38 | @dataclass
 39 | class ScrollY:
 40 |     pixels: int
 41 |     type: str = "ScrollY"
 42 | 
 43 | @dataclass
 44 | class Fill:
 45 |     selector: str
 46 |     value: str
 47 |     type: str = "Fill"
 48 | 
 49 | @dataclass
 50 | class InfiniteScroll:
 51 |     times: int
 52 |     type: str = "InfiniteScroll"
 53 | 
 54 | WebAutomation = Union[
 55 |     Evaluate,
 56 |     Click,
 57 |     Wait,
 58 |     WaitForNavigation,
 59 |     WaitFor,
 60 |     WaitForAndClick,
 61 |     ScrollX,
 62 |     ScrollY,
 63 |     Fill,
 64 |     InfiniteScroll,
 65 | ]
 66 | 
 67 | WebAutomationMap = Dict[str, List[WebAutomation]]
 68 | ExecutionScriptsMap = Dict[str, str]
 69 | 
 70 | RedirectPolicy = Literal[
 71 |     "Loose",
 72 |     "Strict"
 73 | ]
 74 | 
 75 | @dataclass
 76 | class QueryRequest:
 77 |     url: Optional[str] = field(default=None)
 78 |     domain: Optional[str] = field(default=None)
 79 |     pathname: Optional[str] = field(default=None)
 80 | 
 81 | 
 82 | class ChunkingAlgDict(TypedDict):
 83 |     # The chunking algorithm to use with the value to chunk by.
 84 |     type: Literal["ByWords", "ByLines", "ByCharacterLength", "BySentence"]
 85 |     # The amount to chunk by.
 86 |     value: int
 87 | 
 88 | 
 89 | class TimeoutDict(TypedDict):
 90 |     secs: int
 91 |     nanos: int
 92 | 
 93 | class EventTracker(TypedDict):
 94 |     responses: bool
 95 |     requests: bool
 96 | 
 97 | class IdleNetworkDict(TypedDict):
 98 |     timeout: TimeoutDict
 99 | 
100 | 
101 | class SelectorDict(TypedDict):
102 |     timeout: TimeoutDict
103 |     selector: str
104 | 
105 | 
106 | class DelayDict(TypedDict):
107 |     timeout: TimeoutDict
108 | 
109 | 
110 | class WaitForDict(TypedDict, total=False):
111 |     idle_network: Optional[IdleNetworkDict]
112 |     selector: Optional[SelectorDict]
113 |     delay: Optional[DelayDict]
114 |     page_navigations: Optional[bool]
115 | 
116 | 
117 | @dataclass
118 | class WebhookSettings:
119 |     # The destination where the webhook data is sent via HTTP POST.
120 |     destination: str
121 |     # Flag to trigger an action when all credits are depleted
122 |     on_credits_depleted: bool
123 |     # Flag to trigger when half of the credits are depleted
124 |     on_credits_half_depleted: bool
125 |     # Flag to notify on website status update events
126 |     on_website_status: bool
127 |     # Flag to send information (links, bytes) about a new page find
128 |     on_find: bool
129 |     # Flag to handle the metadata of a found page
130 |     on_find_metadata: bool
131 | 
132 | class CSSSelector(TypedDict):
133 |     """
134 |     Represents a set of CSS selectors grouped under a common name.
135 |     """
136 | 
137 |     name: str  # The name of the selector group (e.g., "headers")
138 |     selectors: List[str]  # A list of CSS selectors (e.g., ["h1", "h2", "h3"])
139 | 
140 | 
141 | # CSSExtractionMap is a dictionary where:
142 | # - Keys are strings representing paths (e.g., "/blog")
143 | # - Values are lists of CSSSelector items
144 | CSSExtractionMap = Dict[str, List[CSSSelector]]
145 | 
146 | ReturnFormat = Literal["raw", "markdown", "commonmark", "html2text", "text", "xml", "bytes"];
147 | 
148 | class RequestParamsDict(TypedDict, total=False):
149 |     # The URL to be crawled.
150 |     url: Optional[str]
151 | 
152 |     # The type of request to be made.
153 |     request: Optional[Literal["http", "chrome", "smart"]]
154 | 
155 |     # The maximum number of pages the crawler should visit.
156 |     limit: Optional[int]
157 | 
158 |     # The format in which the result should be returned.
159 |     return_format: Optional[
160 |        Union[
161 |            ReturnFormat,
162 |            List[ReturnFormat],
163 |        ]
164 |     ]
165 | 
166 |     # Specifies whether to only visit the top-level domain.
167 |     tld: Optional[bool]
168 | 
169 |     # The depth of the crawl.
170 |     depth: Optional[int]
171 | 
172 |     # Specifies whether the request should be cached.
173 |     cache: Optional[bool]
174 | 
175 |     # The budget for various resources.
176 |     budget: Optional[Dict[str, int]]
177 | 
178 |     # The blacklist routes to ignore. This can be a Regex string pattern.
179 |     blacklist: Optional[List[str]]
180 | 
181 |     # The whitelist routes to only crawl. This can be a Regex string pattern and used with black_listing.
182 |     whitelist: Optional[List[str]]
183 | 
184 |     # The locale to be used during the crawl.
185 |     locale: Optional[str]
186 | 
187 |     # The cookies to be set for the request, formatted as a single string.
188 |     cookies: Optional[str]
189 | 
190 |     # Specifies whether to use stealth techniques to avoid detection.
191 |     stealth: Optional[bool]
192 | 
193 |     # The headers to be used for the request.
194 |     headers: Optional[Dict[str, str]]
195 | 
196 |     # Specifies whether anti-bot measures should be used.
197 |     anti_bot: Optional[bool]
198 | 
199 |     # Specifies whether to include metadata in the response.
200 |     metadata: Optional[bool]
201 | 
202 |     # The dimensions of the viewport.
203 |     viewport: Optional[Dict[str, int]]
204 | 
205 |     # The encoding to be used for the request.
206 |     encoding: Optional[str]
207 | 
208 |     # Specifies whether to include subdomains in the crawl.
209 |     subdomains: Optional[bool]
210 | 
211 |     # The user agent string to be used for the request.
212 |     user_agent: Optional[str]
213 | 
214 |     # Specifies whether the response data should be stored.
215 |     store_data: Optional[bool]
216 | 
217 |     # Configuration settings for GPT (general purpose texture mappings).
218 |     gpt_config: Optional[Dict]
219 | 
220 |     # Specifies whether to use fingerprinting protection.
221 |     fingerprint: Optional[bool]
222 | 
223 |     # Use CSS query selectors to scrape contents from the web page. Set the paths and the CSS extraction object map to perform extractions per path or page.
224 |     css_extraction_map: Optional[CSSExtractionMap]
225 | 
226 |     # Specifies whether to perform the request without using storage.
227 |     storageless: Optional[bool]
228 | 
229 |     # Specifies whether readability optimizations should be applied.
230 |     readability: Optional[bool]
231 | 
232 |     # Specifies whether to use a proxy for the request.
233 |     proxy_enabled: Optional[bool]
234 | 
235 |     # Specifies whether to respect the site's robots.txt file.
236 |     respect_robots: Optional[bool]
237 | 
238 |     # CSS selector to be used to filter the content.
239 |     root_selector: Optional[str]
240 | 
241 |     # Specifies whether to load all resources of the crawl target.
242 |     full_resources: Optional[bool]
243 | 
244 |     # Specifies whether to use the sitemap links.
245 |     sitemap: Optional[bool]
246 | 
247 |     # Specifies whether to only use the sitemap links.
248 |     sitemap_only: Optional[bool]
249 | 
250 |     # External domains to include in the crawl.
251 |     external_domains: Optional[List[str]]
252 | 
253 |     # Returns the OpenAI embeddings for the title and description. Other values, such as keywords, may also be included. Requires the `metadata` parameter to be set to `true`.
254 |     return_embeddings: Optional[bool]
255 | 
256 |     # Use webhooks to send data to another location via POST.
257 |     webhooks: Optional[WebhookSettings]
258 | 
259 |     # Returns the link(s) found on the page that match the crawler query.
260 |     return_page_links: Optional[bool]
261 | 
262 |     # Returns the HTTP response headers used.
263 |     return_headers: Optional[bool]
264 | 
265 |     # Returns the HTTP response cookies used.
266 |     return_cookies: Optional[bool]
267 | 
268 |     # The timeout for the request, in milliseconds.
269 |     request_timeout: Optional[int]
270 | 
271 |     # Perform an infinite scroll on the page as new content arises. The request param also needs to be set to 'chrome' or 'smart'.
272 |     scroll: Optional[int]
273 | 
274 |     # Specifies whether to run the request in the background.
275 |     run_in_background: Optional[bool]
276 | 
277 |     # Specifies whether to skip configuration checks.
278 |     skip_config_checks: Optional[bool]
279 | 
280 |     # The chunking algorithm to use.
281 |     chunking_alg: Optional[ChunkingAlgDict]
282 | 
283 |     # Disable request interception when running 'request' as 'chrome' or 'smart'. This can help when the page uses 3rd party or external scripts to load content.
284 |     disable_intercept: Optional[bool]
285 | 
286 |     # The wait for events on the page. You need to make your `request` `chrome` or `smart`.
287 |     wait_for: Optional[WaitForDict]
288 | 
289 |     # Perform custom Javascript tasks on a url or url path. You need to make your `request` `chrome` or `smart`.
290 |     exuecution_scripts: Optional[ExecutionScriptsMap]
291 | 
292 |     # Perform custom web automated tasks on a url or url path. You need to make your `request` `chrome` or `smart`.
293 |     automation_scripts: Optional[WebAutomationMap]
294 | 
295 |     # The redirect policy for HTTP request. Set the value to Loose to allow all.
296 |     redirect_policy: Optional[RedirectPolicy]
297 | 
298 |     # Track the request sent and responses received for `chrome` or `smart`. The responses will track the bytes used and the requests will have the monotime sent.
299 |     event_tracker: Optional[EventTracker]
300 | 
301 |     # The timeout to stop the crawl.
302 |     crawl_timeout: Optional[TimeoutDict]
303 | 
304 |     # Evaluates given script in every frame upon creation (before loading frame's scripts).
305 |     evaluate_on_new_document: Optional[str]
306 | 
307 | JsonCallback = Callable[[dict], None]
308 | 


--------------------------------------------------------------------------------
/python/tests/test_async_spider.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import os
  3 | from unittest.mock import patch, AsyncMock
  4 | from spider.async_spider import AsyncSpider
  5 | from spider.spider_types import RequestParamsDict
  6 | from dotenv import load_dotenv
  7 | import aiohttp
  8 | import json
  9 | 
 10 | load_dotenv()
 11 | 
 12 | @pytest.fixture
 13 | def async_spider():
 14 |     return AsyncSpider(api_key="test_api_key")
 15 | 
 16 | @pytest.fixture
 17 | def url():
 18 |     return "http://example.com"
 19 | 
 20 | @pytest.fixture
 21 | def params():
 22 |     return {
 23 |         "limit": 1,
 24 |         "return_format": "markdown",
 25 |         "depth": 2,
 26 |         "cache": True,
 27 |         "domain": "example.com",
 28 |     }
 29 | 
 30 | def test_init_with_env_variable():
 31 |     os.environ["SPIDER_API_KEY"] = "env_api_key"
 32 |     spider = AsyncSpider()
 33 |     assert spider.api_key == "env_api_key"
 34 |     del os.environ["SPIDER_API_KEY"]
 35 | 
 36 | def test_init_without_api_key():
 37 |     with pytest.raises(ValueError):
 38 |         AsyncSpider(api_key=None)
 39 | 
 40 | @pytest.mark.asyncio
 41 | async def test_scrape_url(async_spider, url, params):
 42 |     mock_response = [{"content": "data", "error": None, "status": 200, "url": url}]
 43 |     
 44 |     async def mock_request(*args, **kwargs):
 45 |         yield mock_response
 46 | 
 47 |     with patch.object(AsyncSpider, '_request', side_effect=mock_request):
 48 |         async for response in async_spider.scrape_url(url, params=params):
 49 |             assert isinstance(response, list)
 50 |             assert len(response) > 0
 51 |             assert isinstance(response[0], dict)
 52 |             assert 'content' in response[0]
 53 |             assert 'error' in response[0]
 54 |             assert 'status' in response[0]
 55 |             assert 'url' in response[0]
 56 | 
 57 | @pytest.mark.asyncio
 58 | async def test_crawl_url(async_spider, url, params):
 59 |     mock_response = [{"content": "data", "error": None, "status": 200, "url": url}]
 60 |     
 61 |     async def mock_request(*args, **kwargs):
 62 |         yield mock_response
 63 | 
 64 |     with patch.object(AsyncSpider, '_request', side_effect=mock_request):
 65 |         async for response in async_spider.crawl_url(url, params=params):
 66 |             assert isinstance(response, list)
 67 |             assert len(response) > 0
 68 |             assert isinstance(response[0], dict)
 69 |             assert 'content' in response[0]
 70 |             assert 'error' in response[0]
 71 |             assert 'status' in response[0]
 72 |             assert 'url' in response[0]
 73 | 
 74 | @pytest.mark.asyncio
 75 | async def test_crawl_url_streaming(async_spider, url, params):
 76 |     mock_response = b'{"url": "http://example.com"}'
 77 |     
 78 |     async def mock_request(*args, **kwargs):
 79 |         yield mock_response
 80 | 
 81 |     with patch.object(AsyncSpider, '_request', side_effect=mock_request):
 82 |         def handle_json(json_obj):
 83 |             json_obj = json.loads(json_obj.decode('utf-8'))
 84 |             assert json_obj["url"] == "http://example.com"
 85 | 
 86 |         async for response in async_spider.crawl_url(url, params=params, stream=True, content_type="application/jsonl"):
 87 |             handle_json(response)
 88 | 
 89 | @pytest.mark.asyncio
 90 | async def test_links(async_spider, url, params):
 91 |     mock_response = [{"error": None, "status": 200, "url": url}]
 92 |     
 93 |     async def mock_request(*args, **kwargs):
 94 |         yield mock_response
 95 | 
 96 |     with patch.object(AsyncSpider, '_request', side_effect=mock_request):
 97 |         async for response in async_spider.links(url, params=params):
 98 |             assert isinstance(response, list)
 99 |             assert len(response) > 0
100 |             assert isinstance(response[0], dict)
101 |             assert 'error' in response[0]
102 |             assert 'status' in response[0]
103 |             assert 'url' in response[0]
104 | 
105 | @pytest.mark.asyncio
106 | async def test_screenshot(async_spider, url, params):
107 |     mock_response = [{"content": "base64_encoded_image", "error": None, "status": 200, "url": url}]
108 |     
109 |     async def mock_request(*args, **kwargs):
110 |         yield mock_response
111 | 
112 |     with patch.object(AsyncSpider, '_request', side_effect=mock_request):
113 |         async for response in async_spider.screenshot(url, params=params):
114 |             assert isinstance(response, list)
115 |             assert len(response) > 0
116 |             assert isinstance(response[0], dict)
117 |             assert 'content' in response[0]
118 |             assert 'error' in response[0]
119 |             assert 'status' in response[0]
120 |             assert 'url' in response[0]
121 | 
122 | @pytest.mark.asyncio
123 | async def test_search(async_spider, params):
124 |     mock_response = [{"content": "result", "error": None, "status": 200, "url": "http://example.com"}]
125 |     
126 |     async def mock_request(*args, **kwargs):
127 |         yield mock_response
128 | 
129 |     with patch.object(AsyncSpider, '_request', side_effect=mock_request):
130 |         async for response in async_spider.search("example search query", params=params):
131 |             assert isinstance(response, list)
132 |             assert len(response) > 0
133 |             assert isinstance(response[0], dict)
134 |             assert 'content' in response[0]
135 |             assert 'error' in response[0]
136 |             assert 'status' in response[0]
137 |             assert 'url' in response[0]
138 | 
139 | @pytest.mark.asyncio
140 | async def test_transform(async_spider, url, params):
141 |     mock_response = {"content": "transformed", "error": None, "status": 200}
142 |     
143 |     async def mock_request(*args, **kwargs):
144 |         yield mock_response
145 | 
146 |     with patch.object(AsyncSpider, '_request', side_effect=mock_request):
147 |         transform_data = [{"html": "<html><body>Example</body></html>", "url": url}]
148 |         async for response in async_spider.transform(transform_data, params=params):
149 |             assert isinstance(response, dict)
150 |             assert 'content' in response
151 |             assert 'error' in response
152 |             assert 'status' in response
153 | 
154 | @pytest.mark.asyncio
155 | async def test_extract_contacts(async_spider, url, params):
156 |     mock_response = [{"content": "contacts", "error": None, "status": 200, "url": url}]
157 |     
158 |     async def mock_request(*args, **kwargs):
159 |         yield mock_response
160 | 
161 |     with patch.object(AsyncSpider, '_request', side_effect=mock_request):
162 |         async for response in async_spider.extract_contacts(url, params=params):
163 |             assert isinstance(response, list)
164 |             assert len(response) > 0
165 |             assert isinstance(response[0], dict)
166 |             assert 'content' in response[0]
167 |             assert 'error' in response[0]
168 |             assert 'status' in response[0]
169 |             assert 'url' in response[0]
170 | 
171 | @pytest.mark.asyncio
172 | async def test_label(async_spider, url, params):
173 |     mock_response = [{"content": "labels", "error": None, "status": 200, "url": url}]
174 |     
175 |     async def mock_request(*args, **kwargs):
176 |         yield mock_response
177 | 
178 |     with patch.object(AsyncSpider, '_request', side_effect=mock_request):
179 |         async for response in async_spider.label(url, params=params):
180 |             assert isinstance(response, list)
181 |             assert len(response) > 0
182 |             assert isinstance(response[0], dict)
183 |             assert 'content' in response[0]
184 |             assert 'error' in response[0]
185 |             assert 'status' in response[0]
186 |             assert 'url' in response[0]
187 | 
188 | @pytest.mark.asyncio
189 | async def test_get_crawl_state(async_spider, url, params):
190 |     mock_response = {"data": [{"state": "completed", "credits_used": 10}]}
191 |     
192 |     async def mock_request(*args, **kwargs):
193 |         yield mock_response
194 | 
195 |     with patch.object(AsyncSpider, '_request', side_effect=mock_request):
196 |         async for response in async_spider.get_crawl_state(url, params=params):
197 |             assert isinstance(response, dict)
198 |             assert 'data' in response
199 |             assert isinstance(response['data'], list)
200 | 
201 | @pytest.mark.asyncio
202 | async def test_get_credits(async_spider):
203 |     mock_response = {"data": [{"credits": 1000}]}
204 |     
205 |     async def mock_request(*args, **kwargs):
206 |         yield mock_response
207 | 
208 |     with patch.object(AsyncSpider, '_request', side_effect=mock_request):
209 |         async for response in async_spider.get_credits():
210 |             assert isinstance(response, dict)
211 |             assert 'data' in response
212 |             assert isinstance(response['data'], list)
213 | 
214 | @pytest.mark.asyncio
215 | async def test_data_post(async_spider, url):
216 |     mock_response = None
217 |     
218 |     async def mock_request(*args, **kwargs):
219 |         yield mock_response
220 | 
221 |     with patch.object(AsyncSpider, '_request', side_effect=mock_request):
222 |         table = "websites"
223 |         post_data: RequestParamsDict = {"url": url}
224 |         async for response in async_spider.data_post(table, post_data):
225 |             assert response is None
226 | 
227 | @pytest.mark.asyncio
228 | async def test_data_get(async_spider, url, params):
229 |     mock_response = {"data": [{"url": url}]}
230 |     
231 |     async def mock_request(*args, **kwargs):
232 |         yield mock_response
233 | 
234 |     with patch.object(AsyncSpider, '_request', side_effect=mock_request):
235 |         table = "websites"
236 |         async for response in async_spider.data_get(table, params=params):
237 |             assert isinstance(response['data'], list)
238 | 
239 | @pytest.mark.asyncio
240 | async def test_query(async_spider, params):
241 |     mock_response = {"data": {"status": 200}}
242 |     
243 |     async def mock_request(*args, **kwargs):
244 |         yield mock_response
245 | 
246 |     with patch.object(AsyncSpider, '_request', side_effect=mock_request):
247 |         async for response in async_spider.data_get("query", params=params):
248 |             assert isinstance(response['data'], object)
249 | 
250 | @pytest.mark.asyncio
251 | async def test_data_delete(async_spider, params):
252 |     mock_response = None
253 |     
254 |     async def mock_request(*args, **kwargs):
255 |         yield mock_response
256 | 
257 |     with patch.object(AsyncSpider, '_request', side_effect=mock_request):
258 |         table = "websites"
259 |         async for response in async_spider.data_delete(table, params=params):
260 |             assert response is None
261 | 
262 | @pytest.mark.asyncio
263 | async def test_create_signed_url(async_spider):
264 |     mock_response = b"mocked raw data"
265 |     
266 |     async def mock_request(*args, **kwargs):
267 |         yield mock_response
268 | 
269 |     with patch.object(AsyncSpider, '_request', side_effect=mock_request):
270 |         async for response in async_spider.create_signed_url(params={"domain": "example.com"}):
271 |             assert response == b"mocked raw data"
272 | 
273 | @pytest.mark.asyncio
274 | async def test_handle_error():
275 |     async_spider = AsyncSpider(api_key="test_api_key")
276 |     mock_response = AsyncMock(spec=aiohttp.ClientResponse)
277 |     mock_response.status = 402
278 |     mock_response.json.return_value = {"error": "Payment Required"}
279 |     
280 |     with pytest.raises(Exception, match="Failed to test action. Status code: 402. Error: Payment Required"):
281 |         await async_spider._handle_error(mock_response, "test action")


--------------------------------------------------------------------------------
/python/tests/test_async_spider_integration.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import os
  3 | import logging
  4 | from spider.async_spider import AsyncSpider
  5 | from dotenv import load_dotenv
  6 | 
  7 | load_dotenv()
  8 | 
  9 | logging.basicConfig(level=logging.INFO)
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | @pytest.fixture
 13 | def api_key():
 14 |     api_key = os.getenv("SPIDER_API_KEY")
 15 |     if not api_key:
 16 |         pytest.skip("SPIDER_API_KEY not set in .env file")
 17 |     return api_key
 18 | 
 19 | @pytest.fixture
 20 | def url():
 21 |     return "http://example.com"
 22 | 
 23 | @pytest.fixture
 24 | def params():
 25 |     return {
 26 |         "limit": 1,
 27 |         "return_format": "markdown",
 28 |         "depth": 2,
 29 |         "cache": True,
 30 |         "domain": "example.com",
 31 |     }
 32 | 
 33 | @pytest.mark.asyncio
 34 | async def test_scrape_url(api_key, url, params):
 35 |     async with AsyncSpider(api_key=api_key) as spider:
 36 |         async for response in spider.scrape_url(url, params=params):
 37 |             print(type(response))
 38 |             logger.info(f"Scrape URL response: {response}")
 39 |             assert len(response) > 0
 40 |             assert isinstance(response[0], dict)
 41 |             assert 'content' in response[0]
 42 |             assert 'error' in response[0]
 43 |             assert 'status' in response[0]
 44 |             assert 'url' in response[0]
 45 | 
 46 | @pytest.mark.asyncio
 47 | async def test_crawl_url(api_key, url, params):
 48 |     async with AsyncSpider(api_key=api_key) as spider:
 49 |         async for response in spider.crawl_url(url, params=params):
 50 |             logger.info(f"Crawl URL response: {response}")
 51 |             assert isinstance(response, list)
 52 |             assert len(response) > 0
 53 |             assert isinstance(response[0], dict)
 54 |             assert 'content' in response[0]
 55 |             assert 'error' in response[0]
 56 |             assert 'status' in response[0]
 57 |             assert 'url' in response[0]
 58 | 
 59 | # TODO "Credits or a valid subscription required to use the API"?
 60 | # @pytest.mark.asyncio
 61 | # async def test_crawl_url_streaming(url, params):
 62 | #     async with AsyncSpider(api_key=api_key) as spider:
 63 | #         async for response in spider.crawl_url(url, params=params, stream=True):
 64 | #             print(response)
 65 | #             json_obj = json.loads(response.decode('utf-8'))
 66 | #             assert json_obj["url"] == "http://example.com"
 67 | 
 68 | @pytest.mark.asyncio
 69 | async def test_links(api_key, url, params):
 70 |     async with AsyncSpider(api_key=api_key) as spider:
 71 |         async for response in spider.links(url, params=params):
 72 |             logger.info(f"Links response: {response}")
 73 |             assert isinstance(response, list)
 74 |             assert len(response) > 0
 75 |             assert isinstance(response[0], dict)
 76 |             assert 'error' in response[0]
 77 |             assert 'status' in response[0]
 78 |             assert 'url' in response[0]
 79 | 
 80 | @pytest.mark.asyncio
 81 | async def test_screenshot(api_key, url, params):
 82 |     async with AsyncSpider(api_key=api_key) as spider:
 83 |         async for response in spider.screenshot(url, params=params):
 84 |             logger.info(f"Screenshot response: {response}")
 85 |             assert isinstance(response, list)
 86 |             assert len(response) > 0
 87 |             assert isinstance(response[0], dict)
 88 |             assert 'content' in response[0]
 89 |             assert 'error' in response[0]
 90 |             assert 'status' in response[0]
 91 |             assert 'url' in response[0]
 92 | 
 93 | @pytest.mark.asyncio
 94 | async def test_search(api_key, params):
 95 |     async with AsyncSpider(api_key=api_key) as spider:
 96 |         async for response in spider.search("example search query", params=params):
 97 |             logger.info(f"Search response: {response}")
 98 |             assert isinstance(response, list)
 99 |             assert len(response) > 0
100 |             assert isinstance(response[0], dict)
101 |             assert 'content' in response[0]
102 |             assert 'error' in response[0]
103 |             assert 'status' in response[0]
104 |             assert 'url' in response[0]
105 | 
106 | @pytest.mark.asyncio
107 | async def test_transform(api_key, url, params):
108 |     async with AsyncSpider(api_key=api_key) as spider:
109 |         transform_data = [{"html": "<html><body>Example</body></html>", "url": url}]
110 |         async for response in spider.transform(transform_data, params=params):
111 |             logger.info(f"Transform response: {response}")
112 |             assert isinstance(response, dict)
113 |             assert 'content' in response
114 |             assert 'error' in response
115 |             assert 'status' in response
116 | 
117 | @pytest.mark.asyncio
118 | async def test_extract_contacts(api_key, url, params):
119 |     async with AsyncSpider(api_key=api_key) as spider:
120 |         async for response in spider.extract_contacts(url, params=params):
121 |             logger.info(f"Extract contacts response: {response}")
122 |             assert isinstance(response, list)
123 |             assert len(response) > 0
124 |             assert isinstance(response[0], dict)
125 |             assert 'content' in response[0]
126 |             assert 'error' in response[0]
127 |             assert 'status' in response[0]
128 |             assert 'url' in response[0]
129 | 
130 | @pytest.mark.asyncio
131 | async def test_label(api_key, url, params):
132 |     async with AsyncSpider(api_key=api_key) as spider:
133 |         async for response in spider.label(url, params=params):
134 |             logger.info(f"Label response: {response}")
135 |             assert isinstance(response, list)
136 |             assert len(response) > 0
137 |             assert isinstance(response[0], dict)
138 |             assert 'content' in response[0]
139 |             assert 'error' in response[0]
140 |             assert 'status' in response[0]
141 |             assert 'url' in response[0]
142 | 
143 | @pytest.mark.asyncio
144 | async def test_get_crawl_state(api_key, url, params):
145 |     async with AsyncSpider(api_key=api_key) as spider:
146 |         async for response in spider.get_crawl_state(url, params=params):
147 |             logger.info(f"Get crawl state response: {response}")
148 |             assert isinstance(response, dict)
149 |             assert 'data' in response
150 |             assert isinstance(response['data'], list)
151 | 
152 | @pytest.mark.asyncio
153 | async def test_get_credits(api_key):
154 |     async with AsyncSpider(api_key=api_key) as spider:
155 |         async for response in spider.get_credits():
156 |             logger.info(f"Get credits response: {response}")
157 |             assert isinstance(response, dict)
158 |             assert 'data' in response
159 |             assert isinstance(response['data'], list)
160 | 
161 | @pytest.mark.asyncio
162 | async def test_data_post(api_key, url):
163 |     async with AsyncSpider(api_key=api_key) as spider:
164 |         table = "websites"
165 |         post_data = {"url": url}
166 |         async for response in spider.data_post(table, post_data):
167 |             logger.info(f"Data post response: {response}")
168 |             assert 200 <= response['status'] < 300
169 |             assert response['data']['created_at'] is not None
170 | 
171 | # TODO 500 error
172 | # @pytest.mark.asyncio
173 | # async def test_data_get(api_key, url, params):
174 | #     async with AsyncSpider(api_key=api_key) as spider:
175 | #         table = "websites"
176 | #         async for response in spider.data_get(table, params=params):
177 | #             logger.info(f"Data get response: {response}")
178 | #             print(response)
179 | #             assert isinstance(response['data'], list)
180 | 
181 | @pytest.mark.asyncio
182 | async def test_data_delete(api_key, url, params):
183 |     async with AsyncSpider(api_key=api_key) as spider:
184 |         table = "websites"
185 |         async for response in spider.data_delete(table, params=params):
186 |             logger.info(f"Data delete response: {response}")
187 |             print(response)
188 |             assert response['message'] == 'ok'
189 | 
190 | @pytest.mark.asyncio
191 | async def test_create_signed_url(api_key):
192 |     async with AsyncSpider(api_key=api_key) as spider:
193 |         async for response in spider.create_signed_url(params={"domain": "example.com"}):
194 |             logger.info(f"Create signed URL response: {response}")
195 |             assert isinstance(response, bytes)


--------------------------------------------------------------------------------
/python/tests/test_spider.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import os
  3 | from io import BytesIO
  4 | from unittest.mock import patch, MagicMock
  5 | from spider.spider import Spider
  6 | from spider.spider_types import RequestParamsDict
  7 | from dotenv import load_dotenv
  8 | 
  9 | load_dotenv()
 10 | 
 11 | @pytest.fixture
 12 | def spider():
 13 |     return Spider(api_key="test_api_key")
 14 | 
 15 | @pytest.fixture
 16 | def url():
 17 |     return "http://example.com"
 18 | 
 19 | @pytest.fixture
 20 | def params():
 21 |     return {
 22 |         "limit": 1,
 23 |         "return_format": "markdown",
 24 |         "depth": 2,
 25 |         "cache": True,
 26 |         "domain": "example.com",
 27 |     }
 28 | 
 29 | def test_init_with_env_variable():
 30 |     os.environ["SPIDER_API_KEY"] = "env_api_key"
 31 |     spider = Spider()
 32 |     assert spider.api_key == "env_api_key"
 33 |     del os.environ["SPIDER_API_KEY"]
 34 | 
 35 | def test_init_without_api_key():
 36 |     with pytest.raises(ValueError):
 37 |         Spider(api_key=None)
 38 | 
 39 | @patch('requests.post')
 40 | def test_scrape_url(mock_post, spider, url, params):
 41 |     mock_response = MagicMock()
 42 |     mock_response.status_code = 200
 43 |     mock_response.json.return_value = [{"content": "data", "error": None, "status": 200, "url": url}]
 44 |     mock_post.return_value = mock_response
 45 | 
 46 |     response = spider.scrape_url(url, params=params)
 47 |     assert isinstance(response, list)
 48 |     assert len(response) > 0
 49 |     assert isinstance(response[0], dict)
 50 |     assert 'content' in response[0]
 51 |     assert 'error' in response[0]
 52 |     assert 'status' in response[0]
 53 |     assert 'url' in response[0]
 54 |     mock_post.assert_called_once()
 55 | 
 56 | @patch('requests.post')
 57 | def test_crawl_url(mock_post, spider, url, params):
 58 |     mock_response = MagicMock()
 59 |     mock_response.status_code = 200
 60 |     mock_response.json.return_value = [{"content": "data", "error": None, "status": 200, "url": url}]
 61 |     mock_post.return_value = mock_response
 62 | 
 63 |     response = spider.crawl_url(url, params=params)
 64 |     assert isinstance(response, list)
 65 |     assert len(response) > 0
 66 |     assert isinstance(response[0], dict)
 67 |     assert 'content' in response[0]
 68 |     assert 'error' in response[0]
 69 |     assert 'status' in response[0]
 70 |     assert 'url' in response[0]
 71 |     mock_post.assert_called_once()
 72 | 
 73 | @patch('requests.post')
 74 | def test_crawl_url_streaming(mock_post, spider, url, params):
 75 |     mock_response = MagicMock()
 76 |     mock_response.status_code = 200
 77 |     mock_response.iter_content.return_value = [b'{"url": "http://example.com"}']
 78 |     mock_post.return_value = mock_response
 79 | 
 80 |     def handle_json(json_obj):
 81 |         assert json_obj["url"] == "http://example.com"
 82 | 
 83 |     spider.crawl_url(url, params=params, stream=True, content_type="application/jsonl", callback=handle_json)
 84 |     mock_post.assert_called_once()
 85 | 
 86 | @patch('requests.post')
 87 | def test_links(mock_post, spider, url, params):
 88 |     mock_response = MagicMock()
 89 |     mock_response.status_code = 200
 90 |     mock_response.json.return_value = [{"error": None, "status": 200, "url": url}]
 91 |     mock_post.return_value = mock_response
 92 | 
 93 |     response = spider.links(url, params=params)
 94 |     assert isinstance(response, list)
 95 |     assert len(response) > 0
 96 |     assert isinstance(response[0], dict)
 97 |     assert 'error' in response[0]
 98 |     assert 'status' in response[0]
 99 |     assert 'url' in response[0]
100 |     mock_post.assert_called_once()
101 | 
102 | @patch('requests.post')
103 | def test_screenshot(mock_post, spider, url, params):
104 |     mock_response = MagicMock()
105 |     mock_response.status_code = 200
106 |     mock_response.json.return_value = [{"content": "base64_encoded_image", "error": None, "status": 200, "url": url}]
107 |     mock_post.return_value = mock_response
108 | 
109 |     response = spider.screenshot(url, params=params)
110 |     assert isinstance(response, list)
111 |     assert len(response) > 0
112 |     assert isinstance(response[0], dict)
113 |     assert 'content' in response[0]
114 |     assert 'error' in response[0]
115 |     assert 'status' in response[0]
116 |     assert 'url' in response[0]
117 |     mock_post.assert_called_once()
118 | 
119 | @patch('requests.post')
120 | def test_search(mock_post, spider, params):
121 |     mock_response = MagicMock()
122 |     mock_response.status_code = 200
123 |     mock_response.json.return_value = [{"content": "result", "error": None, "status": 200, "url": "http://example.com"}]
124 |     mock_post.return_value = mock_response
125 | 
126 |     response = spider.search("example search query", params=params)
127 |     assert isinstance(response, list)
128 |     assert len(response) > 0
129 |     assert isinstance(response[0], dict)
130 |     assert 'content' in response[0]
131 |     assert 'error' in response[0]
132 |     assert 'status' in response[0]
133 |     assert 'url' in response[0]
134 |     mock_post.assert_called_once()
135 | 
136 | @patch('requests.post')
137 | def test_transform(mock_post, spider, url, params):
138 |     mock_response = MagicMock()
139 |     mock_response.status_code = 200
140 |     mock_response.json.return_value = {"content": "transformed", "error": None, "status": 200}
141 |     mock_post.return_value = mock_response
142 | 
143 |     transform_data = [{"html": "<html><body>Example</body></html>", "url": url}]
144 |     response = spider.transform(transform_data, params=params)
145 |     assert isinstance(response, dict)
146 |     assert 'content' in response
147 |     assert 'error' in response
148 |     assert 'status' in response
149 |     mock_post.assert_called_once()
150 | 
151 | @patch('requests.post')
152 | def test_extract_contacts(mock_post, spider, url, params):
153 |     mock_response = MagicMock()
154 |     mock_response.status_code = 200
155 |     mock_response.json.return_value = [{"content": "contacts", "error": None, "status": 200, "url": url}]
156 |     mock_post.return_value = mock_response
157 | 
158 |     response = spider.extract_contacts(url, params=params)
159 |     assert isinstance(response, list)
160 |     assert len(response) > 0
161 |     assert isinstance(response[0], dict)
162 |     assert 'content' in response[0]
163 |     assert 'error' in response[0]
164 |     assert 'status' in response[0]
165 |     assert 'url' in response[0]
166 |     mock_post.assert_called_once()
167 | 
168 | @patch('requests.post')
169 | def test_label(mock_post, spider, url, params):
170 |     mock_response = MagicMock()
171 |     mock_response.status_code = 200
172 |     mock_response.json.return_value = [{"content": "labels", "error": None, "status": 200, "url": url}]
173 |     mock_post.return_value = mock_response
174 | 
175 |     response = spider.label(url, params=params)
176 |     assert isinstance(response, list)
177 |     assert len(response) > 0
178 |     assert isinstance(response[0], dict)
179 |     assert 'content' in response[0]
180 |     assert 'error' in response[0]
181 |     assert 'status' in response[0]
182 |     assert 'url' in response[0]
183 |     mock_post.assert_called_once()
184 | 
185 | @patch('requests.post')
186 | def test_get_crawl_state(mock_post, spider, url, params):
187 |     mock_response = MagicMock()
188 |     mock_response.status_code = 200
189 |     mock_response.json.return_value = {"data": [{"state": "completed", "credits_used": 10}]}
190 |     mock_post.return_value = mock_response
191 | 
192 |     response = spider.get_crawl_state(url, params=params)
193 |     assert isinstance(response, dict)
194 |     assert 'data' in response
195 |     assert isinstance(response['data'], list)
196 |     mock_post.assert_called_once()
197 | 
198 | @patch('requests.get')
199 | def test_get_credits(mock_get, spider):
200 |     mock_response = MagicMock()
201 |     mock_response.status_code = 200
202 |     mock_response.json.return_value = {"data": [{"credits": 1000}]}
203 |     mock_get.return_value = mock_response
204 | 
205 |     response = spider.get_credits()
206 |     assert isinstance(response, dict)
207 |     assert 'data' in response
208 |     assert isinstance(response['data'], list)
209 |     mock_get.assert_called_once()
210 | 
211 | @patch('requests.post')
212 | def test_data_post(mock_post, spider, url):
213 |     mock_response = MagicMock()
214 |     mock_response.status_code = 204
215 |     mock_post.return_value = mock_response
216 | 
217 |     table = "websites"
218 |     post_data: RequestParamsDict = {"url": url}
219 |     response = spider.data_post(table, post_data)
220 |     assert response is not None
221 |     mock_post.assert_called_once()
222 | 
223 | @patch('requests.get')
224 | def test_data_get(mock_get, spider, url, params):
225 |     mock_response = MagicMock()
226 |     mock_response.status_code = 200
227 |     mock_response.json.return_value = {"data": [{"url": url}]}
228 |     mock_get.return_value = mock_response
229 | 
230 |     table = "websites"
231 |     response = spider.data_get(table, params=params)
232 |     assert isinstance(response['data'], list)
233 |     mock_get.assert_called_once()
234 | 
235 | @patch('requests.get')
236 | def test_query(mock_get, spider, params):
237 |     mock_response = MagicMock()
238 |     mock_response.status_code = 200
239 |     mock_response.json.return_value = {"data": {"status": 200}}
240 |     mock_get.return_value = mock_response
241 |     response = spider.data_get("query", params=params)
242 |     assert isinstance(response['data'], object)
243 |     mock_get.assert_called_once()
244 | 
245 | @patch('requests.delete')
246 | def test_data_delete(mock_delete, spider, params):
247 |     mock_response = MagicMock()
248 |     mock_response.status_code = 204
249 |     mock_delete.return_value = mock_response
250 | 
251 |     table = "websites"
252 |     response = spider.data_delete(table, params=params)
253 |     assert response is not None
254 |     mock_delete.assert_called_once()
255 | 
256 | @patch('requests.get')
257 | def test_create_signed_url(mock_get, spider):
258 |     mock_response = MagicMock()
259 |     mock_response.status_code = 200
260 |     mock_response.raw = b"mocked raw data"
261 |     mock_get.return_value = mock_response
262 | 
263 |     response = spider.create_signed_url(params={"domain": "example.com"})
264 |     assert response == b"mocked raw data"
265 |     mock_get.assert_called_once()
266 | 
267 | def test_stream_reader():
268 |     spider = Spider(api_key="test_api_key")
269 |     mock_response = MagicMock()
270 |     raw_data = b'{"key": "value"}\n{"key2": "value2"}\n'
271 |     mock_response = MagicMock()
272 |     mock_response.raw = BytesIO(raw_data)
273 |     
274 |     callback_data = []
275 |     def callback(json_obj):
276 |         callback_data.append(json_obj)
277 |     
278 |     spider.stream_reader(mock_response, callback)
279 |     
280 |     assert len(callback_data) == 2
281 |     assert callback_data[0] == {"key": "value"}
282 |     assert callback_data[1] == {"key2": "value2"}
283 | 
284 | def test_handle_error():
285 |     spider = Spider(api_key="test_api_key")
286 |     mock_response = MagicMock()
287 |     mock_response.status_code = 402
288 |     mock_response.json.return_value = {"error": "Payment Required"}
289 |     
290 |     with pytest.raises(Exception, match="Failed to test action. Status code: 402. Error: Payment Required"):
291 |         spider._handle_error(mock_response, "test action")
292 | 


--------------------------------------------------------------------------------
/python/tests/test_spider_integration.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import os
  3 | import logging
  4 | from spider.spider import Spider
  5 | from dotenv import load_dotenv
  6 | 
  7 | load_dotenv()
  8 | 
  9 | logging.basicConfig(level=logging.INFO)
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | @pytest.fixture
 13 | def spider():
 14 |     api_key = os.getenv("SPIDER_API_KEY")
 15 |     if not api_key:
 16 |         pytest.skip("SPIDER_API_KEY not set in .env file")
 17 |     return Spider(api_key=api_key)
 18 | 
 19 | @pytest.fixture
 20 | def url():
 21 |     return "http://example.com"
 22 | 
 23 | @pytest.fixture
 24 | def params():
 25 |     return {
 26 |         "limit": 1,
 27 |         "return_format": "markdown",
 28 |         "depth": 2,
 29 |         "cache": True,
 30 |         "domain": "example.com",
 31 |     }
 32 | 
 33 | def test_scrape_url(spider, url, params):
 34 |     response = spider.scrape_url(url, params=params)
 35 |     logger.info(f"Scrape URL response: {response}")
 36 |     assert isinstance(response, list)
 37 |     assert len(response) > 0
 38 |     assert isinstance(response[0], dict)
 39 |     assert 'content' in response[0]
 40 |     assert 'error' in response[0]
 41 |     assert 'status' in response[0]
 42 |     assert 'url' in response[0]
 43 | 
 44 | def test_crawl_url(spider, url, params):
 45 |     response = spider.crawl_url(url, params=params)
 46 |     logger.info(f"Crawl URL response: {response}")
 47 |     assert isinstance(response, list)
 48 |     assert len(response) > 0
 49 |     assert isinstance(response[0], dict)
 50 |     assert 'content' in response[0]
 51 |     assert 'error' in response[0]
 52 |     assert 'status' in response[0]
 53 |     assert 'url' in response[0]
 54 | 
 55 | def test_links(spider, url, params):
 56 |     response = spider.links(url, params=params)
 57 |     logger.info(f"Links response: {response}")
 58 |     assert isinstance(response, list)
 59 |     assert len(response) > 0
 60 |     assert isinstance(response[0], dict)
 61 |     assert 'error' in response[0]
 62 |     assert 'status' in response[0]
 63 |     assert 'url' in response[0]
 64 | 
 65 | def test_screenshot(spider, url, params):
 66 |     response = spider.screenshot(url, params=params)
 67 |     logger.info(f"Screenshot response: {response}")
 68 |     assert isinstance(response, list)
 69 |     assert len(response) > 0
 70 |     assert isinstance(response[0], dict)
 71 |     assert 'content' in response[0]
 72 |     assert 'error' in response[0]
 73 |     assert 'status' in response[0]
 74 |     assert 'url' in response[0]
 75 | 
 76 | def test_search(spider, params):
 77 |     response = spider.search("example search query", params=params)
 78 |     logger.info(f"Search response: {response}")
 79 |     assert isinstance(response, list)
 80 |     assert len(response) > 0
 81 |     assert isinstance(response[0], dict)
 82 |     assert 'content' in response[0]
 83 |     assert 'error' in response[0]
 84 |     assert 'status' in response[0]
 85 |     assert 'url' in response[0]
 86 | 
 87 | def test_transform(spider, url, params):
 88 |     transform_data = [{"html": "<html><body>Example</body></html>", "url": url}]
 89 |     response = spider.transform(transform_data, params=params)
 90 |     logger.info(f"Transform response: {response}")
 91 |     assert isinstance(response, dict)
 92 |     assert 'content' in response
 93 |     assert 'error' in response
 94 |     assert 'status' in response
 95 | 
 96 | def test_extract_contacts(spider, url, params):
 97 |     response = spider.extract_contacts(url, params=params)
 98 |     logger.info(f"Extract contacts response: {response}")
 99 |     assert isinstance(response, list)
100 |     assert len(response) > 0
101 |     assert isinstance(response[0], dict)
102 |     assert 'content' in response[0]
103 |     assert 'error' in response[0]
104 |     assert 'status' in response[0]
105 |     assert 'url' in response[0]
106 | 
107 | def test_label(spider, url, params):
108 |     response = spider.label(url, params=params)
109 |     logger.info(f"Label response: {response}")
110 |     assert isinstance(response, list)
111 |     assert len(response) > 0
112 |     assert isinstance(response[0], dict)
113 |     assert 'content' in response[0]
114 |     assert 'error' in response[0]
115 |     assert 'status' in response[0]
116 |     assert 'url' in response[0]
117 | 
118 | def test_get_crawl_state(spider, url, params):
119 |     response = spider.get_crawl_state(url, params=params)
120 |     logger.info(f"Get crawl state response: {response}")
121 |     assert isinstance(response, dict)
122 |     assert 'data' in response
123 |     assert isinstance(response['data'], list)
124 | 
125 | def test_get_credits(spider):
126 |     response = spider.get_credits()
127 |     logger.info(f"Get credits response: {response}")
128 |     assert isinstance(response, dict)
129 |     assert 'data' in response
130 |     assert isinstance(response['data'], list)
131 | 
132 | def test_data_post(spider, url):
133 |     table = "websites"
134 |     post_data = {"url": url}
135 |     response = spider.data_post(table, post_data)
136 |     logger.info(f"Data post response: {response}")
137 |     assert isinstance(response['data'], dict)
138 |     assert response['data']['url'] == url
139 |     assert response['data']['domain'] == url.replace("http://", "").replace("https://", "")
140 |     assert response['error'] == None
141 | 
142 | # TODO: 500 error. 
143 | # def test_data_get(spider, params):
144 | #     table = "websites"
145 | #     response = spider.data_get(table, params=params)
146 | #     logger.info(f"Data get response: {response}")
147 | #     assert isinstance(response['data'], list)
148 | 
149 | def test_data_delete(spider, params):
150 |     table = "websites"
151 |     response = spider.data_delete(table, params=params)
152 |     logger.info(f"Data delete response: {response}")
153 |     assert response['message'] == 'ok'
154 | 
155 | # TODO: 500 error. 
156 | # def test_create_signed_url(spider):
157 | #     response = spider.create_signed_url(domain="example.com", options={"page": 1, "limit": 10})
158 | #     logger.info(f"Create signed URL response: {response}")
159 | #     assert isinstance(response, bytes)
160 | 


--------------------------------------------------------------------------------
/rust/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "spider-client"
 3 | version = "0.1.36"
 4 | edition = "2021"
 5 | authors = [ "j-mendez <jeff@spider.cloud>"]
 6 | description = "Spider Cloud client"
 7 | license = "MIT"
 8 | readme = "README.md"
 9 | repository = "https://github.com/spider-rs/spider-clients"
10 | keywords = ["crawler", "web-crawler", "web-scraper", "spider", "web-indexer"]
11 | categories = ["web-programming"]
12 | include = ["src/*", "../../LICENSE", "README.md"]
13 | 
14 | [dependencies]
15 | reqwest = { version = "0.12", features = ["json", "stream"] }
16 | bytes = "1"
17 | tokio = { version = "1", features = ["rt-multi-thread", "macros"] }
18 | serde = { version = "1", features = ["derive"] }
19 | serde_json = { version = "1" }
20 | tokio-stream = "0.1"
21 | backon = { version = "1", features = ["tokio-sleep"] }
22 | tokio-util = "0.7"
23 | 
24 | [dev-dependencies]
25 | dotenv = "0.15.0"
26 | lazy_static = "1.5.0"
27 | 


--------------------------------------------------------------------------------
/rust/README.md:
--------------------------------------------------------------------------------
  1 | # Spider Cloud Rust SDK
  2 | 
  3 | The Spider Cloud Rust SDK offers a toolkit for straightforward website scraping, crawling at scale, and other utilities like extracting links and taking screenshots, enabling you to collect data formatted for compatibility with language models (LLMs). It features a user-friendly interface for seamless integration with the Spider Cloud API.
  4 | 
  5 | ## Installation
  6 | 
  7 | To use the Spider Cloud Rust SDK, include the following in your `Cargo.toml`:
  8 | 
  9 | ```toml
 10 | [dependencies]
 11 | spider-client = "0.1"
 12 | ```
 13 | 
 14 | ## Usage
 15 | 
 16 | 1. Get an API key from [spider.cloud](https://spider.cloud)
 17 | 2. Set the API key as an environment variable named `SPIDER_API_KEY` or pass it as an argument when creating an instance of the `Spider` struct.
 18 | 
 19 | Here's an example of how to use the SDK:
 20 | 
 21 | ```rust
 22 | use serde_json::json;
 23 | use std::env;
 24 | 
 25 | #[tokio::main]
 26 | async fn main() {
 27 |     // Set the API key as an environment variable
 28 |     env::set_var("SPIDER_API_KEY", "your_api_key");
 29 | 
 30 |     // Initialize the Spider with your API key
 31 |     let spider = Spider::new(None).expect("API key must be provided");
 32 | 
 33 |     let url = "https://spider.cloud";
 34 | 
 35 |     // Scrape a single URL
 36 |     let scraped_data = spider.scrape_url(url, None, false, "application/json").await.expect("Failed to scrape the URL");
 37 | 
 38 |     println!("Scraped Data: {:?}", scraped_data);
 39 | 
 40 |     // Crawl a website
 41 |     let crawler_params = RequestParams {
 42 |         limit: Some(1),
 43 |         proxy_enabled: Some(true),
 44 |         store_data: Some(false),
 45 |         metadata: Some(false),
 46 |         request: Some(RequestType::Http),
 47 |         ..Default::default()
 48 |     };
 49 | 
 50 |     let crawl_result = spider.crawl_url(url, Some(crawler_params), false, "application/json", None::<fn(serde_json::Value)>).await.expect("Failed to crawl the URL");
 51 | 
 52 |     println!("Crawl Result: {:?}", crawl_result);
 53 | }
 54 | ```
 55 | 
 56 | ### Scraping a URL
 57 | 
 58 | To scrape data from a single URL:
 59 | 
 60 | ```rust
 61 | let url = "https://example.com";
 62 | let scraped_data = spider.scrape_url(url, None, false, "application/json").await.expect("Failed to scrape the URL");
 63 | ```
 64 | 
 65 | ### Crawling a Website
 66 | 
 67 | To automate crawling a website:
 68 | 
 69 | ```rust
 70 | let url = "https://example.com";
 71 | let crawl_params = RequestParams {
 72 |     limit: Some(200),
 73 |     request: Some(RequestType::Smart),
 74 |     ..Default::default()
 75 | };
 76 | let crawl_result = spider.crawl_url(url, Some(crawl_params), false, "application/json", None::<fn(serde_json::Value)>).await.expect("Failed to crawl the URL");
 77 | ```
 78 | 
 79 | #### Crawl Streaming
 80 | 
 81 | Stream crawl the website in chunks to scale with a callback:
 82 | 
 83 | ```rust
 84 | fn handle_json(json_obj: serde_json::Value) {
 85 |     println!("Received chunk: {:?}", json_obj);
 86 | }
 87 | 
 88 | let url = "https://example.com";
 89 | let crawl_params = RequestParams {
 90 |     limit: Some(200),
 91 |     store_data: Some(false),
 92 |     ..Default::default()
 93 | };
 94 | 
 95 | spider.crawl_url(
 96 |     url,
 97 |     Some(crawl_params),
 98 |     true,
 99 |     "application/json",
100 |     Some(handle_json)
101 | ).await.expect("Failed to crawl the URL");
102 | ```
103 | 
104 | ### Search
105 | 
106 | Perform a search for websites to crawl or gather search results:
107 | 
108 | ```rust
109 | let query = "a sports website";
110 | let crawl_params = RequestParams {
111 |     request: Some(RequestType::Smart),
112 |     search_limit: Some(5),
113 |     limit: Some(5),
114 |     fetch_page_content: Some(true),
115 |     ..Default::default()
116 | };
117 | let crawl_result = spider.search(query, Some(crawl_params), false, "application/json").await.expect("Failed to perform search");
118 | ```
119 | 
120 | ### Retrieving Links from a URL(s)
121 | 
122 | Extract all links from a specified URL:
123 | 
124 | ```rust
125 | let url = "https://example.com";
126 | let links = spider.links(url, None, false, "application/json").await.expect("Failed to retrieve links from URL");
127 | ```
128 | 
129 | ### Transform
130 | 
131 | Transform HTML to markdown or text lightning fast:
132 | 
133 | ```rust
134 | let data = vec![json!({"html": "<html><body><h1>Hello world</h1></body></html>"})];
135 | let params = RequestParams {
136 |     readability: Some(false),
137 |     return_format: Some(ReturnFormat::Markdown),
138 |     ..Default::default()
139 | };
140 | let result = spider.transform(data, Some(params), false, "application/json").await.expect("Failed to transform HTML to markdown");
141 | println!("Transformed Data: {:?}", result);
142 | ```
143 | 
144 | ### Taking Screenshots of a URL(s)
145 | 
146 | Capture a screenshot of a given URL:
147 | 
148 | ```rust
149 | let url = "https://example.com";
150 | let screenshot = spider.screenshot(url, None, false, "application/json").await.expect("Failed to take screenshot of URL");
151 | ```
152 | 
153 | ### Extracting Contact Information
154 | 
155 | Extract contact details from a specified URL:
156 | 
157 | ```rust
158 | let url = "https://example.com";
159 | let contacts = spider.extract_contacts(url, None, false, "application/json").await.expect("Failed to extract contacts from URL");
160 | println!("Extracted Contacts: {:?}", contacts);
161 | ```
162 | 
163 | ### Labeling Data from a URL(s)
164 | 
165 | Label the data extracted from a particular URL:
166 | 
167 | ```rust
168 | let url = "https://example.com";
169 | let labeled_data = spider.label(url, None, false, "application/json").await.expect("Failed to label data from URL");
170 | println!("Labeled Data: {:?}", labeled_data);
171 | ```
172 | 
173 | ### Checking Crawl State
174 | 
175 | You can check the crawl state of a specific URL:
176 | 
177 | ```rust
178 | let url = "https://example.com";
179 | let state = spider.get_crawl_state(url, None, false, "application/json").await.expect("Failed to get crawl state for URL");
180 | println!("Crawl State: {:?}", state);
181 | ```
182 | 
183 | ### Downloading Files
184 | 
185 | You can download the results of the website:
186 | 
187 | ```rust
188 | let url = "https://example.com";
189 | let options = hashmap!{
190 |     "page" => 0,
191 |     "limit" => 100,
192 |     "expiresIn" => 3600 // Optional, add if needed
193 | };
194 | let response = spider.create_signed_url(Some(url), Some(options)).await.expect("Failed to create signed URL");
195 | println!("Download URL: {:?}", response);
196 | ```
197 | 
198 | ### Checking Available Credits
199 | 
200 | You can check the remaining credits on your account:
201 | 
202 | ```rust
203 | let credits = spider.get_credits().await.expect("Failed to get credits");
204 | println!("Remaining Credits: {:?}", credits);
205 | ```
206 | 
207 | ### Data Operations
208 | 
209 | The Spider client can now interact with specific data tables to create, retrieve, and delete data.
210 | 
211 | #### Retrieve Data from a Table
212 | 
213 | To fetch data from a specified table by applying query parameters:
214 | 
215 | ```rust
216 | let table_name = "pages";
217 | let query_params = RequestParams {
218 |     limit: Some(20),
219 |     ..Default::default()
220 | };
221 | let response = spider.data_get(table_name, Some(query_params)).await.expect("Failed to retrieve data from table");
222 | println!("Data from table: {:?}", response);
223 | ```
224 | 
225 | #### Delete Data from a Table
226 | 
227 | To delete data from a specified table based on certain conditions:
228 | 
229 | ```rust
230 | let table_name = "websites";
231 | let delete_params = RequestParams {
232 |     domain: Some("www.example.com".to_string()),
233 |     ..Default::default()
234 | };
235 | let response = spider.data_delete(table_name, Some(delete_params)).await.expect("Failed to delete data from table");
236 | println!("Delete Response: {:?}", response);
237 | ```
238 | 
239 | ## Streaming
240 | 
241 | If you need to use streaming, set the `stream` parameter to `true` and provide a callback function:
242 | 
243 | ```rust
244 | fn handle_json(json_obj: serde_json::Value) {
245 |     println!("Received chunk: {:?}", json_obj);
246 | }
247 | 
248 | let url = "https://example.com";
249 | let crawler_params = RequestParams {
250 |     limit: Some(1),
251 |     proxy_enabled: Some(true),
252 |     store_data: Some(false),
253 |     metadata: Some(false),
254 |     request: Some(RequestType::Http),
255 |     ..Default::default()
256 | };
257 | 
258 | spider.links(url, Some(crawler_params), true, "application/json").await.expect("Failed to retrieve links from URL");
259 | ```
260 | 
261 | ## Content-Type
262 | 
263 | The following Content-type headers are supported using the `content_type` parameter:
264 | 
265 | - `application/json`
266 | - `text/csv`
267 | - `application/xml`
268 | - `application/jsonl`
269 | 
270 | ```rust
271 | let url = "https://example.com";
272 | 
273 | let crawler_params = RequestParams {
274 |     limit: Some(1),
275 |     proxy_enabled: Some(true),
276 |     store_data: Some(false),
277 |     metadata: Some(false),
278 |     request: Some(RequestType::Http),
279 |     ..Default::default()
280 | };
281 | 
282 | // Stream JSON lines back to the client
283 | spider.crawl_url(url, Some(crawler_params), true, "application/jsonl", None::<fn(serde_json::Value)>).await.expect("Failed to crawl the URL");
284 | ```
285 | 
286 | ## Error Handling
287 | 
288 | The SDK handles errors returned by the Spider Cloud API and raises appropriate exceptions. If an error occurs during a request, it will be propagated to the caller with a descriptive error message.
289 | 
290 | ## Contributing
291 | 
292 | Contributions to the Spider Cloud Rust SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
293 | 
294 | ## License
295 | 
296 | The Spider Cloud Rust SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).
297 | 


--------------------------------------------------------------------------------