├── .cargo
    └── config.toml
├── .editorconfig
├── .eslintrc.yml
├── .github
    └── workflows
    │   ├── CI.yml
    │   ├── bench.yml
    │   └── book.yml
├── .gitignore
├── .npmignore
├── .prettierignore
├── .taplo.toml
├── .vscode
    └── settings.json
├── .yarn
    └── releases
    │   └── yarn-3.6.4.cjs
├── .yarnrc.yml
├── Cargo.toml
├── LICENSE
├── README.md
├── __test__
    └── index.spec.ts
├── bench
    ├── README.md
    ├── base.ts
    ├── case
    │   ├── crawlee.ts
    │   └── spider.ts
    ├── compare.ts
    ├── crawlee.ts
    ├── oss.ts
    ├── package-lock.json
    └── package.json
├── book
    ├── .gitignore
    ├── book.toml
    └── src
    │   ├── README.md
    │   ├── SUMMARY.md
    │   ├── benchmarks.md
    │   ├── crawl.md
    │   ├── cron-job.md
    │   ├── env.md
    │   ├── getting-started.md
    │   ├── page.md
    │   ├── scrape.md
    │   ├── simple.md
    │   ├── storing-data.md
    │   └── website.md
├── build.rs
├── examples
    ├── basic.mjs
    ├── cron.mjs
    ├── openai.mjs
    └── subscription.mjs
├── index.d.ts
├── index.js
├── npm
    ├── android-arm-eabi
    │   ├── README.md
    │   └── package.json
    ├── android-arm64
    │   ├── README.md
    │   └── package.json
    ├── darwin-arm64
    │   ├── README.md
    │   └── package.json
    ├── darwin-universal
    │   ├── README.md
    │   └── package.json
    ├── darwin-x64
    │   ├── README.md
    │   └── package.json
    ├── freebsd-x64
    │   ├── README.md
    │   └── package.json
    ├── linux-arm-gnueabihf
    │   ├── README.md
    │   └── package.json
    ├── linux-arm64-gnu
    │   ├── README.md
    │   └── package.json
    ├── linux-arm64-musl
    │   ├── README.md
    │   └── package.json
    ├── linux-x64-gnu
    │   ├── README.md
    │   └── package.json
    ├── linux-x64-musl
    │   ├── README.md
    │   └── package.json
    ├── win32-arm64-msvc
    │   ├── README.md
    │   └── package.json
    ├── win32-ia32-msvc
    │   ├── README.md
    │   └── package.json
    └── win32-x64-msvc
    │   ├── README.md
    │   └── package.json
├── package.json
├── rustfmt.toml
├── src
    ├── conversions.rs
    ├── lib.rs
    ├── npage.rs
    ├── nwebsite.rs
    ├── page.rs
    ├── shortcut.rs
    └── website.rs
├── tsconfig.json
└── yarn.lock


/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | [target.aarch64-unknown-linux-musl]
2 | linker = "aarch64-linux-musl-gcc"
3 | rustflags = ["-C", "target-feature=-crt-static"]


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # EditorConfig helps developers define and maintain consistent
 2 | # coding styles between different editors or IDEs
 3 | # http://editorconfig.org
 4 | root = true
 5 | 
 6 | [*]
 7 | indent_style = space
 8 | indent_size = 2
 9 | end_of_line = lf
10 | charset = utf-8
11 | trim_trailing_whitespace = true
12 | insert_final_newline = true
13 | 
14 | [*.md]
15 | trim_trailing_whitespace = false


--------------------------------------------------------------------------------
/.eslintrc.yml:
--------------------------------------------------------------------------------
  1 | parser: '@typescript-eslint/parser'
  2 | 
  3 | parserOptions:
  4 |   ecmaFeatures:
  5 |     jsx: true
  6 |   ecmaVersion: latest
  7 |   sourceType: module
  8 |   project: ./tsconfig.json
  9 | 
 10 | env:
 11 |   browser: true
 12 |   es6: true
 13 |   node: true
 14 |   jest: true
 15 | 
 16 | ignorePatterns: ['index.js']
 17 | 
 18 | plugins:
 19 |   - import
 20 |   - '@typescript-eslint'
 21 | 
 22 | extends:
 23 |   - eslint:recommended
 24 |   - plugin:prettier/recommended
 25 | 
 26 | rules:
 27 |   # 0 = off, 1 = warn, 2 = error
 28 |   'space-before-function-paren': 0
 29 |   'no-useless-constructor': 0
 30 |   'no-undef': 2
 31 |   'no-console': [2, { allow: ['error', 'warn', 'info', 'assert'] }]
 32 |   'comma-dangle': ['error', 'only-multiline']
 33 |   'no-unused-vars': 0
 34 |   'no-var': 2
 35 |   'one-var-declaration-per-line': 2
 36 |   'prefer-const': 2
 37 |   'no-const-assign': 2
 38 |   'no-duplicate-imports': 2
 39 |   'no-use-before-define': [2, { 'functions': false, 'classes': false }]
 40 |   'eqeqeq': [2, 'always', { 'null': 'ignore' }]
 41 |   'no-case-declarations': 0
 42 |   'no-restricted-syntax':
 43 |     [
 44 |       2,
 45 |       {
 46 |         'selector': 'BinaryExpression[operator=/(==|===|!=|!==)/][left.raw=true], BinaryExpression[operator=/(==|===|!=|!==)/][right.raw=true]',
 47 |         'message': Don't compare for equality against boolean literals,
 48 |       },
 49 |     ]
 50 | 
 51 |   # https://github.com/benmosher/eslint-plugin-import/pull/334
 52 |   'import/no-duplicates': 2
 53 |   'import/first': 2
 54 |   'import/newline-after-import': 2
 55 |   'import/order':
 56 |     [
 57 |       2,
 58 |       {
 59 |         'newlines-between': 'always',
 60 |         'alphabetize': { 'order': 'asc' },
 61 |         'groups': ['builtin', 'external', 'internal', 'parent', 'sibling', 'index'],
 62 |       },
 63 |     ]
 64 | 
 65 | overrides:
 66 |   - files:
 67 |       - ./**/*{.ts,.tsx}
 68 |     rules:
 69 |       'no-unused-vars': [2, { varsIgnorePattern: '^_', argsIgnorePattern: '^_', ignoreRestSiblings: true }]
 70 |       'no-undef': 0
 71 |       # TypeScript declare merge
 72 |       'no-redeclare': 0
 73 |       'no-useless-constructor': 0
 74 |       'no-dupe-class-members': 0
 75 |       'no-case-declarations': 0
 76 |       'no-duplicate-imports': 0
 77 |       # TypeScript Interface and Type
 78 |       'no-use-before-define': 0
 79 | 
 80 |       '@typescript-eslint/adjacent-overload-signatures': 2
 81 |       '@typescript-eslint/await-thenable': 2
 82 |       '@typescript-eslint/consistent-type-assertions': 2
 83 |       '@typescript-eslint/ban-types':
 84 |         [
 85 |           'error',
 86 |           {
 87 |             'types':
 88 |               {
 89 |                 'String': { 'message': 'Use string instead', 'fixWith': 'string' },
 90 |                 'Number': { 'message': 'Use number instead', 'fixWith': 'number' },
 91 |                 'Boolean': { 'message': 'Use boolean instead', 'fixWith': 'boolean' },
 92 |                 'Function': { 'message': 'Use explicit type instead' },
 93 |               },
 94 |           },
 95 |         ]
 96 |       '@typescript-eslint/explicit-member-accessibility':
 97 |         [
 98 |           'error',
 99 |           {
100 |             accessibility: 'explicit',
101 |             overrides:
102 |               {
103 |                 accessors: 'no-public',
104 |                 constructors: 'no-public',
105 |                 methods: 'no-public',
106 |                 properties: 'no-public',
107 |                 parameterProperties: 'explicit',
108 |               },
109 |           },
110 |         ]
111 |       '@typescript-eslint/method-signature-style': 2
112 |       '@typescript-eslint/no-floating-promises': 2
113 |       '@typescript-eslint/no-implied-eval': 2
114 |       '@typescript-eslint/no-for-in-array': 2
115 |       '@typescript-eslint/no-inferrable-types': 2
116 |       '@typescript-eslint/no-invalid-void-type': 2
117 |       '@typescript-eslint/no-misused-new': 2
118 |       '@typescript-eslint/no-misused-promises': 2
119 |       '@typescript-eslint/no-namespace': 2
120 |       '@typescript-eslint/no-non-null-asserted-optional-chain': 2
121 |       '@typescript-eslint/no-throw-literal': 2
122 |       '@typescript-eslint/no-unnecessary-boolean-literal-compare': 2
123 |       '@typescript-eslint/prefer-for-of': 2
124 |       '@typescript-eslint/prefer-nullish-coalescing': 2
125 |       '@typescript-eslint/switch-exhaustiveness-check': 2
126 |       '@typescript-eslint/prefer-optional-chain': 2
127 |       '@typescript-eslint/prefer-readonly': 2
128 |       '@typescript-eslint/prefer-string-starts-ends-with': 0
129 |       '@typescript-eslint/no-array-constructor': 2
130 |       '@typescript-eslint/require-await': 2
131 |       '@typescript-eslint/return-await': 2
132 |       '@typescript-eslint/ban-ts-comment':
133 |         [2, { 'ts-expect-error': false, 'ts-ignore': true, 'ts-nocheck': true, 'ts-check': false }]
134 |       '@typescript-eslint/naming-convention':
135 |         [
136 |           2,
137 |           {
138 |             selector: 'memberLike',
139 |             format: ['camelCase', 'PascalCase'],
140 |             modifiers: ['private'],
141 |             leadingUnderscore: 'forbid',
142 |           },
143 |         ]
144 |       '@typescript-eslint/no-unused-vars':
145 |         [2, { varsIgnorePattern: '^_', argsIgnorePattern: '^_', ignoreRestSiblings: true }]
146 |       '@typescript-eslint/member-ordering':
147 |         [
148 |           2,
149 |           {
150 |             default:
151 |               [
152 |                 'public-static-field',
153 |                 'protected-static-field',
154 |                 'private-static-field',
155 |                 'public-static-method',
156 |                 'protected-static-method',
157 |                 'private-static-method',
158 |                 'public-instance-field',
159 |                 'protected-instance-field',
160 |                 'private-instance-field',
161 |                 'public-constructor',
162 |                 'protected-constructor',
163 |                 'private-constructor',
164 |                 'public-instance-method',
165 |                 'protected-instance-method',
166 |                 'private-instance-method',
167 |               ],
168 |           },
169 |         ]
170 | 


--------------------------------------------------------------------------------
/.github/workflows/CI.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | env:
  3 |   DEBUG: napi:*
  4 |   APP_NAME: spider-rs
  5 |   MACOSX_DEPLOYMENT_TARGET: '10.13'
  6 | permissions:
  7 |   contents: write
  8 |   id-token: write
  9 | on:
 10 |   push:
 11 |     branches:
 12 |       - main
 13 |     tags-ignore:
 14 |       - '**'
 15 |     paths-ignore:
 16 |       - '**/*.md'
 17 |       - LICENSE
 18 |       - '**/*.gitignore'
 19 |       - .editorconfig
 20 |       - docs/**
 21 |   pull_request: null
 22 | concurrency:
 23 |   group: ${{ github.workflow }}-${{ github.ref }}
 24 |   cancel-in-progress: true
 25 | 
 26 | jobs:
 27 |   build:
 28 |     strategy:
 29 |       fail-fast: false
 30 |       matrix:
 31 |         settings:
 32 |           - host: macos-latest
 33 |             target: x86_64-apple-darwin
 34 |             build: yarn build --target x86_64-apple-darwin
 35 |           - host: windows-latest
 36 |             target: x86_64-pc-windows-msvc
 37 |             build: yarn build --target x86_64-pc-windows-msvc
 38 |           - host: windows-latest
 39 |             target: i686-pc-windows-msvc
 40 |             build: |
 41 |               choco install openssl.light
 42 |               set OPENSSL_LIB_DIR=C:\Program Files\OpenSSL\lib
 43 |               set OPENSSL_INCLUDE_DIR=C:\Program Files\OpenSSL\include
 44 |               yarn build --target i686-pc-windows-msvc
 45 |               # timeout issue - signals not working with swc core
 46 |               # yarn test
 47 |           - host: ubuntu-latest
 48 |             target: x86_64-unknown-linux-gnu
 49 |             setup: |
 50 |               sudo apt-get update
 51 |               sudo apt-get install -y gcc build-essential cmake openssl libssl-dev ca-certificates libc6 perl
 52 |             build: yarn build --target x86_64-unknown-linux-gnu
 53 |           # - host: ubuntu-latest
 54 |           #   target: x86_64-unknown-linux-musl
 55 |           #   setup: |
 56 |           #     sudo apt-get update && sudo apt-get install -y build-essential pkg-config cmake musl-tools musl-dev openssl libssl-dev ca-certificates gcc g++ libc6
 57 |           #     export CC=musl-gcc
 58 |           #   docker: ghcr.io/napi-rs/napi-rs/nodejs-rust:lts-alpine
 59 |           #   build: yarn build --target x86_64-unknown-linux-musl
 60 |           #   env:
 61 |           #     CXXFLAGS: '--stdlib=libc++ -L/usr/lib/llvm-18/lib -static'
 62 |           - host: macos-latest
 63 |             target: aarch64-apple-darwin
 64 |             build: yarn build --target aarch64-apple-darwin
 65 |           - host: ubuntu-latest
 66 |             target: aarch64-unknown-linux-gnu
 67 |             docker: ghcr.io/napi-rs/napi-rs/nodejs-rust:lts-debian-aarch64
 68 |             setup: |
 69 |               sudo apt-get update
 70 |               sudo apt-get install -y gcc-aarch64-linux-gnu build-essential cmake openssl libssl-dev ca-certificates gcc libc6 perl pkg-config
 71 |             build: yarn build --target aarch64-unknown-linux-gnu
 72 |           # - host: ubuntu-latest
 73 |           #   target: armv7-unknown-linux-gnueabihf
 74 |           #   setup: |
 75 |           #     sudo apt-get update
 76 |           #     sudo apt-get install build-essential pkg-config perl gcc cmake libc6 ca-certificates openssl libssl-dev gcc-arm-linux-gnueabihf -y
 77 |           #   build: yarn build --target armv7-unknown-linux-gnueabihf
 78 |           #   env:
 79 |           #     CXXFLAGS: '--stdlib=libc++ -L/usr/lib/llvm-18/lib -static'
 80 |           - host: ubuntu-latest
 81 |             target: aarch64-linux-android
 82 |             setup: |
 83 |               sudo apt-get update
 84 |               sudo apt-get install -y build-essential cmake openssl libssl-dev openssl
 85 |             build: yarn build --target aarch64-linux-android
 86 |           - host: ubuntu-latest
 87 |             target: armv7-linux-androideabi
 88 |             setup: |
 89 |               sudo apt-get update
 90 |               sudo apt-get install -y build-essential cmake openssl libssl-dev perl libc6 gcc ca-certificates
 91 |             build: yarn build --target armv7-linux-androideabi
 92 |           # - host: ubuntu-latest
 93 |           #   target: aarch64-unknown-linux-musl
 94 |           #   setup: |
 95 |           #     sudo apt-get update && sudo apt-get install -y pkg-config clang perl-utils build-essential musl-tools musl-dev ca-certificates gcc g++ libc6 perl openssl libssl-dev
 96 |           #     export CC=musl-gcc
 97 |           #   docker: ghcr.io/napi-rs/napi-rs/nodejs-rust:lts-alpine
 98 |           #   build: |
 99 |           #     set -e &&
100 |           #     rustup target add aarch64-unknown-linux-musl &&
101 |           #     yarn build --target aarch64-unknown-linux-musl
102 |           #   env:
103 |           #     CXXFLAGS: '--stdlib=libc++ -L/usr/lib/llvm-18/lib -static'
104 |           - host: windows-latest
105 |             target: aarch64-pc-windows-msvc
106 |             build: |
107 |               choco install openssl.light
108 |               set OPENSSL_LIB_DIR=C:\Program Files\OpenSSL-Win64\lib
109 |               set OPENSSL_INCLUDE_DIR=C:\Program Files\OpenSSL-Win64\include
110 |               yarn build --target aarch64-pc-windows-msvc
111 |     name: stable - ${{ matrix.settings.target }} - node@20
112 |     runs-on: ${{ matrix.settings.host }}
113 |     steps:
114 |       - uses: actions/checkout@v4
115 |       - name: Setup node
116 |         uses: actions/setup-node@v4
117 |         if: ${{ !matrix.settings.docker }}
118 |         with:
119 |           node-version: 20
120 |           cache: yarn
121 |       - name: Install
122 |         uses: dtolnay/rust-toolchain@stable
123 |         if: ${{ !matrix.settings.docker }}
124 |         with:
125 |           toolchain: stable
126 |           targets: ${{ matrix.settings.target }}
127 |       - name: Cache cargo
128 |         uses: actions/cache@v4
129 |         with:
130 |           path: |
131 |             ~/.cargo/registry/index/
132 |             ~/.cargo/registry/cache/
133 |             ~/.cargo/git/db/
134 |             .cargo-cache
135 |             target/
136 |           key: ${{ matrix.settings.target }}-cargo-${{ matrix.settings.host }}
137 |       - uses: goto-bus-stop/setup-zig@v2
138 |         if: ${{ matrix.settings.target == 'armv7-unknown-linux-gnueabihf' }}
139 |         with:
140 |           version: 0.12.0
141 |       - name: Setup toolchain
142 |         run: ${{ matrix.settings.setup }}
143 |         if: ${{ matrix.settings.setup }}
144 |         shell: bash
145 |       - name: Install dependencies
146 |         run: yarn --no-immutable
147 |       - name: Setup node x86
148 |         uses: actions/setup-node@v4
149 |         if: matrix.settings.target == 'i686-pc-windows-msvc'
150 |         with:
151 |           node-version: 20
152 |           cache: yarn
153 |           architecture: x86
154 |       - name: Build in docker
155 |         uses: addnab/docker-run-action@v3
156 |         if: ${{ matrix.settings.docker }}
157 |         with:
158 |           image: ${{ matrix.settings.docker }}
159 |           options: '--user 0:0 -v ${{ github.workspace }}/.cargo-cache/git/db:/usr/local/cargo/git/db -v ${{ github.workspace }}/.cargo/registry/cache:/usr/local/cargo/registry/cache -v ${{ github.workspace }}/.cargo/registry/index:/usr/local/cargo/registry/index -v ${{ github.workspace }}:/build -w /build'
160 |           run: ${{ matrix.settings.build }}
161 |       - name: Build
162 |         run: ${{ matrix.settings.build }}
163 |         if: ${{ !matrix.settings.docker }}
164 |         shell: bash
165 |       - name: Upload artifact
166 |         uses: actions/upload-artifact@v4
167 |         with:
168 |           name: bindings-${{ matrix.settings.target }}
169 |           path: ${{ env.APP_NAME }}.*.node
170 |           if-no-files-found: error
171 | 
172 |   # build-freebsd:
173 |   #   runs-on: macos-13
174 |   #   name: Build FreeBSD
175 |   #   steps:
176 |   #     - uses: actions/checkout@v4
177 |   #     - name: Build
178 |   #       id: build
179 |   #       uses: cross-platform-actions/action@v0.25.0
180 |   #       env:
181 |   #         DEBUG: napi:*
182 |   #         RUSTUP_IO_THREADS: 1
183 |   #       with:
184 |   #         operating_system: freebsd
185 |   #         version: '13.2'
186 |   #         memory: 13G
187 |   #         cpu_count: 3
188 |   #         environment_variables: DEBUG RUSTUP_IO_THREADS
189 |   #         shell: bash
190 |   #         run: |
191 |   #           sudo pkg install -y -f curl node libnghttp2 npm openssl
192 |   #           sudo npm install -g yarn --ignore-scripts
193 |   #           curl https://sh.rustup.rs -sSf --output rustup.sh
194 |   #           sh rustup.sh -y --profile minimal --default-toolchain stable
195 |   #           source "$HOME/.cargo/env"
196 |   #           echo "~~~~ rustc --version ~~~~"
197 |   #           rustc --version
198 |   #           echo "~~~~ node -v ~~~~"
199 |   #           node -v
200 |   #           echo "~~~~ yarn --version ~~~~"
201 |   #           yarn --version
202 |   #           pwd
203 |   #           ls -lah
204 |   #           whoami
205 |   #           env
206 |   #           freebsd-version
207 |   #           yarn install
208 |   #           yarn build
209 |   #           strip -x *.node
210 |   #           yarn test
211 |   #           rm -rf node_modules
212 |   #           rm -rf target
213 |   #           rm -rf .yarn/cache
214 |   #     - name: Upload artifact
215 |   #       uses: actions/upload-artifact@v3
216 |   #       with:
217 |   #         name: bindings-freebsd
218 |   #         path: ${{ env.APP_NAME }}.*.node
219 |   #         if-no-files-found: error
220 | 
221 |   test-macOS-windows-binding:
222 |     name: Test bindings on ${{ matrix.settings.target }} - node@${{ matrix.node }}
223 |     needs:
224 |       - build
225 |     strategy:
226 |       fail-fast: false
227 |       matrix:
228 |         settings:
229 |           - host: windows-latest
230 |             target: x86_64-pc-windows-msvc
231 |             architecture: x64
232 |           - host: macos-latest
233 |             target: aarch64-apple-darwin
234 |             architecture: arm64
235 |           - host: macos-latest
236 |             target: x86_64-apple-darwin
237 |             architecture: x64
238 |         node:
239 |           - '18'
240 |           - '20'
241 |     runs-on: ${{ matrix.settings.host }}
242 |     steps:
243 |       - uses: actions/checkout@v4
244 |       - name: Setup node
245 |         uses: actions/setup-node@v4
246 |         with:
247 |           node-version: ${{ matrix.node }}
248 |           cache: yarn
249 |           architecture: ${{ matrix.settings.architecture }}
250 |       - name: Install dependencies
251 |         run: yarn --no-immutable
252 |       - name: Download artifacts
253 |         uses: actions/download-artifact@v4
254 |         with:
255 |           name: bindings-${{ matrix.settings.target }}
256 |           path: .
257 |       - name: List packages
258 |         run: ls -R .
259 |         shell: bash
260 | 
261 |   test-linux-x64-gnu-binding:
262 |     name: Test bindings on Linux-x64-gnu - node@${{ matrix.node }}
263 |     needs:
264 |       - build
265 |     strategy:
266 |       fail-fast: false
267 |       matrix:
268 |         node:
269 |           - '18'
270 |           - '20'
271 |     runs-on: ubuntu-latest
272 |     steps:
273 |       - uses: actions/checkout@v4
274 |       - name: Setup node
275 |         uses: actions/setup-node@v4
276 |         with:
277 |           node-version: ${{ matrix.node }}
278 |           cache: yarn
279 |       - name: Install dependencies
280 |         run: yarn --no-immutable
281 |       - name: Download artifacts
282 |         uses: actions/download-artifact@v4
283 |         with:
284 |           name: bindings-x86_64-unknown-linux-gnu
285 |           path: .
286 |       - name: List packages
287 |         run: ls -R .
288 |         shell: bash
289 | 
290 |   test-linux-x64-musl-binding:
291 |     name: Test bindings on x86_64-unknown-linux-musl - node@${{ matrix.node }}
292 |     needs:
293 |       - build
294 |     strategy:
295 |       fail-fast: false
296 |       matrix:
297 |         node:
298 |           - '18'
299 |           - '20'
300 |     runs-on: ubuntu-latest
301 |     steps:
302 |       - uses: actions/checkout@v4
303 |       - name: Setup node
304 |         uses: actions/setup-node@v4
305 |         with:
306 |           node-version: ${{ matrix.node }}
307 |           cache: yarn
308 |       - name: Install dependencies
309 |         run: |
310 |           yarn config set supportedArchitectures.libc "musl"
311 |           yarn --no-immutable
312 |       - name: Download artifacts
313 |         uses: actions/download-artifact@v4
314 |         with:
315 |           name: bindings-x86_64-unknown-linux-musl
316 |           path: .
317 |       - name: List packages
318 |         run: ls -R .
319 |         shell: bash
320 | 
321 |   test-linux-aarch64-gnu-binding:
322 |     name: Test bindings on aarch64-unknown-linux-gnu - node@${{ matrix.node }}
323 |     needs:
324 |       - build
325 |     strategy:
326 |       fail-fast: false
327 |       matrix:
328 |         node:
329 |           - '18'
330 |           - '20'
331 |     runs-on: ubuntu-latest
332 |     steps:
333 |       - uses: actions/checkout@v4
334 |       - name: Download artifacts
335 |         uses: actions/download-artifact@v4
336 |         with:
337 |           name: bindings-aarch64-unknown-linux-gnu
338 |           path: .
339 |       - name: List packages
340 |         run: ls -R .
341 |         shell: bash
342 |       - name: Install dependencies
343 |         run: |
344 |           yarn config set supportedArchitectures.cpu "arm64"
345 |           yarn config set supportedArchitectures.libc "glibc"
346 |           yarn --no-immutable
347 |       - name: Set up QEMU
348 |         uses: docker/setup-qemu-action@v3
349 |         with:
350 |           platforms: arm64
351 |       - run: docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
352 |       - name: Setup and run tests
353 |         uses: addnab/docker-run-action@v3
354 |         with:
355 |           image: node:${{ matrix.node }}-slim
356 |           options: '--platform linux/arm64 -v ${{ github.workspace }}:/build -w /build'
357 |           run: |
358 |             set -e
359 |             yarn test
360 |             ls -la
361 | 
362 |   test-linux-aarch64-musl-binding:
363 |     name: Test bindings on aarch64-unknown-linux-musl - node@lts
364 |     needs:
365 |       - build
366 |     runs-on: ubuntu-latest
367 |     steps:
368 |       - uses: actions/checkout@v4
369 |       - name: Download artifacts
370 |         uses: actions/download-artifact@v4
371 |         with:
372 |           name: bindings-aarch64-unknown-linux-musl
373 |           path: .
374 |       - name: List packages
375 |         run: ls -R .
376 |         shell: bash
377 |       - name: Install dependencies
378 |         run: |
379 |           yarn config set supportedArchitectures.cpu "arm64"
380 |           yarn config set supportedArchitectures.libc "musl"
381 |           yarn --no-immutable
382 |       - name: Set up QEMU
383 |         uses: docker/setup-qemu-action@v3
384 |         with:
385 |           platforms: arm64
386 |       - run: docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
387 |       - name: Setup and run tests
388 |         uses: addnab/docker-run-action@v3
389 |         with:
390 |           image: node:lts-alpine
391 |           options: '--platform linux/arm64 -v ${{ github.workspace }}:/build -w /build'
392 |           run: |
393 |             set -e
394 |             yarn test
395 | 
396 |   test-linux-arm-gnueabihf-binding:
397 |     name: Test bindings on armv7-unknown-linux-gnueabihf - node@${{ matrix.node }}
398 |     needs:
399 |       - build
400 |     strategy:
401 |       fail-fast: false
402 |       matrix:
403 |         node:
404 |           - '18'
405 |           - '20'
406 |     runs-on: ubuntu-latest
407 |     steps:
408 |       - uses: actions/checkout@v4
409 |       - name: Download artifacts
410 |         uses: actions/download-artifact@v4
411 |         with:
412 |           name: bindings-armv7-unknown-linux-gnueabihf
413 |           path: .
414 |       - name: List packages
415 |         run: ls -R .
416 |         shell: bash
417 |       - name: Install dependencies
418 |         run: |
419 |           yarn config set supportedArchitectures.cpu "arm"
420 |           yarn --no-immutable
421 |       - name: Set up QEMU
422 |         uses: docker/setup-qemu-action@v3
423 |         with:
424 |           platforms: arm
425 |       - run: docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
426 |       - name: Setup and run tests
427 |         uses: addnab/docker-run-action@v3
428 |         with:
429 |           image: node:${{ matrix.node }}-bullseye-slim
430 |           options: '--platform linux/arm/v7 -v ${{ github.workspace }}:/build -w /build'
431 |           run: |
432 |             set -e
433 |             yarn test
434 |             ls -la
435 | 
436 |   publish:
437 |     name: Publish
438 |     runs-on: ubuntu-latest
439 |     needs:
440 |       - test-macOS-windows-binding
441 |       - test-linux-x64-gnu-binding
442 |       # - build-freebsd
443 |       # - test-linux-x64-musl-binding
444 |       # - test-linux-aarch64-gnu-binding
445 |       # - test-linux-aarch64-musl-binding
446 |       # - test-linux-arm-gnueabihf-binding
447 |     steps:
448 |       - uses: actions/checkout@v4
449 |       - name: Setup node
450 |         uses: actions/setup-node@v4
451 |         with:
452 |           node-version: 20
453 |           cache: yarn
454 |       - name: Install dependencies
455 |         run: yarn --no-immutable
456 |       - name: Download all artifacts
457 |         uses: actions/download-artifact@v4
458 |         with:
459 |           path: artifacts
460 |       - name: Move artifacts
461 |         run: yarn artifacts
462 |       - name: List packages
463 |         run: ls -R ./npm
464 |         shell: bash
465 |       - name: Publish
466 |         run: |
467 |           npm config set provenance true
468 |           if git log -1 --pretty=%B | grep "^[0-9]\+\.[0-9]\+\.[0-9]\+$";
469 |           then
470 |             echo "//registry.npmjs.org/:_authToken=$NPM_TOKEN" >> ~/.npmrc
471 |             npm publish --access public
472 |           elif git log -1 --pretty=%B | grep "^[0-9]\+\.[0-9]\+\.[0-9]\+";
473 |           then
474 |             echo "//registry.npmjs.org/:_authToken=$NPM_TOKEN" >> ~/.npmrc
475 |             npm publish --tag next --access public
476 |           else
477 |             echo "Not a release, skipping publish"
478 |           fi
479 |         env:
480 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
481 |           NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
482 | 


--------------------------------------------------------------------------------
/.github/workflows/bench.yml:
--------------------------------------------------------------------------------
 1 | name: Bench Compare
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 | 
11 | jobs:
12 |   checkout_and_test:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         include:
17 |           - node-version: 18.x
18 |           - node-version: latest
19 | 
20 |     steps:
21 |       - name: Checkout code from ${{ github.repository }}
22 |         uses: actions/checkout@v4
23 | 
24 |       - name: Install OpenSSL
25 |         run: sudo apt-get update && sudo apt-get install -y openssl
26 | 
27 |       - name: Install
28 |         run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
29 | 
30 |       - name: Setup node
31 |         uses: actions/setup-node@v4
32 |         with:
33 |           node-version: ${{ matrix.node-version }}
34 |           cache: 'yarn'
35 | 
36 |       - name: Install yarn
37 |         run: corepack enable && corepack prepare yarn@stable --activate
38 | 
39 |       - name: Install Deps
40 |         run: yarn --no-immutable && yarn build && cd bench && npm i
41 | 
42 |       - name: Run Bench @spider-rs/spider-rs
43 |         run: yarn bench
44 | 
45 |       - name: Run Bench OSS
46 |         run: yarn bench:oss
47 | 


--------------------------------------------------------------------------------
/.github/workflows/book.yml:
--------------------------------------------------------------------------------
 1 | name: github pages
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 | 
 9 | jobs:
10 |   deploy:
11 |     runs-on: ubuntu-20.04
12 |     concurrency:
13 |       group: ${{ github.workflow }}-${{ github.ref }}
14 |     steps:
15 |       - uses: actions/checkout@v4
16 | 
17 |       - name: Setup mdBook
18 |         uses: peaceiris/actions-mdbook@v1
19 |         with:
20 |           mdbook-version: 'latest'
21 | 
22 |       - run: cd book && mdbook build
23 | 
24 |       - name: Deploy
25 |         uses: peaceiris/actions-gh-pages@v3
26 |         if: ${{ github.ref == 'refs/heads/main' }}
27 |         with:
28 |           github_token: ${{ secrets.GITHUB_TOKEN }}
29 |           publish_dir: ./book/book
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.toptal.com/developers/gitignore/api/node
  2 | # Edit at https://www.toptal.com/developers/gitignore?templates=node
  3 | 
  4 | ### Node ###
  5 | # Logs
  6 | logs
  7 | *.log
  8 | npm-debug.log*
  9 | yarn-debug.log*
 10 | yarn-error.log*
 11 | lerna-debug.log*
 12 | 
 13 | # Diagnostic reports (https://nodejs.org/api/report.html)
 14 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 15 | 
 16 | # Runtime data
 17 | pids
 18 | *.pid
 19 | *.seed
 20 | *.pid.lock
 21 | 
 22 | # Directory for instrumented libs generated by jscoverage/JSCover
 23 | lib-cov
 24 | 
 25 | # Coverage directory used by tools like istanbul
 26 | coverage
 27 | *.lcov
 28 | 
 29 | # nyc test coverage
 30 | .nyc_output
 31 | 
 32 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 33 | .grunt
 34 | 
 35 | # Bower dependency directory (https://bower.io/)
 36 | bower_components
 37 | 
 38 | # node-waf configuration
 39 | .lock-wscript
 40 | 
 41 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 42 | build/Release
 43 | 
 44 | # Dependency directories
 45 | node_modules/
 46 | jspm_packages/
 47 | 
 48 | # TypeScript v1 declaration files
 49 | typings/
 50 | 
 51 | # TypeScript cache
 52 | *.tsbuildinfo
 53 | 
 54 | # Optional npm cache directory
 55 | .npm
 56 | 
 57 | # Optional eslint cache
 58 | .eslintcache
 59 | 
 60 | # Microbundle cache
 61 | .rpt2_cache/
 62 | .rts2_cache_cjs/
 63 | .rts2_cache_es/
 64 | .rts2_cache_umd/
 65 | 
 66 | # Optional REPL history
 67 | .node_repl_history
 68 | 
 69 | # Output of 'npm pack'
 70 | *.tgz
 71 | 
 72 | # Yarn Integrity file
 73 | .yarn-integrity
 74 | 
 75 | # dotenv environment variables file
 76 | .env
 77 | .env.test
 78 | 
 79 | # parcel-bundler cache (https://parceljs.org/)
 80 | .cache
 81 | 
 82 | # Next.js build output
 83 | .next
 84 | 
 85 | # Nuxt.js build / generate output
 86 | .nuxt
 87 | dist
 88 | 
 89 | # Gatsby files
 90 | .cache/
 91 | # Comment in the public line in if your project uses Gatsby and not Next.js
 92 | # https://nextjs.org/blog/next-9-1#public-directory-support
 93 | # public
 94 | 
 95 | # vuepress build output
 96 | .vuepress/dist
 97 | 
 98 | # Serverless directories
 99 | .serverless/
100 | 
101 | # FuseBox cache
102 | .fusebox/
103 | 
104 | # DynamoDB Local files
105 | .dynamodb/
106 | 
107 | # TernJS port file
108 | .tern-port
109 | 
110 | # Stores VSCode versions used for testing VSCode extensions
111 | .vscode-test
112 | 
113 | # End of https://www.toptal.com/developers/gitignore/api/node
114 | 
115 | # Created by https://www.toptal.com/developers/gitignore/api/macos
116 | # Edit at https://www.toptal.com/developers/gitignore?templates=macos
117 | 
118 | ### macOS ###
119 | # General
120 | .DS_Store
121 | .AppleDouble
122 | .LSOverride
123 | 
124 | # Icon must end with two 
125 | Icon
126 | 
127 | 
128 | # Thumbnails
129 | ._*
130 | 
131 | # Files that might appear in the root of a volume
132 | .DocumentRevisions-V100
133 | .fseventsd
134 | .Spotlight-V100
135 | .TemporaryItems
136 | .Trashes
137 | .VolumeIcon.icns
138 | .com.apple.timemachine.donotpresent
139 | 
140 | # Directories potentially created on remote AFP share
141 | .AppleDB
142 | .AppleDesktop
143 | Network Trash Folder
144 | Temporary Items
145 | .apdisk
146 | 
147 | ### macOS Patch ###
148 | # iCloud generated files
149 | *.icloud
150 | 
151 | # End of https://www.toptal.com/developers/gitignore/api/macos
152 | 
153 | # Created by https://www.toptal.com/developers/gitignore/api/windows
154 | # Edit at https://www.toptal.com/developers/gitignore?templates=windows
155 | 
156 | ### Windows ###
157 | # Windows thumbnail cache files
158 | Thumbs.db
159 | Thumbs.db:encryptable
160 | ehthumbs.db
161 | ehthumbs_vista.db
162 | 
163 | # Dump file
164 | *.stackdump
165 | 
166 | # Folder config file
167 | [Dd]esktop.ini
168 | 
169 | # Recycle Bin used on file shares
170 | $RECYCLE.BIN/
171 | 
172 | # Windows Installer files
173 | *.cab
174 | *.msi
175 | *.msix
176 | *.msm
177 | *.msp
178 | 
179 | # Windows shortcuts
180 | *.lnk
181 | 
182 | # End of https://www.toptal.com/developers/gitignore/api/windows
183 | 
184 | #Added by cargo
185 | 
186 | /target
187 | Cargo.lock
188 | 
189 | .pnp.*
190 | .yarn/*
191 | !.yarn/patches
192 | !.yarn/plugins
193 | !.yarn/releases
194 | !.yarn/sdks
195 | !.yarn/versions
196 | 
197 | *.node
198 | 
199 | # index.d.ts
200 | # index.js
201 | __test__/*.js
202 | 
203 | /storage
204 | /bench/*.js
205 | /bench/case/**.js
206 | /bench/storage/


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
 1 | target
 2 | Cargo.lock
 3 | .cargo
 4 | .github
 5 | npm
 6 | .eslintrc
 7 | .prettierignore
 8 | rustfmt.toml
 9 | yarn.lock
10 | *.node
11 | .yarn
12 | __test__
13 | renovate.json
14 | book
15 | examples
16 | build.rs
17 | src
18 | Cargo.toml
19 | .vscode
20 | tsconfig.json
21 | bench


--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------
1 | target
2 | .yarn


--------------------------------------------------------------------------------
/.taplo.toml:
--------------------------------------------------------------------------------
1 | exclude = ["node_modules/**/*.toml"]
2 | 
3 | # https://taplo.tamasfe.dev/configuration/formatter-options.html
4 | [formatting]
5 | align_entries = true
6 | indent_tables = true
7 | reorder_keys  = true


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "rust-analyzer.procMacro.ignored": { "napi-derive": ["napi"] }
3 | }
4 | 


--------------------------------------------------------------------------------
/.yarnrc.yml:
--------------------------------------------------------------------------------
1 | nodeLinker: node-modules
2 | 
3 | npmAuditRegistry: https://registry.npmjs.org
4 | 
5 | yarnPath: .yarn/releases/yarn-3.6.4.cjs
6 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | edition = "2021"
 3 | name = "spider-rs_spider-rs"
 4 | version = "0.0.0"
 5 | description = "The fastest web crawler written in Rust ported to nodejs."
 6 | repository = "https://github.com/spider-rs/spider-nodejs"
 7 | authors = ["j-mendez <jeff@a11ywatch.com>"]
 8 | 
 9 | [lib]
10 | crate-type = ["cdylib"]
11 | 
12 | [dependencies]
13 | indexmap = "2"
14 | napi = { version = "2", default-features = false, features = ["napi4", "async", "tokio_rt", "serde-json"] }
15 | napi-derive = "2"
16 | num_cpus = "1"
17 | serde = "1"
18 | serde_json = "1"
19 | spider = { version = "2", default-features = false, features = [
20 |     "cron", 
21 |     "regex", 
22 |     "cookies", 
23 |     "socks", 
24 |     "chrome", 
25 |     "control", 
26 |     "chrome_intercept", 
27 |     "cache", 
28 |     "openai", 
29 |     "serde", 
30 |     "real_browser", 
31 |     "headers", 
32 |     "reqwest_rustls_tls", 
33 |     "io_uring",
34 |     "sync", 
35 |     "disk", 
36 |     "cookies", 
37 |     "ua_generator", 
38 |     "encoding", 
39 |     "string_interner_buffer_backend", 
40 |     "balance"
41 |     ] }
42 | spider_scraper = "0.1"
43 | 
44 | [build-dependencies]
45 | napi-build = "2"
46 | 
47 | [profile.release]
48 | lto = true
49 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Spider Contributors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # spider-rs
  2 | 
  3 | The [spider](https://github.com/spider-rs/spider) project ported to Node.js
  4 | 
  5 | ## Getting Started
  6 | 
  7 | 1. `npm i @spider-rs/spider-rs --save`
  8 | 
  9 | ```ts
 10 | import { Website, pageTitle } from '@spider-rs/spider-rs'
 11 | 
 12 | const website = new Website('https://rsseau.fr')
 13 |   .withHeaders({
 14 |     authorization: 'somerandomjwt',
 15 |   })
 16 |   .withBudget({
 17 |     '*': 20, // limit max request 20 pages for the website
 18 |     '/docs': 10, // limit only 10 pages on the `/docs` paths
 19 |   })
 20 |   .withBlacklistUrl(['/resume']) // regex or pattern matching to ignore paths
 21 |   .build()
 22 | 
 23 | // optional: page event handler
 24 | const onPageEvent = (_err, page) => {
 25 |   const title = pageTitle(page) // comment out to increase performance if title not needed
 26 |   console.info(`Title of ${page.url} is '${title}'`)
 27 |   website.pushData({
 28 |     status: page.statusCode,
 29 |     html: page.content,
 30 |     url: page.url,
 31 |     title,
 32 |   })
 33 | }
 34 | 
 35 | await website.crawl(onPageEvent)
 36 | await website.exportJsonlData('./storage/rsseau.jsonl')
 37 | console.log(website.getLinks())
 38 | ```
 39 | 
 40 | Collect the resources for a website.
 41 | 
 42 | ```ts
 43 | import { Website } from '@spider-rs/spider-rs'
 44 | 
 45 | const website = new Website('https://rsseau.fr')
 46 |   .withBudget({
 47 |     '*': 20,
 48 |     '/docs': 10,
 49 |   })
 50 |   // you can use regex or string matches to ignore paths
 51 |   .withBlacklistUrl(['/resume'])
 52 |   .build()
 53 | 
 54 | await website.scrape()
 55 | console.log(website.getPages())
 56 | ```
 57 | 
 58 | Run the crawls in the background on another thread.
 59 | 
 60 | ```ts
 61 | import { Website } from '@spider-rs/spider-rs'
 62 | 
 63 | const website = new Website('https://rsseau.fr')
 64 | 
 65 | const onPageEvent = (_err, page) => {
 66 |   console.log(page)
 67 | }
 68 | 
 69 | await website.crawl(onPageEvent, true)
 70 | // runs immediately
 71 | ```
 72 | 
 73 | Use headless Chrome rendering for crawls.
 74 | 
 75 | ```ts
 76 | import { Website } from '@spider-rs/spider-rs'
 77 | 
 78 | const website = new Website('https://rsseau.fr').withChromeIntercept(true, true)
 79 | 
 80 | const onPageEvent = (_err, page) => {
 81 |   console.log(page)
 82 | }
 83 | 
 84 | // the third param determines headless chrome usage.
 85 | await website.crawl(onPageEvent, false, true)
 86 | console.log(website.getLinks())
 87 | ```
 88 | 
 89 | Cron jobs can be done with the following.
 90 | 
 91 | ```ts
 92 | import { Website } from '@spider-rs/spider-rs'
 93 | 
 94 | const website = new Website('https://choosealicense.com').withCron('1/5 * * * * *')
 95 | // sleep function to test cron
 96 | const stopCron = (time: number, handle) => {
 97 |   return new Promise((resolve) => {
 98 |     setTimeout(() => {
 99 |       resolve(handle.stop())
100 |     }, time)
101 |   })
102 | }
103 | 
104 | const links = []
105 | 
106 | const onPageEvent = (err, value) => {
107 |   links.push(value)
108 | }
109 | 
110 | const handle = await website.runCron(onPageEvent)
111 | 
112 | // stop the cron in 4 seconds
113 | await stopCron(4000, handle)
114 | ```
115 | 
116 | Use the crawl shortcut to get the page content and url.
117 | 
118 | ```ts
119 | import { crawl } from '@spider-rs/spider-rs'
120 | 
121 | const { links, pages } = await crawl('https://rsseau.fr')
122 | console.log(pages)
123 | ```
124 | 
125 | ## Benchmarks
126 | 
127 | View the [benchmarks](./bench/README.md) to see a breakdown between libs and platforms.
128 | 
129 | Test url: `https://espn.com`
130 | 
131 | | `libraries`                  | `pages`   | `speed` |
132 | | :--------------------------- | :-------- | :------ |
133 | | **`spider(rust): crawl`**    | `150,387` | `1m`    |
134 | | **`spider(nodejs): crawl`**  | `150,387` | `153s`  |
135 | | **`spider(python): crawl`**  | `150,387` | `186s`  |
136 | | **`scrapy(python): crawl`**  | `49,598`  | `1h`    |
137 | | **`crawlee(nodejs): crawl`** | `18,779`  | `30m`   |
138 | 
139 | The benches above were ran on a mac m1, spider on linux arm machines performs about 2-10x faster.
140 | 
141 | ## Development
142 | 
143 | Install the napi cli `npm i @napi-rs/cli --global`.
144 | 
145 | 1. `yarn build:test`
146 | 


--------------------------------------------------------------------------------
/__test__/index.spec.ts:
--------------------------------------------------------------------------------
  1 | import test from 'ava'
  2 | import { crawl, Website, Page, type NPage, Cron, pageTitle } from '../index.js'
  3 | 
  4 | const TEST_URL = 'https://choosealicense.com'
  5 | 
  6 | test('shortcut crawl native', async (t) => {
  7 |   const { links, pages } = await crawl(TEST_URL)
  8 | 
  9 |   t.assert(links.length > 1, 'should be more than one link')
 10 |   t.assert(pages.length > 1, 'should be more than one page')
 11 | })
 12 | 
 13 | test('new website native', async (t) => {
 14 |   const website = new Website(TEST_URL)
 15 |   await website.crawl()
 16 | 
 17 |   t.assert(website.getLinks().length > 1, 'should be more than one link')
 18 | })
 19 | 
 20 | test('new website scrape native', async (t) => {
 21 |   const website = new Website(TEST_URL)
 22 |   await website.scrape()
 23 | 
 24 |   t.assert(website.getPages().length > 1, 'should be more than one page')
 25 | })
 26 | 
 27 | test('new website native with custom config', async (t) => {
 28 |   const website = new Website(TEST_URL)
 29 |     .withHeaders({
 30 |       authorization: 'somerandomjwt',
 31 |     })
 32 |     .build()
 33 | 
 34 |   await website.crawl()
 35 | 
 36 |   t.assert(website.getLinks().length > 1, 'should be more than one page')
 37 | })
 38 | 
 39 | test('new website native budget one page', async (t) => {
 40 |   const website = new Website(TEST_URL)
 41 |     .withBudget({
 42 |       '*': 1,
 43 |     })
 44 |     .build()
 45 | 
 46 |   await website.crawl()
 47 | 
 48 |   t.assert(website.getLinks().length === 1, 'should be one link')
 49 | })
 50 | 
 51 | test('new website native blacklist pages', async (t) => {
 52 |   const website = new Website(TEST_URL).withBlacklistUrl(['/blog', new RegExp('/books').source, '/resume']).build()
 53 | 
 54 |   await website.crawl()
 55 | 
 56 |   const links = website.getLinks()
 57 | 
 58 |   // should be valid unless new pages and routes are created.
 59 |   t.assert(links.length > 1 && !links.includes(`${TEST_URL}/blog`), 'should be more than one page')
 60 | })
 61 | 
 62 | test('new website native onPageEvent', async (t) => {
 63 |   const website = new Website(TEST_URL)
 64 | 
 65 |   const links: NPage[] = []
 66 | 
 67 |   const onPageEvent = (err: Error | null, value: NPage) => {
 68 |     links.push(value)
 69 |   }
 70 | 
 71 |   // running in background can be done with a sleep timer for test.
 72 |   const backgroundStream = false
 73 | 
 74 |   await website.crawl(onPageEvent, backgroundStream)
 75 | 
 76 |   // should be valid unless new pages and routes are created.
 77 |   t.assert(links.length > 1, 'should be more than one page')
 78 | })
 79 | 
 80 | test('new website native with title selector', async (t) => {
 81 |   const website = new Website(TEST_URL)
 82 | 
 83 |   const links: { url: string; title: string }[] = []
 84 | 
 85 |   const onPageEvent = async (_err: Error | null, page: NPage) => {
 86 |     const title = pageTitle(page)
 87 |     links.push({ title, url: page.url })
 88 |   }
 89 | 
 90 |   await website.crawl(onPageEvent)
 91 | 
 92 |   // should be valid unless new pages and routes are created.
 93 |   t.assert(links.length > 1, 'should be more than one page')
 94 | })
 95 | 
 96 | // experimental - does not work on all platforms most likely due to time differences.
 97 | test.skip('new website native cron', async (t) => {
 98 |   const website = new Website(TEST_URL).withCron('1/5 * * * * *')
 99 |   // sleep function to test cron
100 |   const sleep = (time: number, handle: Cron) => {
101 |     return new Promise((resolve) => {
102 |       setTimeout(() => {
103 |         resolve(handle.stop())
104 |       }, time)
105 |     })
106 |   }
107 | 
108 |   const links: NPage[] = []
109 | 
110 |   const onPageEvent = (err: Error | null, value: NPage) => {
111 |     links.push(value)
112 |   }
113 | 
114 |   const handle = await website.runCron(onPageEvent)
115 | 
116 |   await sleep(4000, handle)
117 | 
118 |   // should be valid unless new pages and routes are created.
119 |   t.assert(links.length > 1, 'should be more than one page')
120 | })
121 | 
122 | test('new website native with subscriptions', async (t) => {
123 |   const website = new Website(TEST_URL)
124 | 
125 |   const links: NPage[] = []
126 | 
127 |   const onPageEvent = (_err: Error | null, value: NPage) => {
128 |     links.push(value)
129 |   }
130 | 
131 |   const id = website.subscribe(onPageEvent)
132 | 
133 |   await website.crawl()
134 | 
135 |   website.unsubscribe(id)
136 | 
137 |   // should be valid unless new pages and routes are created.
138 |   t.assert(links.length > 1, 'should be more than one page')
139 | })
140 | 
141 | test('new single page', async (t) => {
142 |   const page = new Page(TEST_URL)
143 |   await page.fetch()
144 |   const links = await page.getLinks()
145 | 
146 |   // should be valid unless new pages and routes are created.
147 |   t.assert(links.length > 1, 'should be more than one link')
148 |   t.assert(page.getHtml().length >= 100, 'should be valid html')
149 |   t.assert(page.getBytes().length >= 100, 'should be valid bytes')
150 | })
151 | 
152 | test.skip('new website native headless', async (t) => {
153 |   const website = new Website(TEST_URL)
154 |   await website.crawl(undefined, false, true)
155 | 
156 |   t.assert(website.getLinks().length > 1, 'should be more than one link')
157 | })
158 | 
159 | test.skip('new website native smart mode', async (t) => {
160 |   const website = new Website(TEST_URL)
161 |   await website.crawlSmart(undefined, false)
162 | 
163 |   t.assert(website.getLinks().length > 1, 'should be more than one link')
164 | })
165 | 
166 | test.skip('new website native headless request interception', async (t) => {
167 |   const website = new Website(TEST_URL).withChromeIntercept(true, true)
168 |   await website.crawl(undefined, false, true)
169 | 
170 |   t.assert(website.getLinks().length > 1, 'should be more than one link')
171 | })
172 | 
173 | test('new website native raw content', async (t) => {
174 |   const website = new Website(TEST_URL, true)
175 | 
176 |   const links: Buffer[] = []
177 | 
178 |   const onPageEvent = (_err: Error | null, page: NPage) => page.rawContent && links.push(page.rawContent)
179 | 
180 |   await website.crawl(onPageEvent)
181 | 
182 |   t.assert(links.length > 1, 'should be more than one page')
183 | })
184 | 
185 | test('new website data store and export', async (t) => {
186 |   const { promises } = await import('node:fs')
187 |   const readFile = promises.readFile
188 | 
189 |   const website = new Website(TEST_URL, true)
190 |   const outputFile = './storage/test.jsonl'
191 | 
192 |   const onPageEvent = (_err: Error | null, page: NPage) => website.pushData(page)
193 | 
194 |   await website.crawl(onPageEvent)
195 |   await website.exportJsonlData(outputFile)
196 | 
197 |   const data = await readFile(outputFile)
198 | 
199 |   t.assert(!!data, 'should contain valid json file')
200 | })
201 | 
202 | test('new website stop', async (t) => {
203 |   const website = new Website(TEST_URL)
204 | 
205 |   const onPageEvent = async (_err: Error | null, page: NPage) => {
206 |     if (website.size >= 2) {
207 |       await website.stop()
208 |     }
209 |   }
210 | 
211 |   await website.crawl(onPageEvent)
212 | 
213 |   t.assert(website.size < 30, 'should only have crawled a couple pages concurrently')
214 | })
215 | 
216 | test('new website stop background', async (t) => {
217 |   const sleep = (time: number) => {
218 |     return new Promise((resolve) => {
219 |       setTimeout(() => {
220 |         resolve(true)
221 |       }, time)
222 |     })
223 |   }
224 | 
225 |   const website = new Website(TEST_URL)
226 |   let count = 0
227 | 
228 |   const onPageEvent = async (_err: Error | null, page: NPage) => {
229 |     if (count) {
230 |       await website.stop()
231 |     }
232 |     count++
233 |   }
234 | 
235 |   // lets wait for all other test since background shutsdown all crawls matching the url
236 |   await sleep(2000)
237 |   await website.crawl(onPageEvent, true)
238 |   await sleep(2000)
239 | 
240 |   t.assert(count < 15, 'should only have crawled a couple pages concurrently in the background')
241 | })
242 | 


--------------------------------------------------------------------------------
/bench/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmarks
 2 | 
 3 | ```sh
 4 | Linux
 5 | 8-core CPU
 6 | 32 GB of RAM memory
 7 | -----------------------
 8 | ```
 9 | 
10 | Test url: `https://choosealicense.com` (small)
11 | 32 pages
12 | 
13 | | `libraries`                       | `speed` |
14 | | :-------------------------------- | :------ |
15 | | **`spider-rs: crawl 10 samples`** | `76ms`  |
16 | | **`crawlee: crawl 10 samples`**   | `1s`    |
17 | 
18 | Test url: `https://rsseau.fr` (medium)
19 | 211 pages
20 | 
21 | | `libraries`                       | `speed` |
22 | | :-------------------------------- | :------ |
23 | | **`spider-rs: crawl 10 samples`** | `0.5s`  |
24 | | **`crawlee: crawl 10 samples`**   | `72s`   |
25 | 
26 | ```sh
27 | ----------------------
28 | mac Apple M1 Max
29 | 10-core CPU
30 | 64 GB of RAM memory
31 | -----------------------
32 | ```
33 | 
34 | Test url: `https://choosealicense.com` (small)
35 | 32 pages
36 | 
37 | | `libraries`                       | `speed` |
38 | | :-------------------------------- | :------ |
39 | | **`spider-rs: crawl 10 samples`** | `286ms` |
40 | | **`crawlee: crawl 10 samples`**   | `1.7s`  |
41 | 
42 | Test url: `https://rsseau.fr` (medium)
43 | 211 pages
44 | 
45 | | `libraries`                       | `speed` |
46 | | :-------------------------------- | :------ |
47 | | **`spider-rs: crawl 10 samples`** | `2.5s`  |
48 | | **`crawlee: crawl 10 samples`**   | `75s`   |
49 | 
50 | The performance scales the larger the website and if throttling is needed. Linux benchmarks are about 10x faster than macOS for spider-rs.
51 | 


--------------------------------------------------------------------------------
/bench/base.ts:
--------------------------------------------------------------------------------
 1 | export const iterations = process.env.BENCH_COUNT ? parseInt(process.env.BENCH_COUNT, 10) : 20
 2 | 
 3 | export const TEST_URL = 'https://choosealicense.com'
 4 | export const TEST_URL_MEDIUM = 'https://rsseau.fr'
 5 | export const TEST_URL_LARGE = 'https://espn.com'
 6 | 
 7 | export enum BenchSizes {
 8 |   SMALL = 'SMALL',
 9 |   MEDIUM = 'MEDIUM',
10 |   LARGE = 'LARGE',
11 | }
12 | 


--------------------------------------------------------------------------------
/bench/case/crawlee.ts:
--------------------------------------------------------------------------------
 1 | // @ts-ignore
 2 | import { CheerioCrawler } from 'crawlee'
 3 | import { TEST_URL, iterations } from '../base'
 4 | 
 5 | export async function bench(url = TEST_URL, size = 'SMALL') {
 6 |   // @ts-ignore
 7 |   const crawler = new CheerioCrawler({
 8 |     // @ts-ignore
 9 |     async requestHandler({ enqueueLinks }) {
10 |       // @ts-ignore
11 |       await enqueueLinks()
12 |     },
13 |   })
14 | 
15 |   let duration = 0
16 | 
17 |   const run = async () => {
18 |     const startTime = performance.now()
19 |     // @ts-ignore
20 |     await crawler.run([url])
21 |     duration += performance.now() - startTime
22 |   }
23 | 
24 |   const bm = async (cb: () => Promise<void>, i = 0) => {
25 |     await cb()
26 |     if (i < iterations) {
27 |       await bm(cb, i + 1)
28 |     }
29 |   }
30 | 
31 |   await bm(run)
32 | 
33 |   console.log(
34 |     JSON.stringify([
35 |       {
36 |         name: `crawlee - OPS/S [${size}:PAGE]`,
37 |         unit: 'OPS/S',
38 |         value: 1000 / (duration / iterations),
39 |       },
40 |     ]),
41 |   )
42 | }
43 | 


--------------------------------------------------------------------------------
/bench/case/spider.ts:
--------------------------------------------------------------------------------
 1 | import { Website } from '../../index.js'
 2 | import { TEST_URL, iterations } from '../base'
 3 | 
 4 | export async function bench(url = TEST_URL, size = 'SMALL') {
 5 |   const website = new Website(url)
 6 | 
 7 |   let duration = 0
 8 | 
 9 |   const run = async () => {
10 |     const startTime = performance.now()
11 |     await website.crawl()
12 |     duration += performance.now() - startTime
13 |   }
14 | 
15 |   const bm = async (cb: () => Promise<void>, i = 0) => {
16 |     await cb()
17 |     if (i < iterations) {
18 |       await bm(cb, i + 1)
19 |     }
20 |   }
21 | 
22 |   await bm(run)
23 | 
24 |   console.log(
25 |     JSON.stringify([
26 |       {
27 |         name: `@spider-rs/spider-rs - OPS/S [${size}:PAGE]`,
28 |         unit: 'OPS/S',
29 |         value: 1000 / (duration / iterations),
30 |       },
31 |     ]),
32 |   )
33 | }
34 | 


--------------------------------------------------------------------------------
/bench/compare.ts:
--------------------------------------------------------------------------------
 1 | import { TEST_URL_MEDIUM, TEST_URL_LARGE, BenchSizes } from './base'
 2 | import { bench } from './case/spider'
 3 | 
 4 | // small
 5 | bench()
 6 | // small/medium
 7 | bench(TEST_URL_MEDIUM, BenchSizes.MEDIUM)
 8 | // large 150k pages plus
 9 | if (process.env.BENCH_LARGE) {
10 |   bench(TEST_URL_LARGE, BenchSizes.LARGE)
11 | }
12 | 


--------------------------------------------------------------------------------
/bench/crawlee.ts:
--------------------------------------------------------------------------------
 1 | import { TEST_URL_MEDIUM, TEST_URL_LARGE, BenchSizes } from './base'
 2 | import { bench } from './case/crawlee'
 3 | 
 4 | // small
 5 | bench()
 6 | // small/medium
 7 | bench(TEST_URL_MEDIUM, BenchSizes.MEDIUM)
 8 | // large 150k pages plus
 9 | if (process.env.BENCH_LARGE) {
10 |   bench(TEST_URL_LARGE, BenchSizes.LARGE)
11 | }
12 | 


--------------------------------------------------------------------------------
/bench/oss.ts:
--------------------------------------------------------------------------------
 1 | import { bench } from './case/spider'
 2 | import { bench as benchCrawlee } from './case/crawlee'
 3 | import { TEST_URL_MEDIUM, BenchSizes } from './base'
 4 | ;(async () => {
 5 |   await bench()
 6 |   await bench(TEST_URL_MEDIUM, BenchSizes.MEDIUM)
 7 |   await benchCrawlee()
 8 |   await benchCrawlee(TEST_URL_MEDIUM, BenchSizes.MEDIUM)
 9 | })()
10 | 


--------------------------------------------------------------------------------
/bench/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@spider-rs/spider-rs-bench",
 3 |   "version": "1.0.0",
 4 |   "author": "Jeff Mendez <jeff@a11ywatch.com>",
 5 |   "publish": false,
 6 |   "devDependencies": {
 7 |     "@napi-rs/cli": "^2.16.5",
 8 |     "@types/node": "^20.10.0",
 9 |     "crawlee": "^3.6.2",
10 |     "typescript": "^5.3.2"
11 |   },
12 |   "scripts": {
13 |     "bench": "tsc && NODE_ENV=production node ./compare.js",
14 |     "bench:oss": "tsc && NODE_ENV=production CRAWLEE_LOG_LEVEL=off node ./oss.js"
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/book/.gitignore:
--------------------------------------------------------------------------------
1 | book
2 | 


--------------------------------------------------------------------------------
/book/book.toml:
--------------------------------------------------------------------------------
 1 | [book]
 2 | authors = ["Jeff Mendez"]
 3 | language = "en"
 4 | multilingual = false
 5 | src = "src"
 6 | title = "spider-rs"
 7 | 
 8 | [output.html]
 9 | git-repository-url = "https://github.com/spider-rs/spider-nodejs/tree/main/book"
10 | edit-url-template  = "https://github.com/spider-rs/spider-nodejs/edit/main/book/{path}"
11 | 


--------------------------------------------------------------------------------
/book/src/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | `Spider-RS` is the fastest web crawler and indexer written in Rust ported to Node.js.
 4 | 
 5 | - Concurrent
 6 | - Streaming
 7 | - Decentralization
 8 | - Headless Chrome [Rendering](https://github.com/mattsse/chromiumoxide)
 9 | - HTTP Proxies
10 | - Cron Jobs
11 | - Subscriptions
12 | - Blacklisting and Budgeting Depth
13 | - Written in [Rust](https://www.rust-lang.org/) for speed, safety, and simplicity
14 | 
15 | Spider powers some big tools and helps bring the crawling aspect to almost no downtime with the correct setup, view the [spider](https://github.com/spider-rs/spider) project to learn more.
16 | 
17 | ```ts
18 | import { Website } from '@spider-rs/spider-rs'
19 | 
20 | const website = new Website('https://choosealicense.com')
21 | 
22 | await website.crawl()
23 | 
24 | console.log(website.getLinks())
25 | ```
26 | 


--------------------------------------------------------------------------------
/book/src/SUMMARY.md:
--------------------------------------------------------------------------------
 1 | # Summary
 2 | 
 3 | [Introduction](./README.md)
 4 | 
 5 | # User Guide
 6 | 
 7 | - [Getting started](./getting-started.md)
 8 | - [A simple example](./simple.md)
 9 | 
10 | # Configuration
11 | 
12 | - [Website](./website.md)
13 | - [Page](./page.md)
14 | - [Environment](./env.md)
15 | 
16 | # Usage
17 | 
18 | - [Crawl](./crawl.md)
19 | - [Scrape](./scrape.md)
20 | - [Cron Job](./cron-job.md)
21 | - [Storing Data](./storing-data.md)
22 | 
23 | # Benchmarks
24 | 
25 | - [Compare](./benchmarks.md)
26 | 


--------------------------------------------------------------------------------
/book/src/benchmarks.md:
--------------------------------------------------------------------------------
 1 | # Benchmarks
 2 | 
 3 | Test url: `https://espn.com`
 4 | Mac M1 64gb 10-core CPU
 5 | 
 6 | | `libraries`                  | `pages`   | `speed` |
 7 | | :--------------------------- | :-------- | :------ |
 8 | | **`spider(rust): crawl`**    | `150,387` | `1m`    |
 9 | | **`spider(nodejs): crawl`**  | `150,387` | `153s`  |
10 | | **`spider(python): crawl`**  | `150,387` | `186s`  |
11 | | **`scrapy(python): crawl`**  | `49,598`  | `1h`    |
12 | | **`crawlee(nodejs): crawl`** | `18,779`  | `30m`   |
13 | 
14 | View the latest runs on [github](https://github.com/spider-rs/spider-nodejs/actions/workflows/bench.yml).
15 | 
16 | ```sh
17 | -----------------------
18 | Linux
19 | 2-core CPU
20 | 7 GB of RAM memory
21 | -----------------------
22 | ```
23 | 
24 | Test url: `https://choosealicense.com` (small)
25 | 32 pages
26 | 
27 | | `libraries`                       | `speed` |
28 | | :-------------------------------- | :------ |
29 | | **`spider-rs: crawl 10 samples`** | `76ms`  |
30 | | **`crawlee: crawl 10 samples`**   | `1s`    |
31 | 
32 | Test url: `https://rsseau.fr` (medium)
33 | 211 pages
34 | 
35 | | `libraries`                       | `speed` |
36 | | :-------------------------------- | :------ |
37 | | **`spider-rs: crawl 10 samples`** | `0.5s`  |
38 | | **`crawlee: crawl 10 samples`**   | `72s`   |
39 | 
40 | ```sh
41 | ----------------------
42 | mac Apple M1 Max
43 | 10-core CPU
44 | 64 GB of RAM memory
45 | -----------------------
46 | ```
47 | 
48 | Test url: `https://choosealicense.com` (small)
49 | 32 pages
50 | 
51 | | `libraries`                       | `speed` |
52 | | :-------------------------------- | :------ |
53 | | **`spider-rs: crawl 10 samples`** | `286ms` |
54 | | **`crawlee: crawl 10 samples`**   | `1.7s`  |
55 | 
56 | Test url: `https://rsseau.fr` (medium)
57 | 211 pages
58 | 
59 | | `libraries`                       | `speed` |
60 | | :-------------------------------- | :------ |
61 | | **`spider-rs: crawl 10 samples`** | `2.5s`  |
62 | | **`crawlee: crawl 10 samples`**   | `75s`   |
63 | 
64 | The performance scales the larger the website and if throttling is needed. Linux benchmarks are about 10x faster than macOS for spider-rs.
65 | 


--------------------------------------------------------------------------------
/book/src/crawl.md:
--------------------------------------------------------------------------------
 1 | # Crawl
 2 | 
 3 | Crawl a website concurrently.
 4 | 
 5 | ```ts
 6 | import { Website } from '@spider-rs/spider-rs'
 7 | 
 8 | // pass in the website url
 9 | const website = new Website('https://rsseau.fr')
10 | 
11 | await website.crawl()
12 | 
13 | // [ "https://rsseau.fr/blog", ...]
14 | console.log(website.getLinks())
15 | ```
16 | 
17 | ## Async Event
18 | 
19 | You can pass in a async function as the first param to the crawl function for realtime updates streamed.
20 | 
21 | ```ts
22 | import { Website } from '@spider-rs/spider-rs'
23 | 
24 | const website = new Website('https://rsseau.fr')
25 | 
26 | const onPageEvent = (err, value) => {
27 |   console.log(value)
28 | }
29 | 
30 | await website.crawl(onPageEvent)
31 | ```
32 | 
33 | ## Background
34 | 
35 | You can run the request in the background and receive events with the second param set to `true`.
36 | 
37 | ```ts
38 | import { Website } from '@spider-rs/spider-rs'
39 | 
40 | const website = new Website('https://rsseau.fr')
41 | 
42 | const onPageEvent = (err, value) => {
43 |   console.log(value)
44 | }
45 | 
46 | await website.crawl(onPageEvent, true)
47 | // this will run instantly as the crawl is in the background
48 | ```
49 | 
50 | ## Subscriptions
51 | 
52 | You can setup many subscriptions to run events when a crawl happens.
53 | 
54 | ```ts
55 | import { Website } from '@spider-rs/spider-rs'
56 | 
57 | const website = new Website('https://rsseau.fr')
58 | 
59 | const onPageEvent = (err, value) => {
60 |   console.log(value)
61 | }
62 | 
63 | const subscriptionID = website.subscribe(onPageEvent)
64 | 
65 | await website.crawl()
66 | 
67 | website.unsubscribe(subscriptionID)
68 | // this will run instantly as the crawl is in the background
69 | ```
70 | 
71 | ## Headless Chrome
72 | 
73 | Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`.
74 | It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will
75 | drastically speed up runs.
76 | 
77 | ```ts
78 | import { Website } from '@spider-rs/spider-rs'
79 | 
80 | const website = new Website('https://rsseau.fr')
81 | 
82 | const onPageEvent = (err, value) => {
83 |   console.log(value)
84 | }
85 | 
86 | // all params are optional. The third param determines headless rendering.
87 | await website.crawl(onPageEvent, false, true)
88 | // make sure to call unsubscribe when finished or else the instance is kept alive when events are setup.
89 | website.unsubscribe()
90 | ```
91 | 


--------------------------------------------------------------------------------
/book/src/cron-job.md:
--------------------------------------------------------------------------------
 1 | # Cron Jobs
 2 | 
 3 | Use a cron job that can run any time of day to gather website data.
 4 | 
 5 | ```ts
 6 | import { Website } from '@spider-rs/spider-rs'
 7 | 
 8 | const website = new Website('https://choosealicense.com').withCron('1/5 * * * * *').build()
 9 | 
10 | // get the pages of the website when the cron runs streamed.
11 | const onPageEvent = (err, value) => {
12 |   console.log(value)
13 | }
14 | 
15 | const handle = await website.runCron(onPageEvent)
16 | ```
17 | 


--------------------------------------------------------------------------------
/book/src/env.md:
--------------------------------------------------------------------------------
 1 | # Environment
 2 | 
 3 | Env variables to adjust the project.
 4 | 
 5 | ## CHROME_URL
 6 | 
 7 | You can set the chrome URL to connect remotely.
 8 | 
 9 | ```sh
10 | CHROME_URL=http://localhost:9222
11 | ```
12 | 


--------------------------------------------------------------------------------
/book/src/getting-started.md:
--------------------------------------------------------------------------------
 1 | # Getting Started
 2 | 
 3 | Make sure to have [node](https://nodejs.org/en/download) installed v10 and higher.
 4 | 
 5 | Install the package with your favorite package manager.
 6 | 
 7 | ```sh
 8 | yarn add @spider-rs/spider-rs
 9 | # or
10 | npm install @spider-rs/spider-rs
11 | ```
12 | 


--------------------------------------------------------------------------------
/book/src/page.md:
--------------------------------------------------------------------------------
 1 | # Page
 2 | 
 3 | A single page on a website, useful if you need just the root url.
 4 | 
 5 | ## New Page
 6 | 
 7 | Get a new page with content.
 8 | 
 9 | The first param is the url, followed by if subdomains should be included, and last to include TLD's in links.
10 | 
11 | Calling `page.fetch` is needed to get the content.
12 | 
13 | ```ts
14 | import { Page } from '@spider-rs/spider-rs'
15 | 
16 | const page = new Page('https://choosealicense.com', false, false)
17 | await page.fetch()
18 | ```
19 | 
20 | ## Page Links
21 | 
22 | get all the links related to a page.
23 | 
24 | ```ts
25 | const page = new Page('https://choosealicense.com', false, false)
26 | await page.fetch()
27 | const links = await page.getLinks()
28 | console.log(links)
29 | ```
30 | 
31 | ## Page Html
32 | 
33 | Get the markup for the page or HTML.
34 | 
35 | ```ts
36 | const page = new Page('https://choosealicense.com', false, false)
37 | await page.fetch()
38 | const html = page.getHtml()
39 | console.log(html)
40 | ```
41 | 
42 | ## Page Bytes
43 | 
44 | Get the raw bytes of a page to store the files in a database.
45 | 
46 | ```ts
47 | const page = new Page('https://choosealicense.com', false, false)
48 | await page.fetch()
49 | const bytes = page.getBytes()
50 | console.log(bytes)
51 | ```
52 | 


--------------------------------------------------------------------------------
/book/src/scrape.md:
--------------------------------------------------------------------------------
 1 | # Scrape
 2 | 
 3 | Scape a website and collect the resource data.
 4 | 
 5 | ```ts
 6 | import { Website } from '@spider-rs/spider-rs'
 7 | 
 8 | // pass in the website url
 9 | const website = new Website('https://rsseau.fr')
10 | 
11 | await website.scrape()
12 | 
13 | // [ { url: "https://rsseau.fr/blog", html: "<html>...</html>"}, ...]
14 | console.log(website.getPages())
15 | ```
16 | 
17 | ## Headless Chrome
18 | 
19 | Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`.
20 | It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will
21 | drastically speed up runs.
22 | 
23 | ```ts
24 | import { Website } from '@spider-rs/spider-rs'
25 | 
26 | const website = new Website('https://rsseau.fr')
27 | 
28 | const onPageEvent = (err, value) => {
29 |   console.log(value)
30 | }
31 | 
32 | // all params are optional. The third param determines headless rendering.
33 | await website.scrape(onPageEvent, false, true)
34 | ```
35 | 


--------------------------------------------------------------------------------
/book/src/simple.md:
--------------------------------------------------------------------------------
 1 | # A simple example
 2 | 
 3 | We use the node-addon to port the Rust project over with napi to target node.js.
 4 | 
 5 | There are some performance drawbacks from the addon, even still the crawls are lightning fast and efficient.
 6 | 
 7 | ## Usage
 8 | 
 9 | The examples below can help get started with spider.
10 | 
11 | ### Basic
12 | 
13 | A basic example.
14 | 
15 | ```ts
16 | import { Website } from '@spider-rs/spider-rs'
17 | 
18 | const website = new Website('https://choosealicense.com')
19 | 
20 | await website.crawl()
21 | console.log(website.getLinks())
22 | ```
23 | 
24 | ### Events
25 | 
26 | You can pass a function that could be async as param to `crawl` and `scrape`.
27 | 
28 | ```ts
29 | import { Website, type NPage } from '@spider-rs/spider-rs'
30 | 
31 | const website = new Website('https://choosealicense.com')
32 | 
33 | const links: NPage[] = []
34 | 
35 | const onPageEvent = async (err: Error | null, page: NPage) => {
36 |   links.push(page)
37 | }
38 | 
39 | await website.crawl(onPageEvent)
40 | console.log(website.getLinks())
41 | ```
42 | 
43 | ### Selector
44 | 
45 | The `title` method allows you to extract the title of the page.
46 | 
47 | ```ts
48 | import { Website, pageTitle } from '@spider-rs/spider-rs'
49 | 
50 | const website = new Website('https://choosealicense.com')
51 | 
52 | const links = []
53 | 
54 | const onPageEvent = async (err, page) => {
55 |   links.push({ title: pageTitle(page), url: page.url })
56 | }
57 | 
58 | // params in order event, background, and headless chrome
59 | await website.crawl(onPageEvent)
60 | ```
61 | 
62 | ## Shortcut
63 | 
64 | You can use the `crawl` shortcut method to collect contents quickly without configuration.
65 | 
66 | ```ts
67 | import { crawl } from '@spider-rs/spider-rs'
68 | 
69 | const { links, pages } = await crawl('https://choosealicense.com')
70 | 
71 | console.log([links, pages])
72 | ```
73 | 


--------------------------------------------------------------------------------
/book/src/storing-data.md:
--------------------------------------------------------------------------------
 1 | # Storing Data
 2 | 
 3 | Storing data can be done to collect the raw content for a website.
 4 | 
 5 | This allows you to upload and download the content without UTF-8 conversion. The property only appears when
 6 | setting the second param of the `Website` class constructor to true.
 7 | 
 8 | ```ts
 9 | const rawContent = true
10 | 
11 | const links: Buffer[] = []
12 | 
13 | const onPageEvent = (_err: Error | null, page: NPage) => {
14 |   if (page.rawContent) {
15 |     // we can download or store the content now to disk.
16 |     links.push(page.rawContent)
17 |   }
18 | }
19 | 
20 | await website.crawl(onPageEvent)
21 | 
22 | const website = new Website('https://choosealicense.com', rawContent)
23 | ```
24 | 


--------------------------------------------------------------------------------
/book/src/website.md:
--------------------------------------------------------------------------------
  1 | # Website
  2 | 
  3 | The Website class is the foundations to the spider.
  4 | 
  5 | ## Builder pattern
  6 | 
  7 | We use the builder pattern to configure the website for crawling.
  8 | 
  9 | \*note: Replace `https://choosealicense.com` from the examples below with your website target URL.
 10 | 
 11 | ```ts
 12 | import { Website } from '@spider-rs/spider-rs'
 13 | 
 14 | const website = new Website('https://choosealicense.com')
 15 | ```
 16 | 
 17 | ### Return Page Links
 18 | 
 19 | Return links found on the page resource.
 20 | 
 21 | ```py
 22 | const website = new Website('https://choosealicense.com')
 23 |   .with_return_page_links(true)
 24 |   .build()
 25 | ```
 26 | 
 27 | ### Custom Headers
 28 | 
 29 | Add custom HTTP headers to use when crawling/scraping.
 30 | 
 31 | ```ts
 32 | const website = new Website('https://choosealicense.com')
 33 |   .withHeaders({
 34 |     authorization: 'somerandomjwt',
 35 |   })
 36 |   .build()
 37 | ```
 38 | 
 39 | ### Blacklist
 40 | 
 41 | Prevent crawling a set path, url, or pattern with Regex.
 42 | 
 43 | ```ts
 44 | const website = new Website('https://choosealicense.com')
 45 |   .withBlacklistUrl(['/blog', new RegExp('/books').source, '/resume'])
 46 |   .build()
 47 | ```
 48 | 
 49 | ### Whitelist
 50 | 
 51 | Only crawl set paths, url, or pattern with Regex.
 52 | 
 53 | ```ts
 54 | const website = new Website('https://choosealicense.com')
 55 |   .withWhitelistUrl(['/blog', new RegExp('/books').source, '/resume'])
 56 |   .build()
 57 | ```
 58 | 
 59 | ### Crons
 60 | 
 61 | Setup a cron job that can run at any time in the background using cron-syntax.
 62 | 
 63 | ```ts
 64 | const website = new Website('https://choosealicense.com').withCron('1/5 * * * * *').build()
 65 | ```
 66 | 
 67 | View the [cron](./cron-job.md) section for details how to use the cron.
 68 | 
 69 | ### Budget
 70 | 
 71 | Add a crawl budget that prevents crawling `x` amount of pages.
 72 | 
 73 | ```ts
 74 | const website = new Website('https://choosealicense.com')
 75 |   .withBudget({
 76 |     '*': 1,
 77 |   })
 78 |   .build()
 79 | ```
 80 | 
 81 | ### Subdomains
 82 | 
 83 | Include subdomains in request.
 84 | 
 85 | ```ts
 86 | const website = new Website('https://choosealicense.com').withSubdomains(true).build()
 87 | ```
 88 | 
 89 | ### TLD
 90 | 
 91 | Include TLDs in request.
 92 | 
 93 | ```ts
 94 | const website = new Website('https://choosealicense.com').withTlds(true).build()
 95 | ```
 96 | 
 97 | ### External Domains
 98 | 
 99 | Add external domains to include with the website.
100 | 
101 | ```ts
102 | const website = new Website('https://choosealicense.com').withExternalDomains(['https://www.myotherdomain.com']).build()
103 | ```
104 | 
105 | ### Proxy
106 | 
107 | Use a proxy to crawl a website.
108 | 
109 | ```ts
110 | const website = new Website('https://choosealicense.com').withProxies(['https://www.myproxy.com']).build()
111 | ```
112 | 
113 | ### Delays
114 | 
115 | Add delays between pages. Defaults to none.
116 | 
117 | ```ts
118 | const website = new Website('https://choosealicense.com').withDelays(200).build()
119 | ```
120 | 
121 | ### Wait_For_Delay
122 | 
123 | Wait for a delay on the page. Should only be used for testing. This method does nothing if the `chrome` feature is not enabled.
124 | The first param is the seconds of delay and the second is the nano seconds to delay by.
125 | 
126 | ```ts
127 | // a delay of 2 seconds and 500 nanos
128 | const website = new Website('https://choosealicense.com').with_wait_for_delay(2, 500).build()
129 | ```
130 | 
131 | ### Wait_For_Selector
132 | 
133 | Wait for a a selector on the page with a max timeout. This method does nothing if the `chrome` feature is not enabled.
134 | 
135 | ```ts
136 | // a delay of 2 seconds and 500 nanos
137 | const website = new Website('https://choosealicense.com').with_wait_for_selector('.news-feed', 2, 500).build()
138 | ```
139 | 
140 | ### Wait_For_Idle_Network
141 | 
142 | Wait for idle network request. This method does nothing if the `chrome` feature is not enabled.
143 | 
144 | ```ts
145 | // a delay of 2 seconds and 500 nanos
146 | const website = new Website('https://choosealicense.com').with_wait_for_idle_network(2, 500).build()
147 | ```
148 | 
149 | ### User-Agent
150 | 
151 | Use a custom User-Agent.
152 | 
153 | ```ts
154 | const website = new Website('https://choosealicense.com').withUserAgent('mybot/v1').build()
155 | ```
156 | 
157 | ### Chrome Remote Connection
158 | 
159 | Add a chrome remote connection url. This can be a json endpoint or ws direct connection.
160 | 
161 | ```ts
162 | const website = new Website('https://choosealicense.com').with_chrome_connection("http://localhost:9222/json/version").build()
163 | ```
164 | 
165 | 
166 | ### OpenAI
167 | 
168 | Use OpenAI to generate dynamic scripts to use with headless. Make sure to set the `OPENAI_API_KEY` env variable.
169 | 
170 | ```ts
171 | const website = new Website('https://google.com')
172 |   .withOpenAI({
173 |     model: 'gpt-3.5-turbo',
174 |     prompt: 'Search for movies',
175 |     maxTokens: 300,
176 |   })
177 |   .build()
178 | 
179 | // make sure to crawl or scrape with the headless param set to true.
180 | ```
181 | 
182 | ### Screenshots
183 | 
184 | Take a screenshot of the pages on crawl when using headless chrome.
185 | 
186 | ```ts
187 | const website = new Website('https://google.com')
188 |   .withScreenshot({
189 |     params: {
190 |       cdp_params: null,
191 |       full_page: true,
192 |       omit_background: false,
193 |     },
194 |     bytes: false,
195 |     save: true,
196 |     output_dir: null,
197 |   })
198 |   .build()
199 | 
200 | // make sure to crawl or scrape with the headless param set to true.
201 | ```
202 | 
203 | ### Request Timeout
204 | 
205 | Add a request timeout per page in miliseconds. Example shows 30 seconds.
206 | 
207 | ```ts
208 | const website = new Website('https://choosealicense.com').withRequestTimeout(30000).build()
209 | ```
210 | 
211 | ### Respect Robots
212 | 
213 | Respect the robots.txt file.
214 | 
215 | ```ts
216 | const website = new Website('https://choosealicense.com').withRespectRobotsTxt(true).build()
217 | ```
218 | 
219 | ### Http2 Prior Knowledge
220 | 
221 | Use http2 to connect if you know the website servers supports this.
222 | 
223 | ```ts
224 | const website = new Website('https://choosealicense.com').withHttp2PriorKnowledge(true).build()
225 | ```
226 | 
227 | ### Chrome Network Interception
228 | 
229 | Enable Network interception when using chrome to speed up request.
230 | 
231 | ```ts
232 | const website = new Website('https://choosealicense.com').withChromeIntercept(true, true).build()
233 | ```
234 | 
235 | ### Redirect Limit
236 | 
237 | Set the redirect limit for request.
238 | 
239 | ```ts
240 | const website = new Website('https://choosealicense.com').withRedirectLimit(2).build()
241 | ```
242 | 
243 | ### Depth Limit
244 | 
245 | Set the depth limit for the amount of forward pages.
246 | 
247 | ```ts
248 | const website = new Website('https://choosealicense.com').withDepth(3).build()
249 | ```
250 | 
251 | ### Cache
252 | 
253 | Enable HTTP caching, this useful when using the spider on a server.
254 | 
255 | ```ts
256 | const website = new Website('https://choosealicense.com').withCaching(true).build()
257 | ```
258 | 
259 | ### Redirect Policy
260 | 
261 | Set the redirect policy for request, either strict or loose(default). Strict only allows redirects that match the domain.
262 | 
263 | ```ts
264 | const website = new Website('https://choosealicense.com').withRedirectPolicy(true).build()
265 | ```
266 | 
267 | ## Chaining
268 | 
269 | You can chain all of the configs together for simple configuration.
270 | 
271 | ```ts
272 | const website = new Website('https://choosealicense.com')
273 |   .withSubdomains(true)
274 |   .withTlds(true)
275 |   .withUserAgent('mybot/v1')
276 |   .withRespectRobotsTxt(true)
277 |   .build()
278 | ```
279 | 
280 | ## Raw Content
281 | 
282 | Set the second param of the website constructor to `true` to return content without UTF-8.
283 | This will return `rawContent` and leave `content` when using subscriptions or the Page Object.
284 | 
285 | ```ts
286 | const rawContent = true
287 | const website = new Website('https://choosealicense.com', rawContent)
288 | await website.scrape()
289 | ```
290 | 
291 | ## Clearing Crawl Data
292 | 
293 | Use `website.clear` to remove the links visited and page data or `website.drainLinks` to drain the links visited.
294 | 
295 | ```ts
296 | const website = new Website('https://choosealicense.com')
297 | await website.crawl()
298 | // links found ["https://...", "..."]
299 | console.log(website.getLinks())
300 | website.clear()
301 | // links will be empty
302 | console.log(website.getLinks())
303 | ```
304 | 
305 | ## Storing and Exporting Data
306 | 
307 | Collecting data to store can be done with `website.pushData()` and `website.exportJsonlData()`.
308 | 
309 | ```ts
310 | const website = new Website('https://choosealicense.com')
311 | 
312 | const onPageEvent = (_err, page) => {
313 |   website.pushData(page)
314 | }
315 | 
316 | await website.crawl(onPageEvent)
317 | 
318 | // uncomment to read the data.
319 | // console.log(website.readData());
320 | 
321 | // we only have one export method atm. Optional file path. All data by default goes to storage
322 | await website.exportJsonlData('./storage/test.jsonl')
323 | ```
324 | 
325 | ## Stop crawl
326 | 
327 | To stop a crawl you can use `website.stopCrawl(id)`, pass in the crawl id to stop a run or leave empty for all crawls to stop.
328 | 
329 | ```ts
330 | const website = new Website('https://choosealicense.com')
331 | 
332 | const onPageEvent = (_err, page) => {
333 |   console.log(page)
334 |   // stop the concurrent crawl when 8 pages are found.
335 |   if (website.size >= 8) {
336 |     website.stop()
337 |   }
338 | }
339 | 
340 | await website.crawl(onPageEvent)
341 | ```
342 | 


--------------------------------------------------------------------------------
/build.rs:
--------------------------------------------------------------------------------
1 | extern crate napi_build;
2 | 
3 | fn main() {
4 |   napi_build::setup();
5 | }
6 | 


--------------------------------------------------------------------------------
/examples/basic.mjs:
--------------------------------------------------------------------------------
 1 | // npm i @spider-rs/spider-rs
 2 | // node basic.mjs
 3 | import { Website } from '../index.js'
 4 | 
 5 | const url = process.argv[2] || 'https://choosealicense.com'
 6 | 
 7 | const website = new Website(url).withBudget({ '*': 300, licenses: 0 })
 8 | 
 9 | const onPageEvent = (_err, value) => {
10 |   console.log(`Found: ${value.url}`)
11 | }
12 | 
13 | const startTime = performance.now()
14 | 
15 | await website.crawl(onPageEvent)
16 | 
17 | const duration = performance.now() - startTime
18 | 
19 | console.log('Finished', url, 'pages found ' + website.getLinks().length, 'elasped duration ' + duration + 'ms')
20 | 


--------------------------------------------------------------------------------
/examples/cron.mjs:
--------------------------------------------------------------------------------
 1 | // npm i @spider-rs/spider-rs
 2 | // node cron.mjs
 3 | import { Website } from '@spider-rs/spider-rs'
 4 | 
 5 | const website = new Website('https://choosealicense.com').withCron('1/5 * * * * *').build()
 6 | 
 7 | // get the pages of the website when the cron runs streamed.
 8 | const onPageEvent = (_err, value) => {
 9 |   console.log(value)
10 | }
11 | 
12 | const handle = await website.runCron(onPageEvent)
13 | console.log('Starting the Runner for 40 seconds')
14 | 
15 | setTimeout(async () => {
16 |   await handle.stop()
17 | }, 40000)
18 | 


--------------------------------------------------------------------------------
/examples/openai.mjs:
--------------------------------------------------------------------------------
 1 | // npm i @spider-rs/spider-rs
 2 | // node openai.mjs
 3 | import { Website } from '../index.js'
 4 | 
 5 | const url = process.argv[2] || 'https://google.com'
 6 | const headless = true
 7 | 
 8 | const website = new Website(url)
 9 |   .withBudget({ '*': 1 })
10 |   .withScreenshot({
11 |     params: {
12 |       cdp_params: null,
13 |       full_page: true,
14 |       omit_background: false,
15 |     },
16 |     bytes: false,
17 |     save: true,
18 |     output_dir: null,
19 |   })
20 |   .withOpenai({
21 |     model: 'gpt-4-1106-preview',
22 |     prompt: 'Search for movies',
23 |     max_tokens: 100,
24 |   })
25 | 
26 | const onPageEvent = (_err, value) => {
27 |   console.log(`Found: ${value.url}\nHTML: ${value.content}`)
28 | }
29 | 
30 | const startTime = performance.now()
31 | 
32 | await website.crawl(onPageEvent, false, headless)
33 | 
34 | const duration = performance.now() - startTime
35 | 
36 | console.log('Finished', url, 'pages found ' + website.getLinks().length, 'elasped duration ' + duration + 'ms')
37 | 


--------------------------------------------------------------------------------
/examples/subscription.mjs:
--------------------------------------------------------------------------------
 1 | // npm i @spider-rs/spider-rs
 2 | // node subscription.mjs
 3 | import { Website } from '@spider-rs/spider-rs'
 4 | 
 5 | const website = new Website('https://choosealicense.com')
 6 | 
 7 | const onPageEvent = (_err, value) => {
 8 |   console.log(value)
 9 |   console.log(`Links found: ${website.size}`)
10 | }
11 | 
12 | const id = website.subscribe(onPageEvent)
13 | await website.crawl()
14 | website.unsubscribe(id)
15 | 


--------------------------------------------------------------------------------
/index.d.ts:
--------------------------------------------------------------------------------
  1 | /* tslint:disable */
  2 | /* eslint-disable */
  3 | 
  4 | /* auto-generated by NAPI-RS */
  5 | 
  6 | /** a simple page object */
  7 | export interface NPage {
  8 |   /** The url found. */
  9 |   url: string
 10 |   /** The content of the page found. */
 11 |   content: string
 12 |   /** The HTTP status code. */
 13 |   statusCode: number
 14 |   /** The Raw content if the resource needs to be sent as binary. */
 15 |   rawContent?: Buffer
 16 |   /** The HTTP headers. */
 17 |   headers?: Record<string, string>
 18 |   /** The links found on the page. Requires the website.builder method website.with_subscription_return_page_links to be set to true. */
 19 |   links?: Array<string>
 20 | }
 21 | /** get the page title. */
 22 | export declare function pageTitle(page: NPage): string
 23 | /** crawl a website using HTTP gathering all links and html. */
 24 | export declare function crawl(url: string, rawContent?: boolean | undefined | null): Promise<NWebsite>
 25 | export interface PageEvent {
 26 |   page: NPage
 27 | }
 28 | /** website main data from rust to node. */
 29 | export class NWebsite {
 30 |   /** all of the website links. */
 31 |   links: Array<string>
 32 |   /** the pages found. */
 33 |   pages: Array<NPage>
 34 | }
 35 | /** a simple page object */
 36 | export class Page {
 37 |   /** The url for the page. */
 38 |   url: string
 39 |   /** The website crawling subdomain pages? */
 40 |   subdomains?: boolean
 41 |   /** The website crawling TLD pages? */
 42 |   tld?: boolean
 43 |   /** The HTTP status code. */
 44 |   statusCode: number
 45 |   /** a new page */
 46 |   constructor(url: string, subdomains?: boolean | undefined | null, tld?: boolean | undefined | null)
 47 |   /** get the page content */
 48 |   fetch(): Promise<this>
 49 |   /** all links on the page */
 50 |   getLinks(): Promise<Array<string>>
 51 |   /** get the html for the page */
 52 |   getHtml(): string
 53 |   /** get the bytes for the page */
 54 |   getBytes(): Uint8Array
 55 | }
 56 | /** a website holding the inner spider::website::Website from Rust fit for nodejs. */
 57 | export class Website {
 58 |   /** a new website. */
 59 |   constructor(url: string, rawContent?: boolean | undefined | null)
 60 |   /** Get the crawl status. */
 61 |   get status(): string
 62 |   /** Store data to heap memory. The data must be an object. Use `website.export_jsonl_data` to store to disk. When using this method test occordingly since only certain primitives are supported. */
 63 |   pushData(obj: unknown): void
 64 |   /** Clear the collected data from heap memory. This only handles the data from `website.pushData`. */
 65 |   clearData(): void
 66 |   /** read the data from the heap memory. */
 67 |   readData(): any
 68 |   /** store data to memory for disk storing. This will create the path if not exist and defaults to ./storage. */
 69 |   exportJsonlData(exportPath?: string | undefined | null): Promise<void>
 70 |   /** subscribe and add an event listener. */
 71 |   subscribe(onPageEvent: (err: Error | null, arg: NPage) => any): number
 72 |   /** remove a subscription listener. */
 73 |   unsubscribe(id?: number | undefined | null): boolean
 74 |   /** stop a crawl */
 75 |   stop(id?: number | undefined | null): Promise<boolean>
 76 |   /** crawl a website */
 77 |   crawl(onPageEvent?: (err: Error | null, arg: NPage) => any | undefined | null, background?: boolean | undefined | null, headless?: boolean | undefined | null): Promise<void>
 78 |   /** Start to crawl website with async concurrency smart. Use HTTP first and JavaScript Rendering as needed. */
 79 |   crawlSmart(onPageEvent?: (err: Error | null, arg: NPage) => any | undefined | null, background?: boolean | undefined | null): Promise<void>
 80 |   /** scrape a website */
 81 |   scrape(onPageEvent?: (err: Error | null, arg: NPage) => any | undefined | null, background?: boolean | undefined | null, headless?: boolean | undefined | null): Promise<void>
 82 |   /** run a cron job */
 83 |   runCron(onPageEvent?: (err: Error | null, arg: NPage) => any | undefined | null): Promise<Cron>
 84 |   /** get all the links of a website */
 85 |   getLinks(): Array<string>
 86 |   /** get the size of the website in amount of pages crawled. If you ran the page in the background, this value will not update. */
 87 |   get size(): number
 88 |   /** get all the pages of a website - requires calling website.scrape */
 89 |   getPages(): Array<NPage>
 90 |   /** drain all links from storing */
 91 |   drainLinks(): Array<string>
 92 |   /** clear all links and page data */
 93 |   clear(): void
 94 |   /** Set HTTP headers for request using [reqwest::header::HeaderMap](https://docs.rs/reqwest/latest/reqwest/header/struct.HeaderMap.html). */
 95 |   withHeaders(headers?: object | undefined | null): this
 96 |   /** Add user agent to request. */
 97 |   withUserAgent(userAgent?: string | undefined | null): this
 98 |   /** Respect robots.txt file. */
 99 |   withRespectRobotsTxt(respectRobotsTxt: boolean): this
100 |   /** Determine whether to collect all the resources found on pages. */
101 |   withFullResources(fullResources: boolean): this
102 |   /** Use network interception for the request to only allow content that matches the host. If the content is from a 3rd party it needs to be part of our include list. */
103 |   withChromeIntercept(chromeIntercept: boolean, blockImages: boolean): this
104 |   /** Set the connection url for the chrome instance. This method does nothing if the `chrome` is not enabled. */
105 |   withChromeConnection(chromeConnection: string): this
106 |   /** Preserve the HOST header. */
107 |   withPreserveHostHeader(preserveHost: boolean): this
108 |   /** Include subdomains detection. */
109 |   withSubdomains(subdomains: boolean): this
110 |   /** Include tld detection. */
111 |   withTld(tld: boolean): this
112 |   /** Only use HTTP/2. */
113 |   withHttp2PriorKnowledge(http2PriorKnowledge: boolean): this
114 |   /** Max time to wait for request duration to milliseconds. */
115 |   withRequestTimeout(requestTimeout?: number | undefined | null): this
116 |   /** add external domains */
117 |   withExternalDomains(externalDomains?: Array<string> | undefined | null): this
118 |   /** Use stealth mode for the request. This does nothing without chrome. */
119 |   withStealth(stealthMode?: boolean | undefined | null): this
120 |   /** Dangerously accept invalid certificates - this should be used as a last resort. */
121 |   withDangerAcceptInvalidCerts(acceptInvalidCerts?: boolean | undefined | null): this
122 |   /** Set the crawling budget */
123 |   withBudget(budget?: Record<string, number> | undefined | null): this
124 |   /** Set the max redirects allowed for request. */
125 |   withRedirectLimit(redirectLimit: number): this
126 |   /** Set the redirect policy to use, either Strict or Loose by default. */
127 |   withRedirectPolicy(strict: boolean): this
128 |   /** Regex blacklist urls from the crawl */
129 |   withBlacklistUrl(blacklistUrl?: Array<string> | undefined | null): this
130 |   /** Regex whitelist urls from the crawl */
131 |   withWhitelistUrl(whitelistUrl?: Array<string> | undefined | null): this
132 |   /** Wait for a delay. Should only be used for testing. This method does nothing if the `chrome` feature is not enabled. */
133 |   withWaitForDelay(seconds?: number | undefined | null, nanos?: number | undefined | null): this
134 |   /** Wait for a CSS query selector. This method does nothing if the `chrome` feature is not enabled. */
135 |   withWaitForSelector(selector?: string | undefined | null, seconds?: number | undefined | null, nanos?: number | undefined | null): this
136 |   /** Wait for idle network request. This method does nothing if the `chrome` feature is not enabled. */
137 |   withWaitForIdleNetwork(seconds?: number | undefined | null, nanos?: number | undefined | null): this
138 |   /** Setup cron jobs to run */
139 |   withCron(cronStr: string, cronType?: string | undefined | null): this
140 |   /** Use OpenAI to generate dynamic javascript snippets. Make sure to set the `OPENAI_API_KEY` env variable. */
141 |   withOpenai(openaiConfigs?: object | undefined | null): this
142 |   /** Take screenshots of web pages using chrome. */
143 |   withScreenshot(screenshotConfigs?: {
144 |   /** The screenshot params. */
145 |   params: {
146 |   /** Chrome DevTools Protocol screenshot options. */
147 |   cdp_params: {
148 |   /** Image compression format (defaults to png). */
149 |   format: 'jpeg' | 'png' | 'webp'
150 |   /** Compression quality from range [0..100] (jpeg only). */
151 |   quality: number
152 |   /** Capture the screenshot of a given region only. */
153 |   clip: {
154 |   x: number
155 |   y: number
156 |   height: number
157 |   width: number
158 |   scale: number
159 |   }
160 |   /** Capture the screenshot from the surface, rather than the view. Defaults to true.*/
161 |   from_surface: boolean
162 |   /** Capture the screenshot beyond the viewport. Defaults to false. */
163 |   capture_beyond_viewport: boolean
164 |   }
165 |   /** Take full page screenshot */
166 |   full_page: boolean
167 |   /** Make the background transparent (png only). */
168 |   omit_background: boolean
169 |   }
170 |   /** Return the bytes of the screenshot on the Page. */
171 |   bytes: boolean
172 |   /** Store the screenshot to disk. This can be used with output_dir. If disabled will not store the file to the output directory. */
173 |   save: boolean
174 |   /** The output directory to store the file. Parent folders may be created inside the directory. */
175 |   output_dir: string | null
176 |   }): this
177 |   /** Delay between request as ms. */
178 |   withDelay(delay: number): this
179 |   /** Set a crawl depth limit. If the value is 0 there is no limit. */
180 |   withDepth(depth: number): this
181 |   /** Return the links found on the page in the channel subscriptions. This method does nothing if the `decentralized` is enabled. */
182 |   withReturnPageLinks(returnPageLinks: boolean): this
183 |   /** Cache the page following HTTP rules. */
184 |   withCaching(cache: boolean): this
185 |   /** Set the sitemap url. */
186 |   withSitemap(sitemap?: string | undefined | null): this
187 |   /** Use proxies for request. */
188 |   withProxies(proxies?: Array<string> | undefined | null): this
189 |   /** build the inner website - not required for all builder_steps */
190 |   build(): this
191 | }
192 | /** a runner for handling crons */
193 | export class Cron {
194 |   /** stop the cron instance */
195 |   stop(): Promise<void>
196 | }
197 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
  1 | /* tslint:disable */
  2 | /* eslint-disable */
  3 | /* prettier-ignore */
  4 | 
  5 | /* auto-generated by NAPI-RS */
  6 | 
  7 | const { existsSync, readFileSync } = require('fs')
  8 | const { join } = require('path')
  9 | 
 10 | const { platform, arch } = process
 11 | 
 12 | let nativeBinding = null
 13 | let localFileExisted = false
 14 | let loadError = null
 15 | 
 16 | function isMusl() {
 17 |   // For Node 10
 18 |   if (!process.report || typeof process.report.getReport !== 'function') {
 19 |     try {
 20 |       const lddPath = require('child_process').execSync('which ldd').toString().trim()
 21 |       return readFileSync(lddPath, 'utf8').includes('musl')
 22 |     } catch (e) {
 23 |       return true
 24 |     }
 25 |   } else {
 26 |     const { glibcVersionRuntime } = process.report.getReport().header
 27 |     return !glibcVersionRuntime
 28 |   }
 29 | }
 30 | 
 31 | switch (platform) {
 32 |   case 'android':
 33 |     switch (arch) {
 34 |       case 'arm64':
 35 |         localFileExisted = existsSync(join(__dirname, 'spider-rs.android-arm64.node'))
 36 |         try {
 37 |           if (localFileExisted) {
 38 |             nativeBinding = require('./spider-rs.android-arm64.node')
 39 |           } else {
 40 |             nativeBinding = require('@spider-rs/spider-rs-android-arm64')
 41 |           }
 42 |         } catch (e) {
 43 |           loadError = e
 44 |         }
 45 |         break
 46 |       case 'arm':
 47 |         localFileExisted = existsSync(join(__dirname, 'spider-rs.android-arm-eabi.node'))
 48 |         try {
 49 |           if (localFileExisted) {
 50 |             nativeBinding = require('./spider-rs.android-arm-eabi.node')
 51 |           } else {
 52 |             nativeBinding = require('@spider-rs/spider-rs-android-arm-eabi')
 53 |           }
 54 |         } catch (e) {
 55 |           loadError = e
 56 |         }
 57 |         break
 58 |       default:
 59 |         throw new Error(`Unsupported architecture on Android ${arch}`)
 60 |     }
 61 |     break
 62 |   case 'win32':
 63 |     switch (arch) {
 64 |       case 'x64':
 65 |         localFileExisted = existsSync(
 66 |           join(__dirname, 'spider-rs.win32-x64-msvc.node')
 67 |         )
 68 |         try {
 69 |           if (localFileExisted) {
 70 |             nativeBinding = require('./spider-rs.win32-x64-msvc.node')
 71 |           } else {
 72 |             nativeBinding = require('@spider-rs/spider-rs-win32-x64-msvc')
 73 |           }
 74 |         } catch (e) {
 75 |           loadError = e
 76 |         }
 77 |         break
 78 |       case 'ia32':
 79 |         localFileExisted = existsSync(
 80 |           join(__dirname, 'spider-rs.win32-ia32-msvc.node')
 81 |         )
 82 |         try {
 83 |           if (localFileExisted) {
 84 |             nativeBinding = require('./spider-rs.win32-ia32-msvc.node')
 85 |           } else {
 86 |             nativeBinding = require('@spider-rs/spider-rs-win32-ia32-msvc')
 87 |           }
 88 |         } catch (e) {
 89 |           loadError = e
 90 |         }
 91 |         break
 92 |       case 'arm64':
 93 |         localFileExisted = existsSync(
 94 |           join(__dirname, 'spider-rs.win32-arm64-msvc.node')
 95 |         )
 96 |         try {
 97 |           if (localFileExisted) {
 98 |             nativeBinding = require('./spider-rs.win32-arm64-msvc.node')
 99 |           } else {
100 |             nativeBinding = require('@spider-rs/spider-rs-win32-arm64-msvc')
101 |           }
102 |         } catch (e) {
103 |           loadError = e
104 |         }
105 |         break
106 |       default:
107 |         throw new Error(`Unsupported architecture on Windows: ${arch}`)
108 |     }
109 |     break
110 |   case 'darwin':
111 |     localFileExisted = existsSync(join(__dirname, 'spider-rs.darwin-universal.node'))
112 |     try {
113 |       if (localFileExisted) {
114 |         nativeBinding = require('./spider-rs.darwin-universal.node')
115 |       } else {
116 |         nativeBinding = require('@spider-rs/spider-rs-darwin-universal')
117 |       }
118 |       break
119 |     } catch {}
120 |     switch (arch) {
121 |       case 'x64':
122 |         localFileExisted = existsSync(join(__dirname, 'spider-rs.darwin-x64.node'))
123 |         try {
124 |           if (localFileExisted) {
125 |             nativeBinding = require('./spider-rs.darwin-x64.node')
126 |           } else {
127 |             nativeBinding = require('@spider-rs/spider-rs-darwin-x64')
128 |           }
129 |         } catch (e) {
130 |           loadError = e
131 |         }
132 |         break
133 |       case 'arm64':
134 |         localFileExisted = existsSync(
135 |           join(__dirname, 'spider-rs.darwin-arm64.node')
136 |         )
137 |         try {
138 |           if (localFileExisted) {
139 |             nativeBinding = require('./spider-rs.darwin-arm64.node')
140 |           } else {
141 |             nativeBinding = require('@spider-rs/spider-rs-darwin-arm64')
142 |           }
143 |         } catch (e) {
144 |           loadError = e
145 |         }
146 |         break
147 |       default:
148 |         throw new Error(`Unsupported architecture on macOS: ${arch}`)
149 |     }
150 |     break
151 |   case 'freebsd':
152 |     if (arch !== 'x64') {
153 |       throw new Error(`Unsupported architecture on FreeBSD: ${arch}`)
154 |     }
155 |     localFileExisted = existsSync(join(__dirname, 'spider-rs.freebsd-x64.node'))
156 |     try {
157 |       if (localFileExisted) {
158 |         nativeBinding = require('./spider-rs.freebsd-x64.node')
159 |       } else {
160 |         nativeBinding = require('@spider-rs/spider-rs-freebsd-x64')
161 |       }
162 |     } catch (e) {
163 |       loadError = e
164 |     }
165 |     break
166 |   case 'linux':
167 |     switch (arch) {
168 |       case 'x64':
169 |         if (isMusl()) {
170 |           localFileExisted = existsSync(
171 |             join(__dirname, 'spider-rs.linux-x64-musl.node')
172 |           )
173 |           try {
174 |             if (localFileExisted) {
175 |               nativeBinding = require('./spider-rs.linux-x64-musl.node')
176 |             } else {
177 |               nativeBinding = require('@spider-rs/spider-rs-linux-x64-musl')
178 |             }
179 |           } catch (e) {
180 |             loadError = e
181 |           }
182 |         } else {
183 |           localFileExisted = existsSync(
184 |             join(__dirname, 'spider-rs.linux-x64-gnu.node')
185 |           )
186 |           try {
187 |             if (localFileExisted) {
188 |               nativeBinding = require('./spider-rs.linux-x64-gnu.node')
189 |             } else {
190 |               nativeBinding = require('@spider-rs/spider-rs-linux-x64-gnu')
191 |             }
192 |           } catch (e) {
193 |             loadError = e
194 |           }
195 |         }
196 |         break
197 |       case 'arm64':
198 |         if (isMusl()) {
199 |           localFileExisted = existsSync(
200 |             join(__dirname, 'spider-rs.linux-arm64-musl.node')
201 |           )
202 |           try {
203 |             if (localFileExisted) {
204 |               nativeBinding = require('./spider-rs.linux-arm64-musl.node')
205 |             } else {
206 |               nativeBinding = require('@spider-rs/spider-rs-linux-arm64-musl')
207 |             }
208 |           } catch (e) {
209 |             loadError = e
210 |           }
211 |         } else {
212 |           localFileExisted = existsSync(
213 |             join(__dirname, 'spider-rs.linux-arm64-gnu.node')
214 |           )
215 |           try {
216 |             if (localFileExisted) {
217 |               nativeBinding = require('./spider-rs.linux-arm64-gnu.node')
218 |             } else {
219 |               nativeBinding = require('@spider-rs/spider-rs-linux-arm64-gnu')
220 |             }
221 |           } catch (e) {
222 |             loadError = e
223 |           }
224 |         }
225 |         break
226 |       case 'arm':
227 |         if (isMusl()) {
228 |           localFileExisted = existsSync(
229 |             join(__dirname, 'spider-rs.linux-arm-musleabihf.node')
230 |           )
231 |           try {
232 |             if (localFileExisted) {
233 |               nativeBinding = require('./spider-rs.linux-arm-musleabihf.node')
234 |             } else {
235 |               nativeBinding = require('@spider-rs/spider-rs-linux-arm-musleabihf')
236 |             }
237 |           } catch (e) {
238 |             loadError = e
239 |           }
240 |         } else {
241 |           localFileExisted = existsSync(
242 |             join(__dirname, 'spider-rs.linux-arm-gnueabihf.node')
243 |           )
244 |           try {
245 |             if (localFileExisted) {
246 |               nativeBinding = require('./spider-rs.linux-arm-gnueabihf.node')
247 |             } else {
248 |               nativeBinding = require('@spider-rs/spider-rs-linux-arm-gnueabihf')
249 |             }
250 |           } catch (e) {
251 |             loadError = e
252 |           }
253 |         }
254 |         break
255 |       case 'riscv64':
256 |         if (isMusl()) {
257 |           localFileExisted = existsSync(
258 |             join(__dirname, 'spider-rs.linux-riscv64-musl.node')
259 |           )
260 |           try {
261 |             if (localFileExisted) {
262 |               nativeBinding = require('./spider-rs.linux-riscv64-musl.node')
263 |             } else {
264 |               nativeBinding = require('@spider-rs/spider-rs-linux-riscv64-musl')
265 |             }
266 |           } catch (e) {
267 |             loadError = e
268 |           }
269 |         } else {
270 |           localFileExisted = existsSync(
271 |             join(__dirname, 'spider-rs.linux-riscv64-gnu.node')
272 |           )
273 |           try {
274 |             if (localFileExisted) {
275 |               nativeBinding = require('./spider-rs.linux-riscv64-gnu.node')
276 |             } else {
277 |               nativeBinding = require('@spider-rs/spider-rs-linux-riscv64-gnu')
278 |             }
279 |           } catch (e) {
280 |             loadError = e
281 |           }
282 |         }
283 |         break
284 |       case 's390x':
285 |         localFileExisted = existsSync(
286 |           join(__dirname, 'spider-rs.linux-s390x-gnu.node')
287 |         )
288 |         try {
289 |           if (localFileExisted) {
290 |             nativeBinding = require('./spider-rs.linux-s390x-gnu.node')
291 |           } else {
292 |             nativeBinding = require('@spider-rs/spider-rs-linux-s390x-gnu')
293 |           }
294 |         } catch (e) {
295 |           loadError = e
296 |         }
297 |         break
298 |       default:
299 |         throw new Error(`Unsupported architecture on Linux: ${arch}`)
300 |     }
301 |     break
302 |   default:
303 |     throw new Error(`Unsupported OS: ${platform}, architecture: ${arch}`)
304 | }
305 | 
306 | if (!nativeBinding) {
307 |   if (loadError) {
308 |     throw loadError
309 |   }
310 |   throw new Error(`Failed to load native binding`)
311 | }
312 | 
313 | const { pageTitle, NWebsite, Page, crawl, Website, Cron } = nativeBinding
314 | 
315 | module.exports.pageTitle = pageTitle
316 | module.exports.NWebsite = NWebsite
317 | module.exports.Page = Page
318 | module.exports.crawl = crawl
319 | module.exports.Website = Website
320 | module.exports.Cron = Cron
321 | 


--------------------------------------------------------------------------------
/npm/android-arm-eabi/README.md:
--------------------------------------------------------------------------------
1 | # `@spider-rs/spider-rs-android-arm-eabi`
2 | 
3 | This is the **armv7-linux-androideabi** binary for `@spider-rs/spider-rs`
4 | 


--------------------------------------------------------------------------------
/npm/android-arm-eabi/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@spider-rs/spider-rs-android-arm-eabi",
 3 |   "version": "0.0.162",
 4 |   "repository": "https://github.com/spider-rs/spider-nodejs",
 5 |   "os": [
 6 |     "android"
 7 |   ],
 8 |   "cpu": [
 9 |     "arm"
10 |   ],
11 |   "main": "spider-rs.android-arm-eabi.node",
12 |   "files": [
13 |     "spider-rs.android-arm-eabi.node"
14 |   ],
15 |   "license": "MIT",
16 |   "engines": {
17 |     "node": ">= 10"
18 |   }
19 | }


--------------------------------------------------------------------------------
/npm/android-arm64/README.md:
--------------------------------------------------------------------------------
1 | # `@spider-rs/spider-rs-android-arm64`
2 | 
3 | This is the **aarch64-linux-android** binary for `@spider-rs/spider-rs`
4 | 


--------------------------------------------------------------------------------
/npm/android-arm64/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@spider-rs/spider-rs-android-arm64",
 3 |   "version": "0.0.162",
 4 |   "repository": "https://github.com/spider-rs/spider-nodejs",
 5 |   "os": [
 6 |     "android"
 7 |   ],
 8 |   "cpu": [
 9 |     "arm64"
10 |   ],
11 |   "main": "spider-rs.android-arm64.node",
12 |   "files": [
13 |     "spider-rs.android-arm64.node"
14 |   ],
15 |   "license": "MIT",
16 |   "engines": {
17 |     "node": ">= 10"
18 |   }
19 | }


--------------------------------------------------------------------------------
/npm/darwin-arm64/README.md:
--------------------------------------------------------------------------------
1 | # `@spider-rs/spider-rs-darwin-arm64`
2 | 
3 | This is the **aarch64-apple-darwin** binary for `@spider-rs/spider-rs`
4 | 


--------------------------------------------------------------------------------
/npm/darwin-arm64/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@spider-rs/spider-rs-darwin-arm64",
 3 |   "version": "0.0.162",
 4 |   "repository": "https://github.com/spider-rs/spider-nodejs",
 5 |   "os": [
 6 |     "darwin"
 7 |   ],
 8 |   "cpu": [
 9 |     "arm64"
10 |   ],
11 |   "main": "spider-rs.darwin-arm64.node",
12 |   "files": [
13 |     "spider-rs.darwin-arm64.node"
14 |   ],
15 |   "license": "MIT",
16 |   "engines": {
17 |     "node": ">= 10"
18 |   }
19 | }


--------------------------------------------------------------------------------
/npm/darwin-universal/README.md:
--------------------------------------------------------------------------------
1 | # `@spider-rs/spider-rs-darwin-universal`
2 | 
3 | This is the **universal-apple-darwin** binary for `@spider-rs/spider-rs`
4 | 


--------------------------------------------------------------------------------
/npm/darwin-universal/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@spider-rs/spider-rs-darwin-universal",
 3 |   "version": "0.0.162",
 4 |   "repository": "https://github.com/spider-rs/spider-nodejs",
 5 |   "os": [
 6 |     "darwin"
 7 |   ],
 8 |   "main": "spider-rs.darwin-universal.node",
 9 |   "files": [
10 |     "spider-rs.darwin-universal.node"
11 |   ],
12 |   "license": "MIT",
13 |   "engines": {
14 |     "node": ">= 10"
15 |   }
16 | }


--------------------------------------------------------------------------------
/npm/darwin-x64/README.md:
--------------------------------------------------------------------------------
1 | # `@spider-rs/spider-rs-darwin-x64`
2 | 
3 | This is the **x86_64-apple-darwin** binary for `@spider-rs/spider-rs`
4 | 


--------------------------------------------------------------------------------
/npm/darwin-x64/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@spider-rs/spider-rs-darwin-x64",
 3 |   "version": "0.0.162",
 4 |   "repository": "https://github.com/spider-rs/spider-nodejs",
 5 |   "os": [
 6 |     "darwin"
 7 |   ],
 8 |   "cpu": [
 9 |     "x64"
10 |   ],
11 |   "main": "spider-rs.darwin-x64.node",
12 |   "files": [
13 |     "spider-rs.darwin-x64.node"
14 |   ],
15 |   "license": "MIT",
16 |   "engines": {
17 |     "node": ">= 10"
18 |   }
19 | }


--------------------------------------------------------------------------------
/npm/freebsd-x64/README.md:
--------------------------------------------------------------------------------
1 | # `@spider-rs/spider-rs-freebsd-x64`
2 | 
3 | This is the **x86_64-unknown-freebsd** binary for `@spider-rs/spider-rs`
4 | 


--------------------------------------------------------------------------------
/npm/freebsd-x64/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@spider-rs/spider-rs-freebsd-x64",
 3 |   "version": "0.0.162",
 4 |   "repository": "https://github.com/spider-rs/spider-nodejs",
 5 |   "os": [
 6 |     "freebsd"
 7 |   ],
 8 |   "cpu": [
 9 |     "x64"
10 |   ],
11 |   "main": "spider-rs.freebsd-x64.node",
12 |   "files": [
13 |     "spider-rs.freebsd-x64.node"
14 |   ],
15 |   "license": "MIT",
16 |   "engines": {
17 |     "node": ">= 10"
18 |   }
19 | }


--------------------------------------------------------------------------------
/npm/linux-arm-gnueabihf/README.md:
--------------------------------------------------------------------------------
1 | # `@spider-rs/spider-rs-linux-arm-gnueabihf`
2 | 
3 | This is the **armv7-unknown-linux-gnueabihf** binary for `@spider-rs/spider-rs`
4 | 


--------------------------------------------------------------------------------
/npm/linux-arm-gnueabihf/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@spider-rs/spider-rs-linux-arm-gnueabihf",
 3 |   "version": "0.0.162",
 4 |   "repository": "https://github.com/spider-rs/spider-nodejs",
 5 |   "os": [
 6 |     "linux"
 7 |   ],
 8 |   "cpu": [
 9 |     "arm"
10 |   ],
11 |   "main": "spider-rs.linux-arm-gnueabihf.node",
12 |   "files": [
13 |     "spider-rs.linux-arm-gnueabihf.node"
14 |   ],
15 |   "license": "MIT",
16 |   "engines": {
17 |     "node": ">= 10"
18 |   }
19 | }


--------------------------------------------------------------------------------
/npm/linux-arm64-gnu/README.md:
--------------------------------------------------------------------------------
1 | # `@spider-rs/spider-rs-linux-arm64-gnu`
2 | 
3 | This is the **aarch64-unknown-linux-gnu** binary for `@spider-rs/spider-rs`
4 | 


--------------------------------------------------------------------------------
/npm/linux-arm64-gnu/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@spider-rs/spider-rs-linux-arm64-gnu",
 3 |   "version": "0.0.162",
 4 |   "repository": "https://github.com/spider-rs/spider-nodejs",
 5 |   "os": [
 6 |     "linux"
 7 |   ],
 8 |   "cpu": [
 9 |     "arm64"
10 |   ],
11 |   "main": "spider-rs.linux-arm64-gnu.node",
12 |   "files": [
13 |     "spider-rs.linux-arm64-gnu.node"
14 |   ],
15 |   "license": "MIT",
16 |   "engines": {
17 |     "node": ">= 10"
18 |   },
19 |   "libc": [
20 |     "glibc"
21 |   ]
22 | }


--------------------------------------------------------------------------------
/npm/linux-arm64-musl/README.md:
--------------------------------------------------------------------------------
1 | # `@spider-rs/spider-rs-linux-arm64-musl`
2 | 
3 | This is the **aarch64-unknown-linux-musl** binary for `@spider-rs/spider-rs`
4 | 


--------------------------------------------------------------------------------
/npm/linux-arm64-musl/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@spider-rs/spider-rs-linux-arm64-musl",
 3 |   "version": "0.0.162",
 4 |   "repository": "https://github.com/spider-rs/spider-nodejs",
 5 |   "os": [
 6 |     "linux"
 7 |   ],
 8 |   "cpu": [
 9 |     "arm64"
10 |   ],
11 |   "main": "spider-rs.linux-arm64-musl.node",
12 |   "files": [
13 |     "spider-rs.linux-arm64-musl.node"
14 |   ],
15 |   "license": "MIT",
16 |   "engines": {
17 |     "node": ">= 10"
18 |   },
19 |   "libc": [
20 |     "musl"
21 |   ]
22 | }


--------------------------------------------------------------------------------
/npm/linux-x64-gnu/README.md:
--------------------------------------------------------------------------------
1 | # `@spider-rs/spider-rs-linux-x64-gnu`
2 | 
3 | This is the **x86_64-unknown-linux-gnu** binary for `@spider-rs/spider-rs`
4 | 


--------------------------------------------------------------------------------
/npm/linux-x64-gnu/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@spider-rs/spider-rs-linux-x64-gnu",
 3 |   "version": "0.0.162",
 4 |   "repository": "https://github.com/spider-rs/spider-nodejs",
 5 |   "os": [
 6 |     "linux"
 7 |   ],
 8 |   "cpu": [
 9 |     "x64"
10 |   ],
11 |   "main": "spider-rs.linux-x64-gnu.node",
12 |   "files": [
13 |     "spider-rs.linux-x64-gnu.node"
14 |   ],
15 |   "license": "MIT",
16 |   "engines": {
17 |     "node": ">= 10"
18 |   },
19 |   "libc": [
20 |     "glibc"
21 |   ]
22 | }


--------------------------------------------------------------------------------
/npm/linux-x64-musl/README.md:
--------------------------------------------------------------------------------
1 | # `@spider-rs/spider-rs-linux-x64-musl`
2 | 
3 | This is the **x86_64-unknown-linux-musl** binary for `@spider-rs/spider-rs`
4 | 


--------------------------------------------------------------------------------
/npm/linux-x64-musl/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@spider-rs/spider-rs-linux-x64-musl",
 3 |   "version": "0.0.162",
 4 |   "repository": "https://github.com/spider-rs/spider-nodejs",
 5 |   "os": [
 6 |     "linux"
 7 |   ],
 8 |   "cpu": [
 9 |     "x64"
10 |   ],
11 |   "main": "spider-rs.linux-x64-musl.node",
12 |   "files": [
13 |     "spider-rs.linux-x64-musl.node"
14 |   ],
15 |   "license": "MIT",
16 |   "engines": {
17 |     "node": ">= 10"
18 |   },
19 |   "libc": [
20 |     "musl"
21 |   ]
22 | }


--------------------------------------------------------------------------------
/npm/win32-arm64-msvc/README.md:
--------------------------------------------------------------------------------
1 | # `@spider-rs/spider-rs-win32-arm64-msvc`
2 | 
3 | This is the **aarch64-pc-windows-msvc** binary for `@spider-rs/spider-rs`
4 | 


--------------------------------------------------------------------------------
/npm/win32-arm64-msvc/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@spider-rs/spider-rs-win32-arm64-msvc",
 3 |   "version": "0.0.162",
 4 |   "repository": "https://github.com/spider-rs/spider-nodejs",
 5 |   "os": [
 6 |     "win32"
 7 |   ],
 8 |   "cpu": [
 9 |     "arm64"
10 |   ],
11 |   "main": "spider-rs.win32-arm64-msvc.node",
12 |   "files": [
13 |     "spider-rs.win32-arm64-msvc.node"
14 |   ],
15 |   "license": "MIT",
16 |   "engines": {
17 |     "node": ">= 10"
18 |   }
19 | }


--------------------------------------------------------------------------------
/npm/win32-ia32-msvc/README.md:
--------------------------------------------------------------------------------
1 | # `@spider-rs/spider-rs-win32-ia32-msvc`
2 | 
3 | This is the **i686-pc-windows-msvc** binary for `@spider-rs/spider-rs`
4 | 


--------------------------------------------------------------------------------
/npm/win32-ia32-msvc/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@spider-rs/spider-rs-win32-ia32-msvc",
 3 |   "version": "0.0.162",
 4 |   "repository": "https://github.com/spider-rs/spider-nodejs",
 5 |   "os": [
 6 |     "win32"
 7 |   ],
 8 |   "cpu": [
 9 |     "ia32"
10 |   ],
11 |   "main": "spider-rs.win32-ia32-msvc.node",
12 |   "files": [
13 |     "spider-rs.win32-ia32-msvc.node"
14 |   ],
15 |   "license": "MIT",
16 |   "engines": {
17 |     "node": ">= 10"
18 |   }
19 | }


--------------------------------------------------------------------------------
/npm/win32-x64-msvc/README.md:
--------------------------------------------------------------------------------
1 | # `@spider-rs/spider-rs-win32-x64-msvc`
2 | 
3 | This is the **x86_64-pc-windows-msvc** binary for `@spider-rs/spider-rs`
4 | 


--------------------------------------------------------------------------------
/npm/win32-x64-msvc/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@spider-rs/spider-rs-win32-x64-msvc",
 3 |   "version": "0.0.162",
 4 |   "repository": "https://github.com/spider-rs/spider-nodejs",
 5 |   "os": [
 6 |     "win32"
 7 |   ],
 8 |   "cpu": [
 9 |     "x64"
10 |   ],
11 |   "main": "spider-rs.win32-x64-msvc.node",
12 |   "files": [
13 |     "spider-rs.win32-x64-msvc.node"
14 |   ],
15 |   "license": "MIT",
16 |   "engines": {
17 |     "node": ">= 10"
18 |   }
19 | }


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@spider-rs/spider-rs",
 3 |   "version": "0.0.162",
 4 |   "main": "index.js",
 5 |   "types": "index.d.ts",
 6 |   "napi": {
 7 |     "name": "spider-rs",
 8 |     "triples": {
 9 |       "additional": [
10 |         "aarch64-apple-darwin",
11 |         "aarch64-linux-android",
12 |         "aarch64-unknown-linux-gnu",
13 |         "aarch64-unknown-linux-musl",
14 |         "aarch64-pc-windows-msvc",
15 |         "armv7-unknown-linux-gnueabihf",
16 |         "x86_64-unknown-linux-musl",
17 |         "x86_64-unknown-freebsd",
18 |         "i686-pc-windows-msvc",
19 |         "armv7-linux-androideabi",
20 |         "universal-apple-darwin"
21 |       ]
22 |     }
23 |   },
24 |   "license": "MIT",
25 |   "keywords": [
26 |     "spider",
27 |     "crawler"
28 |   ],
29 |   "repository": "https://github.com/spider-rs/spider-nodejs",
30 |   "devDependencies": {
31 |     "@napi-rs/cli": "^2.18.4",
32 |     "@swc-node/register": "^1.10.9",
33 |     "@swc/core": "^1.7.0",
34 |     "@types/node": "^20.14.5",
35 |     "ava": "^6.1.3",
36 |     "prettier": "^3.3.3",
37 |     "typescript": "^5.4.5"
38 |   },
39 |   "ava": {
40 |     "require": [
41 |       "@swc-node/register"
42 |     ],
43 |     "extensions": [
44 |       "ts"
45 |     ],
46 |     "timeout": "5m",
47 |     "workerThreads": false,
48 |     "environmentVariables": {
49 |       "TS_NODE_PROJECT": "./tsconfig.json"
50 |     }
51 |   },
52 |   "engines": {
53 |     "node": ">= 10"
54 |   },
55 |   "scripts": {
56 |     "artifacts": "napi artifacts",
57 |     "bench": "cd bench && npm run bench",
58 |     "bench:oss": "cd bench && npm run bench:oss",
59 |     "build": "napi build --platform --release --pipe \"prettier -w\"",
60 |     "build:debug": "napi build --platform --pipe \"prettier -w\"",
61 |     "format": "run-p format:prettier format:rs format:toml",
62 |     "format:prettier": "prettier . -w",
63 |     "format:toml": "taplo format",
64 |     "format:rs": "cargo fmt",
65 |     "lint": "eslint . -c ./.eslintrc.yml",
66 |     "prepublishOnly": "napi prepublish -t npm",
67 |     "test": "ava",
68 |     "version": "napi version"
69 |   },
70 |   "prettier": {
71 |     "printWidth": 120,
72 |     "semi": false,
73 |     "trailingComma": "all",
74 |     "singleQuote": true,
75 |     "arrowParens": "always"
76 |   },
77 |   "packageManager": "yarn@3.6.4"
78 | }
79 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | tab_spaces = 2
2 | 


--------------------------------------------------------------------------------
/src/conversions.rs:
--------------------------------------------------------------------------------
 1 | use napi::bindgen_prelude::{Buffer, Null, Object, Undefined};
 2 | use serde_json::Value;
 3 | 
 4 | /// the object to handle conversions
 5 | pub enum ObjectConvert {
 6 |   /// napi object
 7 |   Obj(Object),
 8 |   /// serde value
 9 |   Val(Value),
10 | }
11 | 
12 | /// convert a napi object to json with trailing comma support for quick reading and writing
13 | pub fn object_to_u8(obj: ObjectConvert) -> Result<Vec<u8>, napi::Error> {
14 |   let mut ss = vec![];
15 | 
16 |   match obj {
17 |     ObjectConvert::Val(deserialized) => {
18 |       ss.extend(deserialized.to_string().as_bytes());
19 |     }
20 |     ObjectConvert::Obj(obj) => {
21 |       let o = Object::keys(&obj)?;
22 |       let o_size = o.len();
23 | 
24 |       ss.push(b'{');
25 | 
26 |       // we are missing map, null, and vector
27 |       for (i, key) in o.iter().enumerate() {
28 |         let mut fp = || {
29 |           ss.push(b'"');
30 |           ss.extend(key.as_bytes());
31 |           ss.push(b'"');
32 |           ss.push(b':');
33 |         };
34 | 
35 |         let mut block = false;
36 | 
37 |         // todo: method to go through all napi values to get types instead of long chain map
38 |         match obj.get::<&str, String>(&key) {
39 |           Ok(s) => {
40 |             fp();
41 |             ss.push(b'"');
42 |             ss.extend(s.unwrap_or_default().as_bytes());
43 |             ss.push(b'"');
44 |           }
45 |           _ => match obj.get::<&str, u32>(&key) {
46 |             Ok(s) => {
47 |               fp();
48 |               ss.push(b'"');
49 |               ss.extend(s.unwrap_or_default().to_string().as_bytes());
50 |               ss.push(b'"');
51 |             }
52 |             _ => match obj.get::<&str, i32>(&key) {
53 |               Ok(s) => {
54 |                 fp();
55 |                 ss.push(b'"');
56 |                 ss.extend(s.unwrap_or_default().to_string().as_bytes());
57 |                 ss.push(b'"');
58 |               }
59 |               _ => match obj.get::<&str, Buffer>(&key) {
60 |                 Ok(s) => {
61 |                   fp();
62 |                   let d = serde_json::to_string(
63 |                     &String::from_utf8(s.unwrap_or_default().as_ref().into()).unwrap_or_default(),
64 |                   )?;
65 |                   ss.extend(d.as_bytes());
66 |                 }
67 |                 _ => match obj.get::<&str, Null>(&key) {
68 |                   Ok(_) => {
69 |                     fp();
70 |                     ss.extend(b"null");
71 |                   }
72 |                   _ => match obj.get::<&str, Undefined>(&key) {
73 |                     Ok(_) => {
74 |                       block = true;
75 |                     }
76 |                     _ => (),
77 |                   },
78 |                 },
79 |               },
80 |             },
81 |           },
82 |         }
83 | 
84 |         if !block && i != o_size - 1 {
85 |           ss.push(b',');
86 |         }
87 |       }
88 | 
89 |       ss.push(b'}');
90 |     }
91 |   }
92 | 
93 |   Ok(ss)
94 | }
95 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![deny(clippy::all)]
 2 | 
 3 | #[macro_use]
 4 | extern crate napi_derive;
 5 | use spider::lazy_static::lazy_static;
 6 | 
 7 | lazy_static! {
 8 |   pub static ref BUFFER: usize = (num_cpus::get() * 20).max(88);
 9 | }
10 | 
11 | pub mod npage;
12 | pub mod nwebsite;
13 | pub mod page;
14 | pub mod shortcut;
15 | pub mod website;
16 | 
17 | pub use npage::{page_title, NPage};
18 | pub use nwebsite::NWebsite;
19 | pub use page::Page;
20 | pub use shortcut::crawl;
21 | pub use website::Website;
22 | /// convert types to different types
23 | mod conversions;
24 | 


--------------------------------------------------------------------------------
/src/npage.rs:
--------------------------------------------------------------------------------
 1 | use napi::bindgen_prelude::Buffer;
 2 | use spider::{lazy_static::lazy_static, reqwest::header::HeaderMap};
 3 | use std::collections::HashMap;
 4 | 
 5 | lazy_static! {
 6 |   static ref TITLE_SELECTOR: scraper::Selector = scraper::Selector::parse("title").unwrap();
 7 | }
 8 | 
 9 | /// a simple page object
10 | #[derive(Default, Clone)]
11 | #[napi(object)]
12 | pub struct NPage {
13 |   /// The url found.
14 |   pub url: String,
15 |   /// The content of the page found.
16 |   pub content: String,
17 |   /// The HTTP status code.
18 |   pub status_code: u16,
19 |   /// The Raw content if the resource needs to be sent as binary.
20 |   pub raw_content: Option<Buffer>,
21 |   /// The HTTP headers.
22 |   pub headers: Option<HashMap<String, String>>,
23 |   /// The links found on the page. Requires the website.builder method website.with_subscription_return_page_links to be set to true.
24 |   pub links: Option<Vec<String>>,
25 | }
26 | 
27 | #[napi]
28 | /// get the page title.
29 | pub fn page_title(page: NPage) -> String {
30 |   page.title()
31 | }
32 | 
33 | #[napi]
34 | impl NPage {
35 |   /// establish a new page
36 |   pub fn new(res: &spider::page::Page, raw: bool) -> NPage {
37 |     NPage {
38 |       url: res.get_url().into(),
39 |       status_code: res.status_code.as_u16(),
40 |       content: if raw {
41 |         Default::default()
42 |       } else {
43 |         res.get_html()
44 |       },
45 |       raw_content: if raw {
46 |         Some(res.get_html_bytes_u8().into())
47 |       } else {
48 |         None
49 |       },
50 |       headers: match res.headers {
51 |         Some(ref headers) => Some(header_map_to_hash_map(headers)),
52 |         _ => None,
53 |       },
54 |       links: match res.page_links {
55 |         Some(ref links) => Some(
56 |           links
57 |             .iter()
58 |             .map(|link| link.as_ref().to_string())
59 |             .collect::<Vec<String>>(),
60 |         ),
61 |         _ => None,
62 |       },
63 |     }
64 |   }
65 | 
66 |   #[napi]
67 |   /// the html page title.
68 |   pub fn title(&self) -> String {
69 |     let fragment: scraper::Html = scraper::Html::parse_document(&self.content);
70 |     match fragment.select(&TITLE_SELECTOR).next() {
71 |       Some(title) => title.inner_html(),
72 |       _ => Default::default(),
73 |     }
74 |   }
75 | }
76 | 
77 | /// convert a headermap to hashmap
78 | pub fn header_map_to_hash_map(header_map: &HeaderMap) -> HashMap<String, String> {
79 |   let mut hash_map = HashMap::new();
80 | 
81 |   for (key, value) in header_map.iter() {
82 |     let key = key.as_str().to_string();
83 | 
84 |     if let Ok(value_str) = value.to_str() {
85 |       hash_map.insert(key, value_str.to_string());
86 |     }
87 |   }
88 | 
89 |   hash_map
90 | }
91 | 


--------------------------------------------------------------------------------
/src/nwebsite.rs:
--------------------------------------------------------------------------------
 1 | use crate::NPage;
 2 | 
 3 | #[napi]
 4 | /// website main data from rust to node.
 5 | pub struct NWebsite {
 6 |   /// all of the website links.
 7 |   pub links: Vec<String>,
 8 |   /// the pages found.
 9 |   pub pages: Vec<NPage>,
10 | }
11 | 


--------------------------------------------------------------------------------
/src/page.rs:
--------------------------------------------------------------------------------
 1 | use napi;
 2 | 
 3 | /// a simple page object
 4 | #[napi]
 5 | #[derive(Default)]
 6 | pub struct Page {
 7 |   /// the page object from spider
 8 |   inner: Option<spider::page::Page>,
 9 |   /// selectors
10 |   selectors: Option<spider::RelativeSelectors>,
11 |   /// The url for the page.
12 |   pub url: String,
13 |   /// The website crawling subdomain pages?
14 |   pub subdomains: Option<bool>,
15 |   /// The website crawling TLD pages?
16 |   pub tld: Option<bool>,
17 |   /// The HTTP status code.
18 |   pub status_code: u16,
19 | }
20 | 
21 | #[napi]
22 | impl Page {
23 |   #[napi(constructor)]
24 |   /// a new page
25 |   pub fn new(url: String, subdomains: Option<bool>, tld: Option<bool>) -> Self {
26 |     Page {
27 |       url,
28 |       subdomains,
29 |       tld,
30 |       ..Default::default()
31 |     }
32 |   }
33 | 
34 |   #[napi]
35 |   /// get the page content
36 |   pub async unsafe fn fetch(&mut self) -> &Self {
37 |     use spider::{
38 |       lazy_static::lazy_static, reqwest::Client, reqwest_middleware::ClientWithMiddleware,
39 |       ClientBuilder,
40 |     };
41 |     lazy_static! {
42 |       /// top level single page client to re-use.
43 |       pub static ref PAGE_CLIENT: ClientWithMiddleware = {
44 |         let reqwest_client = Client::builder().build().unwrap_or_default();
45 |         let client = ClientBuilder::new(reqwest_client).build();
46 | 
47 |         client
48 |       };
49 |     }
50 |     let page = spider::page::Page::new_page(&self.url, &PAGE_CLIENT).await;
51 |     self.status_code = page.status_code.into();
52 |     self.inner = Some(page);
53 |     self.selectors = Some(spider::page::get_page_selectors(
54 |       &self.url,
55 |       self.subdomains.unwrap_or_default(),
56 |       self.tld.unwrap_or_default(),
57 |     ));
58 |     self
59 |   }
60 | 
61 |   #[napi]
62 |   /// all links on the page
63 |   pub async fn get_links(&self) -> Vec<String> {
64 |     match &self.selectors {
65 |       Some(selectors) => match &self.inner {
66 |         Some(inner) => {
67 |           let links = inner.clone().links(&selectors, &None).await;
68 |           links
69 |             .into_iter()
70 |             .map(|i| i.as_ref().to_string())
71 |             .collect::<Vec<String>>()
72 |         }
73 |         _ => Default::default(),
74 |       },
75 |       _ => Default::default(),
76 |     }
77 |   }
78 | 
79 |   #[napi]
80 |   /// get the html for the page
81 |   pub fn get_html(&self) -> String {
82 |     match &self.inner {
83 |       Some(inner) => inner.get_html(),
84 |       _ => Default::default(),
85 |     }
86 |   }
87 | 
88 |   #[napi]
89 |   /// get the bytes for the page
90 |   pub fn get_bytes(&self) -> &[u8] {
91 |     match &self.inner {
92 |       Some(inner) => inner.get_html_bytes_u8(),
93 |       _ => Default::default(),
94 |     }
95 |   }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/shortcut.rs:
--------------------------------------------------------------------------------
 1 | use crate::NPage;
 2 | use crate::NWebsite;
 3 | use crate::BUFFER;
 4 | 
 5 | #[napi]
 6 | /// crawl a website using HTTP gathering all links and html.
 7 | pub async fn crawl(url: String, raw_content: Option<bool>) -> NWebsite {
 8 |   let mut website = spider::website::Website::new(&url);
 9 |   let mut rx2 = website
10 |     .subscribe(*BUFFER / 2)
11 |     .expect("sync feature should be enabled");
12 |   let (tx, mut rx) = spider::tokio::sync::mpsc::channel(*BUFFER);
13 |   let raw_content = raw_content.unwrap_or_default();
14 | 
15 |   spider::tokio::spawn(async move {
16 |     while let Ok(res) = rx2.recv().await {
17 |       if let Err(_) = tx.send(NPage::new(&res, raw_content)).await {
18 |         println!("receiver dropped");
19 |         return;
20 |       }
21 |     }
22 |   });
23 | 
24 |   spider::tokio::spawn(async move {
25 |     website.crawl_raw().await;
26 |   });
27 | 
28 |   let mut pages = Vec::new();
29 | 
30 |   while let Some(i) = rx.recv().await {
31 |     pages.push(i)
32 |   }
33 | 
34 |   let links = pages.iter().map(|x| x.url.clone()).collect::<Vec<String>>();
35 | 
36 |   NWebsite { links, pages }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/website.rs:
--------------------------------------------------------------------------------
   1 | use crate::conversions::{object_to_u8, ObjectConvert};
   2 | use crate::{NPage, BUFFER};
   3 | use indexmap::IndexMap;
   4 | use napi::{bindgen_prelude::Object, tokio::task::JoinHandle};
   5 | use napi::{Env, JsUnknown};
   6 | use spider::compact_str::CompactString;
   7 | use spider::configuration::{WaitForDelay, WaitForIdleNetwork, WaitForSelector};
   8 | use spider::{configuration::RedirectPolicy, utils::shutdown};
   9 | use std::time::Duration;
  10 | 
  11 | #[napi]
  12 | /// a website holding the inner spider::website::Website from Rust fit for nodejs.
  13 | pub struct Website {
  14 |   /// the website from spider.
  15 |   inner: spider::website::Website,
  16 |   /// spawned subscription handles.
  17 |   subscription_handles: IndexMap<u32, JoinHandle<()>>,
  18 |   /// spawned crawl handles.
  19 |   crawl_handles: IndexMap<u32, JoinHandle<()>>,
  20 |   /// do not convert content to UT8.
  21 |   raw_content: bool,
  22 |   /// the data collected.
  23 |   collected_data: Box<Vec<Vec<u8>>>,
  24 |   /// is the crawl running in the background.
  25 |   running_in_background: bool, // /// the file handle for storing data
  26 |                                // file_handle: Option<spider::tokio::fs::File>,
  27 | }
  28 | 
  29 | #[napi(object)]
  30 | struct PageEvent {
  31 |   pub page: NPage,
  32 | }
  33 | 
  34 | #[napi]
  35 | impl Website {
  36 |   #[napi(constructor)]
  37 |   /// a new website.
  38 |   pub fn new(url: String, raw_content: Option<bool>) -> Self {
  39 |     Website {
  40 |       inner: spider::website::Website::new(&url),
  41 |       subscription_handles: IndexMap::new(),
  42 |       crawl_handles: IndexMap::new(),
  43 |       raw_content: raw_content.unwrap_or_default(),
  44 |       collected_data: Box::new(Vec::new()),
  45 |       running_in_background: false, // file_handle: None,
  46 |     }
  47 |   }
  48 | 
  49 |   /// Get the crawl status.
  50 |   #[napi(getter)]
  51 |   pub fn status(&self) -> String {
  52 |     use std::string::ToString;
  53 |     self.inner.get_status().to_string()
  54 |   }
  55 | 
  56 |   #[napi]
  57 |   /// Store data to heap memory. The data must be an object. Use `website.export_jsonl_data` to store to disk. When using this method test occordingly since only certain primitives are supported.
  58 |   pub fn push_data(&mut self, env: Env, obj: JsUnknown) -> napi::Result<()> {
  59 |     match env.from_js_value::<serde_json::Value, &JsUnknown>(&obj) {
  60 |       Ok(deserialized) => {
  61 |         self
  62 |           .collected_data
  63 |           .push(object_to_u8(ObjectConvert::Val(deserialized))?);
  64 |       }
  65 |       _ => match obj.coerce_to_object() {
  66 |         Ok(obj) => {
  67 |           self
  68 |             .collected_data
  69 |             .push(object_to_u8(ObjectConvert::Obj(obj))?);
  70 |         }
  71 |         _ => (),
  72 |       },
  73 |     }
  74 | 
  75 |     Ok(())
  76 |   }
  77 | 
  78 |   #[napi]
  79 |   /// Clear the collected data from heap memory. This only handles the data from `website.pushData`.
  80 |   pub fn clear_data(&mut self) -> napi::Result<()> {
  81 |     self.collected_data.clear();
  82 |     Ok(())
  83 |   }
  84 | 
  85 |   #[napi]
  86 |   /// read the data from the heap memory.
  87 |   pub fn read_data(&mut self) -> serde_json::Value {
  88 |     self
  89 |       .collected_data
  90 |       .iter()
  91 |       .map(|d| serde_json::from_slice::<serde_json::Value>(d).unwrap_or_default())
  92 |       .collect()
  93 |   }
  94 | 
  95 |   #[napi]
  96 |   /// store data to memory for disk storing. This will create the path if not exist and defaults to ./storage.
  97 |   pub async fn export_jsonl_data(&self, export_path: Option<String>) -> napi::Result<()> {
  98 |     use napi::tokio::io::AsyncWriteExt;
  99 |     let file = match export_path {
 100 |       Some(p) => {
 101 |         let base_dir = p
 102 |           .split("/")
 103 |           .into_iter()
 104 |           .map(|f| {
 105 |             if f.contains(".") {
 106 |               "".to_string()
 107 |             } else {
 108 |               f.to_string()
 109 |             }
 110 |           })
 111 |           .collect::<String>();
 112 | 
 113 |         spider::tokio::fs::create_dir_all(&base_dir).await?;
 114 | 
 115 |         if !p.contains(".") {
 116 |           p + ".jsonl"
 117 |         } else {
 118 |           p
 119 |         }
 120 |       }
 121 |       _ => {
 122 |         spider::tokio::fs::create_dir_all("./storage").await?;
 123 |         "./storage/".to_owned()
 124 |           + &self
 125 |             .inner
 126 |             .get_url()
 127 |             .inner()
 128 |             .replace("http://", "")
 129 |             .replace("https://", "")
 130 |           + "jsonl"
 131 |       }
 132 |     };
 133 |     let mut file = spider::tokio::fs::File::create(file).await?;
 134 | 
 135 |     for (index, data) in self.collected_data.iter().enumerate() {
 136 |       if index > 0 {
 137 |         file.write_all(b"\n").await?;
 138 |       }
 139 |       // transform data step needed to auto convert type ..
 140 |       file.write_all(&data).await?;
 141 |     }
 142 | 
 143 |     Ok(())
 144 |   }
 145 | 
 146 |   #[napi]
 147 |   /// subscribe and add an event listener.
 148 |   pub fn subscribe(
 149 |     &mut self,
 150 |     on_page_event: napi::threadsafe_function::ThreadsafeFunction<NPage>,
 151 |   ) -> u32 {
 152 |     let mut rx2 = self
 153 |       .inner
 154 |       .subscribe(*BUFFER / 2)
 155 |       .expect("sync feature should be enabled");
 156 |     let raw_content = self.raw_content;
 157 | 
 158 |     let handle = spider::tokio::spawn(async move {
 159 |       while let Ok(res) = rx2.recv().await {
 160 |         on_page_event.call(
 161 |           Ok(NPage::new(&res, raw_content)),
 162 |           napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking,
 163 |         );
 164 |       }
 165 |     });
 166 | 
 167 |     // always return the highest value as the next id.
 168 |     let id = match self.subscription_handles.last() {
 169 |       Some(handle) => handle.0 + 1,
 170 |       _ => 0,
 171 |     };
 172 | 
 173 |     self.subscription_handles.insert(id, handle);
 174 | 
 175 |     id
 176 |   }
 177 | 
 178 |   #[napi]
 179 |   /// remove a subscription listener.
 180 |   pub fn unsubscribe(&mut self, id: Option<u32>) -> bool {
 181 |     match id {
 182 |       Some(id) => {
 183 |         let handle = self.subscription_handles.get(&id);
 184 | 
 185 |         match handle {
 186 |           Some(h) => {
 187 |             h.abort();
 188 |             self.subscription_handles.shift_remove_entry(&id);
 189 |             true
 190 |           }
 191 |           _ => false,
 192 |         }
 193 |       }
 194 |       // we may want to get all subs and remove them
 195 |       _ => {
 196 |         let keys = self.subscription_handles.len();
 197 |         for k in self.subscription_handles.drain(..) {
 198 |           k.1.abort();
 199 |         }
 200 |         keys > 0
 201 |       }
 202 |     }
 203 |   }
 204 | 
 205 |   #[napi]
 206 |   /// stop a crawl
 207 |   pub async unsafe fn stop(&mut self, id: Option<u32>) -> bool {
 208 |     self.inner.stop();
 209 | 
 210 |     // prevent the last background run
 211 |     if self.running_in_background {
 212 |       // we may want ID's to be used as an option along with urls for complete shutdowns.
 213 |       shutdown(self.inner.get_url().inner()).await;
 214 |       self.running_in_background = false;
 215 |     }
 216 | 
 217 |     match id {
 218 |       Some(id) => {
 219 |         let handle = self.crawl_handles.get(&id);
 220 | 
 221 |         match handle {
 222 |           Some(h) => {
 223 |             h.abort();
 224 |             self.crawl_handles.shift_remove_entry(&id);
 225 |             true
 226 |           }
 227 |           _ => false,
 228 |         }
 229 |       }
 230 |       _ => {
 231 |         let keys = self.crawl_handles.len();
 232 |         for k in self.crawl_handles.drain(..) {
 233 |           k.1.abort();
 234 |         }
 235 |         keys > 0
 236 |       }
 237 |     }
 238 |   }
 239 | 
 240 |   #[napi]
 241 |   /// crawl a website
 242 |   pub async unsafe fn crawl(
 243 |     &mut self,
 244 |     on_page_event: Option<napi::threadsafe_function::ThreadsafeFunction<NPage>>,
 245 |     background: Option<bool>,
 246 |     headless: Option<bool>,
 247 |   ) {
 248 |     // only run in background if on_page_event is handled for streaming.
 249 |     let background = background.is_some() && background.unwrap_or_default();
 250 |     let headless = headless.is_some() && headless.unwrap_or_default();
 251 |     let raw_content = self.raw_content;
 252 | 
 253 |     if background {
 254 |       self.running_in_background = background;
 255 |     }
 256 | 
 257 |     match on_page_event {
 258 |       Some(callback) => {
 259 |         if background {
 260 |           let mut website = self.inner.clone();
 261 |           let mut rx2 = website
 262 |             .subscribe(*BUFFER / 2)
 263 |             .expect("sync feature should be enabled");
 264 | 
 265 |           let handle = spider::tokio::spawn(async move {
 266 |             while let Ok(res) = rx2.recv().await {
 267 |               callback.call(
 268 |                 Ok(NPage::new(&res, raw_content)),
 269 |                 napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking,
 270 |               );
 271 |             }
 272 |           });
 273 | 
 274 |           let crawl_id = match self.crawl_handles.last() {
 275 |             Some(handle) => handle.0 + 1,
 276 |             _ => 0,
 277 |           };
 278 | 
 279 |           let crawl_handle = spider::tokio::spawn(async move {
 280 |             if headless {
 281 |               website.crawl().await;
 282 |             } else {
 283 |               website.crawl_raw().await;
 284 |             }
 285 |           });
 286 | 
 287 |           let id = match self.subscription_handles.last() {
 288 |             Some(handle) => handle.0 + 1,
 289 |             _ => 0,
 290 |           };
 291 | 
 292 |           self.crawl_handles.insert(crawl_id, crawl_handle);
 293 |           self.subscription_handles.insert(id, handle);
 294 |         } else {
 295 |           let mut rx2 = self
 296 |             .inner
 297 |             .subscribe(*BUFFER / 2)
 298 |             .expect("sync feature should be enabled");
 299 | 
 300 |           let handle = spider::tokio::spawn(async move {
 301 |             while let Ok(res) = rx2.recv().await {
 302 |               callback.call(
 303 |                 Ok(NPage::new(&res, raw_content)),
 304 |                 napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking,
 305 |               );
 306 |             }
 307 |           });
 308 | 
 309 |           if headless {
 310 |             self.inner.crawl().await;
 311 |           } else {
 312 |             self.inner.crawl_raw().await;
 313 |           }
 314 | 
 315 |           let id = match self.subscription_handles.last() {
 316 |             Some(handle) => handle.0 + 1,
 317 |             _ => 0,
 318 |           };
 319 | 
 320 |           self.subscription_handles.insert(id, handle);
 321 |         }
 322 |       }
 323 |       _ => {
 324 |         if background {
 325 |           let mut website = self.inner.clone();
 326 | 
 327 |           let crawl_id = match self.crawl_handles.last() {
 328 |             Some(handle) => handle.0 + 1,
 329 |             _ => 0,
 330 |           };
 331 | 
 332 |           let crawl_handle = spider::tokio::spawn(async move {
 333 |             if headless {
 334 |               website.crawl().await;
 335 |             } else {
 336 |               website.crawl_raw().await;
 337 |             }
 338 |           });
 339 | 
 340 |           self.crawl_handles.insert(crawl_id, crawl_handle);
 341 |         } else {
 342 |           if headless {
 343 |             self.inner.crawl().await;
 344 |           } else {
 345 |             self.inner.crawl_raw().await;
 346 |           }
 347 |         }
 348 |       }
 349 |     }
 350 |   }
 351 | 
 352 |   #[napi]
 353 |   /// Start to crawl website with async concurrency smart. Use HTTP first and JavaScript Rendering as needed.
 354 |   pub async unsafe fn crawl_smart(
 355 |     &mut self,
 356 |     on_page_event: Option<napi::threadsafe_function::ThreadsafeFunction<NPage>>,
 357 |     background: Option<bool>,
 358 |   ) {
 359 |     // only run in background if on_page_event is handled for streaming.
 360 |     let background = background.is_some() && background.unwrap_or_default();
 361 |     let raw_content = self.raw_content;
 362 | 
 363 |     if background {
 364 |       self.running_in_background = background;
 365 |     }
 366 | 
 367 |     match on_page_event {
 368 |       Some(callback) => {
 369 |         if background {
 370 |           let mut website = self.inner.clone();
 371 |           let mut rx2 = website
 372 |             .subscribe(*BUFFER / 2)
 373 |             .expect("sync feature should be enabled");
 374 | 
 375 |           let handle = spider::tokio::spawn(async move {
 376 |             while let Ok(res) = rx2.recv().await {
 377 |               callback.call(
 378 |                 Ok(NPage::new(&res, raw_content)),
 379 |                 napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking,
 380 |               );
 381 |             }
 382 |           });
 383 | 
 384 |           let crawl_id = match self.crawl_handles.last() {
 385 |             Some(handle) => handle.0 + 1,
 386 |             _ => 0,
 387 |           };
 388 | 
 389 |           let crawl_handle = spider::tokio::spawn(async move {
 390 |             website.crawl_smart().await;
 391 |           });
 392 | 
 393 |           let id = match self.subscription_handles.last() {
 394 |             Some(handle) => handle.0 + 1,
 395 |             _ => 0,
 396 |           };
 397 | 
 398 |           self.crawl_handles.insert(crawl_id, crawl_handle);
 399 |           self.subscription_handles.insert(id, handle);
 400 |         } else {
 401 |           let mut rx2 = self
 402 |             .inner
 403 |             .subscribe(*BUFFER / 2)
 404 |             .expect("sync feature should be enabled");
 405 | 
 406 |           let handle = spider::tokio::spawn(async move {
 407 |             while let Ok(res) = rx2.recv().await {
 408 |               callback.call(
 409 |                 Ok(NPage::new(&res, raw_content)),
 410 |                 napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking,
 411 |               );
 412 |             }
 413 |           });
 414 | 
 415 |           self.inner.crawl_smart().await;
 416 |           let _ = handle.await;
 417 |         }
 418 |       }
 419 |       _ => {
 420 |         if background {
 421 |           let mut website = self.inner.clone();
 422 | 
 423 |           let crawl_id = match self.crawl_handles.last() {
 424 |             Some(handle) => handle.0 + 1,
 425 |             _ => 0,
 426 |           };
 427 | 
 428 |           let crawl_handle = spider::tokio::spawn(async move {
 429 |             website.crawl_smart().await;
 430 |           });
 431 | 
 432 |           self.crawl_handles.insert(crawl_id, crawl_handle);
 433 |         } else {
 434 |           self.inner.crawl_smart().await;
 435 |         }
 436 |       }
 437 |     }
 438 |   }
 439 | 
 440 |   #[napi]
 441 |   /// scrape a website
 442 |   pub async unsafe fn scrape(
 443 |     &mut self,
 444 |     on_page_event: Option<napi::threadsafe_function::ThreadsafeFunction<NPage>>,
 445 |     background: Option<bool>,
 446 |     headless: Option<bool>,
 447 |   ) {
 448 |     let headless = headless.is_some() && headless.unwrap_or_default();
 449 |     let raw_content = self.raw_content;
 450 |     let background = background.is_some() && background.unwrap_or_default();
 451 | 
 452 |     if background {
 453 |       self.running_in_background = background;
 454 |     }
 455 | 
 456 |     match on_page_event {
 457 |       Some(callback) => {
 458 |         if background {
 459 |           let mut website = self.inner.clone();
 460 |           let mut rx2 = website
 461 |             .subscribe(*BUFFER / 2)
 462 |             .expect("sync feature should be enabled");
 463 | 
 464 |           let handle = spider::tokio::spawn(async move {
 465 |             while let Ok(res) = rx2.recv().await {
 466 |               callback.call(
 467 |                 Ok(NPage::new(&res, raw_content)),
 468 |                 napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking,
 469 |               );
 470 |             }
 471 |           });
 472 | 
 473 |           let crawl_id = match self.crawl_handles.last() {
 474 |             Some(handle) => handle.0 + 1,
 475 |             _ => 0,
 476 |           };
 477 | 
 478 |           let crawl_handle = spider::tokio::spawn(async move {
 479 |             if headless {
 480 |               website.scrape().await;
 481 |             } else {
 482 |               website.scrape_raw().await;
 483 |             }
 484 |           });
 485 | 
 486 |           let id = match self.subscription_handles.last() {
 487 |             Some(handle) => handle.0 + 1,
 488 |             _ => 0,
 489 |           };
 490 | 
 491 |           self.crawl_handles.insert(crawl_id, crawl_handle);
 492 |           self.subscription_handles.insert(id, handle);
 493 |         } else {
 494 |           let mut rx2 = self
 495 |             .inner
 496 |             .subscribe(*BUFFER / 2)
 497 |             .expect("sync feature should be enabled");
 498 | 
 499 |           let handle = spider::tokio::spawn(async move {
 500 |             while let Ok(res) = rx2.recv().await {
 501 |               callback.call(
 502 |                 Ok(NPage::new(&res, raw_content)),
 503 |                 napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking,
 504 |               );
 505 |             }
 506 |           });
 507 | 
 508 |           if headless {
 509 |             self.inner.scrape().await;
 510 |           } else {
 511 |             self.inner.scrape_raw().await;
 512 |           }
 513 | 
 514 |           let _ = handle.await;
 515 |         }
 516 |       }
 517 |       _ => {
 518 |         if background {
 519 |           let mut website = self.inner.clone();
 520 | 
 521 |           let crawl_id = match self.crawl_handles.last() {
 522 |             Some(handle) => handle.0 + 1,
 523 |             _ => 0,
 524 |           };
 525 | 
 526 |           let crawl_handle = spider::tokio::spawn(async move {
 527 |             if headless {
 528 |               website.scrape().await;
 529 |             } else {
 530 |               website.scrape_raw().await;
 531 |             }
 532 |           });
 533 | 
 534 |           self.crawl_handles.insert(crawl_id, crawl_handle);
 535 |         } else {
 536 |           if headless {
 537 |             self.inner.scrape().await;
 538 |           } else {
 539 |             self.inner.scrape_raw().await;
 540 |           }
 541 |         }
 542 |       }
 543 |     }
 544 |   }
 545 | 
 546 |   /// run a cron job
 547 |   #[napi]
 548 |   pub async unsafe fn run_cron(
 549 |     &mut self,
 550 |     on_page_event: Option<napi::threadsafe_function::ThreadsafeFunction<NPage>>,
 551 |   ) -> Cron {
 552 |     let cron_handle = match on_page_event {
 553 |       Some(callback) => {
 554 |         let mut rx2 = self
 555 |           .inner
 556 |           .subscribe(*BUFFER / 2)
 557 |           .expect("sync feature should be enabled");
 558 |         let raw_content = self.raw_content;
 559 | 
 560 |         let handler = spider::tokio::spawn(async move {
 561 |           while let Ok(res) = rx2.recv().await {
 562 |             callback.call(
 563 |               Ok(NPage::new(&res, raw_content)),
 564 |               napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking,
 565 |             );
 566 |           }
 567 |         });
 568 | 
 569 |         Some(handler)
 570 |       }
 571 |       _ => None,
 572 |     };
 573 | 
 574 |     let inner = self.inner.run_cron().await;
 575 | 
 576 |     Cron { inner, cron_handle }
 577 |   }
 578 | 
 579 |   #[napi]
 580 |   /// get all the links of a website
 581 |   pub fn get_links(&self) -> Vec<String> {
 582 |     let links = self
 583 |       .inner
 584 |       .get_links()
 585 |       .iter()
 586 |       .map(|x| x.as_ref().to_string())
 587 |       .collect::<Vec<String>>();
 588 |     links
 589 |   }
 590 | 
 591 |   #[napi(getter)]
 592 |   /// get the size of the website in amount of pages crawled. If you ran the page in the background, this value will not update.
 593 |   pub fn size(&mut self) -> u32 {
 594 |     self.inner.size() as u32
 595 |   }
 596 | 
 597 |   /// get all the pages of a website - requires calling website.scrape
 598 |   #[napi]
 599 |   pub fn get_pages(&self) -> Vec<NPage> {
 600 |     let mut pages: Vec<NPage> = Vec::new();
 601 |     let raw_content = self.raw_content;
 602 | 
 603 |     match self.inner.get_pages() {
 604 |       Some(p) => {
 605 |         for page in p.iter() {
 606 |           pages.push(NPage::new(page, raw_content));
 607 |         }
 608 |       }
 609 |       _ => (),
 610 |     }
 611 | 
 612 |     pages
 613 |   }
 614 | 
 615 |   #[napi]
 616 |   /// drain all links from storing
 617 |   pub fn drain_links(&mut self) -> Vec<String> {
 618 |     let links = self
 619 |       .inner
 620 |       .get_links()
 621 |       .iter()
 622 |       .map(|x| x.as_ref().to_string())
 623 |       .collect::<Vec<String>>();
 624 |     self.inner.drain_links();
 625 |     links
 626 |   }
 627 | 
 628 |   #[napi]
 629 |   /// clear all links and page data
 630 |   pub fn clear(&mut self) {
 631 |     self.inner.clear();
 632 |   }
 633 | 
 634 |   #[napi]
 635 |   /// Set HTTP headers for request using [reqwest::header::HeaderMap](https://docs.rs/reqwest/latest/reqwest/header/struct.HeaderMap.html).
 636 |   pub fn with_headers(&mut self, headers: Option<Object>) -> &Self {
 637 |     use std::str::FromStr;
 638 | 
 639 |     match headers {
 640 |       Some(obj) => {
 641 |         let mut h = spider::reqwest::header::HeaderMap::new();
 642 |         let keys = Object::keys(&obj).unwrap_or_default();
 643 | 
 644 |         for key in keys.into_iter() {
 645 |           let header_key = spider::reqwest::header::HeaderName::from_str(&key);
 646 | 
 647 |           match header_key {
 648 |             Ok(hn) => {
 649 |               let header_value = obj
 650 |                 .get::<String, String>(key)
 651 |                 .unwrap_or_default()
 652 |                 .unwrap_or_default();
 653 | 
 654 |               match spider::reqwest::header::HeaderValue::from_str(&header_value) {
 655 |                 Ok(hk) => {
 656 |                   h.append(hn, hk);
 657 |                 }
 658 |                 _ => (),
 659 |               }
 660 |             }
 661 |             _ => (),
 662 |           }
 663 |         }
 664 |         self.inner.with_headers(Some(h));
 665 |       }
 666 |       _ => {
 667 |         self.inner.with_headers(None);
 668 |       }
 669 |     };
 670 | 
 671 |     self
 672 |   }
 673 | 
 674 |   /// Add user agent to request.
 675 |   #[napi]
 676 |   pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &Self {
 677 |     self.inner.configuration.with_user_agent(user_agent);
 678 |     self
 679 |   }
 680 | 
 681 |   /// Respect robots.txt file.
 682 |   #[napi]
 683 |   pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &Self {
 684 |     self
 685 |       .inner
 686 |       .configuration
 687 |       .with_respect_robots_txt(respect_robots_txt);
 688 |     self
 689 |   }
 690 | 
 691 |   /// Determine whether to collect all the resources found on pages.
 692 |   #[napi]
 693 |   pub fn with_full_resources(&mut self, full_resources: bool) -> &Self {
 694 |     self.inner.configuration.with_full_resources(full_resources);
 695 |     self
 696 |   }
 697 | 
 698 |   /// Use network interception for the request to only allow content that matches the host. If the content is from a 3rd party it needs to be part of our include list.
 699 |   #[napi]
 700 |   pub fn with_chrome_intercept(&mut self, chrome_intercept: bool, block_images: bool) -> &Self {
 701 |     let mut intercept_config =
 702 |       spider::features::chrome_common::RequestInterceptConfiguration::new(chrome_intercept);
 703 | 
 704 |     intercept_config.block_visuals = block_images;
 705 | 
 706 |     self.inner.with_chrome_intercept(intercept_config);
 707 |     self
 708 |   }
 709 | 
 710 |   /// Set the connection url for the chrome instance. This method does nothing if the `chrome` is not enabled.
 711 |   #[napi]
 712 |   pub fn with_chrome_connection(&mut self, chrome_connection: String) -> &Self {
 713 |     self
 714 |       .inner
 715 |       .with_chrome_connection(if chrome_connection.is_empty() {
 716 |         None
 717 |       } else {
 718 |         Some(chrome_connection)
 719 |       });
 720 |     self
 721 |   }
 722 | 
 723 |   /// Preserve the HOST header.
 724 |   #[napi]
 725 |   pub fn with_preserve_host_header(&mut self, preserve_host: bool) -> &Self {
 726 |     self.inner.with_preserve_host_header(preserve_host);
 727 |     self
 728 |   }
 729 | 
 730 |   /// Include subdomains detection.
 731 |   #[napi]
 732 |   pub fn with_subdomains(&mut self, subdomains: bool) -> &Self {
 733 |     self.inner.configuration.with_subdomains(subdomains);
 734 |     self
 735 |   }
 736 | 
 737 |   /// Include tld detection.
 738 |   #[napi]
 739 |   pub fn with_tld(&mut self, tld: bool) -> &Self {
 740 |     self.inner.configuration.with_tld(tld);
 741 |     self
 742 |   }
 743 | 
 744 |   /// Only use HTTP/2.
 745 |   #[napi]
 746 |   pub fn with_http2_prior_knowledge(&mut self, http2_prior_knowledge: bool) -> &Self {
 747 |     self
 748 |       .inner
 749 |       .configuration
 750 |       .with_http2_prior_knowledge(http2_prior_knowledge);
 751 |     self
 752 |   }
 753 | 
 754 |   /// Max time to wait for request duration to milliseconds.
 755 |   #[napi]
 756 |   pub fn with_request_timeout(&mut self, request_timeout: Option<u32>) -> &Self {
 757 |     self
 758 |       .inner
 759 |       .configuration
 760 |       .with_request_timeout(match request_timeout {
 761 |         Some(d) => Some(Duration::from_millis(d.into())),
 762 |         _ => None,
 763 |       });
 764 |     self
 765 |   }
 766 | 
 767 |   /// add external domains
 768 |   #[napi]
 769 |   pub fn with_external_domains(&mut self, external_domains: Option<Vec<String>>) -> &Self {
 770 |     self.inner.with_external_domains(match external_domains {
 771 |       Some(ext) => Some(ext.into_iter()),
 772 |       _ => None,
 773 |     });
 774 |     self
 775 |   }
 776 | 
 777 |   /// Use stealth mode for the request. This does nothing without chrome.
 778 |   #[napi]
 779 |   pub fn with_stealth(&mut self, stealth_mode: Option<bool>) -> &Self {
 780 |     self.inner.with_stealth(match stealth_mode {
 781 |       Some(ext) => ext,
 782 |       _ => false,
 783 |     });
 784 |     self
 785 |   }
 786 | 
 787 |   /// Dangerously accept invalid certificates - this should be used as a last resort.
 788 |   #[napi]
 789 |   pub fn with_danger_accept_invalid_certs(&mut self, accept_invalid_certs: Option<bool>) -> &Self {
 790 |     self
 791 |       .inner
 792 |       .with_danger_accept_invalid_certs(match accept_invalid_certs {
 793 |         Some(ext) => ext,
 794 |         _ => false,
 795 |       });
 796 |     self
 797 |   }
 798 | 
 799 |   #[napi]
 800 |   /// Set the crawling budget
 801 |   pub fn with_budget(&mut self, budget: Option<std::collections::HashMap<String, u32>>) -> &Self {
 802 |     use spider::hashbrown::hash_map::HashMap;
 803 | 
 804 |     match budget {
 805 |       Some(d) => {
 806 |         self.inner.with_budget(Some(
 807 |           d.iter()
 808 |             .map(|(k, v)| (k.as_str(), *v))
 809 |             .collect::<HashMap<&str, u32>>(),
 810 |         ));
 811 |       }
 812 |       _ => (),
 813 |     }
 814 | 
 815 |     self
 816 |   }
 817 | 
 818 |   /// Set the max redirects allowed for request.
 819 |   #[napi]
 820 |   pub fn with_redirect_limit(&mut self, redirect_limit: u32) -> &Self {
 821 |     self.inner.with_redirect_limit(redirect_limit as usize);
 822 |     self
 823 |   }
 824 | 
 825 |   /// Set the redirect policy to use, either Strict or Loose by default.
 826 |   #[napi]
 827 |   pub fn with_redirect_policy(&mut self, strict: bool) -> &Self {
 828 |     self.inner.with_redirect_policy(if strict {
 829 |       RedirectPolicy::Strict
 830 |     } else {
 831 |       RedirectPolicy::Loose
 832 |     });
 833 |     self
 834 |   }
 835 | 
 836 |   #[napi]
 837 |   /// Regex blacklist urls from the crawl
 838 |   pub fn with_blacklist_url(&mut self, blacklist_url: Option<Vec<String>>) -> &Self {
 839 |     self
 840 |       .inner
 841 |       .configuration
 842 |       .with_blacklist_url(match blacklist_url {
 843 |         Some(v) => {
 844 |           let mut blacklist: Vec<CompactString> = Vec::new();
 845 |           for item in v {
 846 |             blacklist.push(CompactString::new(item));
 847 |           }
 848 |           Some(blacklist)
 849 |         }
 850 |         _ => None,
 851 |       });
 852 | 
 853 |     self
 854 |   }
 855 | 
 856 |   #[napi]
 857 |   /// Regex whitelist urls from the crawl
 858 |   pub fn with_whitelist_url(&mut self, whitelist_url: Option<Vec<String>>) -> &Self {
 859 |     self
 860 |       .inner
 861 |       .configuration
 862 |       .with_whitelist_url(match whitelist_url {
 863 |         Some(v) => {
 864 |           let mut whitelist: Vec<CompactString> = Vec::new();
 865 |           for item in v {
 866 |             whitelist.push(CompactString::new(item));
 867 |           }
 868 |           Some(whitelist)
 869 |         }
 870 |         _ => None,
 871 |       });
 872 | 
 873 |     self
 874 |   }
 875 | 
 876 |   #[napi]
 877 |   /// Wait for a delay. Should only be used for testing. This method does nothing if the `chrome` feature is not enabled.
 878 |   pub fn with_wait_for_delay(&mut self, seconds: Option<u32>, nanos: Option<u32>) -> &Self {
 879 |     self
 880 |       .inner
 881 |       .configuration
 882 |       .with_wait_for_delay(if seconds.is_some() || nanos.is_some() {
 883 |         let duration = Duration::new(
 884 |           seconds.unwrap_or_default() as u64,
 885 |           nanos.unwrap_or_default(),
 886 |         );
 887 |         Some(WaitForDelay::new(Some(duration)))
 888 |       } else {
 889 |         None
 890 |       });
 891 | 
 892 |     self
 893 |   }
 894 | 
 895 |   #[napi]
 896 |   /// Wait for a CSS query selector. This method does nothing if the `chrome` feature is not enabled.
 897 |   pub fn with_wait_for_selector(
 898 |     &mut self,
 899 |     selector: Option<&str>,
 900 |     seconds: Option<u32>,
 901 |     nanos: Option<u32>,
 902 |   ) -> &Self {
 903 |     self
 904 |       .inner
 905 |       .configuration
 906 |       .with_wait_for_selector(if seconds.is_some() || nanos.is_some() {
 907 |         let duration = Duration::new(
 908 |           seconds.unwrap_or_default() as u64,
 909 |           nanos.unwrap_or_default(),
 910 |         );
 911 |         Some(WaitForSelector::new(
 912 |           Some(duration),
 913 |           selector.unwrap_or_default().to_string(),
 914 |         ))
 915 |       } else {
 916 |         None
 917 |       });
 918 | 
 919 |     self
 920 |   }
 921 | 
 922 |   #[napi]
 923 |   /// Wait for idle network request. This method does nothing if the `chrome` feature is not enabled.
 924 |   pub fn with_wait_for_idle_network(&mut self, seconds: Option<u32>, nanos: Option<u32>) -> &Self {
 925 |     self
 926 |       .inner
 927 |       .configuration
 928 |       .with_wait_for_idle_network(if seconds.is_some() || nanos.is_some() {
 929 |         let duration = Duration::new(
 930 |           seconds.unwrap_or_default() as u64,
 931 |           nanos.unwrap_or_default(),
 932 |         );
 933 |         Some(WaitForIdleNetwork::new(Some(duration)))
 934 |       } else {
 935 |         None
 936 |       });
 937 | 
 938 |     self
 939 |   }
 940 | 
 941 |   /// Setup cron jobs to run
 942 |   #[napi]
 943 |   pub fn with_cron(&mut self, cron_str: String, cron_type: Option<String>) -> &Self {
 944 |     self.inner.with_cron(
 945 |       cron_str.as_str(),
 946 |       if cron_type.unwrap_or_default() == "scrape" {
 947 |         spider::website::CronType::Scrape
 948 |       } else {
 949 |         spider::website::CronType::Crawl
 950 |       },
 951 |     );
 952 |     self
 953 |   }
 954 | 
 955 |   /// Use OpenAI to generate dynamic javascript snippets. Make sure to set the `OPENAI_API_KEY` env variable.
 956 |   #[napi]
 957 |   pub fn with_openai(&mut self, env: Env, openai_configs: Option<napi::JsObject>) -> &Self {
 958 |     use serde_json::Value;
 959 |     use spider::configuration::GPTConfigs;
 960 |     let openai_configs: Option<Value> = match openai_configs {
 961 |       Some(obj) => match env.from_js_value(obj) {
 962 |         Ok(e) => Some(e),
 963 |         _ => None,
 964 |       },
 965 |       None => None,
 966 |     };
 967 | 
 968 |     if let Some(configs) = openai_configs {
 969 |       let configs: GPTConfigs =
 970 |         serde_json::from_value(configs).unwrap_or_else(|_| GPTConfigs::default());
 971 | 
 972 |       if !configs.model.is_empty() || configs.prompt_url_map.is_some() {
 973 |         self.inner.with_openai(Some(configs));
 974 |       }
 975 |     }
 976 | 
 977 |     self
 978 |   }
 979 | 
 980 |   /// Take screenshots of web pages using chrome.
 981 |   #[napi]
 982 |   pub fn with_screenshot(
 983 |     &mut self,
 984 |     env: Env,
 985 | 
 986 |     #[napi(ts_arg_type = r#"{
 987 |   /** The screenshot params. */
 988 |   params: {
 989 |     /** Chrome DevTools Protocol screenshot options. */
 990 |     cdp_params: {
 991 |       /** Image compression format (defaults to png). */
 992 |       format: 'jpeg' | 'png' | 'webp'
 993 |       /** Compression quality from range [0..100] (jpeg only). */
 994 |       quality: number
 995 |       /** Capture the screenshot of a given region only. */
 996 |       clip: {
 997 |         x: number
 998 |         y: number
 999 |         height: number
1000 |         width: number
1001 |         scale: number
1002 |       }
1003 |       /** Capture the screenshot from the surface, rather than the view. Defaults to true.*/
1004 |       from_surface: boolean
1005 |       /** Capture the screenshot beyond the viewport. Defaults to false. */
1006 |       capture_beyond_viewport: boolean
1007 |     }
1008 |     /** Take full page screenshot */
1009 |     full_page: boolean
1010 |     /** Make the background transparent (png only). */
1011 |     omit_background: boolean
1012 |   }
1013 |   /** Return the bytes of the screenshot on the Page. */
1014 |   bytes: boolean
1015 |   /** Store the screenshot to disk. This can be used with output_dir. If disabled will not store the file to the output directory. */
1016 |   save: boolean
1017 |   /** The output directory to store the file. Parent folders may be created inside the directory. */
1018 |   output_dir: string | null
1019 | }"#)]
1020 |     screenshot_configs: Option<napi::JsObject>,
1021 |   ) -> &Self {
1022 |     use serde_json::Value;
1023 |     use spider::configuration::ScreenShotConfig;
1024 |     let screenshot_configs: Option<Value> = match screenshot_configs {
1025 |       Some(obj) => match env.from_js_value(obj) {
1026 |         Ok(e) => Some(e),
1027 |         _ => None,
1028 |       },
1029 |       None => None,
1030 |     };
1031 | 
1032 |     if let Some(configs) = screenshot_configs {
1033 |       let configs: ScreenShotConfig =
1034 |         serde_json::from_value(configs).unwrap_or_else(|_| ScreenShotConfig::default());
1035 | 
1036 |       self.inner.with_screenshot(Some(configs));
1037 |     }
1038 | 
1039 |     self
1040 |   }
1041 | 
1042 |   /// Delay between request as ms.
1043 |   #[napi]
1044 |   pub fn with_delay(&mut self, delay: u32) -> &Self {
1045 |     self.inner.configuration.with_delay(delay.into());
1046 |     self
1047 |   }
1048 | 
1049 |   /// Set a crawl depth limit. If the value is 0 there is no limit.
1050 |   #[napi]
1051 |   pub fn with_depth(&mut self, depth: u32) -> &Self {
1052 |     self.inner.configuration.with_depth(depth as usize);
1053 |     self
1054 |   }
1055 | 
1056 |   /// Return the links found on the page in the channel subscriptions. This method does nothing if the `decentralized` is enabled.
1057 |   #[napi]
1058 |   pub fn with_return_page_links(&mut self, return_page_links: bool) -> &Self {
1059 |     self
1060 |       .inner
1061 |       .configuration
1062 |       .with_return_page_links(return_page_links);
1063 |     self
1064 |   }
1065 | 
1066 |   /// Cache the page following HTTP rules.
1067 |   #[napi]
1068 |   pub fn with_caching(&mut self, cache: bool) -> &Self {
1069 |     self.inner.configuration.with_caching(cache);
1070 |     self
1071 |   }
1072 | 
1073 |   /// Set the sitemap url.
1074 |   #[napi]
1075 |   pub fn with_sitemap(&mut self, sitemap: Option<&str>) -> &Self {
1076 |     self.inner.configuration.with_sitemap(sitemap);
1077 |     self
1078 |   }
1079 | 
1080 |   /// Use proxies for request.
1081 |   #[napi]
1082 |   pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &Self {
1083 |     self.inner.configuration.with_proxies(proxies);
1084 |     self
1085 |   }
1086 | 
1087 |   #[napi]
1088 |   /// build the inner website - not required for all builder_steps
1089 |   pub fn build(&mut self) -> &Self {
1090 |     match self.inner.build() {
1091 |       Ok(w) => self.inner = w,
1092 |       _ => (),
1093 |     }
1094 |     self
1095 |   }
1096 | }
1097 | 
1098 | /// a runner for handling crons
1099 | #[napi]
1100 | pub struct Cron {
1101 |   /// the runner task
1102 |   inner: spider::async_job::Runner,
1103 |   /// inner cron handle
1104 |   cron_handle: Option<JoinHandle<()>>,
1105 | }
1106 | 
1107 | #[napi]
1108 | impl Cron {
1109 |   /// stop the cron instance
1110 |   #[napi]
1111 |   pub async unsafe fn stop(&mut self) {
1112 |     self.inner.stop().await;
1113 |     match &self.cron_handle {
1114 |       Some(h) => h.abort(),
1115 |       _ => (),
1116 |     }
1117 |   }
1118 | }
1119 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "module": "commonjs",
 4 |     "strict": true,
 5 |     "lib": ["es2016", "dom"],
 6 |     "types": ["node"],
 7 |     "skipLibCheck": true
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------