├── .cargo └── config.toml ├── .editorconfig ├── .eslintrc.yml ├── .github └── workflows │ ├── CI.yml │ ├── bench.yml │ └── book.yml ├── .gitignore ├── .npmignore ├── .prettierignore ├── .taplo.toml ├── .vscode └── settings.json ├── .yarn └── releases │ └── yarn-3.6.4.cjs ├── .yarnrc.yml ├── Cargo.toml ├── LICENSE ├── README.md ├── __test__ └── index.spec.ts ├── bench ├── README.md ├── base.ts ├── case │ ├── crawlee.ts │ └── spider.ts ├── compare.ts ├── crawlee.ts ├── oss.ts ├── package-lock.json └── package.json ├── book ├── .gitignore ├── book.toml └── src │ ├── README.md │ ├── SUMMARY.md │ ├── benchmarks.md │ ├── crawl.md │ ├── cron-job.md │ ├── env.md │ ├── getting-started.md │ ├── page.md │ ├── scrape.md │ ├── simple.md │ ├── storing-data.md │ └── website.md ├── build.rs ├── examples ├── basic.mjs ├── cron.mjs ├── openai.mjs └── subscription.mjs ├── index.d.ts ├── index.js ├── npm ├── android-arm-eabi │ ├── README.md │ └── package.json ├── android-arm64 │ ├── README.md │ └── package.json ├── darwin-arm64 │ ├── README.md │ └── package.json ├── darwin-universal │ ├── README.md │ └── package.json ├── darwin-x64 │ ├── README.md │ └── package.json ├── freebsd-x64 │ ├── README.md │ └── package.json ├── linux-arm-gnueabihf │ ├── README.md │ └── package.json ├── linux-arm64-gnu │ ├── README.md │ └── package.json ├── linux-arm64-musl │ ├── README.md │ └── package.json ├── linux-x64-gnu │ ├── README.md │ └── package.json ├── linux-x64-musl │ ├── README.md │ └── package.json ├── win32-arm64-msvc │ ├── README.md │ └── package.json ├── win32-ia32-msvc │ ├── README.md │ └── package.json └── win32-x64-msvc │ ├── README.md │ └── package.json ├── package.json ├── rustfmt.toml ├── src ├── conversions.rs ├── lib.rs ├── npage.rs ├── nwebsite.rs ├── page.rs ├── shortcut.rs └── website.rs ├── tsconfig.json └── yarn.lock /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.aarch64-unknown-linux-musl] 2 | linker = "aarch64-linux-musl-gcc" 3 | rustflags = ["-C", "target-feature=-crt-static"] -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig helps developers define and maintain consistent 2 | # coding styles between different editors or IDEs 3 | # http://editorconfig.org 4 | root = true 5 | 6 | [*] 7 | indent_style = space 8 | indent_size = 2 9 | end_of_line = lf 10 | charset = utf-8 11 | trim_trailing_whitespace = true 12 | insert_final_newline = true 13 | 14 | [*.md] 15 | trim_trailing_whitespace = false -------------------------------------------------------------------------------- /.eslintrc.yml: -------------------------------------------------------------------------------- 1 | parser: '@typescript-eslint/parser' 2 | 3 | parserOptions: 4 | ecmaFeatures: 5 | jsx: true 6 | ecmaVersion: latest 7 | sourceType: module 8 | project: ./tsconfig.json 9 | 10 | env: 11 | browser: true 12 | es6: true 13 | node: true 14 | jest: true 15 | 16 | ignorePatterns: ['index.js'] 17 | 18 | plugins: 19 | - import 20 | - '@typescript-eslint' 21 | 22 | extends: 23 | - eslint:recommended 24 | - plugin:prettier/recommended 25 | 26 | rules: 27 | # 0 = off, 1 = warn, 2 = error 28 | 'space-before-function-paren': 0 29 | 'no-useless-constructor': 0 30 | 'no-undef': 2 31 | 'no-console': [2, { allow: ['error', 'warn', 'info', 'assert'] }] 32 | 'comma-dangle': ['error', 'only-multiline'] 33 | 'no-unused-vars': 0 34 | 'no-var': 2 35 | 'one-var-declaration-per-line': 2 36 | 'prefer-const': 2 37 | 'no-const-assign': 2 38 | 'no-duplicate-imports': 2 39 | 'no-use-before-define': [2, { 'functions': false, 'classes': false }] 40 | 'eqeqeq': [2, 'always', { 'null': 'ignore' }] 41 | 'no-case-declarations': 0 42 | 'no-restricted-syntax': 43 | [ 44 | 2, 45 | { 46 | 'selector': 'BinaryExpression[operator=/(==|===|!=|!==)/][left.raw=true], BinaryExpression[operator=/(==|===|!=|!==)/][right.raw=true]', 47 | 'message': Don't compare for equality against boolean literals, 48 | }, 49 | ] 50 | 51 | # https://github.com/benmosher/eslint-plugin-import/pull/334 52 | 'import/no-duplicates': 2 53 | 'import/first': 2 54 | 'import/newline-after-import': 2 55 | 'import/order': 56 | [ 57 | 2, 58 | { 59 | 'newlines-between': 'always', 60 | 'alphabetize': { 'order': 'asc' }, 61 | 'groups': ['builtin', 'external', 'internal', 'parent', 'sibling', 'index'], 62 | }, 63 | ] 64 | 65 | overrides: 66 | - files: 67 | - ./**/*{.ts,.tsx} 68 | rules: 69 | 'no-unused-vars': [2, { varsIgnorePattern: '^_', argsIgnorePattern: '^_', ignoreRestSiblings: true }] 70 | 'no-undef': 0 71 | # TypeScript declare merge 72 | 'no-redeclare': 0 73 | 'no-useless-constructor': 0 74 | 'no-dupe-class-members': 0 75 | 'no-case-declarations': 0 76 | 'no-duplicate-imports': 0 77 | # TypeScript Interface and Type 78 | 'no-use-before-define': 0 79 | 80 | '@typescript-eslint/adjacent-overload-signatures': 2 81 | '@typescript-eslint/await-thenable': 2 82 | '@typescript-eslint/consistent-type-assertions': 2 83 | '@typescript-eslint/ban-types': 84 | [ 85 | 'error', 86 | { 87 | 'types': 88 | { 89 | 'String': { 'message': 'Use string instead', 'fixWith': 'string' }, 90 | 'Number': { 'message': 'Use number instead', 'fixWith': 'number' }, 91 | 'Boolean': { 'message': 'Use boolean instead', 'fixWith': 'boolean' }, 92 | 'Function': { 'message': 'Use explicit type instead' }, 93 | }, 94 | }, 95 | ] 96 | '@typescript-eslint/explicit-member-accessibility': 97 | [ 98 | 'error', 99 | { 100 | accessibility: 'explicit', 101 | overrides: 102 | { 103 | accessors: 'no-public', 104 | constructors: 'no-public', 105 | methods: 'no-public', 106 | properties: 'no-public', 107 | parameterProperties: 'explicit', 108 | }, 109 | }, 110 | ] 111 | '@typescript-eslint/method-signature-style': 2 112 | '@typescript-eslint/no-floating-promises': 2 113 | '@typescript-eslint/no-implied-eval': 2 114 | '@typescript-eslint/no-for-in-array': 2 115 | '@typescript-eslint/no-inferrable-types': 2 116 | '@typescript-eslint/no-invalid-void-type': 2 117 | '@typescript-eslint/no-misused-new': 2 118 | '@typescript-eslint/no-misused-promises': 2 119 | '@typescript-eslint/no-namespace': 2 120 | '@typescript-eslint/no-non-null-asserted-optional-chain': 2 121 | '@typescript-eslint/no-throw-literal': 2 122 | '@typescript-eslint/no-unnecessary-boolean-literal-compare': 2 123 | '@typescript-eslint/prefer-for-of': 2 124 | '@typescript-eslint/prefer-nullish-coalescing': 2 125 | '@typescript-eslint/switch-exhaustiveness-check': 2 126 | '@typescript-eslint/prefer-optional-chain': 2 127 | '@typescript-eslint/prefer-readonly': 2 128 | '@typescript-eslint/prefer-string-starts-ends-with': 0 129 | '@typescript-eslint/no-array-constructor': 2 130 | '@typescript-eslint/require-await': 2 131 | '@typescript-eslint/return-await': 2 132 | '@typescript-eslint/ban-ts-comment': 133 | [2, { 'ts-expect-error': false, 'ts-ignore': true, 'ts-nocheck': true, 'ts-check': false }] 134 | '@typescript-eslint/naming-convention': 135 | [ 136 | 2, 137 | { 138 | selector: 'memberLike', 139 | format: ['camelCase', 'PascalCase'], 140 | modifiers: ['private'], 141 | leadingUnderscore: 'forbid', 142 | }, 143 | ] 144 | '@typescript-eslint/no-unused-vars': 145 | [2, { varsIgnorePattern: '^_', argsIgnorePattern: '^_', ignoreRestSiblings: true }] 146 | '@typescript-eslint/member-ordering': 147 | [ 148 | 2, 149 | { 150 | default: 151 | [ 152 | 'public-static-field', 153 | 'protected-static-field', 154 | 'private-static-field', 155 | 'public-static-method', 156 | 'protected-static-method', 157 | 'private-static-method', 158 | 'public-instance-field', 159 | 'protected-instance-field', 160 | 'private-instance-field', 161 | 'public-constructor', 162 | 'protected-constructor', 163 | 'private-constructor', 164 | 'public-instance-method', 165 | 'protected-instance-method', 166 | 'private-instance-method', 167 | ], 168 | }, 169 | ] 170 | -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | env: 3 | DEBUG: napi:* 4 | APP_NAME: spider-rs 5 | MACOSX_DEPLOYMENT_TARGET: '10.13' 6 | permissions: 7 | contents: write 8 | id-token: write 9 | on: 10 | push: 11 | branches: 12 | - main 13 | tags-ignore: 14 | - '**' 15 | paths-ignore: 16 | - '**/*.md' 17 | - LICENSE 18 | - '**/*.gitignore' 19 | - .editorconfig 20 | - docs/** 21 | pull_request: null 22 | concurrency: 23 | group: ${{ github.workflow }}-${{ github.ref }} 24 | cancel-in-progress: true 25 | 26 | jobs: 27 | build: 28 | strategy: 29 | fail-fast: false 30 | matrix: 31 | settings: 32 | - host: macos-latest 33 | target: x86_64-apple-darwin 34 | build: yarn build --target x86_64-apple-darwin 35 | - host: windows-latest 36 | target: x86_64-pc-windows-msvc 37 | build: yarn build --target x86_64-pc-windows-msvc 38 | - host: windows-latest 39 | target: i686-pc-windows-msvc 40 | build: | 41 | choco install openssl.light 42 | set OPENSSL_LIB_DIR=C:\Program Files\OpenSSL\lib 43 | set OPENSSL_INCLUDE_DIR=C:\Program Files\OpenSSL\include 44 | yarn build --target i686-pc-windows-msvc 45 | # timeout issue - signals not working with swc core 46 | # yarn test 47 | - host: ubuntu-latest 48 | target: x86_64-unknown-linux-gnu 49 | setup: | 50 | sudo apt-get update 51 | sudo apt-get install -y gcc build-essential cmake openssl libssl-dev ca-certificates libc6 perl 52 | build: yarn build --target x86_64-unknown-linux-gnu 53 | # - host: ubuntu-latest 54 | # target: x86_64-unknown-linux-musl 55 | # setup: | 56 | # sudo apt-get update && sudo apt-get install -y build-essential pkg-config cmake musl-tools musl-dev openssl libssl-dev ca-certificates gcc g++ libc6 57 | # export CC=musl-gcc 58 | # docker: ghcr.io/napi-rs/napi-rs/nodejs-rust:lts-alpine 59 | # build: yarn build --target x86_64-unknown-linux-musl 60 | # env: 61 | # CXXFLAGS: '--stdlib=libc++ -L/usr/lib/llvm-18/lib -static' 62 | - host: macos-latest 63 | target: aarch64-apple-darwin 64 | build: yarn build --target aarch64-apple-darwin 65 | - host: ubuntu-latest 66 | target: aarch64-unknown-linux-gnu 67 | docker: ghcr.io/napi-rs/napi-rs/nodejs-rust:lts-debian-aarch64 68 | setup: | 69 | sudo apt-get update 70 | sudo apt-get install -y gcc-aarch64-linux-gnu build-essential cmake openssl libssl-dev ca-certificates gcc libc6 perl pkg-config 71 | build: yarn build --target aarch64-unknown-linux-gnu 72 | # - host: ubuntu-latest 73 | # target: armv7-unknown-linux-gnueabihf 74 | # setup: | 75 | # sudo apt-get update 76 | # sudo apt-get install build-essential pkg-config perl gcc cmake libc6 ca-certificates openssl libssl-dev gcc-arm-linux-gnueabihf -y 77 | # build: yarn build --target armv7-unknown-linux-gnueabihf 78 | # env: 79 | # CXXFLAGS: '--stdlib=libc++ -L/usr/lib/llvm-18/lib -static' 80 | - host: ubuntu-latest 81 | target: aarch64-linux-android 82 | setup: | 83 | sudo apt-get update 84 | sudo apt-get install -y build-essential cmake openssl libssl-dev openssl 85 | build: yarn build --target aarch64-linux-android 86 | - host: ubuntu-latest 87 | target: armv7-linux-androideabi 88 | setup: | 89 | sudo apt-get update 90 | sudo apt-get install -y build-essential cmake openssl libssl-dev perl libc6 gcc ca-certificates 91 | build: yarn build --target armv7-linux-androideabi 92 | # - host: ubuntu-latest 93 | # target: aarch64-unknown-linux-musl 94 | # setup: | 95 | # sudo apt-get update && sudo apt-get install -y pkg-config clang perl-utils build-essential musl-tools musl-dev ca-certificates gcc g++ libc6 perl openssl libssl-dev 96 | # export CC=musl-gcc 97 | # docker: ghcr.io/napi-rs/napi-rs/nodejs-rust:lts-alpine 98 | # build: | 99 | # set -e && 100 | # rustup target add aarch64-unknown-linux-musl && 101 | # yarn build --target aarch64-unknown-linux-musl 102 | # env: 103 | # CXXFLAGS: '--stdlib=libc++ -L/usr/lib/llvm-18/lib -static' 104 | - host: windows-latest 105 | target: aarch64-pc-windows-msvc 106 | build: | 107 | choco install openssl.light 108 | set OPENSSL_LIB_DIR=C:\Program Files\OpenSSL-Win64\lib 109 | set OPENSSL_INCLUDE_DIR=C:\Program Files\OpenSSL-Win64\include 110 | yarn build --target aarch64-pc-windows-msvc 111 | name: stable - ${{ matrix.settings.target }} - node@20 112 | runs-on: ${{ matrix.settings.host }} 113 | steps: 114 | - uses: actions/checkout@v4 115 | - name: Setup node 116 | uses: actions/setup-node@v4 117 | if: ${{ !matrix.settings.docker }} 118 | with: 119 | node-version: 20 120 | cache: yarn 121 | - name: Install 122 | uses: dtolnay/rust-toolchain@stable 123 | if: ${{ !matrix.settings.docker }} 124 | with: 125 | toolchain: stable 126 | targets: ${{ matrix.settings.target }} 127 | - name: Cache cargo 128 | uses: actions/cache@v4 129 | with: 130 | path: | 131 | ~/.cargo/registry/index/ 132 | ~/.cargo/registry/cache/ 133 | ~/.cargo/git/db/ 134 | .cargo-cache 135 | target/ 136 | key: ${{ matrix.settings.target }}-cargo-${{ matrix.settings.host }} 137 | - uses: goto-bus-stop/setup-zig@v2 138 | if: ${{ matrix.settings.target == 'armv7-unknown-linux-gnueabihf' }} 139 | with: 140 | version: 0.12.0 141 | - name: Setup toolchain 142 | run: ${{ matrix.settings.setup }} 143 | if: ${{ matrix.settings.setup }} 144 | shell: bash 145 | - name: Install dependencies 146 | run: yarn --no-immutable 147 | - name: Setup node x86 148 | uses: actions/setup-node@v4 149 | if: matrix.settings.target == 'i686-pc-windows-msvc' 150 | with: 151 | node-version: 20 152 | cache: yarn 153 | architecture: x86 154 | - name: Build in docker 155 | uses: addnab/docker-run-action@v3 156 | if: ${{ matrix.settings.docker }} 157 | with: 158 | image: ${{ matrix.settings.docker }} 159 | options: '--user 0:0 -v ${{ github.workspace }}/.cargo-cache/git/db:/usr/local/cargo/git/db -v ${{ github.workspace }}/.cargo/registry/cache:/usr/local/cargo/registry/cache -v ${{ github.workspace }}/.cargo/registry/index:/usr/local/cargo/registry/index -v ${{ github.workspace }}:/build -w /build' 160 | run: ${{ matrix.settings.build }} 161 | - name: Build 162 | run: ${{ matrix.settings.build }} 163 | if: ${{ !matrix.settings.docker }} 164 | shell: bash 165 | - name: Upload artifact 166 | uses: actions/upload-artifact@v4 167 | with: 168 | name: bindings-${{ matrix.settings.target }} 169 | path: ${{ env.APP_NAME }}.*.node 170 | if-no-files-found: error 171 | 172 | # build-freebsd: 173 | # runs-on: macos-13 174 | # name: Build FreeBSD 175 | # steps: 176 | # - uses: actions/checkout@v4 177 | # - name: Build 178 | # id: build 179 | # uses: cross-platform-actions/action@v0.25.0 180 | # env: 181 | # DEBUG: napi:* 182 | # RUSTUP_IO_THREADS: 1 183 | # with: 184 | # operating_system: freebsd 185 | # version: '13.2' 186 | # memory: 13G 187 | # cpu_count: 3 188 | # environment_variables: DEBUG RUSTUP_IO_THREADS 189 | # shell: bash 190 | # run: | 191 | # sudo pkg install -y -f curl node libnghttp2 npm openssl 192 | # sudo npm install -g yarn --ignore-scripts 193 | # curl https://sh.rustup.rs -sSf --output rustup.sh 194 | # sh rustup.sh -y --profile minimal --default-toolchain stable 195 | # source "$HOME/.cargo/env" 196 | # echo "~~~~ rustc --version ~~~~" 197 | # rustc --version 198 | # echo "~~~~ node -v ~~~~" 199 | # node -v 200 | # echo "~~~~ yarn --version ~~~~" 201 | # yarn --version 202 | # pwd 203 | # ls -lah 204 | # whoami 205 | # env 206 | # freebsd-version 207 | # yarn install 208 | # yarn build 209 | # strip -x *.node 210 | # yarn test 211 | # rm -rf node_modules 212 | # rm -rf target 213 | # rm -rf .yarn/cache 214 | # - name: Upload artifact 215 | # uses: actions/upload-artifact@v3 216 | # with: 217 | # name: bindings-freebsd 218 | # path: ${{ env.APP_NAME }}.*.node 219 | # if-no-files-found: error 220 | 221 | test-macOS-windows-binding: 222 | name: Test bindings on ${{ matrix.settings.target }} - node@${{ matrix.node }} 223 | needs: 224 | - build 225 | strategy: 226 | fail-fast: false 227 | matrix: 228 | settings: 229 | - host: windows-latest 230 | target: x86_64-pc-windows-msvc 231 | architecture: x64 232 | - host: macos-latest 233 | target: aarch64-apple-darwin 234 | architecture: arm64 235 | - host: macos-latest 236 | target: x86_64-apple-darwin 237 | architecture: x64 238 | node: 239 | - '18' 240 | - '20' 241 | runs-on: ${{ matrix.settings.host }} 242 | steps: 243 | - uses: actions/checkout@v4 244 | - name: Setup node 245 | uses: actions/setup-node@v4 246 | with: 247 | node-version: ${{ matrix.node }} 248 | cache: yarn 249 | architecture: ${{ matrix.settings.architecture }} 250 | - name: Install dependencies 251 | run: yarn --no-immutable 252 | - name: Download artifacts 253 | uses: actions/download-artifact@v4 254 | with: 255 | name: bindings-${{ matrix.settings.target }} 256 | path: . 257 | - name: List packages 258 | run: ls -R . 259 | shell: bash 260 | 261 | test-linux-x64-gnu-binding: 262 | name: Test bindings on Linux-x64-gnu - node@${{ matrix.node }} 263 | needs: 264 | - build 265 | strategy: 266 | fail-fast: false 267 | matrix: 268 | node: 269 | - '18' 270 | - '20' 271 | runs-on: ubuntu-latest 272 | steps: 273 | - uses: actions/checkout@v4 274 | - name: Setup node 275 | uses: actions/setup-node@v4 276 | with: 277 | node-version: ${{ matrix.node }} 278 | cache: yarn 279 | - name: Install dependencies 280 | run: yarn --no-immutable 281 | - name: Download artifacts 282 | uses: actions/download-artifact@v4 283 | with: 284 | name: bindings-x86_64-unknown-linux-gnu 285 | path: . 286 | - name: List packages 287 | run: ls -R . 288 | shell: bash 289 | 290 | test-linux-x64-musl-binding: 291 | name: Test bindings on x86_64-unknown-linux-musl - node@${{ matrix.node }} 292 | needs: 293 | - build 294 | strategy: 295 | fail-fast: false 296 | matrix: 297 | node: 298 | - '18' 299 | - '20' 300 | runs-on: ubuntu-latest 301 | steps: 302 | - uses: actions/checkout@v4 303 | - name: Setup node 304 | uses: actions/setup-node@v4 305 | with: 306 | node-version: ${{ matrix.node }} 307 | cache: yarn 308 | - name: Install dependencies 309 | run: | 310 | yarn config set supportedArchitectures.libc "musl" 311 | yarn --no-immutable 312 | - name: Download artifacts 313 | uses: actions/download-artifact@v4 314 | with: 315 | name: bindings-x86_64-unknown-linux-musl 316 | path: . 317 | - name: List packages 318 | run: ls -R . 319 | shell: bash 320 | 321 | test-linux-aarch64-gnu-binding: 322 | name: Test bindings on aarch64-unknown-linux-gnu - node@${{ matrix.node }} 323 | needs: 324 | - build 325 | strategy: 326 | fail-fast: false 327 | matrix: 328 | node: 329 | - '18' 330 | - '20' 331 | runs-on: ubuntu-latest 332 | steps: 333 | - uses: actions/checkout@v4 334 | - name: Download artifacts 335 | uses: actions/download-artifact@v4 336 | with: 337 | name: bindings-aarch64-unknown-linux-gnu 338 | path: . 339 | - name: List packages 340 | run: ls -R . 341 | shell: bash 342 | - name: Install dependencies 343 | run: | 344 | yarn config set supportedArchitectures.cpu "arm64" 345 | yarn config set supportedArchitectures.libc "glibc" 346 | yarn --no-immutable 347 | - name: Set up QEMU 348 | uses: docker/setup-qemu-action@v3 349 | with: 350 | platforms: arm64 351 | - run: docker run --rm --privileged multiarch/qemu-user-static --reset -p yes 352 | - name: Setup and run tests 353 | uses: addnab/docker-run-action@v3 354 | with: 355 | image: node:${{ matrix.node }}-slim 356 | options: '--platform linux/arm64 -v ${{ github.workspace }}:/build -w /build' 357 | run: | 358 | set -e 359 | yarn test 360 | ls -la 361 | 362 | test-linux-aarch64-musl-binding: 363 | name: Test bindings on aarch64-unknown-linux-musl - node@lts 364 | needs: 365 | - build 366 | runs-on: ubuntu-latest 367 | steps: 368 | - uses: actions/checkout@v4 369 | - name: Download artifacts 370 | uses: actions/download-artifact@v4 371 | with: 372 | name: bindings-aarch64-unknown-linux-musl 373 | path: . 374 | - name: List packages 375 | run: ls -R . 376 | shell: bash 377 | - name: Install dependencies 378 | run: | 379 | yarn config set supportedArchitectures.cpu "arm64" 380 | yarn config set supportedArchitectures.libc "musl" 381 | yarn --no-immutable 382 | - name: Set up QEMU 383 | uses: docker/setup-qemu-action@v3 384 | with: 385 | platforms: arm64 386 | - run: docker run --rm --privileged multiarch/qemu-user-static --reset -p yes 387 | - name: Setup and run tests 388 | uses: addnab/docker-run-action@v3 389 | with: 390 | image: node:lts-alpine 391 | options: '--platform linux/arm64 -v ${{ github.workspace }}:/build -w /build' 392 | run: | 393 | set -e 394 | yarn test 395 | 396 | test-linux-arm-gnueabihf-binding: 397 | name: Test bindings on armv7-unknown-linux-gnueabihf - node@${{ matrix.node }} 398 | needs: 399 | - build 400 | strategy: 401 | fail-fast: false 402 | matrix: 403 | node: 404 | - '18' 405 | - '20' 406 | runs-on: ubuntu-latest 407 | steps: 408 | - uses: actions/checkout@v4 409 | - name: Download artifacts 410 | uses: actions/download-artifact@v4 411 | with: 412 | name: bindings-armv7-unknown-linux-gnueabihf 413 | path: . 414 | - name: List packages 415 | run: ls -R . 416 | shell: bash 417 | - name: Install dependencies 418 | run: | 419 | yarn config set supportedArchitectures.cpu "arm" 420 | yarn --no-immutable 421 | - name: Set up QEMU 422 | uses: docker/setup-qemu-action@v3 423 | with: 424 | platforms: arm 425 | - run: docker run --rm --privileged multiarch/qemu-user-static --reset -p yes 426 | - name: Setup and run tests 427 | uses: addnab/docker-run-action@v3 428 | with: 429 | image: node:${{ matrix.node }}-bullseye-slim 430 | options: '--platform linux/arm/v7 -v ${{ github.workspace }}:/build -w /build' 431 | run: | 432 | set -e 433 | yarn test 434 | ls -la 435 | 436 | publish: 437 | name: Publish 438 | runs-on: ubuntu-latest 439 | needs: 440 | - test-macOS-windows-binding 441 | - test-linux-x64-gnu-binding 442 | # - build-freebsd 443 | # - test-linux-x64-musl-binding 444 | # - test-linux-aarch64-gnu-binding 445 | # - test-linux-aarch64-musl-binding 446 | # - test-linux-arm-gnueabihf-binding 447 | steps: 448 | - uses: actions/checkout@v4 449 | - name: Setup node 450 | uses: actions/setup-node@v4 451 | with: 452 | node-version: 20 453 | cache: yarn 454 | - name: Install dependencies 455 | run: yarn --no-immutable 456 | - name: Download all artifacts 457 | uses: actions/download-artifact@v4 458 | with: 459 | path: artifacts 460 | - name: Move artifacts 461 | run: yarn artifacts 462 | - name: List packages 463 | run: ls -R ./npm 464 | shell: bash 465 | - name: Publish 466 | run: | 467 | npm config set provenance true 468 | if git log -1 --pretty=%B | grep "^[0-9]\+\.[0-9]\+\.[0-9]\+$"; 469 | then 470 | echo "//registry.npmjs.org/:_authToken=$NPM_TOKEN" >> ~/.npmrc 471 | npm publish --access public 472 | elif git log -1 --pretty=%B | grep "^[0-9]\+\.[0-9]\+\.[0-9]\+"; 473 | then 474 | echo "//registry.npmjs.org/:_authToken=$NPM_TOKEN" >> ~/.npmrc 475 | npm publish --tag next --access public 476 | else 477 | echo "Not a release, skipping publish" 478 | fi 479 | env: 480 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 481 | NPM_TOKEN: ${{ secrets.NPM_TOKEN }} 482 | -------------------------------------------------------------------------------- /.github/workflows/bench.yml: -------------------------------------------------------------------------------- 1 | name: Bench Compare 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | checkout_and_test: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | include: 17 | - node-version: 18.x 18 | - node-version: latest 19 | 20 | steps: 21 | - name: Checkout code from ${{ github.repository }} 22 | uses: actions/checkout@v4 23 | 24 | - name: Install OpenSSL 25 | run: sudo apt-get update && sudo apt-get install -y openssl 26 | 27 | - name: Install 28 | run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh 29 | 30 | - name: Setup node 31 | uses: actions/setup-node@v4 32 | with: 33 | node-version: ${{ matrix.node-version }} 34 | cache: 'yarn' 35 | 36 | - name: Install yarn 37 | run: corepack enable && corepack prepare yarn@stable --activate 38 | 39 | - name: Install Deps 40 | run: yarn --no-immutable && yarn build && cd bench && npm i 41 | 42 | - name: Run Bench @spider-rs/spider-rs 43 | run: yarn bench 44 | 45 | - name: Run Bench OSS 46 | run: yarn bench:oss 47 | -------------------------------------------------------------------------------- /.github/workflows/book.yml: -------------------------------------------------------------------------------- 1 | name: github pages 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | jobs: 10 | deploy: 11 | runs-on: ubuntu-20.04 12 | concurrency: 13 | group: ${{ github.workflow }}-${{ github.ref }} 14 | steps: 15 | - uses: actions/checkout@v4 16 | 17 | - name: Setup mdBook 18 | uses: peaceiris/actions-mdbook@v1 19 | with: 20 | mdbook-version: 'latest' 21 | 22 | - run: cd book && mdbook build 23 | 24 | - name: Deploy 25 | uses: peaceiris/actions-gh-pages@v3 26 | if: ${{ github.ref == 'refs/heads/main' }} 27 | with: 28 | github_token: ${{ secrets.GITHUB_TOKEN }} 29 | publish_dir: ./book/book 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/node 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=node 3 | 4 | ### Node ### 5 | # Logs 6 | logs 7 | *.log 8 | npm-debug.log* 9 | yarn-debug.log* 10 | yarn-error.log* 11 | lerna-debug.log* 12 | 13 | # Diagnostic reports (https://nodejs.org/api/report.html) 14 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 15 | 16 | # Runtime data 17 | pids 18 | *.pid 19 | *.seed 20 | *.pid.lock 21 | 22 | # Directory for instrumented libs generated by jscoverage/JSCover 23 | lib-cov 24 | 25 | # Coverage directory used by tools like istanbul 26 | coverage 27 | *.lcov 28 | 29 | # nyc test coverage 30 | .nyc_output 31 | 32 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 33 | .grunt 34 | 35 | # Bower dependency directory (https://bower.io/) 36 | bower_components 37 | 38 | # node-waf configuration 39 | .lock-wscript 40 | 41 | # Compiled binary addons (https://nodejs.org/api/addons.html) 42 | build/Release 43 | 44 | # Dependency directories 45 | node_modules/ 46 | jspm_packages/ 47 | 48 | # TypeScript v1 declaration files 49 | typings/ 50 | 51 | # TypeScript cache 52 | *.tsbuildinfo 53 | 54 | # Optional npm cache directory 55 | .npm 56 | 57 | # Optional eslint cache 58 | .eslintcache 59 | 60 | # Microbundle cache 61 | .rpt2_cache/ 62 | .rts2_cache_cjs/ 63 | .rts2_cache_es/ 64 | .rts2_cache_umd/ 65 | 66 | # Optional REPL history 67 | .node_repl_history 68 | 69 | # Output of 'npm pack' 70 | *.tgz 71 | 72 | # Yarn Integrity file 73 | .yarn-integrity 74 | 75 | # dotenv environment variables file 76 | .env 77 | .env.test 78 | 79 | # parcel-bundler cache (https://parceljs.org/) 80 | .cache 81 | 82 | # Next.js build output 83 | .next 84 | 85 | # Nuxt.js build / generate output 86 | .nuxt 87 | dist 88 | 89 | # Gatsby files 90 | .cache/ 91 | # Comment in the public line in if your project uses Gatsby and not Next.js 92 | # https://nextjs.org/blog/next-9-1#public-directory-support 93 | # public 94 | 95 | # vuepress build output 96 | .vuepress/dist 97 | 98 | # Serverless directories 99 | .serverless/ 100 | 101 | # FuseBox cache 102 | .fusebox/ 103 | 104 | # DynamoDB Local files 105 | .dynamodb/ 106 | 107 | # TernJS port file 108 | .tern-port 109 | 110 | # Stores VSCode versions used for testing VSCode extensions 111 | .vscode-test 112 | 113 | # End of https://www.toptal.com/developers/gitignore/api/node 114 | 115 | # Created by https://www.toptal.com/developers/gitignore/api/macos 116 | # Edit at https://www.toptal.com/developers/gitignore?templates=macos 117 | 118 | ### macOS ### 119 | # General 120 | .DS_Store 121 | .AppleDouble 122 | .LSOverride 123 | 124 | # Icon must end with two 125 | Icon 126 | 127 | 128 | # Thumbnails 129 | ._* 130 | 131 | # Files that might appear in the root of a volume 132 | .DocumentRevisions-V100 133 | .fseventsd 134 | .Spotlight-V100 135 | .TemporaryItems 136 | .Trashes 137 | .VolumeIcon.icns 138 | .com.apple.timemachine.donotpresent 139 | 140 | # Directories potentially created on remote AFP share 141 | .AppleDB 142 | .AppleDesktop 143 | Network Trash Folder 144 | Temporary Items 145 | .apdisk 146 | 147 | ### macOS Patch ### 148 | # iCloud generated files 149 | *.icloud 150 | 151 | # End of https://www.toptal.com/developers/gitignore/api/macos 152 | 153 | # Created by https://www.toptal.com/developers/gitignore/api/windows 154 | # Edit at https://www.toptal.com/developers/gitignore?templates=windows 155 | 156 | ### Windows ### 157 | # Windows thumbnail cache files 158 | Thumbs.db 159 | Thumbs.db:encryptable 160 | ehthumbs.db 161 | ehthumbs_vista.db 162 | 163 | # Dump file 164 | *.stackdump 165 | 166 | # Folder config file 167 | [Dd]esktop.ini 168 | 169 | # Recycle Bin used on file shares 170 | $RECYCLE.BIN/ 171 | 172 | # Windows Installer files 173 | *.cab 174 | *.msi 175 | *.msix 176 | *.msm 177 | *.msp 178 | 179 | # Windows shortcuts 180 | *.lnk 181 | 182 | # End of https://www.toptal.com/developers/gitignore/api/windows 183 | 184 | #Added by cargo 185 | 186 | /target 187 | Cargo.lock 188 | 189 | .pnp.* 190 | .yarn/* 191 | !.yarn/patches 192 | !.yarn/plugins 193 | !.yarn/releases 194 | !.yarn/sdks 195 | !.yarn/versions 196 | 197 | *.node 198 | 199 | # index.d.ts 200 | # index.js 201 | __test__/*.js 202 | 203 | /storage 204 | /bench/*.js 205 | /bench/case/**.js 206 | /bench/storage/ -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | .cargo 4 | .github 5 | npm 6 | .eslintrc 7 | .prettierignore 8 | rustfmt.toml 9 | yarn.lock 10 | *.node 11 | .yarn 12 | __test__ 13 | renovate.json 14 | book 15 | examples 16 | build.rs 17 | src 18 | Cargo.toml 19 | .vscode 20 | tsconfig.json 21 | bench -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | target 2 | .yarn -------------------------------------------------------------------------------- /.taplo.toml: -------------------------------------------------------------------------------- 1 | exclude = ["node_modules/**/*.toml"] 2 | 3 | # https://taplo.tamasfe.dev/configuration/formatter-options.html 4 | [formatting] 5 | align_entries = true 6 | indent_tables = true 7 | reorder_keys = true -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "rust-analyzer.procMacro.ignored": { "napi-derive": ["napi"] } 3 | } 4 | -------------------------------------------------------------------------------- /.yarnrc.yml: -------------------------------------------------------------------------------- 1 | nodeLinker: node-modules 2 | 3 | npmAuditRegistry: https://registry.npmjs.org 4 | 5 | yarnPath: .yarn/releases/yarn-3.6.4.cjs 6 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | edition = "2021" 3 | name = "spider-rs_spider-rs" 4 | version = "0.0.0" 5 | description = "The fastest web crawler written in Rust ported to nodejs." 6 | repository = "https://github.com/spider-rs/spider-nodejs" 7 | authors = ["j-mendez "] 8 | 9 | [lib] 10 | crate-type = ["cdylib"] 11 | 12 | [dependencies] 13 | indexmap = "2" 14 | napi = { version = "2", default-features = false, features = ["napi4", "async", "tokio_rt", "serde-json"] } 15 | napi-derive = "2" 16 | num_cpus = "1" 17 | serde = "1" 18 | serde_json = "1" 19 | spider = { version = "2", default-features = false, features = [ 20 | "cron", 21 | "regex", 22 | "cookies", 23 | "socks", 24 | "chrome", 25 | "control", 26 | "chrome_intercept", 27 | "cache", 28 | "openai", 29 | "serde", 30 | "real_browser", 31 | "headers", 32 | "reqwest_rustls_tls", 33 | "io_uring", 34 | "sync", 35 | "disk", 36 | "cookies", 37 | "ua_generator", 38 | "encoding", 39 | "string_interner_buffer_backend", 40 | "balance" 41 | ] } 42 | spider_scraper = "0.1" 43 | 44 | [build-dependencies] 45 | napi-build = "2" 46 | 47 | [profile.release] 48 | lto = true 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Spider Contributors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spider-rs 2 | 3 | The [spider](https://github.com/spider-rs/spider) project ported to Node.js 4 | 5 | ## Getting Started 6 | 7 | 1. `npm i @spider-rs/spider-rs --save` 8 | 9 | ```ts 10 | import { Website, pageTitle } from '@spider-rs/spider-rs' 11 | 12 | const website = new Website('https://rsseau.fr') 13 | .withHeaders({ 14 | authorization: 'somerandomjwt', 15 | }) 16 | .withBudget({ 17 | '*': 20, // limit max request 20 pages for the website 18 | '/docs': 10, // limit only 10 pages on the `/docs` paths 19 | }) 20 | .withBlacklistUrl(['/resume']) // regex or pattern matching to ignore paths 21 | .build() 22 | 23 | // optional: page event handler 24 | const onPageEvent = (_err, page) => { 25 | const title = pageTitle(page) // comment out to increase performance if title not needed 26 | console.info(`Title of ${page.url} is '${title}'`) 27 | website.pushData({ 28 | status: page.statusCode, 29 | html: page.content, 30 | url: page.url, 31 | title, 32 | }) 33 | } 34 | 35 | await website.crawl(onPageEvent) 36 | await website.exportJsonlData('./storage/rsseau.jsonl') 37 | console.log(website.getLinks()) 38 | ``` 39 | 40 | Collect the resources for a website. 41 | 42 | ```ts 43 | import { Website } from '@spider-rs/spider-rs' 44 | 45 | const website = new Website('https://rsseau.fr') 46 | .withBudget({ 47 | '*': 20, 48 | '/docs': 10, 49 | }) 50 | // you can use regex or string matches to ignore paths 51 | .withBlacklistUrl(['/resume']) 52 | .build() 53 | 54 | await website.scrape() 55 | console.log(website.getPages()) 56 | ``` 57 | 58 | Run the crawls in the background on another thread. 59 | 60 | ```ts 61 | import { Website } from '@spider-rs/spider-rs' 62 | 63 | const website = new Website('https://rsseau.fr') 64 | 65 | const onPageEvent = (_err, page) => { 66 | console.log(page) 67 | } 68 | 69 | await website.crawl(onPageEvent, true) 70 | // runs immediately 71 | ``` 72 | 73 | Use headless Chrome rendering for crawls. 74 | 75 | ```ts 76 | import { Website } from '@spider-rs/spider-rs' 77 | 78 | const website = new Website('https://rsseau.fr').withChromeIntercept(true, true) 79 | 80 | const onPageEvent = (_err, page) => { 81 | console.log(page) 82 | } 83 | 84 | // the third param determines headless chrome usage. 85 | await website.crawl(onPageEvent, false, true) 86 | console.log(website.getLinks()) 87 | ``` 88 | 89 | Cron jobs can be done with the following. 90 | 91 | ```ts 92 | import { Website } from '@spider-rs/spider-rs' 93 | 94 | const website = new Website('https://choosealicense.com').withCron('1/5 * * * * *') 95 | // sleep function to test cron 96 | const stopCron = (time: number, handle) => { 97 | return new Promise((resolve) => { 98 | setTimeout(() => { 99 | resolve(handle.stop()) 100 | }, time) 101 | }) 102 | } 103 | 104 | const links = [] 105 | 106 | const onPageEvent = (err, value) => { 107 | links.push(value) 108 | } 109 | 110 | const handle = await website.runCron(onPageEvent) 111 | 112 | // stop the cron in 4 seconds 113 | await stopCron(4000, handle) 114 | ``` 115 | 116 | Use the crawl shortcut to get the page content and url. 117 | 118 | ```ts 119 | import { crawl } from '@spider-rs/spider-rs' 120 | 121 | const { links, pages } = await crawl('https://rsseau.fr') 122 | console.log(pages) 123 | ``` 124 | 125 | ## Benchmarks 126 | 127 | View the [benchmarks](./bench/README.md) to see a breakdown between libs and platforms. 128 | 129 | Test url: `https://espn.com` 130 | 131 | | `libraries` | `pages` | `speed` | 132 | | :--------------------------- | :-------- | :------ | 133 | | **`spider(rust): crawl`** | `150,387` | `1m` | 134 | | **`spider(nodejs): crawl`** | `150,387` | `153s` | 135 | | **`spider(python): crawl`** | `150,387` | `186s` | 136 | | **`scrapy(python): crawl`** | `49,598` | `1h` | 137 | | **`crawlee(nodejs): crawl`** | `18,779` | `30m` | 138 | 139 | The benches above were ran on a mac m1, spider on linux arm machines performs about 2-10x faster. 140 | 141 | ## Development 142 | 143 | Install the napi cli `npm i @napi-rs/cli --global`. 144 | 145 | 1. `yarn build:test` 146 | -------------------------------------------------------------------------------- /__test__/index.spec.ts: -------------------------------------------------------------------------------- 1 | import test from 'ava' 2 | import { crawl, Website, Page, type NPage, Cron, pageTitle } from '../index.js' 3 | 4 | const TEST_URL = 'https://choosealicense.com' 5 | 6 | test('shortcut crawl native', async (t) => { 7 | const { links, pages } = await crawl(TEST_URL) 8 | 9 | t.assert(links.length > 1, 'should be more than one link') 10 | t.assert(pages.length > 1, 'should be more than one page') 11 | }) 12 | 13 | test('new website native', async (t) => { 14 | const website = new Website(TEST_URL) 15 | await website.crawl() 16 | 17 | t.assert(website.getLinks().length > 1, 'should be more than one link') 18 | }) 19 | 20 | test('new website scrape native', async (t) => { 21 | const website = new Website(TEST_URL) 22 | await website.scrape() 23 | 24 | t.assert(website.getPages().length > 1, 'should be more than one page') 25 | }) 26 | 27 | test('new website native with custom config', async (t) => { 28 | const website = new Website(TEST_URL) 29 | .withHeaders({ 30 | authorization: 'somerandomjwt', 31 | }) 32 | .build() 33 | 34 | await website.crawl() 35 | 36 | t.assert(website.getLinks().length > 1, 'should be more than one page') 37 | }) 38 | 39 | test('new website native budget one page', async (t) => { 40 | const website = new Website(TEST_URL) 41 | .withBudget({ 42 | '*': 1, 43 | }) 44 | .build() 45 | 46 | await website.crawl() 47 | 48 | t.assert(website.getLinks().length === 1, 'should be one link') 49 | }) 50 | 51 | test('new website native blacklist pages', async (t) => { 52 | const website = new Website(TEST_URL).withBlacklistUrl(['/blog', new RegExp('/books').source, '/resume']).build() 53 | 54 | await website.crawl() 55 | 56 | const links = website.getLinks() 57 | 58 | // should be valid unless new pages and routes are created. 59 | t.assert(links.length > 1 && !links.includes(`${TEST_URL}/blog`), 'should be more than one page') 60 | }) 61 | 62 | test('new website native onPageEvent', async (t) => { 63 | const website = new Website(TEST_URL) 64 | 65 | const links: NPage[] = [] 66 | 67 | const onPageEvent = (err: Error | null, value: NPage) => { 68 | links.push(value) 69 | } 70 | 71 | // running in background can be done with a sleep timer for test. 72 | const backgroundStream = false 73 | 74 | await website.crawl(onPageEvent, backgroundStream) 75 | 76 | // should be valid unless new pages and routes are created. 77 | t.assert(links.length > 1, 'should be more than one page') 78 | }) 79 | 80 | test('new website native with title selector', async (t) => { 81 | const website = new Website(TEST_URL) 82 | 83 | const links: { url: string; title: string }[] = [] 84 | 85 | const onPageEvent = async (_err: Error | null, page: NPage) => { 86 | const title = pageTitle(page) 87 | links.push({ title, url: page.url }) 88 | } 89 | 90 | await website.crawl(onPageEvent) 91 | 92 | // should be valid unless new pages and routes are created. 93 | t.assert(links.length > 1, 'should be more than one page') 94 | }) 95 | 96 | // experimental - does not work on all platforms most likely due to time differences. 97 | test.skip('new website native cron', async (t) => { 98 | const website = new Website(TEST_URL).withCron('1/5 * * * * *') 99 | // sleep function to test cron 100 | const sleep = (time: number, handle: Cron) => { 101 | return new Promise((resolve) => { 102 | setTimeout(() => { 103 | resolve(handle.stop()) 104 | }, time) 105 | }) 106 | } 107 | 108 | const links: NPage[] = [] 109 | 110 | const onPageEvent = (err: Error | null, value: NPage) => { 111 | links.push(value) 112 | } 113 | 114 | const handle = await website.runCron(onPageEvent) 115 | 116 | await sleep(4000, handle) 117 | 118 | // should be valid unless new pages and routes are created. 119 | t.assert(links.length > 1, 'should be more than one page') 120 | }) 121 | 122 | test('new website native with subscriptions', async (t) => { 123 | const website = new Website(TEST_URL) 124 | 125 | const links: NPage[] = [] 126 | 127 | const onPageEvent = (_err: Error | null, value: NPage) => { 128 | links.push(value) 129 | } 130 | 131 | const id = website.subscribe(onPageEvent) 132 | 133 | await website.crawl() 134 | 135 | website.unsubscribe(id) 136 | 137 | // should be valid unless new pages and routes are created. 138 | t.assert(links.length > 1, 'should be more than one page') 139 | }) 140 | 141 | test('new single page', async (t) => { 142 | const page = new Page(TEST_URL) 143 | await page.fetch() 144 | const links = await page.getLinks() 145 | 146 | // should be valid unless new pages and routes are created. 147 | t.assert(links.length > 1, 'should be more than one link') 148 | t.assert(page.getHtml().length >= 100, 'should be valid html') 149 | t.assert(page.getBytes().length >= 100, 'should be valid bytes') 150 | }) 151 | 152 | test.skip('new website native headless', async (t) => { 153 | const website = new Website(TEST_URL) 154 | await website.crawl(undefined, false, true) 155 | 156 | t.assert(website.getLinks().length > 1, 'should be more than one link') 157 | }) 158 | 159 | test.skip('new website native smart mode', async (t) => { 160 | const website = new Website(TEST_URL) 161 | await website.crawlSmart(undefined, false) 162 | 163 | t.assert(website.getLinks().length > 1, 'should be more than one link') 164 | }) 165 | 166 | test.skip('new website native headless request interception', async (t) => { 167 | const website = new Website(TEST_URL).withChromeIntercept(true, true) 168 | await website.crawl(undefined, false, true) 169 | 170 | t.assert(website.getLinks().length > 1, 'should be more than one link') 171 | }) 172 | 173 | test('new website native raw content', async (t) => { 174 | const website = new Website(TEST_URL, true) 175 | 176 | const links: Buffer[] = [] 177 | 178 | const onPageEvent = (_err: Error | null, page: NPage) => page.rawContent && links.push(page.rawContent) 179 | 180 | await website.crawl(onPageEvent) 181 | 182 | t.assert(links.length > 1, 'should be more than one page') 183 | }) 184 | 185 | test('new website data store and export', async (t) => { 186 | const { promises } = await import('node:fs') 187 | const readFile = promises.readFile 188 | 189 | const website = new Website(TEST_URL, true) 190 | const outputFile = './storage/test.jsonl' 191 | 192 | const onPageEvent = (_err: Error | null, page: NPage) => website.pushData(page) 193 | 194 | await website.crawl(onPageEvent) 195 | await website.exportJsonlData(outputFile) 196 | 197 | const data = await readFile(outputFile) 198 | 199 | t.assert(!!data, 'should contain valid json file') 200 | }) 201 | 202 | test('new website stop', async (t) => { 203 | const website = new Website(TEST_URL) 204 | 205 | const onPageEvent = async (_err: Error | null, page: NPage) => { 206 | if (website.size >= 2) { 207 | await website.stop() 208 | } 209 | } 210 | 211 | await website.crawl(onPageEvent) 212 | 213 | t.assert(website.size < 30, 'should only have crawled a couple pages concurrently') 214 | }) 215 | 216 | test('new website stop background', async (t) => { 217 | const sleep = (time: number) => { 218 | return new Promise((resolve) => { 219 | setTimeout(() => { 220 | resolve(true) 221 | }, time) 222 | }) 223 | } 224 | 225 | const website = new Website(TEST_URL) 226 | let count = 0 227 | 228 | const onPageEvent = async (_err: Error | null, page: NPage) => { 229 | if (count) { 230 | await website.stop() 231 | } 232 | count++ 233 | } 234 | 235 | // lets wait for all other test since background shutsdown all crawls matching the url 236 | await sleep(2000) 237 | await website.crawl(onPageEvent, true) 238 | await sleep(2000) 239 | 240 | t.assert(count < 15, 'should only have crawled a couple pages concurrently in the background') 241 | }) 242 | -------------------------------------------------------------------------------- /bench/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarks 2 | 3 | ```sh 4 | Linux 5 | 8-core CPU 6 | 32 GB of RAM memory 7 | ----------------------- 8 | ``` 9 | 10 | Test url: `https://choosealicense.com` (small) 11 | 32 pages 12 | 13 | | `libraries` | `speed` | 14 | | :-------------------------------- | :------ | 15 | | **`spider-rs: crawl 10 samples`** | `76ms` | 16 | | **`crawlee: crawl 10 samples`** | `1s` | 17 | 18 | Test url: `https://rsseau.fr` (medium) 19 | 211 pages 20 | 21 | | `libraries` | `speed` | 22 | | :-------------------------------- | :------ | 23 | | **`spider-rs: crawl 10 samples`** | `0.5s` | 24 | | **`crawlee: crawl 10 samples`** | `72s` | 25 | 26 | ```sh 27 | ---------------------- 28 | mac Apple M1 Max 29 | 10-core CPU 30 | 64 GB of RAM memory 31 | ----------------------- 32 | ``` 33 | 34 | Test url: `https://choosealicense.com` (small) 35 | 32 pages 36 | 37 | | `libraries` | `speed` | 38 | | :-------------------------------- | :------ | 39 | | **`spider-rs: crawl 10 samples`** | `286ms` | 40 | | **`crawlee: crawl 10 samples`** | `1.7s` | 41 | 42 | Test url: `https://rsseau.fr` (medium) 43 | 211 pages 44 | 45 | | `libraries` | `speed` | 46 | | :-------------------------------- | :------ | 47 | | **`spider-rs: crawl 10 samples`** | `2.5s` | 48 | | **`crawlee: crawl 10 samples`** | `75s` | 49 | 50 | The performance scales the larger the website and if throttling is needed. Linux benchmarks are about 10x faster than macOS for spider-rs. 51 | -------------------------------------------------------------------------------- /bench/base.ts: -------------------------------------------------------------------------------- 1 | export const iterations = process.env.BENCH_COUNT ? parseInt(process.env.BENCH_COUNT, 10) : 20 2 | 3 | export const TEST_URL = 'https://choosealicense.com' 4 | export const TEST_URL_MEDIUM = 'https://rsseau.fr' 5 | export const TEST_URL_LARGE = 'https://espn.com' 6 | 7 | export enum BenchSizes { 8 | SMALL = 'SMALL', 9 | MEDIUM = 'MEDIUM', 10 | LARGE = 'LARGE', 11 | } 12 | -------------------------------------------------------------------------------- /bench/case/crawlee.ts: -------------------------------------------------------------------------------- 1 | // @ts-ignore 2 | import { CheerioCrawler } from 'crawlee' 3 | import { TEST_URL, iterations } from '../base' 4 | 5 | export async function bench(url = TEST_URL, size = 'SMALL') { 6 | // @ts-ignore 7 | const crawler = new CheerioCrawler({ 8 | // @ts-ignore 9 | async requestHandler({ enqueueLinks }) { 10 | // @ts-ignore 11 | await enqueueLinks() 12 | }, 13 | }) 14 | 15 | let duration = 0 16 | 17 | const run = async () => { 18 | const startTime = performance.now() 19 | // @ts-ignore 20 | await crawler.run([url]) 21 | duration += performance.now() - startTime 22 | } 23 | 24 | const bm = async (cb: () => Promise, i = 0) => { 25 | await cb() 26 | if (i < iterations) { 27 | await bm(cb, i + 1) 28 | } 29 | } 30 | 31 | await bm(run) 32 | 33 | console.log( 34 | JSON.stringify([ 35 | { 36 | name: `crawlee - OPS/S [${size}:PAGE]`, 37 | unit: 'OPS/S', 38 | value: 1000 / (duration / iterations), 39 | }, 40 | ]), 41 | ) 42 | } 43 | -------------------------------------------------------------------------------- /bench/case/spider.ts: -------------------------------------------------------------------------------- 1 | import { Website } from '../../index.js' 2 | import { TEST_URL, iterations } from '../base' 3 | 4 | export async function bench(url = TEST_URL, size = 'SMALL') { 5 | const website = new Website(url) 6 | 7 | let duration = 0 8 | 9 | const run = async () => { 10 | const startTime = performance.now() 11 | await website.crawl() 12 | duration += performance.now() - startTime 13 | } 14 | 15 | const bm = async (cb: () => Promise, i = 0) => { 16 | await cb() 17 | if (i < iterations) { 18 | await bm(cb, i + 1) 19 | } 20 | } 21 | 22 | await bm(run) 23 | 24 | console.log( 25 | JSON.stringify([ 26 | { 27 | name: `@spider-rs/spider-rs - OPS/S [${size}:PAGE]`, 28 | unit: 'OPS/S', 29 | value: 1000 / (duration / iterations), 30 | }, 31 | ]), 32 | ) 33 | } 34 | -------------------------------------------------------------------------------- /bench/compare.ts: -------------------------------------------------------------------------------- 1 | import { TEST_URL_MEDIUM, TEST_URL_LARGE, BenchSizes } from './base' 2 | import { bench } from './case/spider' 3 | 4 | // small 5 | bench() 6 | // small/medium 7 | bench(TEST_URL_MEDIUM, BenchSizes.MEDIUM) 8 | // large 150k pages plus 9 | if (process.env.BENCH_LARGE) { 10 | bench(TEST_URL_LARGE, BenchSizes.LARGE) 11 | } 12 | -------------------------------------------------------------------------------- /bench/crawlee.ts: -------------------------------------------------------------------------------- 1 | import { TEST_URL_MEDIUM, TEST_URL_LARGE, BenchSizes } from './base' 2 | import { bench } from './case/crawlee' 3 | 4 | // small 5 | bench() 6 | // small/medium 7 | bench(TEST_URL_MEDIUM, BenchSizes.MEDIUM) 8 | // large 150k pages plus 9 | if (process.env.BENCH_LARGE) { 10 | bench(TEST_URL_LARGE, BenchSizes.LARGE) 11 | } 12 | -------------------------------------------------------------------------------- /bench/oss.ts: -------------------------------------------------------------------------------- 1 | import { bench } from './case/spider' 2 | import { bench as benchCrawlee } from './case/crawlee' 3 | import { TEST_URL_MEDIUM, BenchSizes } from './base' 4 | ;(async () => { 5 | await bench() 6 | await bench(TEST_URL_MEDIUM, BenchSizes.MEDIUM) 7 | await benchCrawlee() 8 | await benchCrawlee(TEST_URL_MEDIUM, BenchSizes.MEDIUM) 9 | })() 10 | -------------------------------------------------------------------------------- /bench/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@spider-rs/spider-rs-bench", 3 | "version": "1.0.0", 4 | "author": "Jeff Mendez ", 5 | "publish": false, 6 | "devDependencies": { 7 | "@napi-rs/cli": "^2.16.5", 8 | "@types/node": "^20.10.0", 9 | "crawlee": "^3.6.2", 10 | "typescript": "^5.3.2" 11 | }, 12 | "scripts": { 13 | "bench": "tsc && NODE_ENV=production node ./compare.js", 14 | "bench:oss": "tsc && NODE_ENV=production CRAWLEE_LOG_LEVEL=off node ./oss.js" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /book/.gitignore: -------------------------------------------------------------------------------- 1 | book 2 | -------------------------------------------------------------------------------- /book/book.toml: -------------------------------------------------------------------------------- 1 | [book] 2 | authors = ["Jeff Mendez"] 3 | language = "en" 4 | multilingual = false 5 | src = "src" 6 | title = "spider-rs" 7 | 8 | [output.html] 9 | git-repository-url = "https://github.com/spider-rs/spider-nodejs/tree/main/book" 10 | edit-url-template = "https://github.com/spider-rs/spider-nodejs/edit/main/book/{path}" 11 | -------------------------------------------------------------------------------- /book/src/README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | `Spider-RS` is the fastest web crawler and indexer written in Rust ported to Node.js. 4 | 5 | - Concurrent 6 | - Streaming 7 | - Decentralization 8 | - Headless Chrome [Rendering](https://github.com/mattsse/chromiumoxide) 9 | - HTTP Proxies 10 | - Cron Jobs 11 | - Subscriptions 12 | - Blacklisting and Budgeting Depth 13 | - Written in [Rust](https://www.rust-lang.org/) for speed, safety, and simplicity 14 | 15 | Spider powers some big tools and helps bring the crawling aspect to almost no downtime with the correct setup, view the [spider](https://github.com/spider-rs/spider) project to learn more. 16 | 17 | ```ts 18 | import { Website } from '@spider-rs/spider-rs' 19 | 20 | const website = new Website('https://choosealicense.com') 21 | 22 | await website.crawl() 23 | 24 | console.log(website.getLinks()) 25 | ``` 26 | -------------------------------------------------------------------------------- /book/src/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | [Introduction](./README.md) 4 | 5 | # User Guide 6 | 7 | - [Getting started](./getting-started.md) 8 | - [A simple example](./simple.md) 9 | 10 | # Configuration 11 | 12 | - [Website](./website.md) 13 | - [Page](./page.md) 14 | - [Environment](./env.md) 15 | 16 | # Usage 17 | 18 | - [Crawl](./crawl.md) 19 | - [Scrape](./scrape.md) 20 | - [Cron Job](./cron-job.md) 21 | - [Storing Data](./storing-data.md) 22 | 23 | # Benchmarks 24 | 25 | - [Compare](./benchmarks.md) 26 | -------------------------------------------------------------------------------- /book/src/benchmarks.md: -------------------------------------------------------------------------------- 1 | # Benchmarks 2 | 3 | Test url: `https://espn.com` 4 | Mac M1 64gb 10-core CPU 5 | 6 | | `libraries` | `pages` | `speed` | 7 | | :--------------------------- | :-------- | :------ | 8 | | **`spider(rust): crawl`** | `150,387` | `1m` | 9 | | **`spider(nodejs): crawl`** | `150,387` | `153s` | 10 | | **`spider(python): crawl`** | `150,387` | `186s` | 11 | | **`scrapy(python): crawl`** | `49,598` | `1h` | 12 | | **`crawlee(nodejs): crawl`** | `18,779` | `30m` | 13 | 14 | View the latest runs on [github](https://github.com/spider-rs/spider-nodejs/actions/workflows/bench.yml). 15 | 16 | ```sh 17 | ----------------------- 18 | Linux 19 | 2-core CPU 20 | 7 GB of RAM memory 21 | ----------------------- 22 | ``` 23 | 24 | Test url: `https://choosealicense.com` (small) 25 | 32 pages 26 | 27 | | `libraries` | `speed` | 28 | | :-------------------------------- | :------ | 29 | | **`spider-rs: crawl 10 samples`** | `76ms` | 30 | | **`crawlee: crawl 10 samples`** | `1s` | 31 | 32 | Test url: `https://rsseau.fr` (medium) 33 | 211 pages 34 | 35 | | `libraries` | `speed` | 36 | | :-------------------------------- | :------ | 37 | | **`spider-rs: crawl 10 samples`** | `0.5s` | 38 | | **`crawlee: crawl 10 samples`** | `72s` | 39 | 40 | ```sh 41 | ---------------------- 42 | mac Apple M1 Max 43 | 10-core CPU 44 | 64 GB of RAM memory 45 | ----------------------- 46 | ``` 47 | 48 | Test url: `https://choosealicense.com` (small) 49 | 32 pages 50 | 51 | | `libraries` | `speed` | 52 | | :-------------------------------- | :------ | 53 | | **`spider-rs: crawl 10 samples`** | `286ms` | 54 | | **`crawlee: crawl 10 samples`** | `1.7s` | 55 | 56 | Test url: `https://rsseau.fr` (medium) 57 | 211 pages 58 | 59 | | `libraries` | `speed` | 60 | | :-------------------------------- | :------ | 61 | | **`spider-rs: crawl 10 samples`** | `2.5s` | 62 | | **`crawlee: crawl 10 samples`** | `75s` | 63 | 64 | The performance scales the larger the website and if throttling is needed. Linux benchmarks are about 10x faster than macOS for spider-rs. 65 | -------------------------------------------------------------------------------- /book/src/crawl.md: -------------------------------------------------------------------------------- 1 | # Crawl 2 | 3 | Crawl a website concurrently. 4 | 5 | ```ts 6 | import { Website } from '@spider-rs/spider-rs' 7 | 8 | // pass in the website url 9 | const website = new Website('https://rsseau.fr') 10 | 11 | await website.crawl() 12 | 13 | // [ "https://rsseau.fr/blog", ...] 14 | console.log(website.getLinks()) 15 | ``` 16 | 17 | ## Async Event 18 | 19 | You can pass in a async function as the first param to the crawl function for realtime updates streamed. 20 | 21 | ```ts 22 | import { Website } from '@spider-rs/spider-rs' 23 | 24 | const website = new Website('https://rsseau.fr') 25 | 26 | const onPageEvent = (err, value) => { 27 | console.log(value) 28 | } 29 | 30 | await website.crawl(onPageEvent) 31 | ``` 32 | 33 | ## Background 34 | 35 | You can run the request in the background and receive events with the second param set to `true`. 36 | 37 | ```ts 38 | import { Website } from '@spider-rs/spider-rs' 39 | 40 | const website = new Website('https://rsseau.fr') 41 | 42 | const onPageEvent = (err, value) => { 43 | console.log(value) 44 | } 45 | 46 | await website.crawl(onPageEvent, true) 47 | // this will run instantly as the crawl is in the background 48 | ``` 49 | 50 | ## Subscriptions 51 | 52 | You can setup many subscriptions to run events when a crawl happens. 53 | 54 | ```ts 55 | import { Website } from '@spider-rs/spider-rs' 56 | 57 | const website = new Website('https://rsseau.fr') 58 | 59 | const onPageEvent = (err, value) => { 60 | console.log(value) 61 | } 62 | 63 | const subscriptionID = website.subscribe(onPageEvent) 64 | 65 | await website.crawl() 66 | 67 | website.unsubscribe(subscriptionID) 68 | // this will run instantly as the crawl is in the background 69 | ``` 70 | 71 | ## Headless Chrome 72 | 73 | Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`. 74 | It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will 75 | drastically speed up runs. 76 | 77 | ```ts 78 | import { Website } from '@spider-rs/spider-rs' 79 | 80 | const website = new Website('https://rsseau.fr') 81 | 82 | const onPageEvent = (err, value) => { 83 | console.log(value) 84 | } 85 | 86 | // all params are optional. The third param determines headless rendering. 87 | await website.crawl(onPageEvent, false, true) 88 | // make sure to call unsubscribe when finished or else the instance is kept alive when events are setup. 89 | website.unsubscribe() 90 | ``` 91 | -------------------------------------------------------------------------------- /book/src/cron-job.md: -------------------------------------------------------------------------------- 1 | # Cron Jobs 2 | 3 | Use a cron job that can run any time of day to gather website data. 4 | 5 | ```ts 6 | import { Website } from '@spider-rs/spider-rs' 7 | 8 | const website = new Website('https://choosealicense.com').withCron('1/5 * * * * *').build() 9 | 10 | // get the pages of the website when the cron runs streamed. 11 | const onPageEvent = (err, value) => { 12 | console.log(value) 13 | } 14 | 15 | const handle = await website.runCron(onPageEvent) 16 | ``` 17 | -------------------------------------------------------------------------------- /book/src/env.md: -------------------------------------------------------------------------------- 1 | # Environment 2 | 3 | Env variables to adjust the project. 4 | 5 | ## CHROME_URL 6 | 7 | You can set the chrome URL to connect remotely. 8 | 9 | ```sh 10 | CHROME_URL=http://localhost:9222 11 | ``` 12 | -------------------------------------------------------------------------------- /book/src/getting-started.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | Make sure to have [node](https://nodejs.org/en/download) installed v10 and higher. 4 | 5 | Install the package with your favorite package manager. 6 | 7 | ```sh 8 | yarn add @spider-rs/spider-rs 9 | # or 10 | npm install @spider-rs/spider-rs 11 | ``` 12 | -------------------------------------------------------------------------------- /book/src/page.md: -------------------------------------------------------------------------------- 1 | # Page 2 | 3 | A single page on a website, useful if you need just the root url. 4 | 5 | ## New Page 6 | 7 | Get a new page with content. 8 | 9 | The first param is the url, followed by if subdomains should be included, and last to include TLD's in links. 10 | 11 | Calling `page.fetch` is needed to get the content. 12 | 13 | ```ts 14 | import { Page } from '@spider-rs/spider-rs' 15 | 16 | const page = new Page('https://choosealicense.com', false, false) 17 | await page.fetch() 18 | ``` 19 | 20 | ## Page Links 21 | 22 | get all the links related to a page. 23 | 24 | ```ts 25 | const page = new Page('https://choosealicense.com', false, false) 26 | await page.fetch() 27 | const links = await page.getLinks() 28 | console.log(links) 29 | ``` 30 | 31 | ## Page Html 32 | 33 | Get the markup for the page or HTML. 34 | 35 | ```ts 36 | const page = new Page('https://choosealicense.com', false, false) 37 | await page.fetch() 38 | const html = page.getHtml() 39 | console.log(html) 40 | ``` 41 | 42 | ## Page Bytes 43 | 44 | Get the raw bytes of a page to store the files in a database. 45 | 46 | ```ts 47 | const page = new Page('https://choosealicense.com', false, false) 48 | await page.fetch() 49 | const bytes = page.getBytes() 50 | console.log(bytes) 51 | ``` 52 | -------------------------------------------------------------------------------- /book/src/scrape.md: -------------------------------------------------------------------------------- 1 | # Scrape 2 | 3 | Scape a website and collect the resource data. 4 | 5 | ```ts 6 | import { Website } from '@spider-rs/spider-rs' 7 | 8 | // pass in the website url 9 | const website = new Website('https://rsseau.fr') 10 | 11 | await website.scrape() 12 | 13 | // [ { url: "https://rsseau.fr/blog", html: "..."}, ...] 14 | console.log(website.getPages()) 15 | ``` 16 | 17 | ## Headless Chrome 18 | 19 | Headless Chrome rendering can be done by setting the third param in `crawl` or `scrape` to `true`. 20 | It will attempt to connect to chrome running remotely if the `CHROME_URL` env variable is set with chrome launching as a fallback. Using a remote connection with `CHROME_URL` will 21 | drastically speed up runs. 22 | 23 | ```ts 24 | import { Website } from '@spider-rs/spider-rs' 25 | 26 | const website = new Website('https://rsseau.fr') 27 | 28 | const onPageEvent = (err, value) => { 29 | console.log(value) 30 | } 31 | 32 | // all params are optional. The third param determines headless rendering. 33 | await website.scrape(onPageEvent, false, true) 34 | ``` 35 | -------------------------------------------------------------------------------- /book/src/simple.md: -------------------------------------------------------------------------------- 1 | # A simple example 2 | 3 | We use the node-addon to port the Rust project over with napi to target node.js. 4 | 5 | There are some performance drawbacks from the addon, even still the crawls are lightning fast and efficient. 6 | 7 | ## Usage 8 | 9 | The examples below can help get started with spider. 10 | 11 | ### Basic 12 | 13 | A basic example. 14 | 15 | ```ts 16 | import { Website } from '@spider-rs/spider-rs' 17 | 18 | const website = new Website('https://choosealicense.com') 19 | 20 | await website.crawl() 21 | console.log(website.getLinks()) 22 | ``` 23 | 24 | ### Events 25 | 26 | You can pass a function that could be async as param to `crawl` and `scrape`. 27 | 28 | ```ts 29 | import { Website, type NPage } from '@spider-rs/spider-rs' 30 | 31 | const website = new Website('https://choosealicense.com') 32 | 33 | const links: NPage[] = [] 34 | 35 | const onPageEvent = async (err: Error | null, page: NPage) => { 36 | links.push(page) 37 | } 38 | 39 | await website.crawl(onPageEvent) 40 | console.log(website.getLinks()) 41 | ``` 42 | 43 | ### Selector 44 | 45 | The `title` method allows you to extract the title of the page. 46 | 47 | ```ts 48 | import { Website, pageTitle } from '@spider-rs/spider-rs' 49 | 50 | const website = new Website('https://choosealicense.com') 51 | 52 | const links = [] 53 | 54 | const onPageEvent = async (err, page) => { 55 | links.push({ title: pageTitle(page), url: page.url }) 56 | } 57 | 58 | // params in order event, background, and headless chrome 59 | await website.crawl(onPageEvent) 60 | ``` 61 | 62 | ## Shortcut 63 | 64 | You can use the `crawl` shortcut method to collect contents quickly without configuration. 65 | 66 | ```ts 67 | import { crawl } from '@spider-rs/spider-rs' 68 | 69 | const { links, pages } = await crawl('https://choosealicense.com') 70 | 71 | console.log([links, pages]) 72 | ``` 73 | -------------------------------------------------------------------------------- /book/src/storing-data.md: -------------------------------------------------------------------------------- 1 | # Storing Data 2 | 3 | Storing data can be done to collect the raw content for a website. 4 | 5 | This allows you to upload and download the content without UTF-8 conversion. The property only appears when 6 | setting the second param of the `Website` class constructor to true. 7 | 8 | ```ts 9 | const rawContent = true 10 | 11 | const links: Buffer[] = [] 12 | 13 | const onPageEvent = (_err: Error | null, page: NPage) => { 14 | if (page.rawContent) { 15 | // we can download or store the content now to disk. 16 | links.push(page.rawContent) 17 | } 18 | } 19 | 20 | await website.crawl(onPageEvent) 21 | 22 | const website = new Website('https://choosealicense.com', rawContent) 23 | ``` 24 | -------------------------------------------------------------------------------- /book/src/website.md: -------------------------------------------------------------------------------- 1 | # Website 2 | 3 | The Website class is the foundations to the spider. 4 | 5 | ## Builder pattern 6 | 7 | We use the builder pattern to configure the website for crawling. 8 | 9 | \*note: Replace `https://choosealicense.com` from the examples below with your website target URL. 10 | 11 | ```ts 12 | import { Website } from '@spider-rs/spider-rs' 13 | 14 | const website = new Website('https://choosealicense.com') 15 | ``` 16 | 17 | ### Return Page Links 18 | 19 | Return links found on the page resource. 20 | 21 | ```py 22 | const website = new Website('https://choosealicense.com') 23 | .with_return_page_links(true) 24 | .build() 25 | ``` 26 | 27 | ### Custom Headers 28 | 29 | Add custom HTTP headers to use when crawling/scraping. 30 | 31 | ```ts 32 | const website = new Website('https://choosealicense.com') 33 | .withHeaders({ 34 | authorization: 'somerandomjwt', 35 | }) 36 | .build() 37 | ``` 38 | 39 | ### Blacklist 40 | 41 | Prevent crawling a set path, url, or pattern with Regex. 42 | 43 | ```ts 44 | const website = new Website('https://choosealicense.com') 45 | .withBlacklistUrl(['/blog', new RegExp('/books').source, '/resume']) 46 | .build() 47 | ``` 48 | 49 | ### Whitelist 50 | 51 | Only crawl set paths, url, or pattern with Regex. 52 | 53 | ```ts 54 | const website = new Website('https://choosealicense.com') 55 | .withWhitelistUrl(['/blog', new RegExp('/books').source, '/resume']) 56 | .build() 57 | ``` 58 | 59 | ### Crons 60 | 61 | Setup a cron job that can run at any time in the background using cron-syntax. 62 | 63 | ```ts 64 | const website = new Website('https://choosealicense.com').withCron('1/5 * * * * *').build() 65 | ``` 66 | 67 | View the [cron](./cron-job.md) section for details how to use the cron. 68 | 69 | ### Budget 70 | 71 | Add a crawl budget that prevents crawling `x` amount of pages. 72 | 73 | ```ts 74 | const website = new Website('https://choosealicense.com') 75 | .withBudget({ 76 | '*': 1, 77 | }) 78 | .build() 79 | ``` 80 | 81 | ### Subdomains 82 | 83 | Include subdomains in request. 84 | 85 | ```ts 86 | const website = new Website('https://choosealicense.com').withSubdomains(true).build() 87 | ``` 88 | 89 | ### TLD 90 | 91 | Include TLDs in request. 92 | 93 | ```ts 94 | const website = new Website('https://choosealicense.com').withTlds(true).build() 95 | ``` 96 | 97 | ### External Domains 98 | 99 | Add external domains to include with the website. 100 | 101 | ```ts 102 | const website = new Website('https://choosealicense.com').withExternalDomains(['https://www.myotherdomain.com']).build() 103 | ``` 104 | 105 | ### Proxy 106 | 107 | Use a proxy to crawl a website. 108 | 109 | ```ts 110 | const website = new Website('https://choosealicense.com').withProxies(['https://www.myproxy.com']).build() 111 | ``` 112 | 113 | ### Delays 114 | 115 | Add delays between pages. Defaults to none. 116 | 117 | ```ts 118 | const website = new Website('https://choosealicense.com').withDelays(200).build() 119 | ``` 120 | 121 | ### Wait_For_Delay 122 | 123 | Wait for a delay on the page. Should only be used for testing. This method does nothing if the `chrome` feature is not enabled. 124 | The first param is the seconds of delay and the second is the nano seconds to delay by. 125 | 126 | ```ts 127 | // a delay of 2 seconds and 500 nanos 128 | const website = new Website('https://choosealicense.com').with_wait_for_delay(2, 500).build() 129 | ``` 130 | 131 | ### Wait_For_Selector 132 | 133 | Wait for a a selector on the page with a max timeout. This method does nothing if the `chrome` feature is not enabled. 134 | 135 | ```ts 136 | // a delay of 2 seconds and 500 nanos 137 | const website = new Website('https://choosealicense.com').with_wait_for_selector('.news-feed', 2, 500).build() 138 | ``` 139 | 140 | ### Wait_For_Idle_Network 141 | 142 | Wait for idle network request. This method does nothing if the `chrome` feature is not enabled. 143 | 144 | ```ts 145 | // a delay of 2 seconds and 500 nanos 146 | const website = new Website('https://choosealicense.com').with_wait_for_idle_network(2, 500).build() 147 | ``` 148 | 149 | ### User-Agent 150 | 151 | Use a custom User-Agent. 152 | 153 | ```ts 154 | const website = new Website('https://choosealicense.com').withUserAgent('mybot/v1').build() 155 | ``` 156 | 157 | ### Chrome Remote Connection 158 | 159 | Add a chrome remote connection url. This can be a json endpoint or ws direct connection. 160 | 161 | ```ts 162 | const website = new Website('https://choosealicense.com').with_chrome_connection("http://localhost:9222/json/version").build() 163 | ``` 164 | 165 | 166 | ### OpenAI 167 | 168 | Use OpenAI to generate dynamic scripts to use with headless. Make sure to set the `OPENAI_API_KEY` env variable. 169 | 170 | ```ts 171 | const website = new Website('https://google.com') 172 | .withOpenAI({ 173 | model: 'gpt-3.5-turbo', 174 | prompt: 'Search for movies', 175 | maxTokens: 300, 176 | }) 177 | .build() 178 | 179 | // make sure to crawl or scrape with the headless param set to true. 180 | ``` 181 | 182 | ### Screenshots 183 | 184 | Take a screenshot of the pages on crawl when using headless chrome. 185 | 186 | ```ts 187 | const website = new Website('https://google.com') 188 | .withScreenshot({ 189 | params: { 190 | cdp_params: null, 191 | full_page: true, 192 | omit_background: false, 193 | }, 194 | bytes: false, 195 | save: true, 196 | output_dir: null, 197 | }) 198 | .build() 199 | 200 | // make sure to crawl or scrape with the headless param set to true. 201 | ``` 202 | 203 | ### Request Timeout 204 | 205 | Add a request timeout per page in miliseconds. Example shows 30 seconds. 206 | 207 | ```ts 208 | const website = new Website('https://choosealicense.com').withRequestTimeout(30000).build() 209 | ``` 210 | 211 | ### Respect Robots 212 | 213 | Respect the robots.txt file. 214 | 215 | ```ts 216 | const website = new Website('https://choosealicense.com').withRespectRobotsTxt(true).build() 217 | ``` 218 | 219 | ### Http2 Prior Knowledge 220 | 221 | Use http2 to connect if you know the website servers supports this. 222 | 223 | ```ts 224 | const website = new Website('https://choosealicense.com').withHttp2PriorKnowledge(true).build() 225 | ``` 226 | 227 | ### Chrome Network Interception 228 | 229 | Enable Network interception when using chrome to speed up request. 230 | 231 | ```ts 232 | const website = new Website('https://choosealicense.com').withChromeIntercept(true, true).build() 233 | ``` 234 | 235 | ### Redirect Limit 236 | 237 | Set the redirect limit for request. 238 | 239 | ```ts 240 | const website = new Website('https://choosealicense.com').withRedirectLimit(2).build() 241 | ``` 242 | 243 | ### Depth Limit 244 | 245 | Set the depth limit for the amount of forward pages. 246 | 247 | ```ts 248 | const website = new Website('https://choosealicense.com').withDepth(3).build() 249 | ``` 250 | 251 | ### Cache 252 | 253 | Enable HTTP caching, this useful when using the spider on a server. 254 | 255 | ```ts 256 | const website = new Website('https://choosealicense.com').withCaching(true).build() 257 | ``` 258 | 259 | ### Redirect Policy 260 | 261 | Set the redirect policy for request, either strict or loose(default). Strict only allows redirects that match the domain. 262 | 263 | ```ts 264 | const website = new Website('https://choosealicense.com').withRedirectPolicy(true).build() 265 | ``` 266 | 267 | ## Chaining 268 | 269 | You can chain all of the configs together for simple configuration. 270 | 271 | ```ts 272 | const website = new Website('https://choosealicense.com') 273 | .withSubdomains(true) 274 | .withTlds(true) 275 | .withUserAgent('mybot/v1') 276 | .withRespectRobotsTxt(true) 277 | .build() 278 | ``` 279 | 280 | ## Raw Content 281 | 282 | Set the second param of the website constructor to `true` to return content without UTF-8. 283 | This will return `rawContent` and leave `content` when using subscriptions or the Page Object. 284 | 285 | ```ts 286 | const rawContent = true 287 | const website = new Website('https://choosealicense.com', rawContent) 288 | await website.scrape() 289 | ``` 290 | 291 | ## Clearing Crawl Data 292 | 293 | Use `website.clear` to remove the links visited and page data or `website.drainLinks` to drain the links visited. 294 | 295 | ```ts 296 | const website = new Website('https://choosealicense.com') 297 | await website.crawl() 298 | // links found ["https://...", "..."] 299 | console.log(website.getLinks()) 300 | website.clear() 301 | // links will be empty 302 | console.log(website.getLinks()) 303 | ``` 304 | 305 | ## Storing and Exporting Data 306 | 307 | Collecting data to store can be done with `website.pushData()` and `website.exportJsonlData()`. 308 | 309 | ```ts 310 | const website = new Website('https://choosealicense.com') 311 | 312 | const onPageEvent = (_err, page) => { 313 | website.pushData(page) 314 | } 315 | 316 | await website.crawl(onPageEvent) 317 | 318 | // uncomment to read the data. 319 | // console.log(website.readData()); 320 | 321 | // we only have one export method atm. Optional file path. All data by default goes to storage 322 | await website.exportJsonlData('./storage/test.jsonl') 323 | ``` 324 | 325 | ## Stop crawl 326 | 327 | To stop a crawl you can use `website.stopCrawl(id)`, pass in the crawl id to stop a run or leave empty for all crawls to stop. 328 | 329 | ```ts 330 | const website = new Website('https://choosealicense.com') 331 | 332 | const onPageEvent = (_err, page) => { 333 | console.log(page) 334 | // stop the concurrent crawl when 8 pages are found. 335 | if (website.size >= 8) { 336 | website.stop() 337 | } 338 | } 339 | 340 | await website.crawl(onPageEvent) 341 | ``` 342 | -------------------------------------------------------------------------------- /build.rs: -------------------------------------------------------------------------------- 1 | extern crate napi_build; 2 | 3 | fn main() { 4 | napi_build::setup(); 5 | } 6 | -------------------------------------------------------------------------------- /examples/basic.mjs: -------------------------------------------------------------------------------- 1 | // npm i @spider-rs/spider-rs 2 | // node basic.mjs 3 | import { Website } from '../index.js' 4 | 5 | const url = process.argv[2] || 'https://choosealicense.com' 6 | 7 | const website = new Website(url).withBudget({ '*': 300, licenses: 0 }) 8 | 9 | const onPageEvent = (_err, value) => { 10 | console.log(`Found: ${value.url}`) 11 | } 12 | 13 | const startTime = performance.now() 14 | 15 | await website.crawl(onPageEvent) 16 | 17 | const duration = performance.now() - startTime 18 | 19 | console.log('Finished', url, 'pages found ' + website.getLinks().length, 'elasped duration ' + duration + 'ms') 20 | -------------------------------------------------------------------------------- /examples/cron.mjs: -------------------------------------------------------------------------------- 1 | // npm i @spider-rs/spider-rs 2 | // node cron.mjs 3 | import { Website } from '@spider-rs/spider-rs' 4 | 5 | const website = new Website('https://choosealicense.com').withCron('1/5 * * * * *').build() 6 | 7 | // get the pages of the website when the cron runs streamed. 8 | const onPageEvent = (_err, value) => { 9 | console.log(value) 10 | } 11 | 12 | const handle = await website.runCron(onPageEvent) 13 | console.log('Starting the Runner for 40 seconds') 14 | 15 | setTimeout(async () => { 16 | await handle.stop() 17 | }, 40000) 18 | -------------------------------------------------------------------------------- /examples/openai.mjs: -------------------------------------------------------------------------------- 1 | // npm i @spider-rs/spider-rs 2 | // node openai.mjs 3 | import { Website } from '../index.js' 4 | 5 | const url = process.argv[2] || 'https://google.com' 6 | const headless = true 7 | 8 | const website = new Website(url) 9 | .withBudget({ '*': 1 }) 10 | .withScreenshot({ 11 | params: { 12 | cdp_params: null, 13 | full_page: true, 14 | omit_background: false, 15 | }, 16 | bytes: false, 17 | save: true, 18 | output_dir: null, 19 | }) 20 | .withOpenai({ 21 | model: 'gpt-4-1106-preview', 22 | prompt: 'Search for movies', 23 | max_tokens: 100, 24 | }) 25 | 26 | const onPageEvent = (_err, value) => { 27 | console.log(`Found: ${value.url}\nHTML: ${value.content}`) 28 | } 29 | 30 | const startTime = performance.now() 31 | 32 | await website.crawl(onPageEvent, false, headless) 33 | 34 | const duration = performance.now() - startTime 35 | 36 | console.log('Finished', url, 'pages found ' + website.getLinks().length, 'elasped duration ' + duration + 'ms') 37 | -------------------------------------------------------------------------------- /examples/subscription.mjs: -------------------------------------------------------------------------------- 1 | // npm i @spider-rs/spider-rs 2 | // node subscription.mjs 3 | import { Website } from '@spider-rs/spider-rs' 4 | 5 | const website = new Website('https://choosealicense.com') 6 | 7 | const onPageEvent = (_err, value) => { 8 | console.log(value) 9 | console.log(`Links found: ${website.size}`) 10 | } 11 | 12 | const id = website.subscribe(onPageEvent) 13 | await website.crawl() 14 | website.unsubscribe(id) 15 | -------------------------------------------------------------------------------- /index.d.ts: -------------------------------------------------------------------------------- 1 | /* tslint:disable */ 2 | /* eslint-disable */ 3 | 4 | /* auto-generated by NAPI-RS */ 5 | 6 | /** a simple page object */ 7 | export interface NPage { 8 | /** The url found. */ 9 | url: string 10 | /** The content of the page found. */ 11 | content: string 12 | /** The HTTP status code. */ 13 | statusCode: number 14 | /** The Raw content if the resource needs to be sent as binary. */ 15 | rawContent?: Buffer 16 | /** The HTTP headers. */ 17 | headers?: Record 18 | /** The links found on the page. Requires the website.builder method website.with_subscription_return_page_links to be set to true. */ 19 | links?: Array 20 | } 21 | /** get the page title. */ 22 | export declare function pageTitle(page: NPage): string 23 | /** crawl a website using HTTP gathering all links and html. */ 24 | export declare function crawl(url: string, rawContent?: boolean | undefined | null): Promise 25 | export interface PageEvent { 26 | page: NPage 27 | } 28 | /** website main data from rust to node. */ 29 | export class NWebsite { 30 | /** all of the website links. */ 31 | links: Array 32 | /** the pages found. */ 33 | pages: Array 34 | } 35 | /** a simple page object */ 36 | export class Page { 37 | /** The url for the page. */ 38 | url: string 39 | /** The website crawling subdomain pages? */ 40 | subdomains?: boolean 41 | /** The website crawling TLD pages? */ 42 | tld?: boolean 43 | /** The HTTP status code. */ 44 | statusCode: number 45 | /** a new page */ 46 | constructor(url: string, subdomains?: boolean | undefined | null, tld?: boolean | undefined | null) 47 | /** get the page content */ 48 | fetch(): Promise 49 | /** all links on the page */ 50 | getLinks(): Promise> 51 | /** get the html for the page */ 52 | getHtml(): string 53 | /** get the bytes for the page */ 54 | getBytes(): Uint8Array 55 | } 56 | /** a website holding the inner spider::website::Website from Rust fit for nodejs. */ 57 | export class Website { 58 | /** a new website. */ 59 | constructor(url: string, rawContent?: boolean | undefined | null) 60 | /** Get the crawl status. */ 61 | get status(): string 62 | /** Store data to heap memory. The data must be an object. Use `website.export_jsonl_data` to store to disk. When using this method test occordingly since only certain primitives are supported. */ 63 | pushData(obj: unknown): void 64 | /** Clear the collected data from heap memory. This only handles the data from `website.pushData`. */ 65 | clearData(): void 66 | /** read the data from the heap memory. */ 67 | readData(): any 68 | /** store data to memory for disk storing. This will create the path if not exist and defaults to ./storage. */ 69 | exportJsonlData(exportPath?: string | undefined | null): Promise 70 | /** subscribe and add an event listener. */ 71 | subscribe(onPageEvent: (err: Error | null, arg: NPage) => any): number 72 | /** remove a subscription listener. */ 73 | unsubscribe(id?: number | undefined | null): boolean 74 | /** stop a crawl */ 75 | stop(id?: number | undefined | null): Promise 76 | /** crawl a website */ 77 | crawl(onPageEvent?: (err: Error | null, arg: NPage) => any | undefined | null, background?: boolean | undefined | null, headless?: boolean | undefined | null): Promise 78 | /** Start to crawl website with async concurrency smart. Use HTTP first and JavaScript Rendering as needed. */ 79 | crawlSmart(onPageEvent?: (err: Error | null, arg: NPage) => any | undefined | null, background?: boolean | undefined | null): Promise 80 | /** scrape a website */ 81 | scrape(onPageEvent?: (err: Error | null, arg: NPage) => any | undefined | null, background?: boolean | undefined | null, headless?: boolean | undefined | null): Promise 82 | /** run a cron job */ 83 | runCron(onPageEvent?: (err: Error | null, arg: NPage) => any | undefined | null): Promise 84 | /** get all the links of a website */ 85 | getLinks(): Array 86 | /** get the size of the website in amount of pages crawled. If you ran the page in the background, this value will not update. */ 87 | get size(): number 88 | /** get all the pages of a website - requires calling website.scrape */ 89 | getPages(): Array 90 | /** drain all links from storing */ 91 | drainLinks(): Array 92 | /** clear all links and page data */ 93 | clear(): void 94 | /** Set HTTP headers for request using [reqwest::header::HeaderMap](https://docs.rs/reqwest/latest/reqwest/header/struct.HeaderMap.html). */ 95 | withHeaders(headers?: object | undefined | null): this 96 | /** Add user agent to request. */ 97 | withUserAgent(userAgent?: string | undefined | null): this 98 | /** Respect robots.txt file. */ 99 | withRespectRobotsTxt(respectRobotsTxt: boolean): this 100 | /** Determine whether to collect all the resources found on pages. */ 101 | withFullResources(fullResources: boolean): this 102 | /** Use network interception for the request to only allow content that matches the host. If the content is from a 3rd party it needs to be part of our include list. */ 103 | withChromeIntercept(chromeIntercept: boolean, blockImages: boolean): this 104 | /** Set the connection url for the chrome instance. This method does nothing if the `chrome` is not enabled. */ 105 | withChromeConnection(chromeConnection: string): this 106 | /** Preserve the HOST header. */ 107 | withPreserveHostHeader(preserveHost: boolean): this 108 | /** Include subdomains detection. */ 109 | withSubdomains(subdomains: boolean): this 110 | /** Include tld detection. */ 111 | withTld(tld: boolean): this 112 | /** Only use HTTP/2. */ 113 | withHttp2PriorKnowledge(http2PriorKnowledge: boolean): this 114 | /** Max time to wait for request duration to milliseconds. */ 115 | withRequestTimeout(requestTimeout?: number | undefined | null): this 116 | /** add external domains */ 117 | withExternalDomains(externalDomains?: Array | undefined | null): this 118 | /** Use stealth mode for the request. This does nothing without chrome. */ 119 | withStealth(stealthMode?: boolean | undefined | null): this 120 | /** Dangerously accept invalid certificates - this should be used as a last resort. */ 121 | withDangerAcceptInvalidCerts(acceptInvalidCerts?: boolean | undefined | null): this 122 | /** Set the crawling budget */ 123 | withBudget(budget?: Record | undefined | null): this 124 | /** Set the max redirects allowed for request. */ 125 | withRedirectLimit(redirectLimit: number): this 126 | /** Set the redirect policy to use, either Strict or Loose by default. */ 127 | withRedirectPolicy(strict: boolean): this 128 | /** Regex blacklist urls from the crawl */ 129 | withBlacklistUrl(blacklistUrl?: Array | undefined | null): this 130 | /** Regex whitelist urls from the crawl */ 131 | withWhitelistUrl(whitelistUrl?: Array | undefined | null): this 132 | /** Wait for a delay. Should only be used for testing. This method does nothing if the `chrome` feature is not enabled. */ 133 | withWaitForDelay(seconds?: number | undefined | null, nanos?: number | undefined | null): this 134 | /** Wait for a CSS query selector. This method does nothing if the `chrome` feature is not enabled. */ 135 | withWaitForSelector(selector?: string | undefined | null, seconds?: number | undefined | null, nanos?: number | undefined | null): this 136 | /** Wait for idle network request. This method does nothing if the `chrome` feature is not enabled. */ 137 | withWaitForIdleNetwork(seconds?: number | undefined | null, nanos?: number | undefined | null): this 138 | /** Setup cron jobs to run */ 139 | withCron(cronStr: string, cronType?: string | undefined | null): this 140 | /** Use OpenAI to generate dynamic javascript snippets. Make sure to set the `OPENAI_API_KEY` env variable. */ 141 | withOpenai(openaiConfigs?: object | undefined | null): this 142 | /** Take screenshots of web pages using chrome. */ 143 | withScreenshot(screenshotConfigs?: { 144 | /** The screenshot params. */ 145 | params: { 146 | /** Chrome DevTools Protocol screenshot options. */ 147 | cdp_params: { 148 | /** Image compression format (defaults to png). */ 149 | format: 'jpeg' | 'png' | 'webp' 150 | /** Compression quality from range [0..100] (jpeg only). */ 151 | quality: number 152 | /** Capture the screenshot of a given region only. */ 153 | clip: { 154 | x: number 155 | y: number 156 | height: number 157 | width: number 158 | scale: number 159 | } 160 | /** Capture the screenshot from the surface, rather than the view. Defaults to true.*/ 161 | from_surface: boolean 162 | /** Capture the screenshot beyond the viewport. Defaults to false. */ 163 | capture_beyond_viewport: boolean 164 | } 165 | /** Take full page screenshot */ 166 | full_page: boolean 167 | /** Make the background transparent (png only). */ 168 | omit_background: boolean 169 | } 170 | /** Return the bytes of the screenshot on the Page. */ 171 | bytes: boolean 172 | /** Store the screenshot to disk. This can be used with output_dir. If disabled will not store the file to the output directory. */ 173 | save: boolean 174 | /** The output directory to store the file. Parent folders may be created inside the directory. */ 175 | output_dir: string | null 176 | }): this 177 | /** Delay between request as ms. */ 178 | withDelay(delay: number): this 179 | /** Set a crawl depth limit. If the value is 0 there is no limit. */ 180 | withDepth(depth: number): this 181 | /** Return the links found on the page in the channel subscriptions. This method does nothing if the `decentralized` is enabled. */ 182 | withReturnPageLinks(returnPageLinks: boolean): this 183 | /** Cache the page following HTTP rules. */ 184 | withCaching(cache: boolean): this 185 | /** Set the sitemap url. */ 186 | withSitemap(sitemap?: string | undefined | null): this 187 | /** Use proxies for request. */ 188 | withProxies(proxies?: Array | undefined | null): this 189 | /** build the inner website - not required for all builder_steps */ 190 | build(): this 191 | } 192 | /** a runner for handling crons */ 193 | export class Cron { 194 | /** stop the cron instance */ 195 | stop(): Promise 196 | } 197 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | /* tslint:disable */ 2 | /* eslint-disable */ 3 | /* prettier-ignore */ 4 | 5 | /* auto-generated by NAPI-RS */ 6 | 7 | const { existsSync, readFileSync } = require('fs') 8 | const { join } = require('path') 9 | 10 | const { platform, arch } = process 11 | 12 | let nativeBinding = null 13 | let localFileExisted = false 14 | let loadError = null 15 | 16 | function isMusl() { 17 | // For Node 10 18 | if (!process.report || typeof process.report.getReport !== 'function') { 19 | try { 20 | const lddPath = require('child_process').execSync('which ldd').toString().trim() 21 | return readFileSync(lddPath, 'utf8').includes('musl') 22 | } catch (e) { 23 | return true 24 | } 25 | } else { 26 | const { glibcVersionRuntime } = process.report.getReport().header 27 | return !glibcVersionRuntime 28 | } 29 | } 30 | 31 | switch (platform) { 32 | case 'android': 33 | switch (arch) { 34 | case 'arm64': 35 | localFileExisted = existsSync(join(__dirname, 'spider-rs.android-arm64.node')) 36 | try { 37 | if (localFileExisted) { 38 | nativeBinding = require('./spider-rs.android-arm64.node') 39 | } else { 40 | nativeBinding = require('@spider-rs/spider-rs-android-arm64') 41 | } 42 | } catch (e) { 43 | loadError = e 44 | } 45 | break 46 | case 'arm': 47 | localFileExisted = existsSync(join(__dirname, 'spider-rs.android-arm-eabi.node')) 48 | try { 49 | if (localFileExisted) { 50 | nativeBinding = require('./spider-rs.android-arm-eabi.node') 51 | } else { 52 | nativeBinding = require('@spider-rs/spider-rs-android-arm-eabi') 53 | } 54 | } catch (e) { 55 | loadError = e 56 | } 57 | break 58 | default: 59 | throw new Error(`Unsupported architecture on Android ${arch}`) 60 | } 61 | break 62 | case 'win32': 63 | switch (arch) { 64 | case 'x64': 65 | localFileExisted = existsSync( 66 | join(__dirname, 'spider-rs.win32-x64-msvc.node') 67 | ) 68 | try { 69 | if (localFileExisted) { 70 | nativeBinding = require('./spider-rs.win32-x64-msvc.node') 71 | } else { 72 | nativeBinding = require('@spider-rs/spider-rs-win32-x64-msvc') 73 | } 74 | } catch (e) { 75 | loadError = e 76 | } 77 | break 78 | case 'ia32': 79 | localFileExisted = existsSync( 80 | join(__dirname, 'spider-rs.win32-ia32-msvc.node') 81 | ) 82 | try { 83 | if (localFileExisted) { 84 | nativeBinding = require('./spider-rs.win32-ia32-msvc.node') 85 | } else { 86 | nativeBinding = require('@spider-rs/spider-rs-win32-ia32-msvc') 87 | } 88 | } catch (e) { 89 | loadError = e 90 | } 91 | break 92 | case 'arm64': 93 | localFileExisted = existsSync( 94 | join(__dirname, 'spider-rs.win32-arm64-msvc.node') 95 | ) 96 | try { 97 | if (localFileExisted) { 98 | nativeBinding = require('./spider-rs.win32-arm64-msvc.node') 99 | } else { 100 | nativeBinding = require('@spider-rs/spider-rs-win32-arm64-msvc') 101 | } 102 | } catch (e) { 103 | loadError = e 104 | } 105 | break 106 | default: 107 | throw new Error(`Unsupported architecture on Windows: ${arch}`) 108 | } 109 | break 110 | case 'darwin': 111 | localFileExisted = existsSync(join(__dirname, 'spider-rs.darwin-universal.node')) 112 | try { 113 | if (localFileExisted) { 114 | nativeBinding = require('./spider-rs.darwin-universal.node') 115 | } else { 116 | nativeBinding = require('@spider-rs/spider-rs-darwin-universal') 117 | } 118 | break 119 | } catch {} 120 | switch (arch) { 121 | case 'x64': 122 | localFileExisted = existsSync(join(__dirname, 'spider-rs.darwin-x64.node')) 123 | try { 124 | if (localFileExisted) { 125 | nativeBinding = require('./spider-rs.darwin-x64.node') 126 | } else { 127 | nativeBinding = require('@spider-rs/spider-rs-darwin-x64') 128 | } 129 | } catch (e) { 130 | loadError = e 131 | } 132 | break 133 | case 'arm64': 134 | localFileExisted = existsSync( 135 | join(__dirname, 'spider-rs.darwin-arm64.node') 136 | ) 137 | try { 138 | if (localFileExisted) { 139 | nativeBinding = require('./spider-rs.darwin-arm64.node') 140 | } else { 141 | nativeBinding = require('@spider-rs/spider-rs-darwin-arm64') 142 | } 143 | } catch (e) { 144 | loadError = e 145 | } 146 | break 147 | default: 148 | throw new Error(`Unsupported architecture on macOS: ${arch}`) 149 | } 150 | break 151 | case 'freebsd': 152 | if (arch !== 'x64') { 153 | throw new Error(`Unsupported architecture on FreeBSD: ${arch}`) 154 | } 155 | localFileExisted = existsSync(join(__dirname, 'spider-rs.freebsd-x64.node')) 156 | try { 157 | if (localFileExisted) { 158 | nativeBinding = require('./spider-rs.freebsd-x64.node') 159 | } else { 160 | nativeBinding = require('@spider-rs/spider-rs-freebsd-x64') 161 | } 162 | } catch (e) { 163 | loadError = e 164 | } 165 | break 166 | case 'linux': 167 | switch (arch) { 168 | case 'x64': 169 | if (isMusl()) { 170 | localFileExisted = existsSync( 171 | join(__dirname, 'spider-rs.linux-x64-musl.node') 172 | ) 173 | try { 174 | if (localFileExisted) { 175 | nativeBinding = require('./spider-rs.linux-x64-musl.node') 176 | } else { 177 | nativeBinding = require('@spider-rs/spider-rs-linux-x64-musl') 178 | } 179 | } catch (e) { 180 | loadError = e 181 | } 182 | } else { 183 | localFileExisted = existsSync( 184 | join(__dirname, 'spider-rs.linux-x64-gnu.node') 185 | ) 186 | try { 187 | if (localFileExisted) { 188 | nativeBinding = require('./spider-rs.linux-x64-gnu.node') 189 | } else { 190 | nativeBinding = require('@spider-rs/spider-rs-linux-x64-gnu') 191 | } 192 | } catch (e) { 193 | loadError = e 194 | } 195 | } 196 | break 197 | case 'arm64': 198 | if (isMusl()) { 199 | localFileExisted = existsSync( 200 | join(__dirname, 'spider-rs.linux-arm64-musl.node') 201 | ) 202 | try { 203 | if (localFileExisted) { 204 | nativeBinding = require('./spider-rs.linux-arm64-musl.node') 205 | } else { 206 | nativeBinding = require('@spider-rs/spider-rs-linux-arm64-musl') 207 | } 208 | } catch (e) { 209 | loadError = e 210 | } 211 | } else { 212 | localFileExisted = existsSync( 213 | join(__dirname, 'spider-rs.linux-arm64-gnu.node') 214 | ) 215 | try { 216 | if (localFileExisted) { 217 | nativeBinding = require('./spider-rs.linux-arm64-gnu.node') 218 | } else { 219 | nativeBinding = require('@spider-rs/spider-rs-linux-arm64-gnu') 220 | } 221 | } catch (e) { 222 | loadError = e 223 | } 224 | } 225 | break 226 | case 'arm': 227 | if (isMusl()) { 228 | localFileExisted = existsSync( 229 | join(__dirname, 'spider-rs.linux-arm-musleabihf.node') 230 | ) 231 | try { 232 | if (localFileExisted) { 233 | nativeBinding = require('./spider-rs.linux-arm-musleabihf.node') 234 | } else { 235 | nativeBinding = require('@spider-rs/spider-rs-linux-arm-musleabihf') 236 | } 237 | } catch (e) { 238 | loadError = e 239 | } 240 | } else { 241 | localFileExisted = existsSync( 242 | join(__dirname, 'spider-rs.linux-arm-gnueabihf.node') 243 | ) 244 | try { 245 | if (localFileExisted) { 246 | nativeBinding = require('./spider-rs.linux-arm-gnueabihf.node') 247 | } else { 248 | nativeBinding = require('@spider-rs/spider-rs-linux-arm-gnueabihf') 249 | } 250 | } catch (e) { 251 | loadError = e 252 | } 253 | } 254 | break 255 | case 'riscv64': 256 | if (isMusl()) { 257 | localFileExisted = existsSync( 258 | join(__dirname, 'spider-rs.linux-riscv64-musl.node') 259 | ) 260 | try { 261 | if (localFileExisted) { 262 | nativeBinding = require('./spider-rs.linux-riscv64-musl.node') 263 | } else { 264 | nativeBinding = require('@spider-rs/spider-rs-linux-riscv64-musl') 265 | } 266 | } catch (e) { 267 | loadError = e 268 | } 269 | } else { 270 | localFileExisted = existsSync( 271 | join(__dirname, 'spider-rs.linux-riscv64-gnu.node') 272 | ) 273 | try { 274 | if (localFileExisted) { 275 | nativeBinding = require('./spider-rs.linux-riscv64-gnu.node') 276 | } else { 277 | nativeBinding = require('@spider-rs/spider-rs-linux-riscv64-gnu') 278 | } 279 | } catch (e) { 280 | loadError = e 281 | } 282 | } 283 | break 284 | case 's390x': 285 | localFileExisted = existsSync( 286 | join(__dirname, 'spider-rs.linux-s390x-gnu.node') 287 | ) 288 | try { 289 | if (localFileExisted) { 290 | nativeBinding = require('./spider-rs.linux-s390x-gnu.node') 291 | } else { 292 | nativeBinding = require('@spider-rs/spider-rs-linux-s390x-gnu') 293 | } 294 | } catch (e) { 295 | loadError = e 296 | } 297 | break 298 | default: 299 | throw new Error(`Unsupported architecture on Linux: ${arch}`) 300 | } 301 | break 302 | default: 303 | throw new Error(`Unsupported OS: ${platform}, architecture: ${arch}`) 304 | } 305 | 306 | if (!nativeBinding) { 307 | if (loadError) { 308 | throw loadError 309 | } 310 | throw new Error(`Failed to load native binding`) 311 | } 312 | 313 | const { pageTitle, NWebsite, Page, crawl, Website, Cron } = nativeBinding 314 | 315 | module.exports.pageTitle = pageTitle 316 | module.exports.NWebsite = NWebsite 317 | module.exports.Page = Page 318 | module.exports.crawl = crawl 319 | module.exports.Website = Website 320 | module.exports.Cron = Cron 321 | -------------------------------------------------------------------------------- /npm/android-arm-eabi/README.md: -------------------------------------------------------------------------------- 1 | # `@spider-rs/spider-rs-android-arm-eabi` 2 | 3 | This is the **armv7-linux-androideabi** binary for `@spider-rs/spider-rs` 4 | -------------------------------------------------------------------------------- /npm/android-arm-eabi/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@spider-rs/spider-rs-android-arm-eabi", 3 | "version": "0.0.162", 4 | "repository": "https://github.com/spider-rs/spider-nodejs", 5 | "os": [ 6 | "android" 7 | ], 8 | "cpu": [ 9 | "arm" 10 | ], 11 | "main": "spider-rs.android-arm-eabi.node", 12 | "files": [ 13 | "spider-rs.android-arm-eabi.node" 14 | ], 15 | "license": "MIT", 16 | "engines": { 17 | "node": ">= 10" 18 | } 19 | } -------------------------------------------------------------------------------- /npm/android-arm64/README.md: -------------------------------------------------------------------------------- 1 | # `@spider-rs/spider-rs-android-arm64` 2 | 3 | This is the **aarch64-linux-android** binary for `@spider-rs/spider-rs` 4 | -------------------------------------------------------------------------------- /npm/android-arm64/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@spider-rs/spider-rs-android-arm64", 3 | "version": "0.0.162", 4 | "repository": "https://github.com/spider-rs/spider-nodejs", 5 | "os": [ 6 | "android" 7 | ], 8 | "cpu": [ 9 | "arm64" 10 | ], 11 | "main": "spider-rs.android-arm64.node", 12 | "files": [ 13 | "spider-rs.android-arm64.node" 14 | ], 15 | "license": "MIT", 16 | "engines": { 17 | "node": ">= 10" 18 | } 19 | } -------------------------------------------------------------------------------- /npm/darwin-arm64/README.md: -------------------------------------------------------------------------------- 1 | # `@spider-rs/spider-rs-darwin-arm64` 2 | 3 | This is the **aarch64-apple-darwin** binary for `@spider-rs/spider-rs` 4 | -------------------------------------------------------------------------------- /npm/darwin-arm64/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@spider-rs/spider-rs-darwin-arm64", 3 | "version": "0.0.162", 4 | "repository": "https://github.com/spider-rs/spider-nodejs", 5 | "os": [ 6 | "darwin" 7 | ], 8 | "cpu": [ 9 | "arm64" 10 | ], 11 | "main": "spider-rs.darwin-arm64.node", 12 | "files": [ 13 | "spider-rs.darwin-arm64.node" 14 | ], 15 | "license": "MIT", 16 | "engines": { 17 | "node": ">= 10" 18 | } 19 | } -------------------------------------------------------------------------------- /npm/darwin-universal/README.md: -------------------------------------------------------------------------------- 1 | # `@spider-rs/spider-rs-darwin-universal` 2 | 3 | This is the **universal-apple-darwin** binary for `@spider-rs/spider-rs` 4 | -------------------------------------------------------------------------------- /npm/darwin-universal/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@spider-rs/spider-rs-darwin-universal", 3 | "version": "0.0.162", 4 | "repository": "https://github.com/spider-rs/spider-nodejs", 5 | "os": [ 6 | "darwin" 7 | ], 8 | "main": "spider-rs.darwin-universal.node", 9 | "files": [ 10 | "spider-rs.darwin-universal.node" 11 | ], 12 | "license": "MIT", 13 | "engines": { 14 | "node": ">= 10" 15 | } 16 | } -------------------------------------------------------------------------------- /npm/darwin-x64/README.md: -------------------------------------------------------------------------------- 1 | # `@spider-rs/spider-rs-darwin-x64` 2 | 3 | This is the **x86_64-apple-darwin** binary for `@spider-rs/spider-rs` 4 | -------------------------------------------------------------------------------- /npm/darwin-x64/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@spider-rs/spider-rs-darwin-x64", 3 | "version": "0.0.162", 4 | "repository": "https://github.com/spider-rs/spider-nodejs", 5 | "os": [ 6 | "darwin" 7 | ], 8 | "cpu": [ 9 | "x64" 10 | ], 11 | "main": "spider-rs.darwin-x64.node", 12 | "files": [ 13 | "spider-rs.darwin-x64.node" 14 | ], 15 | "license": "MIT", 16 | "engines": { 17 | "node": ">= 10" 18 | } 19 | } -------------------------------------------------------------------------------- /npm/freebsd-x64/README.md: -------------------------------------------------------------------------------- 1 | # `@spider-rs/spider-rs-freebsd-x64` 2 | 3 | This is the **x86_64-unknown-freebsd** binary for `@spider-rs/spider-rs` 4 | -------------------------------------------------------------------------------- /npm/freebsd-x64/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@spider-rs/spider-rs-freebsd-x64", 3 | "version": "0.0.162", 4 | "repository": "https://github.com/spider-rs/spider-nodejs", 5 | "os": [ 6 | "freebsd" 7 | ], 8 | "cpu": [ 9 | "x64" 10 | ], 11 | "main": "spider-rs.freebsd-x64.node", 12 | "files": [ 13 | "spider-rs.freebsd-x64.node" 14 | ], 15 | "license": "MIT", 16 | "engines": { 17 | "node": ">= 10" 18 | } 19 | } -------------------------------------------------------------------------------- /npm/linux-arm-gnueabihf/README.md: -------------------------------------------------------------------------------- 1 | # `@spider-rs/spider-rs-linux-arm-gnueabihf` 2 | 3 | This is the **armv7-unknown-linux-gnueabihf** binary for `@spider-rs/spider-rs` 4 | -------------------------------------------------------------------------------- /npm/linux-arm-gnueabihf/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@spider-rs/spider-rs-linux-arm-gnueabihf", 3 | "version": "0.0.162", 4 | "repository": "https://github.com/spider-rs/spider-nodejs", 5 | "os": [ 6 | "linux" 7 | ], 8 | "cpu": [ 9 | "arm" 10 | ], 11 | "main": "spider-rs.linux-arm-gnueabihf.node", 12 | "files": [ 13 | "spider-rs.linux-arm-gnueabihf.node" 14 | ], 15 | "license": "MIT", 16 | "engines": { 17 | "node": ">= 10" 18 | } 19 | } -------------------------------------------------------------------------------- /npm/linux-arm64-gnu/README.md: -------------------------------------------------------------------------------- 1 | # `@spider-rs/spider-rs-linux-arm64-gnu` 2 | 3 | This is the **aarch64-unknown-linux-gnu** binary for `@spider-rs/spider-rs` 4 | -------------------------------------------------------------------------------- /npm/linux-arm64-gnu/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@spider-rs/spider-rs-linux-arm64-gnu", 3 | "version": "0.0.162", 4 | "repository": "https://github.com/spider-rs/spider-nodejs", 5 | "os": [ 6 | "linux" 7 | ], 8 | "cpu": [ 9 | "arm64" 10 | ], 11 | "main": "spider-rs.linux-arm64-gnu.node", 12 | "files": [ 13 | "spider-rs.linux-arm64-gnu.node" 14 | ], 15 | "license": "MIT", 16 | "engines": { 17 | "node": ">= 10" 18 | }, 19 | "libc": [ 20 | "glibc" 21 | ] 22 | } -------------------------------------------------------------------------------- /npm/linux-arm64-musl/README.md: -------------------------------------------------------------------------------- 1 | # `@spider-rs/spider-rs-linux-arm64-musl` 2 | 3 | This is the **aarch64-unknown-linux-musl** binary for `@spider-rs/spider-rs` 4 | -------------------------------------------------------------------------------- /npm/linux-arm64-musl/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@spider-rs/spider-rs-linux-arm64-musl", 3 | "version": "0.0.162", 4 | "repository": "https://github.com/spider-rs/spider-nodejs", 5 | "os": [ 6 | "linux" 7 | ], 8 | "cpu": [ 9 | "arm64" 10 | ], 11 | "main": "spider-rs.linux-arm64-musl.node", 12 | "files": [ 13 | "spider-rs.linux-arm64-musl.node" 14 | ], 15 | "license": "MIT", 16 | "engines": { 17 | "node": ">= 10" 18 | }, 19 | "libc": [ 20 | "musl" 21 | ] 22 | } -------------------------------------------------------------------------------- /npm/linux-x64-gnu/README.md: -------------------------------------------------------------------------------- 1 | # `@spider-rs/spider-rs-linux-x64-gnu` 2 | 3 | This is the **x86_64-unknown-linux-gnu** binary for `@spider-rs/spider-rs` 4 | -------------------------------------------------------------------------------- /npm/linux-x64-gnu/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@spider-rs/spider-rs-linux-x64-gnu", 3 | "version": "0.0.162", 4 | "repository": "https://github.com/spider-rs/spider-nodejs", 5 | "os": [ 6 | "linux" 7 | ], 8 | "cpu": [ 9 | "x64" 10 | ], 11 | "main": "spider-rs.linux-x64-gnu.node", 12 | "files": [ 13 | "spider-rs.linux-x64-gnu.node" 14 | ], 15 | "license": "MIT", 16 | "engines": { 17 | "node": ">= 10" 18 | }, 19 | "libc": [ 20 | "glibc" 21 | ] 22 | } -------------------------------------------------------------------------------- /npm/linux-x64-musl/README.md: -------------------------------------------------------------------------------- 1 | # `@spider-rs/spider-rs-linux-x64-musl` 2 | 3 | This is the **x86_64-unknown-linux-musl** binary for `@spider-rs/spider-rs` 4 | -------------------------------------------------------------------------------- /npm/linux-x64-musl/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@spider-rs/spider-rs-linux-x64-musl", 3 | "version": "0.0.162", 4 | "repository": "https://github.com/spider-rs/spider-nodejs", 5 | "os": [ 6 | "linux" 7 | ], 8 | "cpu": [ 9 | "x64" 10 | ], 11 | "main": "spider-rs.linux-x64-musl.node", 12 | "files": [ 13 | "spider-rs.linux-x64-musl.node" 14 | ], 15 | "license": "MIT", 16 | "engines": { 17 | "node": ">= 10" 18 | }, 19 | "libc": [ 20 | "musl" 21 | ] 22 | } -------------------------------------------------------------------------------- /npm/win32-arm64-msvc/README.md: -------------------------------------------------------------------------------- 1 | # `@spider-rs/spider-rs-win32-arm64-msvc` 2 | 3 | This is the **aarch64-pc-windows-msvc** binary for `@spider-rs/spider-rs` 4 | -------------------------------------------------------------------------------- /npm/win32-arm64-msvc/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@spider-rs/spider-rs-win32-arm64-msvc", 3 | "version": "0.0.162", 4 | "repository": "https://github.com/spider-rs/spider-nodejs", 5 | "os": [ 6 | "win32" 7 | ], 8 | "cpu": [ 9 | "arm64" 10 | ], 11 | "main": "spider-rs.win32-arm64-msvc.node", 12 | "files": [ 13 | "spider-rs.win32-arm64-msvc.node" 14 | ], 15 | "license": "MIT", 16 | "engines": { 17 | "node": ">= 10" 18 | } 19 | } -------------------------------------------------------------------------------- /npm/win32-ia32-msvc/README.md: -------------------------------------------------------------------------------- 1 | # `@spider-rs/spider-rs-win32-ia32-msvc` 2 | 3 | This is the **i686-pc-windows-msvc** binary for `@spider-rs/spider-rs` 4 | -------------------------------------------------------------------------------- /npm/win32-ia32-msvc/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@spider-rs/spider-rs-win32-ia32-msvc", 3 | "version": "0.0.162", 4 | "repository": "https://github.com/spider-rs/spider-nodejs", 5 | "os": [ 6 | "win32" 7 | ], 8 | "cpu": [ 9 | "ia32" 10 | ], 11 | "main": "spider-rs.win32-ia32-msvc.node", 12 | "files": [ 13 | "spider-rs.win32-ia32-msvc.node" 14 | ], 15 | "license": "MIT", 16 | "engines": { 17 | "node": ">= 10" 18 | } 19 | } -------------------------------------------------------------------------------- /npm/win32-x64-msvc/README.md: -------------------------------------------------------------------------------- 1 | # `@spider-rs/spider-rs-win32-x64-msvc` 2 | 3 | This is the **x86_64-pc-windows-msvc** binary for `@spider-rs/spider-rs` 4 | -------------------------------------------------------------------------------- /npm/win32-x64-msvc/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@spider-rs/spider-rs-win32-x64-msvc", 3 | "version": "0.0.162", 4 | "repository": "https://github.com/spider-rs/spider-nodejs", 5 | "os": [ 6 | "win32" 7 | ], 8 | "cpu": [ 9 | "x64" 10 | ], 11 | "main": "spider-rs.win32-x64-msvc.node", 12 | "files": [ 13 | "spider-rs.win32-x64-msvc.node" 14 | ], 15 | "license": "MIT", 16 | "engines": { 17 | "node": ">= 10" 18 | } 19 | } -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@spider-rs/spider-rs", 3 | "version": "0.0.162", 4 | "main": "index.js", 5 | "types": "index.d.ts", 6 | "napi": { 7 | "name": "spider-rs", 8 | "triples": { 9 | "additional": [ 10 | "aarch64-apple-darwin", 11 | "aarch64-linux-android", 12 | "aarch64-unknown-linux-gnu", 13 | "aarch64-unknown-linux-musl", 14 | "aarch64-pc-windows-msvc", 15 | "armv7-unknown-linux-gnueabihf", 16 | "x86_64-unknown-linux-musl", 17 | "x86_64-unknown-freebsd", 18 | "i686-pc-windows-msvc", 19 | "armv7-linux-androideabi", 20 | "universal-apple-darwin" 21 | ] 22 | } 23 | }, 24 | "license": "MIT", 25 | "keywords": [ 26 | "spider", 27 | "crawler" 28 | ], 29 | "repository": "https://github.com/spider-rs/spider-nodejs", 30 | "devDependencies": { 31 | "@napi-rs/cli": "^2.18.4", 32 | "@swc-node/register": "^1.10.9", 33 | "@swc/core": "^1.7.0", 34 | "@types/node": "^20.14.5", 35 | "ava": "^6.1.3", 36 | "prettier": "^3.3.3", 37 | "typescript": "^5.4.5" 38 | }, 39 | "ava": { 40 | "require": [ 41 | "@swc-node/register" 42 | ], 43 | "extensions": [ 44 | "ts" 45 | ], 46 | "timeout": "5m", 47 | "workerThreads": false, 48 | "environmentVariables": { 49 | "TS_NODE_PROJECT": "./tsconfig.json" 50 | } 51 | }, 52 | "engines": { 53 | "node": ">= 10" 54 | }, 55 | "scripts": { 56 | "artifacts": "napi artifacts", 57 | "bench": "cd bench && npm run bench", 58 | "bench:oss": "cd bench && npm run bench:oss", 59 | "build": "napi build --platform --release --pipe \"prettier -w\"", 60 | "build:debug": "napi build --platform --pipe \"prettier -w\"", 61 | "format": "run-p format:prettier format:rs format:toml", 62 | "format:prettier": "prettier . -w", 63 | "format:toml": "taplo format", 64 | "format:rs": "cargo fmt", 65 | "lint": "eslint . -c ./.eslintrc.yml", 66 | "prepublishOnly": "napi prepublish -t npm", 67 | "test": "ava", 68 | "version": "napi version" 69 | }, 70 | "prettier": { 71 | "printWidth": 120, 72 | "semi": false, 73 | "trailingComma": "all", 74 | "singleQuote": true, 75 | "arrowParens": "always" 76 | }, 77 | "packageManager": "yarn@3.6.4" 78 | } 79 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | tab_spaces = 2 2 | -------------------------------------------------------------------------------- /src/conversions.rs: -------------------------------------------------------------------------------- 1 | use napi::bindgen_prelude::{Buffer, Null, Object, Undefined}; 2 | use serde_json::Value; 3 | 4 | /// the object to handle conversions 5 | pub enum ObjectConvert { 6 | /// napi object 7 | Obj(Object), 8 | /// serde value 9 | Val(Value), 10 | } 11 | 12 | /// convert a napi object to json with trailing comma support for quick reading and writing 13 | pub fn object_to_u8(obj: ObjectConvert) -> Result, napi::Error> { 14 | let mut ss = vec![]; 15 | 16 | match obj { 17 | ObjectConvert::Val(deserialized) => { 18 | ss.extend(deserialized.to_string().as_bytes()); 19 | } 20 | ObjectConvert::Obj(obj) => { 21 | let o = Object::keys(&obj)?; 22 | let o_size = o.len(); 23 | 24 | ss.push(b'{'); 25 | 26 | // we are missing map, null, and vector 27 | for (i, key) in o.iter().enumerate() { 28 | let mut fp = || { 29 | ss.push(b'"'); 30 | ss.extend(key.as_bytes()); 31 | ss.push(b'"'); 32 | ss.push(b':'); 33 | }; 34 | 35 | let mut block = false; 36 | 37 | // todo: method to go through all napi values to get types instead of long chain map 38 | match obj.get::<&str, String>(&key) { 39 | Ok(s) => { 40 | fp(); 41 | ss.push(b'"'); 42 | ss.extend(s.unwrap_or_default().as_bytes()); 43 | ss.push(b'"'); 44 | } 45 | _ => match obj.get::<&str, u32>(&key) { 46 | Ok(s) => { 47 | fp(); 48 | ss.push(b'"'); 49 | ss.extend(s.unwrap_or_default().to_string().as_bytes()); 50 | ss.push(b'"'); 51 | } 52 | _ => match obj.get::<&str, i32>(&key) { 53 | Ok(s) => { 54 | fp(); 55 | ss.push(b'"'); 56 | ss.extend(s.unwrap_or_default().to_string().as_bytes()); 57 | ss.push(b'"'); 58 | } 59 | _ => match obj.get::<&str, Buffer>(&key) { 60 | Ok(s) => { 61 | fp(); 62 | let d = serde_json::to_string( 63 | &String::from_utf8(s.unwrap_or_default().as_ref().into()).unwrap_or_default(), 64 | )?; 65 | ss.extend(d.as_bytes()); 66 | } 67 | _ => match obj.get::<&str, Null>(&key) { 68 | Ok(_) => { 69 | fp(); 70 | ss.extend(b"null"); 71 | } 72 | _ => match obj.get::<&str, Undefined>(&key) { 73 | Ok(_) => { 74 | block = true; 75 | } 76 | _ => (), 77 | }, 78 | }, 79 | }, 80 | }, 81 | }, 82 | } 83 | 84 | if !block && i != o_size - 1 { 85 | ss.push(b','); 86 | } 87 | } 88 | 89 | ss.push(b'}'); 90 | } 91 | } 92 | 93 | Ok(ss) 94 | } 95 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![deny(clippy::all)] 2 | 3 | #[macro_use] 4 | extern crate napi_derive; 5 | use spider::lazy_static::lazy_static; 6 | 7 | lazy_static! { 8 | pub static ref BUFFER: usize = (num_cpus::get() * 20).max(88); 9 | } 10 | 11 | pub mod npage; 12 | pub mod nwebsite; 13 | pub mod page; 14 | pub mod shortcut; 15 | pub mod website; 16 | 17 | pub use npage::{page_title, NPage}; 18 | pub use nwebsite::NWebsite; 19 | pub use page::Page; 20 | pub use shortcut::crawl; 21 | pub use website::Website; 22 | /// convert types to different types 23 | mod conversions; 24 | -------------------------------------------------------------------------------- /src/npage.rs: -------------------------------------------------------------------------------- 1 | use napi::bindgen_prelude::Buffer; 2 | use spider::{lazy_static::lazy_static, reqwest::header::HeaderMap}; 3 | use std::collections::HashMap; 4 | 5 | lazy_static! { 6 | static ref TITLE_SELECTOR: scraper::Selector = scraper::Selector::parse("title").unwrap(); 7 | } 8 | 9 | /// a simple page object 10 | #[derive(Default, Clone)] 11 | #[napi(object)] 12 | pub struct NPage { 13 | /// The url found. 14 | pub url: String, 15 | /// The content of the page found. 16 | pub content: String, 17 | /// The HTTP status code. 18 | pub status_code: u16, 19 | /// The Raw content if the resource needs to be sent as binary. 20 | pub raw_content: Option, 21 | /// The HTTP headers. 22 | pub headers: Option>, 23 | /// The links found on the page. Requires the website.builder method website.with_subscription_return_page_links to be set to true. 24 | pub links: Option>, 25 | } 26 | 27 | #[napi] 28 | /// get the page title. 29 | pub fn page_title(page: NPage) -> String { 30 | page.title() 31 | } 32 | 33 | #[napi] 34 | impl NPage { 35 | /// establish a new page 36 | pub fn new(res: &spider::page::Page, raw: bool) -> NPage { 37 | NPage { 38 | url: res.get_url().into(), 39 | status_code: res.status_code.as_u16(), 40 | content: if raw { 41 | Default::default() 42 | } else { 43 | res.get_html() 44 | }, 45 | raw_content: if raw { 46 | Some(res.get_html_bytes_u8().into()) 47 | } else { 48 | None 49 | }, 50 | headers: match res.headers { 51 | Some(ref headers) => Some(header_map_to_hash_map(headers)), 52 | _ => None, 53 | }, 54 | links: match res.page_links { 55 | Some(ref links) => Some( 56 | links 57 | .iter() 58 | .map(|link| link.as_ref().to_string()) 59 | .collect::>(), 60 | ), 61 | _ => None, 62 | }, 63 | } 64 | } 65 | 66 | #[napi] 67 | /// the html page title. 68 | pub fn title(&self) -> String { 69 | let fragment: scraper::Html = scraper::Html::parse_document(&self.content); 70 | match fragment.select(&TITLE_SELECTOR).next() { 71 | Some(title) => title.inner_html(), 72 | _ => Default::default(), 73 | } 74 | } 75 | } 76 | 77 | /// convert a headermap to hashmap 78 | pub fn header_map_to_hash_map(header_map: &HeaderMap) -> HashMap { 79 | let mut hash_map = HashMap::new(); 80 | 81 | for (key, value) in header_map.iter() { 82 | let key = key.as_str().to_string(); 83 | 84 | if let Ok(value_str) = value.to_str() { 85 | hash_map.insert(key, value_str.to_string()); 86 | } 87 | } 88 | 89 | hash_map 90 | } 91 | -------------------------------------------------------------------------------- /src/nwebsite.rs: -------------------------------------------------------------------------------- 1 | use crate::NPage; 2 | 3 | #[napi] 4 | /// website main data from rust to node. 5 | pub struct NWebsite { 6 | /// all of the website links. 7 | pub links: Vec, 8 | /// the pages found. 9 | pub pages: Vec, 10 | } 11 | -------------------------------------------------------------------------------- /src/page.rs: -------------------------------------------------------------------------------- 1 | use napi; 2 | 3 | /// a simple page object 4 | #[napi] 5 | #[derive(Default)] 6 | pub struct Page { 7 | /// the page object from spider 8 | inner: Option, 9 | /// selectors 10 | selectors: Option, 11 | /// The url for the page. 12 | pub url: String, 13 | /// The website crawling subdomain pages? 14 | pub subdomains: Option, 15 | /// The website crawling TLD pages? 16 | pub tld: Option, 17 | /// The HTTP status code. 18 | pub status_code: u16, 19 | } 20 | 21 | #[napi] 22 | impl Page { 23 | #[napi(constructor)] 24 | /// a new page 25 | pub fn new(url: String, subdomains: Option, tld: Option) -> Self { 26 | Page { 27 | url, 28 | subdomains, 29 | tld, 30 | ..Default::default() 31 | } 32 | } 33 | 34 | #[napi] 35 | /// get the page content 36 | pub async unsafe fn fetch(&mut self) -> &Self { 37 | use spider::{ 38 | lazy_static::lazy_static, reqwest::Client, reqwest_middleware::ClientWithMiddleware, 39 | ClientBuilder, 40 | }; 41 | lazy_static! { 42 | /// top level single page client to re-use. 43 | pub static ref PAGE_CLIENT: ClientWithMiddleware = { 44 | let reqwest_client = Client::builder().build().unwrap_or_default(); 45 | let client = ClientBuilder::new(reqwest_client).build(); 46 | 47 | client 48 | }; 49 | } 50 | let page = spider::page::Page::new_page(&self.url, &PAGE_CLIENT).await; 51 | self.status_code = page.status_code.into(); 52 | self.inner = Some(page); 53 | self.selectors = Some(spider::page::get_page_selectors( 54 | &self.url, 55 | self.subdomains.unwrap_or_default(), 56 | self.tld.unwrap_or_default(), 57 | )); 58 | self 59 | } 60 | 61 | #[napi] 62 | /// all links on the page 63 | pub async fn get_links(&self) -> Vec { 64 | match &self.selectors { 65 | Some(selectors) => match &self.inner { 66 | Some(inner) => { 67 | let links = inner.clone().links(&selectors, &None).await; 68 | links 69 | .into_iter() 70 | .map(|i| i.as_ref().to_string()) 71 | .collect::>() 72 | } 73 | _ => Default::default(), 74 | }, 75 | _ => Default::default(), 76 | } 77 | } 78 | 79 | #[napi] 80 | /// get the html for the page 81 | pub fn get_html(&self) -> String { 82 | match &self.inner { 83 | Some(inner) => inner.get_html(), 84 | _ => Default::default(), 85 | } 86 | } 87 | 88 | #[napi] 89 | /// get the bytes for the page 90 | pub fn get_bytes(&self) -> &[u8] { 91 | match &self.inner { 92 | Some(inner) => inner.get_html_bytes_u8(), 93 | _ => Default::default(), 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/shortcut.rs: -------------------------------------------------------------------------------- 1 | use crate::NPage; 2 | use crate::NWebsite; 3 | use crate::BUFFER; 4 | 5 | #[napi] 6 | /// crawl a website using HTTP gathering all links and html. 7 | pub async fn crawl(url: String, raw_content: Option) -> NWebsite { 8 | let mut website = spider::website::Website::new(&url); 9 | let mut rx2 = website 10 | .subscribe(*BUFFER / 2) 11 | .expect("sync feature should be enabled"); 12 | let (tx, mut rx) = spider::tokio::sync::mpsc::channel(*BUFFER); 13 | let raw_content = raw_content.unwrap_or_default(); 14 | 15 | spider::tokio::spawn(async move { 16 | while let Ok(res) = rx2.recv().await { 17 | if let Err(_) = tx.send(NPage::new(&res, raw_content)).await { 18 | println!("receiver dropped"); 19 | return; 20 | } 21 | } 22 | }); 23 | 24 | spider::tokio::spawn(async move { 25 | website.crawl_raw().await; 26 | }); 27 | 28 | let mut pages = Vec::new(); 29 | 30 | while let Some(i) = rx.recv().await { 31 | pages.push(i) 32 | } 33 | 34 | let links = pages.iter().map(|x| x.url.clone()).collect::>(); 35 | 36 | NWebsite { links, pages } 37 | } 38 | -------------------------------------------------------------------------------- /src/website.rs: -------------------------------------------------------------------------------- 1 | use crate::conversions::{object_to_u8, ObjectConvert}; 2 | use crate::{NPage, BUFFER}; 3 | use indexmap::IndexMap; 4 | use napi::{bindgen_prelude::Object, tokio::task::JoinHandle}; 5 | use napi::{Env, JsUnknown}; 6 | use spider::compact_str::CompactString; 7 | use spider::configuration::{WaitForDelay, WaitForIdleNetwork, WaitForSelector}; 8 | use spider::{configuration::RedirectPolicy, utils::shutdown}; 9 | use std::time::Duration; 10 | 11 | #[napi] 12 | /// a website holding the inner spider::website::Website from Rust fit for nodejs. 13 | pub struct Website { 14 | /// the website from spider. 15 | inner: spider::website::Website, 16 | /// spawned subscription handles. 17 | subscription_handles: IndexMap>, 18 | /// spawned crawl handles. 19 | crawl_handles: IndexMap>, 20 | /// do not convert content to UT8. 21 | raw_content: bool, 22 | /// the data collected. 23 | collected_data: Box>>, 24 | /// is the crawl running in the background. 25 | running_in_background: bool, // /// the file handle for storing data 26 | // file_handle: Option, 27 | } 28 | 29 | #[napi(object)] 30 | struct PageEvent { 31 | pub page: NPage, 32 | } 33 | 34 | #[napi] 35 | impl Website { 36 | #[napi(constructor)] 37 | /// a new website. 38 | pub fn new(url: String, raw_content: Option) -> Self { 39 | Website { 40 | inner: spider::website::Website::new(&url), 41 | subscription_handles: IndexMap::new(), 42 | crawl_handles: IndexMap::new(), 43 | raw_content: raw_content.unwrap_or_default(), 44 | collected_data: Box::new(Vec::new()), 45 | running_in_background: false, // file_handle: None, 46 | } 47 | } 48 | 49 | /// Get the crawl status. 50 | #[napi(getter)] 51 | pub fn status(&self) -> String { 52 | use std::string::ToString; 53 | self.inner.get_status().to_string() 54 | } 55 | 56 | #[napi] 57 | /// Store data to heap memory. The data must be an object. Use `website.export_jsonl_data` to store to disk. When using this method test occordingly since only certain primitives are supported. 58 | pub fn push_data(&mut self, env: Env, obj: JsUnknown) -> napi::Result<()> { 59 | match env.from_js_value::(&obj) { 60 | Ok(deserialized) => { 61 | self 62 | .collected_data 63 | .push(object_to_u8(ObjectConvert::Val(deserialized))?); 64 | } 65 | _ => match obj.coerce_to_object() { 66 | Ok(obj) => { 67 | self 68 | .collected_data 69 | .push(object_to_u8(ObjectConvert::Obj(obj))?); 70 | } 71 | _ => (), 72 | }, 73 | } 74 | 75 | Ok(()) 76 | } 77 | 78 | #[napi] 79 | /// Clear the collected data from heap memory. This only handles the data from `website.pushData`. 80 | pub fn clear_data(&mut self) -> napi::Result<()> { 81 | self.collected_data.clear(); 82 | Ok(()) 83 | } 84 | 85 | #[napi] 86 | /// read the data from the heap memory. 87 | pub fn read_data(&mut self) -> serde_json::Value { 88 | self 89 | .collected_data 90 | .iter() 91 | .map(|d| serde_json::from_slice::(d).unwrap_or_default()) 92 | .collect() 93 | } 94 | 95 | #[napi] 96 | /// store data to memory for disk storing. This will create the path if not exist and defaults to ./storage. 97 | pub async fn export_jsonl_data(&self, export_path: Option) -> napi::Result<()> { 98 | use napi::tokio::io::AsyncWriteExt; 99 | let file = match export_path { 100 | Some(p) => { 101 | let base_dir = p 102 | .split("/") 103 | .into_iter() 104 | .map(|f| { 105 | if f.contains(".") { 106 | "".to_string() 107 | } else { 108 | f.to_string() 109 | } 110 | }) 111 | .collect::(); 112 | 113 | spider::tokio::fs::create_dir_all(&base_dir).await?; 114 | 115 | if !p.contains(".") { 116 | p + ".jsonl" 117 | } else { 118 | p 119 | } 120 | } 121 | _ => { 122 | spider::tokio::fs::create_dir_all("./storage").await?; 123 | "./storage/".to_owned() 124 | + &self 125 | .inner 126 | .get_url() 127 | .inner() 128 | .replace("http://", "") 129 | .replace("https://", "") 130 | + "jsonl" 131 | } 132 | }; 133 | let mut file = spider::tokio::fs::File::create(file).await?; 134 | 135 | for (index, data) in self.collected_data.iter().enumerate() { 136 | if index > 0 { 137 | file.write_all(b"\n").await?; 138 | } 139 | // transform data step needed to auto convert type .. 140 | file.write_all(&data).await?; 141 | } 142 | 143 | Ok(()) 144 | } 145 | 146 | #[napi] 147 | /// subscribe and add an event listener. 148 | pub fn subscribe( 149 | &mut self, 150 | on_page_event: napi::threadsafe_function::ThreadsafeFunction, 151 | ) -> u32 { 152 | let mut rx2 = self 153 | .inner 154 | .subscribe(*BUFFER / 2) 155 | .expect("sync feature should be enabled"); 156 | let raw_content = self.raw_content; 157 | 158 | let handle = spider::tokio::spawn(async move { 159 | while let Ok(res) = rx2.recv().await { 160 | on_page_event.call( 161 | Ok(NPage::new(&res, raw_content)), 162 | napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, 163 | ); 164 | } 165 | }); 166 | 167 | // always return the highest value as the next id. 168 | let id = match self.subscription_handles.last() { 169 | Some(handle) => handle.0 + 1, 170 | _ => 0, 171 | }; 172 | 173 | self.subscription_handles.insert(id, handle); 174 | 175 | id 176 | } 177 | 178 | #[napi] 179 | /// remove a subscription listener. 180 | pub fn unsubscribe(&mut self, id: Option) -> bool { 181 | match id { 182 | Some(id) => { 183 | let handle = self.subscription_handles.get(&id); 184 | 185 | match handle { 186 | Some(h) => { 187 | h.abort(); 188 | self.subscription_handles.shift_remove_entry(&id); 189 | true 190 | } 191 | _ => false, 192 | } 193 | } 194 | // we may want to get all subs and remove them 195 | _ => { 196 | let keys = self.subscription_handles.len(); 197 | for k in self.subscription_handles.drain(..) { 198 | k.1.abort(); 199 | } 200 | keys > 0 201 | } 202 | } 203 | } 204 | 205 | #[napi] 206 | /// stop a crawl 207 | pub async unsafe fn stop(&mut self, id: Option) -> bool { 208 | self.inner.stop(); 209 | 210 | // prevent the last background run 211 | if self.running_in_background { 212 | // we may want ID's to be used as an option along with urls for complete shutdowns. 213 | shutdown(self.inner.get_url().inner()).await; 214 | self.running_in_background = false; 215 | } 216 | 217 | match id { 218 | Some(id) => { 219 | let handle = self.crawl_handles.get(&id); 220 | 221 | match handle { 222 | Some(h) => { 223 | h.abort(); 224 | self.crawl_handles.shift_remove_entry(&id); 225 | true 226 | } 227 | _ => false, 228 | } 229 | } 230 | _ => { 231 | let keys = self.crawl_handles.len(); 232 | for k in self.crawl_handles.drain(..) { 233 | k.1.abort(); 234 | } 235 | keys > 0 236 | } 237 | } 238 | } 239 | 240 | #[napi] 241 | /// crawl a website 242 | pub async unsafe fn crawl( 243 | &mut self, 244 | on_page_event: Option>, 245 | background: Option, 246 | headless: Option, 247 | ) { 248 | // only run in background if on_page_event is handled for streaming. 249 | let background = background.is_some() && background.unwrap_or_default(); 250 | let headless = headless.is_some() && headless.unwrap_or_default(); 251 | let raw_content = self.raw_content; 252 | 253 | if background { 254 | self.running_in_background = background; 255 | } 256 | 257 | match on_page_event { 258 | Some(callback) => { 259 | if background { 260 | let mut website = self.inner.clone(); 261 | let mut rx2 = website 262 | .subscribe(*BUFFER / 2) 263 | .expect("sync feature should be enabled"); 264 | 265 | let handle = spider::tokio::spawn(async move { 266 | while let Ok(res) = rx2.recv().await { 267 | callback.call( 268 | Ok(NPage::new(&res, raw_content)), 269 | napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, 270 | ); 271 | } 272 | }); 273 | 274 | let crawl_id = match self.crawl_handles.last() { 275 | Some(handle) => handle.0 + 1, 276 | _ => 0, 277 | }; 278 | 279 | let crawl_handle = spider::tokio::spawn(async move { 280 | if headless { 281 | website.crawl().await; 282 | } else { 283 | website.crawl_raw().await; 284 | } 285 | }); 286 | 287 | let id = match self.subscription_handles.last() { 288 | Some(handle) => handle.0 + 1, 289 | _ => 0, 290 | }; 291 | 292 | self.crawl_handles.insert(crawl_id, crawl_handle); 293 | self.subscription_handles.insert(id, handle); 294 | } else { 295 | let mut rx2 = self 296 | .inner 297 | .subscribe(*BUFFER / 2) 298 | .expect("sync feature should be enabled"); 299 | 300 | let handle = spider::tokio::spawn(async move { 301 | while let Ok(res) = rx2.recv().await { 302 | callback.call( 303 | Ok(NPage::new(&res, raw_content)), 304 | napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, 305 | ); 306 | } 307 | }); 308 | 309 | if headless { 310 | self.inner.crawl().await; 311 | } else { 312 | self.inner.crawl_raw().await; 313 | } 314 | 315 | let id = match self.subscription_handles.last() { 316 | Some(handle) => handle.0 + 1, 317 | _ => 0, 318 | }; 319 | 320 | self.subscription_handles.insert(id, handle); 321 | } 322 | } 323 | _ => { 324 | if background { 325 | let mut website = self.inner.clone(); 326 | 327 | let crawl_id = match self.crawl_handles.last() { 328 | Some(handle) => handle.0 + 1, 329 | _ => 0, 330 | }; 331 | 332 | let crawl_handle = spider::tokio::spawn(async move { 333 | if headless { 334 | website.crawl().await; 335 | } else { 336 | website.crawl_raw().await; 337 | } 338 | }); 339 | 340 | self.crawl_handles.insert(crawl_id, crawl_handle); 341 | } else { 342 | if headless { 343 | self.inner.crawl().await; 344 | } else { 345 | self.inner.crawl_raw().await; 346 | } 347 | } 348 | } 349 | } 350 | } 351 | 352 | #[napi] 353 | /// Start to crawl website with async concurrency smart. Use HTTP first and JavaScript Rendering as needed. 354 | pub async unsafe fn crawl_smart( 355 | &mut self, 356 | on_page_event: Option>, 357 | background: Option, 358 | ) { 359 | // only run in background if on_page_event is handled for streaming. 360 | let background = background.is_some() && background.unwrap_or_default(); 361 | let raw_content = self.raw_content; 362 | 363 | if background { 364 | self.running_in_background = background; 365 | } 366 | 367 | match on_page_event { 368 | Some(callback) => { 369 | if background { 370 | let mut website = self.inner.clone(); 371 | let mut rx2 = website 372 | .subscribe(*BUFFER / 2) 373 | .expect("sync feature should be enabled"); 374 | 375 | let handle = spider::tokio::spawn(async move { 376 | while let Ok(res) = rx2.recv().await { 377 | callback.call( 378 | Ok(NPage::new(&res, raw_content)), 379 | napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, 380 | ); 381 | } 382 | }); 383 | 384 | let crawl_id = match self.crawl_handles.last() { 385 | Some(handle) => handle.0 + 1, 386 | _ => 0, 387 | }; 388 | 389 | let crawl_handle = spider::tokio::spawn(async move { 390 | website.crawl_smart().await; 391 | }); 392 | 393 | let id = match self.subscription_handles.last() { 394 | Some(handle) => handle.0 + 1, 395 | _ => 0, 396 | }; 397 | 398 | self.crawl_handles.insert(crawl_id, crawl_handle); 399 | self.subscription_handles.insert(id, handle); 400 | } else { 401 | let mut rx2 = self 402 | .inner 403 | .subscribe(*BUFFER / 2) 404 | .expect("sync feature should be enabled"); 405 | 406 | let handle = spider::tokio::spawn(async move { 407 | while let Ok(res) = rx2.recv().await { 408 | callback.call( 409 | Ok(NPage::new(&res, raw_content)), 410 | napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, 411 | ); 412 | } 413 | }); 414 | 415 | self.inner.crawl_smart().await; 416 | let _ = handle.await; 417 | } 418 | } 419 | _ => { 420 | if background { 421 | let mut website = self.inner.clone(); 422 | 423 | let crawl_id = match self.crawl_handles.last() { 424 | Some(handle) => handle.0 + 1, 425 | _ => 0, 426 | }; 427 | 428 | let crawl_handle = spider::tokio::spawn(async move { 429 | website.crawl_smart().await; 430 | }); 431 | 432 | self.crawl_handles.insert(crawl_id, crawl_handle); 433 | } else { 434 | self.inner.crawl_smart().await; 435 | } 436 | } 437 | } 438 | } 439 | 440 | #[napi] 441 | /// scrape a website 442 | pub async unsafe fn scrape( 443 | &mut self, 444 | on_page_event: Option>, 445 | background: Option, 446 | headless: Option, 447 | ) { 448 | let headless = headless.is_some() && headless.unwrap_or_default(); 449 | let raw_content = self.raw_content; 450 | let background = background.is_some() && background.unwrap_or_default(); 451 | 452 | if background { 453 | self.running_in_background = background; 454 | } 455 | 456 | match on_page_event { 457 | Some(callback) => { 458 | if background { 459 | let mut website = self.inner.clone(); 460 | let mut rx2 = website 461 | .subscribe(*BUFFER / 2) 462 | .expect("sync feature should be enabled"); 463 | 464 | let handle = spider::tokio::spawn(async move { 465 | while let Ok(res) = rx2.recv().await { 466 | callback.call( 467 | Ok(NPage::new(&res, raw_content)), 468 | napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, 469 | ); 470 | } 471 | }); 472 | 473 | let crawl_id = match self.crawl_handles.last() { 474 | Some(handle) => handle.0 + 1, 475 | _ => 0, 476 | }; 477 | 478 | let crawl_handle = spider::tokio::spawn(async move { 479 | if headless { 480 | website.scrape().await; 481 | } else { 482 | website.scrape_raw().await; 483 | } 484 | }); 485 | 486 | let id = match self.subscription_handles.last() { 487 | Some(handle) => handle.0 + 1, 488 | _ => 0, 489 | }; 490 | 491 | self.crawl_handles.insert(crawl_id, crawl_handle); 492 | self.subscription_handles.insert(id, handle); 493 | } else { 494 | let mut rx2 = self 495 | .inner 496 | .subscribe(*BUFFER / 2) 497 | .expect("sync feature should be enabled"); 498 | 499 | let handle = spider::tokio::spawn(async move { 500 | while let Ok(res) = rx2.recv().await { 501 | callback.call( 502 | Ok(NPage::new(&res, raw_content)), 503 | napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, 504 | ); 505 | } 506 | }); 507 | 508 | if headless { 509 | self.inner.scrape().await; 510 | } else { 511 | self.inner.scrape_raw().await; 512 | } 513 | 514 | let _ = handle.await; 515 | } 516 | } 517 | _ => { 518 | if background { 519 | let mut website = self.inner.clone(); 520 | 521 | let crawl_id = match self.crawl_handles.last() { 522 | Some(handle) => handle.0 + 1, 523 | _ => 0, 524 | }; 525 | 526 | let crawl_handle = spider::tokio::spawn(async move { 527 | if headless { 528 | website.scrape().await; 529 | } else { 530 | website.scrape_raw().await; 531 | } 532 | }); 533 | 534 | self.crawl_handles.insert(crawl_id, crawl_handle); 535 | } else { 536 | if headless { 537 | self.inner.scrape().await; 538 | } else { 539 | self.inner.scrape_raw().await; 540 | } 541 | } 542 | } 543 | } 544 | } 545 | 546 | /// run a cron job 547 | #[napi] 548 | pub async unsafe fn run_cron( 549 | &mut self, 550 | on_page_event: Option>, 551 | ) -> Cron { 552 | let cron_handle = match on_page_event { 553 | Some(callback) => { 554 | let mut rx2 = self 555 | .inner 556 | .subscribe(*BUFFER / 2) 557 | .expect("sync feature should be enabled"); 558 | let raw_content = self.raw_content; 559 | 560 | let handler = spider::tokio::spawn(async move { 561 | while let Ok(res) = rx2.recv().await { 562 | callback.call( 563 | Ok(NPage::new(&res, raw_content)), 564 | napi::threadsafe_function::ThreadsafeFunctionCallMode::NonBlocking, 565 | ); 566 | } 567 | }); 568 | 569 | Some(handler) 570 | } 571 | _ => None, 572 | }; 573 | 574 | let inner = self.inner.run_cron().await; 575 | 576 | Cron { inner, cron_handle } 577 | } 578 | 579 | #[napi] 580 | /// get all the links of a website 581 | pub fn get_links(&self) -> Vec { 582 | let links = self 583 | .inner 584 | .get_links() 585 | .iter() 586 | .map(|x| x.as_ref().to_string()) 587 | .collect::>(); 588 | links 589 | } 590 | 591 | #[napi(getter)] 592 | /// get the size of the website in amount of pages crawled. If you ran the page in the background, this value will not update. 593 | pub fn size(&mut self) -> u32 { 594 | self.inner.size() as u32 595 | } 596 | 597 | /// get all the pages of a website - requires calling website.scrape 598 | #[napi] 599 | pub fn get_pages(&self) -> Vec { 600 | let mut pages: Vec = Vec::new(); 601 | let raw_content = self.raw_content; 602 | 603 | match self.inner.get_pages() { 604 | Some(p) => { 605 | for page in p.iter() { 606 | pages.push(NPage::new(page, raw_content)); 607 | } 608 | } 609 | _ => (), 610 | } 611 | 612 | pages 613 | } 614 | 615 | #[napi] 616 | /// drain all links from storing 617 | pub fn drain_links(&mut self) -> Vec { 618 | let links = self 619 | .inner 620 | .get_links() 621 | .iter() 622 | .map(|x| x.as_ref().to_string()) 623 | .collect::>(); 624 | self.inner.drain_links(); 625 | links 626 | } 627 | 628 | #[napi] 629 | /// clear all links and page data 630 | pub fn clear(&mut self) { 631 | self.inner.clear(); 632 | } 633 | 634 | #[napi] 635 | /// Set HTTP headers for request using [reqwest::header::HeaderMap](https://docs.rs/reqwest/latest/reqwest/header/struct.HeaderMap.html). 636 | pub fn with_headers(&mut self, headers: Option) -> &Self { 637 | use std::str::FromStr; 638 | 639 | match headers { 640 | Some(obj) => { 641 | let mut h = spider::reqwest::header::HeaderMap::new(); 642 | let keys = Object::keys(&obj).unwrap_or_default(); 643 | 644 | for key in keys.into_iter() { 645 | let header_key = spider::reqwest::header::HeaderName::from_str(&key); 646 | 647 | match header_key { 648 | Ok(hn) => { 649 | let header_value = obj 650 | .get::(key) 651 | .unwrap_or_default() 652 | .unwrap_or_default(); 653 | 654 | match spider::reqwest::header::HeaderValue::from_str(&header_value) { 655 | Ok(hk) => { 656 | h.append(hn, hk); 657 | } 658 | _ => (), 659 | } 660 | } 661 | _ => (), 662 | } 663 | } 664 | self.inner.with_headers(Some(h)); 665 | } 666 | _ => { 667 | self.inner.with_headers(None); 668 | } 669 | }; 670 | 671 | self 672 | } 673 | 674 | /// Add user agent to request. 675 | #[napi] 676 | pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &Self { 677 | self.inner.configuration.with_user_agent(user_agent); 678 | self 679 | } 680 | 681 | /// Respect robots.txt file. 682 | #[napi] 683 | pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &Self { 684 | self 685 | .inner 686 | .configuration 687 | .with_respect_robots_txt(respect_robots_txt); 688 | self 689 | } 690 | 691 | /// Determine whether to collect all the resources found on pages. 692 | #[napi] 693 | pub fn with_full_resources(&mut self, full_resources: bool) -> &Self { 694 | self.inner.configuration.with_full_resources(full_resources); 695 | self 696 | } 697 | 698 | /// Use network interception for the request to only allow content that matches the host. If the content is from a 3rd party it needs to be part of our include list. 699 | #[napi] 700 | pub fn with_chrome_intercept(&mut self, chrome_intercept: bool, block_images: bool) -> &Self { 701 | let mut intercept_config = 702 | spider::features::chrome_common::RequestInterceptConfiguration::new(chrome_intercept); 703 | 704 | intercept_config.block_visuals = block_images; 705 | 706 | self.inner.with_chrome_intercept(intercept_config); 707 | self 708 | } 709 | 710 | /// Set the connection url for the chrome instance. This method does nothing if the `chrome` is not enabled. 711 | #[napi] 712 | pub fn with_chrome_connection(&mut self, chrome_connection: String) -> &Self { 713 | self 714 | .inner 715 | .with_chrome_connection(if chrome_connection.is_empty() { 716 | None 717 | } else { 718 | Some(chrome_connection) 719 | }); 720 | self 721 | } 722 | 723 | /// Preserve the HOST header. 724 | #[napi] 725 | pub fn with_preserve_host_header(&mut self, preserve_host: bool) -> &Self { 726 | self.inner.with_preserve_host_header(preserve_host); 727 | self 728 | } 729 | 730 | /// Include subdomains detection. 731 | #[napi] 732 | pub fn with_subdomains(&mut self, subdomains: bool) -> &Self { 733 | self.inner.configuration.with_subdomains(subdomains); 734 | self 735 | } 736 | 737 | /// Include tld detection. 738 | #[napi] 739 | pub fn with_tld(&mut self, tld: bool) -> &Self { 740 | self.inner.configuration.with_tld(tld); 741 | self 742 | } 743 | 744 | /// Only use HTTP/2. 745 | #[napi] 746 | pub fn with_http2_prior_knowledge(&mut self, http2_prior_knowledge: bool) -> &Self { 747 | self 748 | .inner 749 | .configuration 750 | .with_http2_prior_knowledge(http2_prior_knowledge); 751 | self 752 | } 753 | 754 | /// Max time to wait for request duration to milliseconds. 755 | #[napi] 756 | pub fn with_request_timeout(&mut self, request_timeout: Option) -> &Self { 757 | self 758 | .inner 759 | .configuration 760 | .with_request_timeout(match request_timeout { 761 | Some(d) => Some(Duration::from_millis(d.into())), 762 | _ => None, 763 | }); 764 | self 765 | } 766 | 767 | /// add external domains 768 | #[napi] 769 | pub fn with_external_domains(&mut self, external_domains: Option>) -> &Self { 770 | self.inner.with_external_domains(match external_domains { 771 | Some(ext) => Some(ext.into_iter()), 772 | _ => None, 773 | }); 774 | self 775 | } 776 | 777 | /// Use stealth mode for the request. This does nothing without chrome. 778 | #[napi] 779 | pub fn with_stealth(&mut self, stealth_mode: Option) -> &Self { 780 | self.inner.with_stealth(match stealth_mode { 781 | Some(ext) => ext, 782 | _ => false, 783 | }); 784 | self 785 | } 786 | 787 | /// Dangerously accept invalid certificates - this should be used as a last resort. 788 | #[napi] 789 | pub fn with_danger_accept_invalid_certs(&mut self, accept_invalid_certs: Option) -> &Self { 790 | self 791 | .inner 792 | .with_danger_accept_invalid_certs(match accept_invalid_certs { 793 | Some(ext) => ext, 794 | _ => false, 795 | }); 796 | self 797 | } 798 | 799 | #[napi] 800 | /// Set the crawling budget 801 | pub fn with_budget(&mut self, budget: Option>) -> &Self { 802 | use spider::hashbrown::hash_map::HashMap; 803 | 804 | match budget { 805 | Some(d) => { 806 | self.inner.with_budget(Some( 807 | d.iter() 808 | .map(|(k, v)| (k.as_str(), *v)) 809 | .collect::>(), 810 | )); 811 | } 812 | _ => (), 813 | } 814 | 815 | self 816 | } 817 | 818 | /// Set the max redirects allowed for request. 819 | #[napi] 820 | pub fn with_redirect_limit(&mut self, redirect_limit: u32) -> &Self { 821 | self.inner.with_redirect_limit(redirect_limit as usize); 822 | self 823 | } 824 | 825 | /// Set the redirect policy to use, either Strict or Loose by default. 826 | #[napi] 827 | pub fn with_redirect_policy(&mut self, strict: bool) -> &Self { 828 | self.inner.with_redirect_policy(if strict { 829 | RedirectPolicy::Strict 830 | } else { 831 | RedirectPolicy::Loose 832 | }); 833 | self 834 | } 835 | 836 | #[napi] 837 | /// Regex blacklist urls from the crawl 838 | pub fn with_blacklist_url(&mut self, blacklist_url: Option>) -> &Self { 839 | self 840 | .inner 841 | .configuration 842 | .with_blacklist_url(match blacklist_url { 843 | Some(v) => { 844 | let mut blacklist: Vec = Vec::new(); 845 | for item in v { 846 | blacklist.push(CompactString::new(item)); 847 | } 848 | Some(blacklist) 849 | } 850 | _ => None, 851 | }); 852 | 853 | self 854 | } 855 | 856 | #[napi] 857 | /// Regex whitelist urls from the crawl 858 | pub fn with_whitelist_url(&mut self, whitelist_url: Option>) -> &Self { 859 | self 860 | .inner 861 | .configuration 862 | .with_whitelist_url(match whitelist_url { 863 | Some(v) => { 864 | let mut whitelist: Vec = Vec::new(); 865 | for item in v { 866 | whitelist.push(CompactString::new(item)); 867 | } 868 | Some(whitelist) 869 | } 870 | _ => None, 871 | }); 872 | 873 | self 874 | } 875 | 876 | #[napi] 877 | /// Wait for a delay. Should only be used for testing. This method does nothing if the `chrome` feature is not enabled. 878 | pub fn with_wait_for_delay(&mut self, seconds: Option, nanos: Option) -> &Self { 879 | self 880 | .inner 881 | .configuration 882 | .with_wait_for_delay(if seconds.is_some() || nanos.is_some() { 883 | let duration = Duration::new( 884 | seconds.unwrap_or_default() as u64, 885 | nanos.unwrap_or_default(), 886 | ); 887 | Some(WaitForDelay::new(Some(duration))) 888 | } else { 889 | None 890 | }); 891 | 892 | self 893 | } 894 | 895 | #[napi] 896 | /// Wait for a CSS query selector. This method does nothing if the `chrome` feature is not enabled. 897 | pub fn with_wait_for_selector( 898 | &mut self, 899 | selector: Option<&str>, 900 | seconds: Option, 901 | nanos: Option, 902 | ) -> &Self { 903 | self 904 | .inner 905 | .configuration 906 | .with_wait_for_selector(if seconds.is_some() || nanos.is_some() { 907 | let duration = Duration::new( 908 | seconds.unwrap_or_default() as u64, 909 | nanos.unwrap_or_default(), 910 | ); 911 | Some(WaitForSelector::new( 912 | Some(duration), 913 | selector.unwrap_or_default().to_string(), 914 | )) 915 | } else { 916 | None 917 | }); 918 | 919 | self 920 | } 921 | 922 | #[napi] 923 | /// Wait for idle network request. This method does nothing if the `chrome` feature is not enabled. 924 | pub fn with_wait_for_idle_network(&mut self, seconds: Option, nanos: Option) -> &Self { 925 | self 926 | .inner 927 | .configuration 928 | .with_wait_for_idle_network(if seconds.is_some() || nanos.is_some() { 929 | let duration = Duration::new( 930 | seconds.unwrap_or_default() as u64, 931 | nanos.unwrap_or_default(), 932 | ); 933 | Some(WaitForIdleNetwork::new(Some(duration))) 934 | } else { 935 | None 936 | }); 937 | 938 | self 939 | } 940 | 941 | /// Setup cron jobs to run 942 | #[napi] 943 | pub fn with_cron(&mut self, cron_str: String, cron_type: Option) -> &Self { 944 | self.inner.with_cron( 945 | cron_str.as_str(), 946 | if cron_type.unwrap_or_default() == "scrape" { 947 | spider::website::CronType::Scrape 948 | } else { 949 | spider::website::CronType::Crawl 950 | }, 951 | ); 952 | self 953 | } 954 | 955 | /// Use OpenAI to generate dynamic javascript snippets. Make sure to set the `OPENAI_API_KEY` env variable. 956 | #[napi] 957 | pub fn with_openai(&mut self, env: Env, openai_configs: Option) -> &Self { 958 | use serde_json::Value; 959 | use spider::configuration::GPTConfigs; 960 | let openai_configs: Option = match openai_configs { 961 | Some(obj) => match env.from_js_value(obj) { 962 | Ok(e) => Some(e), 963 | _ => None, 964 | }, 965 | None => None, 966 | }; 967 | 968 | if let Some(configs) = openai_configs { 969 | let configs: GPTConfigs = 970 | serde_json::from_value(configs).unwrap_or_else(|_| GPTConfigs::default()); 971 | 972 | if !configs.model.is_empty() || configs.prompt_url_map.is_some() { 973 | self.inner.with_openai(Some(configs)); 974 | } 975 | } 976 | 977 | self 978 | } 979 | 980 | /// Take screenshots of web pages using chrome. 981 | #[napi] 982 | pub fn with_screenshot( 983 | &mut self, 984 | env: Env, 985 | 986 | #[napi(ts_arg_type = r#"{ 987 | /** The screenshot params. */ 988 | params: { 989 | /** Chrome DevTools Protocol screenshot options. */ 990 | cdp_params: { 991 | /** Image compression format (defaults to png). */ 992 | format: 'jpeg' | 'png' | 'webp' 993 | /** Compression quality from range [0..100] (jpeg only). */ 994 | quality: number 995 | /** Capture the screenshot of a given region only. */ 996 | clip: { 997 | x: number 998 | y: number 999 | height: number 1000 | width: number 1001 | scale: number 1002 | } 1003 | /** Capture the screenshot from the surface, rather than the view. Defaults to true.*/ 1004 | from_surface: boolean 1005 | /** Capture the screenshot beyond the viewport. Defaults to false. */ 1006 | capture_beyond_viewport: boolean 1007 | } 1008 | /** Take full page screenshot */ 1009 | full_page: boolean 1010 | /** Make the background transparent (png only). */ 1011 | omit_background: boolean 1012 | } 1013 | /** Return the bytes of the screenshot on the Page. */ 1014 | bytes: boolean 1015 | /** Store the screenshot to disk. This can be used with output_dir. If disabled will not store the file to the output directory. */ 1016 | save: boolean 1017 | /** The output directory to store the file. Parent folders may be created inside the directory. */ 1018 | output_dir: string | null 1019 | }"#)] 1020 | screenshot_configs: Option, 1021 | ) -> &Self { 1022 | use serde_json::Value; 1023 | use spider::configuration::ScreenShotConfig; 1024 | let screenshot_configs: Option = match screenshot_configs { 1025 | Some(obj) => match env.from_js_value(obj) { 1026 | Ok(e) => Some(e), 1027 | _ => None, 1028 | }, 1029 | None => None, 1030 | }; 1031 | 1032 | if let Some(configs) = screenshot_configs { 1033 | let configs: ScreenShotConfig = 1034 | serde_json::from_value(configs).unwrap_or_else(|_| ScreenShotConfig::default()); 1035 | 1036 | self.inner.with_screenshot(Some(configs)); 1037 | } 1038 | 1039 | self 1040 | } 1041 | 1042 | /// Delay between request as ms. 1043 | #[napi] 1044 | pub fn with_delay(&mut self, delay: u32) -> &Self { 1045 | self.inner.configuration.with_delay(delay.into()); 1046 | self 1047 | } 1048 | 1049 | /// Set a crawl depth limit. If the value is 0 there is no limit. 1050 | #[napi] 1051 | pub fn with_depth(&mut self, depth: u32) -> &Self { 1052 | self.inner.configuration.with_depth(depth as usize); 1053 | self 1054 | } 1055 | 1056 | /// Return the links found on the page in the channel subscriptions. This method does nothing if the `decentralized` is enabled. 1057 | #[napi] 1058 | pub fn with_return_page_links(&mut self, return_page_links: bool) -> &Self { 1059 | self 1060 | .inner 1061 | .configuration 1062 | .with_return_page_links(return_page_links); 1063 | self 1064 | } 1065 | 1066 | /// Cache the page following HTTP rules. 1067 | #[napi] 1068 | pub fn with_caching(&mut self, cache: bool) -> &Self { 1069 | self.inner.configuration.with_caching(cache); 1070 | self 1071 | } 1072 | 1073 | /// Set the sitemap url. 1074 | #[napi] 1075 | pub fn with_sitemap(&mut self, sitemap: Option<&str>) -> &Self { 1076 | self.inner.configuration.with_sitemap(sitemap); 1077 | self 1078 | } 1079 | 1080 | /// Use proxies for request. 1081 | #[napi] 1082 | pub fn with_proxies(&mut self, proxies: Option>) -> &Self { 1083 | self.inner.configuration.with_proxies(proxies); 1084 | self 1085 | } 1086 | 1087 | #[napi] 1088 | /// build the inner website - not required for all builder_steps 1089 | pub fn build(&mut self) -> &Self { 1090 | match self.inner.build() { 1091 | Ok(w) => self.inner = w, 1092 | _ => (), 1093 | } 1094 | self 1095 | } 1096 | } 1097 | 1098 | /// a runner for handling crons 1099 | #[napi] 1100 | pub struct Cron { 1101 | /// the runner task 1102 | inner: spider::async_job::Runner, 1103 | /// inner cron handle 1104 | cron_handle: Option>, 1105 | } 1106 | 1107 | #[napi] 1108 | impl Cron { 1109 | /// stop the cron instance 1110 | #[napi] 1111 | pub async unsafe fn stop(&mut self) { 1112 | self.inner.stop().await; 1113 | match &self.cron_handle { 1114 | Some(h) => h.abort(), 1115 | _ => (), 1116 | } 1117 | } 1118 | } 1119 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "module": "commonjs", 4 | "strict": true, 5 | "lib": ["es2016", "dom"], 6 | "types": ["node"], 7 | "skipLibCheck": true 8 | } 9 | } 10 | --------------------------------------------------------------------------------