├── .github
    └── workflows
    │   ├── .keep
    │   └── cd.yml
├── .gitignore
├── .gitmodules
├── .vscode
    ├── exensions.json
    ├── launch.json
    ├── settings.json
    └── tasks.json
├── Dockerfile
├── LICENSE
├── README.md
├── integrity-check.cjs
├── package-lock.json
├── package.json
├── public
    ├── favicon.ico
    └── robots.txt
├── src
    ├── api
    │   ├── crawler.ts
    │   ├── searcher.ts
    │   └── serp.ts
    ├── cloud-functions
    │   ├── adaptive-crawler.ts
    │   └── data-crunching.ts
    ├── db
    │   ├── adaptive-crawl-task.ts
    │   ├── crawled.ts
    │   ├── domain-blockade.ts
    │   ├── domain-profile.ts
    │   ├── img-alt.ts
    │   ├── pdf.ts
    │   └── searched.ts
    ├── dto
    │   ├── adaptive-crawler-options.ts
    │   ├── crawler-options.ts
    │   ├── jina-embeddings-auth.ts
    │   └── turndown-tweakable-options.ts
    ├── fetch.d.ts
    ├── lib
    │   └── transform-server-event-stream.ts
    ├── services
    │   ├── alt-text.ts
    │   ├── async-context.ts
    │   ├── blackhole-detector.ts
    │   ├── brave-search.ts
    │   ├── canvas.ts
    │   ├── cf-browser-rendering.ts
    │   ├── curl.ts
    │   ├── errors.ts
    │   ├── finalizer.ts
    │   ├── geoip.ts
    │   ├── jsdom.ts
    │   ├── lm.ts
    │   ├── logger.ts
    │   ├── minimal-stealth.js
    │   ├── misc.ts
    │   ├── pdf-extract.ts
    │   ├── pseudo-transfer.ts
    │   ├── puppeteer.ts
    │   ├── registry.ts
    │   ├── robots-text.ts
    │   ├── serp
    │   │   ├── compat.ts
    │   │   ├── google.ts
    │   │   ├── internal.ts
    │   │   ├── puppeteer.ts
    │   │   └── serper.ts
    │   ├── serper-search.ts
    │   ├── snapshot-formatter.ts
    │   ├── temp-file.ts
    │   └── threaded.ts
    ├── shared
    ├── stand-alone
    │   ├── crawl.ts
    │   ├── search.ts
    │   └── serp.ts
    ├── types.d.ts
    └── utils
    │   ├── encoding.ts
    │   ├── get-function-url.ts
    │   ├── ip.ts
    │   ├── markdown.ts
    │   ├── misc.ts
    │   └── tailwind-classes.ts
└── tsconfig.json


/.github/workflows/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/reader/5f07900eabe07b1dd0e8d09e6c8ea022e6b2c176/.github/workflows/.keep


--------------------------------------------------------------------------------
/.github/workflows/cd.yml:
--------------------------------------------------------------------------------
 1 | run-name: Build push and deploy (CD)
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 |       - ci-debug
 7 |       - dev
 8 |     tags:
 9 |       - '*'
10 | 
11 | jobs:
12 |   build-and-push-to-gcr:
13 |     runs-on: ubuntu-latest
14 |     concurrency:
15 |       group: ${{ github.ref_type == 'branch' && github.ref }}
16 |       cancel-in-progress: true
17 |     permissions:
18 |       contents: read
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |         with:
22 |           lfs: true
23 |           submodules: true
24 |           token: ${{ secrets.THINAPPS_SHARED_READ_TOKEN }}
25 |       - uses: 'google-github-actions/auth@v2'
26 |         with:
27 |            credentials_json: '${{ secrets.GCLOUD_SERVICE_ACCOUNT_SECRET_JSON }}'
28 |       - name: 'Set up Cloud SDK'
29 |         uses: 'google-github-actions/setup-gcloud@v2'
30 |         with:
31 |           install_components: beta
32 |       - name: "Docker auth"
33 |         run: |-
34 |           gcloud auth configure-docker us-docker.pkg.dev --quiet
35 |       - name: Set controller release version
36 |         run: echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
37 |       - name: Set up Node.js
38 |         uses: actions/setup-node@v4
39 |         with:
40 |           node-version: 22.12.0
41 |           cache: npm
42 | 
43 |       - name: npm install
44 |         run: npm ci
45 |       - name: get maxmind mmdb
46 |         run: mkdir -p licensed && curl -o licensed/GeoLite2-City.mmdb https://raw.githubusercontent.com/P3TERX/GeoLite.mmdb/download/GeoLite2-City.mmdb
47 |       - name: get source han sans font
48 |         run: curl -o licensed/SourceHanSansSC-Regular.otf https://raw.githubusercontent.com/adobe-fonts/source-han-sans/refs/heads/release/OTF/SimplifiedChinese/SourceHanSansSC-Regular.otf
49 |       - name: build application
50 |         run: npm run build
51 |       - name: Set package version
52 |         run: npm version --no-git-tag-version ${{ env.RELEASE_VERSION }}
53 |         if: github.ref_type == 'tag'
54 |       - name: Docker meta
55 |         id: meta
56 |         uses: docker/metadata-action@v5
57 |         with:
58 |           images: |
59 |             us-docker.pkg.dev/reader-6b7dc/jina-reader/reader
60 |       - name: Set up QEMU
61 |         uses: docker/setup-qemu-action@v3
62 |       - name: Set up Docker Buildx
63 |         uses: docker/setup-buildx-action@v3
64 |       - name: Build and push
65 |         id: container
66 |         uses: docker/build-push-action@v6
67 |         with:
68 |           context: .
69 |           push: true
70 |           tags: ${{ steps.meta.outputs.tags }}
71 |           labels: ${{ steps.meta.outputs.labels }}
72 |       - name: Deploy CRAWL with Tag
73 |         run: |
74 |           gcloud beta run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
75 |       - name: Deploy SEARCH with Tag
76 |         run: |
77 |           gcloud beta run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
78 |       - name: Deploy SERP with Tag
79 |         run: |
80 |           gcloud beta run deploy serp --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
81 |       - name: Deploy CRAWL-EU with Tag
82 |         run: |
83 |           gcloud beta run deploy crawl-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
84 |       - name: Deploy SEARCH-EU with Tag
85 |         run: |
86 |           gcloud beta run deploy search-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
87 |       - name: Deploy SERP-HK with Tag
88 |         run: |
89 |           gcloud beta run deploy serp-hk --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region asia-east2 --async --min-instances 0 --deploy-health-check --use-http2


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Logs
 2 | logs
 3 | *.log
 4 | npm-debug.log*
 5 | yarn-debug.log*
 6 | yarn-error.log*
 7 | firebase-debug.log*
 8 | firebase-debug.*.log*
 9 | 
10 | # Firebase cache
11 | .firebase/
12 | 
13 | # Firebase config
14 | 
15 | # Uncomment this if you'd like others to create their own Firebase project.
16 | # For a team working on the same Firebase project(s), it is recommended to leave
17 | # it commented so all members can deploy to the same project(s) in .firebaserc.
18 | # .firebaserc
19 | 
20 | # Runtime data
21 | pids
22 | *.pid
23 | *.seed
24 | *.pid.lock
25 | 
26 | # Directory for instrumented libs generated by jscoverage/JSCover
27 | lib-cov
28 | 
29 | # Coverage directory used by tools like istanbul
30 | coverage
31 | 
32 | # nyc test coverage
33 | .nyc_output
34 | 
35 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
36 | .grunt
37 | 
38 | # Bower dependency directory (https://bower.io/)
39 | bower_components
40 | 
41 | # node-waf configuration
42 | .lock-wscript
43 | 
44 | # Compiled binary addons (http://nodejs.org/api/addons.html)
45 | build/Release
46 | 
47 | # Dependency directories
48 | node_modules/
49 | 
50 | # Optional npm cache directory
51 | .npm
52 | 
53 | # Optional eslint cache
54 | .eslintcache
55 | 
56 | # Optional REPL history
57 | .node_repl_history
58 | 
59 | # Output of 'npm pack'
60 | *.tgz
61 | 
62 | # Yarn Integrity file
63 | .yarn-integrity
64 | 
65 | # dotenv environment variables file
66 | .env
67 | .secret.local
68 | 
69 | toy*.ts
70 | 
71 | .DS_Store
72 | build/
73 | .firebase-emu/
74 | *.log
75 | .DS_Store
76 | 
77 | *.local
78 | .secret.*
79 | licensed/


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "thinapps-shared"]
2 | 	path = thinapps-shared
3 | 	url = git@github.com:jina-ai/thinapps-shared.git
4 | 


--------------------------------------------------------------------------------
/.vscode/exensions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "recommendations": [
 3 |         "editorconfig.editorconfig",
 4 |         "octref.vetur",
 5 |         "redhat.vscode-yaml",
 6 |         "dbaeumer.vscode-eslint",
 7 |         "esbenp.prettier-vscode",
 8 |         "streetsidesoftware.code-spell-checker"
 9 |     ]
10 | }


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "version": "0.2.0",
  3 |   "configurations": [
  4 |     {
  5 |       "name": "Attach",
  6 |       "port": 9229,
  7 |       "request": "attach",
  8 |       "skipFiles": [
  9 |         "<node_internals>/**"
 10 |       ],
 11 |       "type": "node"
 12 |     },
 13 |     {
 14 |       "name": "Attach by Process ID",
 15 |       "processId": "${command:PickProcess}",
 16 |       "request": "attach",
 17 |       "skipFiles": [
 18 |         "<node_internals>/**"
 19 |       ],
 20 |       "type": "node"
 21 |     },
 22 |     {
 23 |       "name": "Debug Stand Alone Crawl",
 24 |       "request": "launch",
 25 |       "runtimeArgs": [
 26 |         "--env-file=.secret.local",
 27 |       ],
 28 |       "env": {
 29 |         "GCLOUD_PROJECT": "reader-6b7dc",
 30 |         "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
 31 |       },
 32 |       "cwd": "${workspaceFolder}",
 33 |       "program": "build/stand-alone/crawl.js",
 34 |       "skipFiles": [
 35 |         "<node_internals>/**"
 36 |       ],
 37 |       "type": "node",
 38 |       "outputCapture": "std",
 39 |       "preLaunchTask": "Backend:build:watch",
 40 |       "killBehavior": "forceful"
 41 |     },
 42 |     {
 43 |       "name": "Debug Stand Alone Crawl + Browser",
 44 |       "request": "launch",
 45 |       "runtimeArgs": [
 46 |         "--env-file=.secret.local",
 47 |       ],
 48 |       "env": {
 49 |         "GCLOUD_PROJECT": "reader-6b7dc",
 50 |         "DEBUG_BROWSER": "true",
 51 |         "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
 52 |       },
 53 |       "cwd": "${workspaceFolder}",
 54 |       "program": "build/stand-alone/crawl.js",
 55 |       "skipFiles": [
 56 |         "<node_internals>/**"
 57 |       ],
 58 |       "type": "node",
 59 |       "outputCapture": "std",
 60 |       "preLaunchTask": "Backend:build:watch",
 61 |       "killBehavior": "forceful"
 62 |     },
 63 |     {
 64 |       "name": "Debug Stand Alone Crawl - EU",
 65 |       "request": "launch",
 66 |       "runtimeArgs": [
 67 |         "--env-file=.secret.local",
 68 |       ],
 69 |       "env": {
 70 |         "GCLOUD_PROJECT": "reader-6b7dc",
 71 |         "FIRESTORE_DATABASE": "reader-eu",
 72 |         "GCP_STORAGE_BUCKET": "reader-eu",
 73 |         "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
 74 |       },
 75 |       "cwd": "${workspaceFolder}",
 76 |       "program": "build/stand-alone/crawl.js",
 77 |       "skipFiles": [
 78 |         "<node_internals>/**"
 79 |       ],
 80 |       "type": "node",
 81 |       "outputCapture": "std",
 82 |       "preLaunchTask": "Backend:build:watch",
 83 |       "killBehavior": "forceful"
 84 |     },
 85 |     {
 86 |       "name": "Debug Stand Alone Search",
 87 |       "request": "launch",
 88 |       "runtimeArgs": [
 89 |         "--env-file=.secret.local",
 90 |       ],
 91 |       "env": {
 92 |         "GCLOUD_PROJECT": "reader-6b7dc",
 93 |         "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
 94 |       },
 95 |       "cwd": "${workspaceFolder}",
 96 |       "program": "build/stand-alone/search.js",
 97 |       "skipFiles": [
 98 |         "<node_internals>/**"
 99 |       ],
100 |       "type": "node",
101 |       "outputCapture": "std",
102 |       "preLaunchTask": "Backend:build:watch",
103 |       "killBehavior": "forceful"
104 |     },
105 |     {
106 |       "name": "Debug Stand Alone SERP",
107 |       "request": "launch",
108 |       "runtimeArgs": [
109 |         "--env-file=.secret.local",
110 |       ],
111 |       "env": {
112 |         "GCLOUD_PROJECT": "reader-6b7dc",
113 |         "PREFERRED_PROXY_COUNTRY": "hk",
114 |         "OVERRIDE_GOOGLE_DOMAIN": "www.google.com.hk",
115 |         "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
116 |       },
117 |       "cwd": "${workspaceFolder}",
118 |       "program": "build/stand-alone/serp.js",
119 |       "skipFiles": [
120 |         "<node_internals>/**"
121 |       ],
122 |       "type": "node",
123 |       "outputCapture": "std",
124 |       "preLaunchTask": "Backend:build:watch",
125 |       "killBehavior": "forceful"
126 |     },
127 |   ]
128 | }


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "editor.wordWrap": "on",
 3 |     "editor.wordWrapColumn": 120,
 4 |     "files.trimTrailingWhitespace": true,
 5 |     "files.trimFinalNewlines": true,
 6 |     "[javascript]": {
 7 |         "editor.defaultFormatter": "vscode.typescript-language-features"
 8 |     },
 9 |     "[jsonc]": {
10 |         "editor.defaultFormatter": "vscode.json-language-features"
11 |     },
12 |     "[typescript]": {
13 |         "editor.defaultFormatter": "vscode.typescript-language-features"
14 |     },
15 |     "[json]": {
16 |         "editor.defaultFormatter": "vscode.json-language-features"
17 |     },
18 |     "[yaml]": {
19 |         "editor.defaultFormatter": "redhat.vscode-yaml"
20 |     },
21 |     "[markdown]": {
22 |         "files.trimTrailingWhitespace": false
23 |     },
24 |     "typescript.tsdk": "node_modules/typescript/lib",
25 |     "typescript.preferences.quoteStyle": "single",
26 |     "typescript.format.semicolons": "insert",
27 |     "typescript.preferences.importModuleSpecifier": "project-relative",
28 |     "typescript.locale": "en",
29 |     "cSpell.enabled": true,
30 |     "cSpell.words": [
31 |     ],
32 | }


--------------------------------------------------------------------------------
/.vscode/tasks.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "2.0.0",
 3 |     "tasks": [
 4 |         {
 5 |             "type": "npm",
 6 |             "script": "build",
 7 |             "group": "build",
 8 |             "options": {
 9 |                 "cwd": "${workspaceFolder}"
10 |             },
11 |             "problemMatcher": [],
12 |             "label": "Backend:rebuild",
13 |             "detail": "Backend:rebuild"
14 |         },
15 |         {
16 |             "type": "typescript",
17 |             "options": {
18 |                 "cwd": "${workspaceFolder}"
19 |             },
20 |             "tsconfig": "tsconfig.json",
21 |             "option": "watch",
22 |             "isBackground": true,
23 |             "problemMatcher": [
24 |                 "$tsc-watch"
25 |             ],
26 |             "group": "build",
27 |             "label": "Backend:build:watch"
28 |         }
29 |     ]
30 | }


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:1
 2 | FROM lwthiker/curl-impersonate:0.6-chrome-slim-bullseye
 3 | 
 4 | FROM node:22
 5 | 
 6 | RUN apt-get update \
 7 |     && apt-get install -y wget gnupg \
 8 |     && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
 9 |     && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
10 |     && apt-get update \
11 |     && apt-get install -y google-chrome-stable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-freefont-ttf libxss1 zstd \
12 |     --no-install-recommends \
13 |     && rm -rf /var/lib/apt/lists/*
14 | 
15 | COPY --from=0 /usr/local/lib/libcurl-impersonate.so /usr/local/lib/libcurl-impersonate.so
16 | 
17 | RUN groupadd -r jina
18 | RUN useradd -g jina  -G audio,video -m jina
19 | USER jina
20 | 
21 | WORKDIR /app
22 | 
23 | COPY package.json package-lock.json ./
24 | RUN npm ci
25 | 
26 | COPY build ./build
27 | COPY public ./public
28 | COPY licensed ./licensed
29 | 
30 | RUN rm -rf ~/.config/chromium && mkdir -p ~/.config/chromium
31 | 
32 | RUN NODE_COMPILE_CACHE=node_modules npm run dry-run
33 | 
34 | ENV OVERRIDE_CHROME_EXECUTABLE_PATH=/usr/bin/google-chrome-stable
35 | ENV LD_PRELOAD=/usr/local/lib/libcurl-impersonate.so CURL_IMPERSONATE=chrome116 CURL_IMPERSONATE_HEADERS=no
36 | ENV NODE_COMPILE_CACHE=node_modules
37 | ENV PORT=8080
38 | 
39 | EXPOSE 3000 3001 8080 8081
40 | ENTRYPOINT ["node"]
41 | CMD [ "build/stand-alone/crawl.js" ]
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Copyright 2020-2024 Jina AI Limited.  All rights reserved.
  2 | 
  3 | 
  4 |                                  Apache License
  5 |                            Version 2.0, January 2004
  6 |                         http://www.apache.org/licenses/
  7 | 
  8 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  9 | 
 10 |    1. Definitions.
 11 | 
 12 |       "License" shall mean the terms and conditions for use, reproduction,
 13 |       and distribution as defined by Sections 1 through 9 of this document.
 14 | 
 15 |       "Licensor" shall mean the copyright owner or entity authorized by
 16 |       the copyright owner that is granting the License.
 17 | 
 18 |       "Legal Entity" shall mean the union of the acting entity and all
 19 |       other entities that control, are controlled by, or are under common
 20 |       control with that entity. For the purposes of this definition,
 21 |       "control" means (i) the power, direct or indirect, to cause the
 22 |       direction or management of such entity, whether by contract or
 23 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 24 |       outstanding shares, or (iii) beneficial ownership of such entity.
 25 | 
 26 |       "You" (or "Your") shall mean an individual or Legal Entity
 27 |       exercising permissions granted by this License.
 28 | 
 29 |       "Source" form shall mean the preferred form for making modifications,
 30 |       including but not limited to software source code, documentation
 31 |       source, and configuration files.
 32 | 
 33 |       "Object" form shall mean any form resulting from mechanical
 34 |       transformation or translation of a Source form, including but
 35 |       not limited to compiled object code, generated documentation,
 36 |       and conversions to other media types.
 37 | 
 38 |       "Work" shall mean the work of authorship, whether in Source or
 39 |       Object form, made available under the License, as indicated by a
 40 |       copyright notice that is included in or attached to the work
 41 |       (an example is provided in the Appendix below).
 42 | 
 43 |       "Derivative Works" shall mean any work, whether in Source or Object
 44 |       form, that is based on (or derived from) the Work and for which the
 45 |       editorial revisions, annotations, elaborations, or other modifications
 46 |       represent, as a whole, an original work of authorship. For the purposes
 47 |       of this License, Derivative Works shall not include works that remain
 48 |       separable from, or merely link (or bind by name) to the interfaces of,
 49 |       the Work and Derivative Works thereof.
 50 | 
 51 |       "Contribution" shall mean any work of authorship, including
 52 |       the original version of the Work and any modifications or additions
 53 |       to that Work or Derivative Works thereof, that is intentionally
 54 |       submitted to Licensor for inclusion in the Work by the copyright owner
 55 |       or by an individual or Legal Entity authorized to submit on behalf of
 56 |       the copyright owner. For the purposes of this definition, "submitted"
 57 |       means any form of electronic, verbal, or written communication sent
 58 |       to the Licensor or its representatives, including but not limited to
 59 |       communication on electronic mailing lists, source code control systems,
 60 |       and issue tracking systems that are managed by, or on behalf of, the
 61 |       Licensor for the purpose of discussing and improving the Work, but
 62 |       excluding communication that is conspicuously marked or otherwise
 63 |       designated in writing by the copyright owner as "Not a Contribution."
 64 | 
 65 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 66 |       on behalf of whom a Contribution has been received by Licensor and
 67 |       subsequently incorporated within the Work.
 68 | 
 69 |    2. Grant of Copyright License. Subject to the terms and conditions of
 70 |       this License, each Contributor hereby grants to You a perpetual,
 71 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 72 |       copyright license to reproduce, prepare Derivative Works of,
 73 |       publicly display, publicly perform, sublicense, and distribute the
 74 |       Work and such Derivative Works in Source or Object form.
 75 | 
 76 |    3. Grant of Patent License. Subject to the terms and conditions of
 77 |       this License, each Contributor hereby grants to You a perpetual,
 78 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 79 |       (except as stated in this section) patent license to make, have made,
 80 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 81 |       where such license applies only to those patent claims licensable
 82 |       by such Contributor that are necessarily infringed by their
 83 |       Contribution(s) alone or by combination of their Contribution(s)
 84 |       with the Work to which such Contribution(s) was submitted. If You
 85 |       institute patent litigation against any entity (including a
 86 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 87 |       or a Contribution incorporated within the Work constitutes direct
 88 |       or contributory patent infringement, then any patent licenses
 89 |       granted to You under this License for that Work shall terminate
 90 |       as of the date such litigation is filed.
 91 | 
 92 |    4. Redistribution. You may reproduce and distribute copies of the
 93 |       Work or Derivative Works thereof in any medium, with or without
 94 |       modifications, and in Source or Object form, provided that You
 95 |       meet the following conditions:
 96 | 
 97 |       (a) You must give any other recipients of the Work or
 98 |           Derivative Works a copy of this License; and
 99 | 
100 |       (b) You must cause any modified files to carry prominent notices
101 |           stating that You changed the files; and
102 | 
103 |       (c) You must retain, in the Source form of any Derivative Works
104 |           that You distribute, all copyright, patent, trademark, and
105 |           attribution notices from the Source form of the Work,
106 |           excluding those notices that do not pertain to any part of
107 |           the Derivative Works; and
108 | 
109 |       (d) If the Work includes a "NOTICE" text file as part of its
110 |           distribution, then any Derivative Works that You distribute must
111 |           include a readable copy of the attribution notices contained
112 |           within such NOTICE file, excluding those notices that do not
113 |           pertain to any part of the Derivative Works, in at least one
114 |           of the following places: within a NOTICE text file distributed
115 |           as part of the Derivative Works; within the Source form or
116 |           documentation, if provided along with the Derivative Works; or,
117 |           within a display generated by the Derivative Works, if and
118 |           wherever such third-party notices normally appear. The contents
119 |           of the NOTICE file are for informational purposes only and
120 |           do not modify the License. You may add Your own attribution
121 |           notices within Derivative Works that You distribute, alongside
122 |           or as an addendum to the NOTICE text from the Work, provided
123 |           that such additional attribution notices cannot be construed
124 |           as modifying the License.
125 | 
126 |       You may add Your own copyright statement to Your modifications and
127 |       may provide additional or different license terms and conditions
128 |       for use, reproduction, or distribution of Your modifications, or
129 |       for any such Derivative Works as a whole, provided Your use,
130 |       reproduction, and distribution of the Work otherwise complies with
131 |       the conditions stated in this License.
132 | 
133 |    5. Submission of Contributions. Unless You explicitly state otherwise,
134 |       any Contribution intentionally submitted for inclusion in the Work
135 |       by You to the Licensor shall be under the terms and conditions of
136 |       this License, without any additional terms or conditions.
137 |       Notwithstanding the above, nothing herein shall supersede or modify
138 |       the terms of any separate license agreement you may have executed
139 |       with Licensor regarding such Contributions.
140 | 
141 |    6. Trademarks. This License does not grant permission to use the trade
142 |       names, trademarks, service marks, or product names of the Licensor,
143 |       except as required for reasonable and customary use in describing the
144 |       origin of the Work and reproducing the content of the NOTICE file.
145 | 
146 |    7. Disclaimer of Warranty. Unless required by applicable law or
147 |       agreed to in writing, Licensor provides the Work (and each
148 |       Contributor provides its Contributions) on an "AS IS" BASIS,
149 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
150 |       implied, including, without limitation, any warranties or conditions
151 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
152 |       PARTICULAR PURPOSE. You are solely responsible for determining the
153 |       appropriateness of using or redistributing the Work and assume any
154 |       risks associated with Your exercise of permissions under this License.
155 | 
156 |    8. Limitation of Liability. In no event and under no legal theory,
157 |       whether in tort (including negligence), contract, or otherwise,
158 |       unless required by applicable law (such as deliberate and grossly
159 |       negligent acts) or agreed to in writing, shall any Contributor be
160 |       liable to You for damages, including any direct, indirect, special,
161 |       incidental, or consequential damages of any character arising as a
162 |       result of this License or out of the use or inability to use the
163 |       Work (including but not limited to damages for loss of goodwill,
164 |       work stoppage, computer failure or malfunction, or any and all
165 |       other commercial damages or losses), even if such Contributor
166 |       has been advised of the possibility of such damages.
167 | 
168 |    9. Accepting Warranty or Additional Liability. While redistributing
169 |       the Work or Derivative Works thereof, You may choose to offer,
170 |       and charge a fee for, acceptance of support, warranty, indemnity,
171 |       or other liability obligations and/or rights consistent with this
172 |       License. However, in accepting such obligations, You may act only
173 |       on Your own behalf and on Your sole responsibility, not on behalf
174 |       of any other Contributor, and only if You agree to indemnify,
175 |       defend, and hold each Contributor harmless for any liability
176 |       incurred by, or claims asserted against, such Contributor by reason
177 |       of your accepting any such warranty or additional liability.
178 | 
179 |    END OF TERMS AND CONDITIONS
180 | 
181 |    Copyright 2020-2021 Jina AI Limited
182 | 
183 |    Licensed under the Apache License, Version 2.0 (the "License");
184 |    you may not use this file except in compliance with the License.
185 |    You may obtain a copy of the License at
186 | 
187 |        http://www.apache.org/licenses/LICENSE-2.0
188 | 
189 |    Unless required by applicable law or agreed to in writing, software
190 |    distributed under the License is distributed on an "AS IS" BASIS,
191 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
192 |    See the License for the specific language governing permissions and
193 |    limitations under the License.
194 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Reader
  2 | 
  3 | Your LLMs deserve better input.
  4 | 
  5 | Reader does two things:
  6 | - **Read**: It converts any URL to an **LLM-friendly** input with `https://r.jina.ai/https://your.url`. Get improved output for your agent and RAG systems at no cost.
  7 | - **Search**: It searches the web for a given query with `https://s.jina.ai/your+query`. This allows your LLMs to access the latest world knowledge from the web.
  8 | 
  9 | Check out [the live demo](https://jina.ai/reader#demo)
 10 | 
 11 | Or just visit these URLs (**Read**) https://r.jina.ai/https://github.com/jina-ai/reader, (**Search**) https://s.jina.ai/Who%20will%20win%202024%20US%20presidential%20election%3F and see yourself.
 12 | 
 13 | > Feel free to use Reader API in production. It is free, stable and scalable. We are maintaining it actively as one of the core products of Jina AI. [Check out rate limit](https://jina.ai/reader#pricing)
 14 | 
 15 | <img width="973" alt="image" src="https://github.com/jina-ai/reader/assets/2041322/2067c7a2-c12e-4465-b107-9a16ca178d41">
 16 | <img width="973" alt="image" src="https://github.com/jina-ai/reader/assets/2041322/675ac203-f246-41c2-b094-76318240159f">
 17 | 
 18 | 
 19 | ## Updates
 20 | 
 21 | - **2024-07-15**: To restrict the results of `s.jina.ai` to certain domain/website, you can set e.g. `site=jina.ai` in the query parameters, which enables in-site search. For more options, [try our updated live-demo](https://jina.ai/reader/#apiform).
 22 | - **2024-05-30**: Reader can now read abitrary PDF from any URL! Check out [this PDF result from NASA.gov](https://r.jina.ai/https://www.nasa.gov/wp-content/uploads/2023/01/55583main_vision_space_exploration2.pdf) vs [the original](https://www.nasa.gov/wp-content/uploads/2023/01/55583main_vision_space_exploration2.pdf).
 23 | - **2024-05-15**: We introduced a new endpoint `s.jina.ai` that searches on the web and return top-5 results, each in a LLM-friendly format. [Read more about this new feature here](https://jina.ai/news/jina-reader-for-search-grounding-to-improve-factuality-of-llms).
 24 | - **2024-05-08**: Image caption is off by default for better latency. To turn it on, set `x-with-generated-alt: true` in the request header.
 25 | - **2024-04-24**: You now have more fine-grained control over Reader API [using headers](#using-request-headers), e.g. forwarding cookies, using HTTP proxy.
 26 | - **2024-04-15**: Reader now supports image reading! It captions all images at the specified URL and adds `Image [idx]: [caption]` as an alt tag (if they initially lack one). This enables downstream LLMs to interact with the images in reasoning, summarizing etc. [See example here](https://x.com/JinaAI_/status/1780094402071023926).
 27 | 
 28 | ## Usage
 29 | 
 30 | ### Using `r.jina.ai` for single URL fetching
 31 | Simply prepend `https://r.jina.ai/` to any URL. For example, to convert the URL `https://en.wikipedia.org/wiki/Artificial_intelligence` to an LLM-friendly input, use the following URL:
 32 | 
 33 | [https://r.jina.ai/https://en.wikipedia.org/wiki/Artificial_intelligence](https://r.jina.ai/https://en.wikipedia.org/wiki/Artificial_intelligence)
 34 | 
 35 | ### [Using `r.jina.ai` for a full website fetching (Google Colab)](https://colab.research.google.com/drive/1uoBy6_7BhxqpFQ45vuhgDDDGwstaCt4P#scrollTo=5LQjzJiT9ewT)
 36 | 
 37 | ### Using `s.jina.ai` for web search
 38 | Simply prepend `https://s.jina.ai/` to your search query. Note that if you are using this in the code, make sure to encode your search query first, e.g. if your query is `Who will win 2024 US presidential election?` then your url should look like:
 39 | 
 40 | [https://s.jina.ai/Who%20will%20win%202024%20US%20presidential%20election%3F](https://s.jina.ai/Who%20will%20win%202024%20US%20presidential%20election%3F)
 41 | 
 42 | Behind the scenes, Reader searches the web, fetches the top 5 results, visits each URL, and applies `r.jina.ai` to it. This is different from many `web search function-calling` in agent/RAG frameworks, which often return only the title, URL, and description provided by the search engine API. If you want to read one result more deeply, you have to fetch the content yourself from that URL. With Reader, `http://s.jina.ai` automatically fetches the content from the top 5 search result URLs for you (reusing the tech stack behind `http://r.jina.ai`). This means you don't have to handle browser rendering, blocking, or any issues related to JavaScript and CSS yourself.
 43 | 
 44 | ### Using `s.jina.ai` for in-site search
 45 | Simply specify `site` in the query parameters such as:
 46 | 
 47 | ```bash
 48 | curl 'https://s.jina.ai/When%20was%20Jina%20AI%20founded%3F?site=jina.ai&site=github.com'
 49 | ```
 50 | 
 51 | ### [Interactive Code Snippet Builder](https://jina.ai/reader#apiform)
 52 | 
 53 | We highly recommend using the code builder to explore different parameter combinations of the Reader API.
 54 | 
 55 | <a href="https://jina.ai/reader#apiform"><img width="973" alt="image" src="https://github.com/jina-ai/reader/assets/2041322/a490fd3a-1c4c-4a3f-a95a-c481c2a8cc8f"></a>
 56 | 
 57 | 
 58 | ### Using request headers
 59 | 
 60 | As you have already seen above, one can control the behavior of the Reader API using request headers. Here is a complete list of supported headers.
 61 | 
 62 | - You can enable the image caption feature via the `x-with-generated-alt: true` header.
 63 | - You can ask the Reader API to forward cookies settings via the `x-set-cookie` header.
 64 |   - Note that requests with cookies will not be cached.
 65 | - You can bypass `readability` filtering via the `x-respond-with` header, specifically:
 66 |   - `x-respond-with: markdown` returns markdown *without* going through `reability`
 67 |   - `x-respond-with: html` returns `documentElement.outerHTML`
 68 |   - `x-respond-with: text` returns `document.body.innerText`
 69 |   - `x-respond-with: screenshot` returns the URL of the webpage's screenshot
 70 | - You can specify a proxy server via the `x-proxy-url` header.
 71 | - You can customize cache tolerance via the `x-cache-tolerance` header (integer in seconds).
 72 | - You can bypass the cached page (lifetime 3600s) via the `x-no-cache: true` header (equivalent of `x-cache-tolerance: 0`).
 73 | - If you already know the HTML structure of your target page, you may specify `x-target-selector` or `x-wait-for-selector` to direct the Reader API to focus on a specific part of the page.
 74 |   - By setting `x-target-selector` header to a CSS selector, the Reader API return the content within the matched element, instead of the full HTML. Setting this header is useful when the automatic content extraction fails to capture the desired content and you can manually select the correct target.
 75 |   - By setting `x-wait-for-selector` header to a CSS selector, the Reader API will wait until the matched element is rendered before returning the content. If you already specified `x-wait-for-selector`, this header can be omitted if you plan to wait for the same element.
 76 | 
 77 | ### Using `r.jina.ai` for single page application (SPA) fetching
 78 | Many websites nowadays rely on JavaScript frameworks and client-side rendering. Usually known as Single Page Application (SPA). Thanks to [Puppeteer](https://github.com/puppeteer/puppeteer) and headless Chrome browser, Reader natively supports fetching these websites. However, due to specific approach some SPA are developed, there may be some extra precautions to take. 
 79 | 
 80 | #### SPAs with hash-based routing
 81 | By definition of the web standards, content come after `#` in a URL is not sent to the server. To mitigate this issue, use `POST` method with `url` parameter in body.
 82 | 
 83 | ```bash
 84 | curl -X POST 'https://r.jina.ai/' -d 'url=https://example.com/#/route' 
 85 | ```
 86 | 
 87 | #### SPAs with preloading contents
 88 | Some SPAs, or even some websites that are not strictly SPAs, may show preload contents before later loading the main content dynamically. In this case, Reader may be capturing the preload content instead of the main content. To mitigate this issue, here are some possible solutions:
 89 | 
 90 | ##### Specifying `x-timeout` 
 91 | When timeout is explicitly specified, Reader will not attempt to return early and will wait for network idle until the timeout is reached. This is useful when the target website will eventually come to a network idle. 
 92 | 
 93 | ```bash
 94 | curl 'https://example.com/' -H 'x-timeout: 30'
 95 | ```
 96 | 
 97 | ##### Specifying `x-wait-for-selector` 
 98 | When wait-for-selector is explicitly specified, Reader will wait for the appearance of the specified CSS selector until timeout is reached. This is useful when you know exactly what element to wait for. 
 99 | 
100 | ```bash
101 | curl 'https://example.com/' -H 'x-wait-for-selector: #content'
102 | ```
103 | 
104 | ### Streaming mode
105 | 
106 | Streaming mode is useful when you find that the standard mode provides an incomplete result. This is because the Reader will wait a bit longer until the page is *stablely* rendered. Use the accept-header to toggle the streaming mode:
107 | 
108 | ```bash
109 | curl -H "Accept: text/event-stream" https://r.jina.ai/https://en.m.wikipedia.org/wiki/Main_Page
110 | ```
111 | 
112 | The data comes in a stream; each subsequent chunk contains more complete information. **The last chunk should provide the most complete and final result.** If you come from LLMs, please note that it is a different behavior than the LLMs' text-generation streaming.
113 | 
114 | For example, compare these two curl commands below. You can see streaming one gives you complete information at last, whereas standard mode does not. This is because the content loading on this particular site is triggered by some js *after* the page is fully loaded, and standard mode returns the page "too soon".
115 | ```bash
116 | curl -H 'x-no-cache: true' https://access.redhat.com/security/cve/CVE-2023-45853
117 | curl -H "Accept: text/event-stream" -H 'x-no-cache: true' https://r.jina.ai/https://access.redhat.com/security/cve/CVE-2023-45853
118 | ```
119 | 
120 | > Note: `-H 'x-no-cache: true'` is used only for demonstration purposes to bypass the cache.
121 | 
122 | Streaming mode is also useful if your downstream LLM/agent system requires immediate content delivery or needs to process data in chunks to interleave I/O and LLM processing times. This allows for quicker access and more efficient data handling:
123 | 
124 | ```text
125 | Reader API:  streamContent1 ----> streamContent2 ----> streamContent3 ---> ... 
126 |                           |                    |                     |
127 |                           v                    |                     |
128 | Your LLM:                 LLM(streamContent1)  |                     |
129 |                                                v                     |
130 |                                                LLM(streamContent2)   |
131 |                                                                      v
132 |                                                                      LLM(streamContent3)
133 | ```
134 | 
135 | Note that in terms of completeness: `... > streamContent3 > streamContent2 > streamContent1`, each subsequent chunk contains more complete information.
136 | 
137 | ### JSON mode
138 | 
139 | This is still very early and the result is not really a "useful" JSON. It contains three fields `url`, `title` and `content` only. Nonetheless, you can use accept-header to control the output format:
140 | ```bash
141 | curl -H "Accept: application/json" https://r.jina.ai/https://en.m.wikipedia.org/wiki/Main_Page
142 | ```
143 | 
144 | JSON mode is probably more useful in `s.jina.ai` than `r.jina.ai`. For `s.jina.ai` with JSON mode, it returns 5 results in a list, each in the structure of `{'title', 'content', 'url'}`.
145 | 
146 | ### Generated alt
147 | 
148 | All images in that page that lack `alt` tag can be auto-captioned by a VLM (vision langauge model) and formatted as `!(Image [idx]: [VLM_caption])[img_URL]`. This should give your downstream text-only LLM *just enough* hints to include those images into reasoning, selecting, and summarization. Use the x-with-generated-alt header to toggle the streaming mode:
149 | 
150 | ```bash
151 | curl -H "X-With-Generated-Alt: true" https://r.jina.ai/https://en.m.wikipedia.org/wiki/Main_Page
152 | ```
153 | 
154 | ## How it works
155 | [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/jina-ai/reader)
156 | 
157 | ## What is `thinapps-shared` submodule?
158 | 
159 | You might notice a reference to `thinapps-shared` submodule, an internal package we use to share code across our products. While it’s not open-sourced and isn't integral to the Reader's functions, it mainly helps with decorators, logging, secrets management, etc. Feel free to ignore it for now.
160 | 
161 | That said, this is *the single codebase* behind `https://r.jina.ai`, so everytime we commit here, we will deploy the new version to the `https://r.jina.ai`.
162 | 
163 | ## Having trouble on some websites?
164 | Please raise an issue with the URL you are having trouble with. We will look into it and try to fix it.
165 | 
166 | ## License
167 | Reader is backed by [Jina AI](https://jina.ai) and licensed under [Apache-2.0](./LICENSE).
168 | 


--------------------------------------------------------------------------------
/integrity-check.cjs:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const fs = require('fs');
 4 | const path = require('path');
 5 | 
 6 | const file = path.resolve(__dirname, 'licensed/GeoLite2-City.mmdb');
 7 | 
 8 | if (!fs.existsSync(file)) {
 9 |     console.error(`Integrity check failed: ${file} does not exist.`);
10 |     process.exit(1);
11 | }
12 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "reader",
 3 |   "scripts": {
 4 |     "lint": "eslint --ext .js,.ts .",
 5 |     "build": "node ./integrity-check.cjs && tsc -p .",
 6 |     "build:watch": "tsc --watch",
 7 |     "build:clean": "rm -rf ./build",
 8 |     "serve": "npm run build && npm run start",
 9 |     "debug": "npm run build && npm run dev",
10 |     "start": "node ./build/stand-alone/crawl.js",
11 |     "dry-run": "NODE_ENV=dry-run node ./build/stand-alone/search.js"
12 |   },
13 |   "engines": {
14 |     "node": ">=18"
15 |   },
16 |   "main": "build/index.js",
17 |   "dependencies": {
18 |     "@esm2cjs/normalize-url": "^8.0.0",
19 |     "@google-cloud/translate": "^8.2.0",
20 |     "@koa/bodyparser": "^5.1.1",
21 |     "@mozilla/readability": "^0.6.0",
22 |     "@napi-rs/canvas": "^0.1.68",
23 |     "@types/turndown": "^5.0.4",
24 |     "@xmldom/xmldom": "^0.9.3",
25 |     "archiver": "^6.0.1",
26 |     "axios": "^1.3.3",
27 |     "bcrypt": "^5.1.0",
28 |     "busboy": "^1.6.0",
29 |     "civkit": "^0.9.0-2570394",
30 |     "cors": "^2.8.5",
31 |     "dayjs": "^1.11.9",
32 |     "express": "^4.19.2",
33 |     "firebase-admin": "^12.1.0",
34 |     "firebase-functions": "^6.1.1",
35 |     "htmlparser2": "^9.0.0",
36 |     "jose": "^5.1.0",
37 |     "koa": "^2.16.0",
38 |     "koa-compress": "^5.1.1",
39 |     "langdetect": "^0.2.1",
40 |     "linkedom": "^0.18.4",
41 |     "lru-cache": "^11.0.2",
42 |     "maxmind": "^4.3.18",
43 |     "minio": "^7.1.3",
44 |     "node-libcurl": "^4.1.0",
45 |     "openai": "^4.20.0",
46 |     "pdfjs-dist": "^4.10.38",
47 |     "puppeteer": "^23.3.0",
48 |     "puppeteer-extra": "^3.3.6",
49 |     "puppeteer-extra-plugin-block-resources": "^2.4.3",
50 |     "robots-parser": "^3.0.1",
51 |     "set-cookie-parser": "^2.6.0",
52 |     "simple-zstd": "^1.4.2",
53 |     "stripe": "^11.11.0",
54 |     "svg2png-wasm": "^1.4.1",
55 |     "tiktoken": "^1.0.16",
56 |     "tld-extract": "^2.1.0",
57 |     "turndown": "^7.1.3",
58 |     "turndown-plugin-gfm": "^1.0.2",
59 |     "undici": "^7.8.0"
60 |   },
61 |   "devDependencies": {
62 |     "@types/archiver": "^5.3.4",
63 |     "@types/bcrypt": "^5.0.0",
64 |     "@types/busboy": "^1.5.4",
65 |     "@types/cors": "^2.8.17",
66 |     "@types/koa": "^2.15.0",
67 |     "@types/koa-compress": "^4.0.6",
68 |     "@types/node": "^20.14.13",
69 |     "@types/set-cookie-parser": "^2.4.7",
70 |     "@types/xmldom": "^0.1.34",
71 |     "@typescript-eslint/eslint-plugin": "^5.12.0",
72 |     "@typescript-eslint/parser": "^5.12.0",
73 |     "eslint": "^8.9.0",
74 |     "eslint-config-google": "^0.14.0",
75 |     "eslint-plugin-import": "^2.25.4",
76 |     "firebase-functions-test": "^3.0.0",
77 |     "pino-pretty": "^13.0.0",
78 |     "replicate": "^0.16.1",
79 |     "typescript": "^5.5.4"
80 |   },
81 |   "private": true,
82 |   "exports": {
83 |     ".": "./build/index.js"
84 |   }
85 | }
86 | 


--------------------------------------------------------------------------------
/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/reader/5f07900eabe07b1dd0e8d09e6c8ea022e6b2c176/public/favicon.ico


--------------------------------------------------------------------------------
/public/robots.txt:
--------------------------------------------------------------------------------
1 | User-Agent: *
2 | Disallow: /
3 | 


--------------------------------------------------------------------------------
/src/cloud-functions/data-crunching.ts:
--------------------------------------------------------------------------------
  1 | import {
  2 |     Defer,
  3 |     PromiseThrottle,
  4 |     RPCHost,
  5 | } from 'civkit';
  6 | import { singleton } from 'tsyringe';
  7 | import {
  8 |     // CloudScheduleV2, CloudTaskV2,
  9 |     FirebaseStorageBucketControl, Logger, Param, TempFileManager
 10 | } from '../shared';
 11 | import _ from 'lodash';
 12 | import { CrawlerHost } from '../api/crawler';
 13 | 
 14 | import { Crawled } from '../db/crawled';
 15 | import dayjs from 'dayjs';
 16 | import { createReadStream } from 'fs';
 17 | import { appendFile } from 'fs/promises';
 18 | import { createGzip } from 'zlib';
 19 | import { getFunctions } from 'firebase-admin/functions';
 20 | import { SnapshotFormatter } from '../services/snapshot-formatter';
 21 | import { getFunctionUrl } from '../utils/get-function-url';
 22 | 
 23 | dayjs.extend(require('dayjs/plugin/utc'));
 24 | 
 25 | @singleton()
 26 | export class DataCrunchingHost extends RPCHost {
 27 |     logger = this.globalLogger.child({ service: this.constructor.name });
 28 | 
 29 |     pageCacheCrunchingPrefix = 'crunched-pages';
 30 |     pageCacheCrunchingBatchSize = 5000;
 31 |     pageCacheCrunchingTMinus = 6 * 24 * 60 * 60 * 1000;
 32 |     rev = 7;
 33 | 
 34 |     constructor(
 35 |         protected globalLogger: Logger,
 36 | 
 37 |         protected crawler: CrawlerHost,
 38 |         protected snapshotFormatter: SnapshotFormatter,
 39 |         protected tempFileManager: TempFileManager,
 40 |         protected firebaseObjectStorage: FirebaseStorageBucketControl,
 41 |     ) {
 42 |         super(..._.without(arguments, crawler));
 43 |     }
 44 | 
 45 |     override async init() {
 46 |         await this.dependencyReady();
 47 | 
 48 |         this.emit('ready');
 49 |     }
 50 | 
 51 |     // @CloudTaskV2({
 52 |     //     runtime: {
 53 |     //         cpu: 2,
 54 |     //         memory: '4GiB',
 55 |     //         timeoutSeconds: 3600,
 56 |     //         concurrency: 2,
 57 |     //         maxInstances: 200,
 58 |     //         retryConfig: {
 59 |     //             maxAttempts: 3,
 60 |     //             minBackoffSeconds: 60,
 61 |     //         },
 62 |     //         rateLimits: {
 63 |     //             maxConcurrentDispatches: 150,
 64 |     //             maxDispatchesPerSecond: 2,
 65 |     //         },
 66 |     //     },
 67 |     //     tags: ['DataCrunching'],
 68 |     // })
 69 |     async crunchPageCacheWorker(
 70 |         @Param('date') date: string,
 71 |         @Param('offset', { default: 0 }) offset: number
 72 |     ) {
 73 |         this.logger.info(`Crunching page cache @${date}+${offset}...`);
 74 |         for await (const { fileName, records } of this.iterPageCacheRecords(date, offset)) {
 75 |             this.logger.info(`Crunching ${fileName}...`);
 76 |             const fileOnDrive = await this.crunchCacheRecords(records);
 77 |             const fstream = createReadStream(fileOnDrive.path);
 78 |             const gzipStream = createGzip();
 79 |             fstream.pipe(gzipStream, { end: true });
 80 |             await this.firebaseObjectStorage.bucket.file(fileName).save(gzipStream, {
 81 |                 contentType: 'application/jsonl+gzip',
 82 |             });
 83 |         }
 84 | 
 85 |         this.logger.info(`Crunching page cache @${date}+${offset} done.`);
 86 | 
 87 |         return true;
 88 |     }
 89 | 
 90 |     // @CloudScheduleV2('2 0 * * *', {
 91 |     //     name: 'crunchPageCacheEveryday',
 92 |     //     runtime: {
 93 |     //         cpu: 2,
 94 |     //         memory: '4GiB',
 95 |     //         timeoutSeconds: 1800,
 96 |     //         timeZone: 'UTC',
 97 |     //         retryCount: 3,
 98 |     //         minBackoffSeconds: 60,
 99 |     //     },
100 |     //     tags: ['DataCrunching'],
101 |     // })
102 |     async dispatchPageCacheCrunching() {
103 |         for await (const { fileName, date, offset } of this.iterPageCacheChunks()) {
104 |             this.logger.info(`Dispatching ${fileName}...`);
105 |             // sse.write({ data: `Dispatching ${fileName}...` });
106 | 
107 |             await getFunctions().taskQueue('crunchPageCacheWorker').enqueue({ date, offset }, {
108 |                 dispatchDeadlineSeconds: 1800,
109 |                 uri: await getFunctionUrl('crunchPageCacheWorker'),
110 |             });
111 |         }
112 | 
113 |         return true;
114 |     }
115 | 
116 |     // @CloudHTTPv2({
117 |     //     runtime: {
118 |     //         cpu: 2,
119 |     //         memory: '4GiB',
120 |     //         timeoutSeconds: 3600,
121 |     //         concurrency: 2,
122 |     //         maxInstances: 200,
123 |     //     },
124 |     //     tags: ['DataCrunching'],
125 |     // })
126 |     // async dispatchPageCacheCrunching(
127 |     //     @RPCReflect() rpcReflect: RPCReflection
128 |     // ) {
129 |     //     const sse = new OutputServerEventStream({ highWaterMark: 4096 });
130 |     //     rpcReflect.return(sse);
131 |     //     rpcReflect.catch((err) => {
132 |     //         sse.end({ data: `Error: ${err.message}` });
133 |     //     });
134 |     //     for await (const { fileName, date, offset } of this.iterPageCacheChunks()) {
135 |     //         this.logger.info(`Dispatching ${fileName}...`);
136 |     //         sse.write({ data: `Dispatching ${fileName}...` });
137 | 
138 |     //         await getFunctions().taskQueue('crunchPageCacheWorker').enqueue({ date, offset }, {
139 |     //             dispatchDeadlineSeconds: 1800,
140 |     //             uri: await getFunctionUrl('crunchPageCacheWorker'),
141 |     //         });
142 |     //     }
143 | 
144 |     //     sse.end({ data: 'done' });
145 | 
146 |     //     return true;
147 |     // }
148 | 
149 |     async* iterPageCacheRecords(date?: string, inputOffset?: number | string) {
150 |         const startOfToday = dayjs().utc().startOf('day');
151 |         const startingPoint = dayjs().utc().subtract(this.pageCacheCrunchingTMinus, 'ms').startOf('day');
152 |         let theDay = startingPoint;
153 | 
154 |         if (date) {
155 |             theDay = dayjs(date).utc().startOf('day');
156 |         }
157 | 
158 |         let counter = 0;
159 |         if (inputOffset) {
160 |             counter = parseInt(inputOffset as string, 10);
161 |         }
162 | 
163 |         while (theDay.isBefore(startOfToday)) {
164 |             const fileName = `${this.pageCacheCrunchingPrefix}/r${this.rev}/${theDay.format('YYYY-MM-DD')}/${counter}.jsonl.gz`;
165 |             const offset = counter;
166 |             counter += this.pageCacheCrunchingBatchSize;
167 |             const fileExists = (await this.firebaseObjectStorage.bucket.file(fileName).exists())[0];
168 |             if (fileExists) {
169 |                 continue;
170 |             }
171 | 
172 |             const records = await Crawled.fromFirestoreQuery(Crawled.COLLECTION
173 |                 .where('createdAt', '>=', theDay.toDate())
174 |                 .where('createdAt', '<', theDay.add(1, 'day').toDate())
175 |                 .orderBy('createdAt', 'asc')
176 |                 .offset(offset)
177 |                 .limit(this.pageCacheCrunchingBatchSize)
178 |             );
179 | 
180 |             this.logger.info(`Found ${records.length} records for ${theDay.format('YYYY-MM-DD')} at offset ${offset}`, { fileName, counter });
181 | 
182 |             if (!records.length) {
183 |                 if (date) {
184 |                     break;
185 |                 }
186 |                 theDay = theDay.add(1, 'day');
187 |                 counter = 0;
188 |                 continue;
189 |             }
190 | 
191 |             yield { fileName, records };
192 | 
193 |             if (offset) {
194 |                 break;
195 |             }
196 |         }
197 |     }
198 | 
199 |     async* iterPageCacheChunks() {
200 |         const startOfToday = dayjs().utc().startOf('day');
201 |         const startingPoint = dayjs().utc().subtract(this.pageCacheCrunchingTMinus, 'ms').startOf('day');
202 |         let theDay = startingPoint;
203 | 
204 |         let counter = 0;
205 | 
206 |         while (theDay.isBefore(startOfToday)) {
207 |             const fileName = `${this.pageCacheCrunchingPrefix}/r${this.rev}/${theDay.format('YYYY-MM-DD')}/${counter}.jsonl.gz`;
208 |             const offset = counter;
209 |             counter += this.pageCacheCrunchingBatchSize;
210 |             const fileExists = (await this.firebaseObjectStorage.bucket.file(fileName).exists())[0];
211 |             if (fileExists) {
212 |                 continue;
213 |             }
214 | 
215 |             const nRecords = (await Crawled.COLLECTION
216 |                 .where('createdAt', '>=', theDay.toDate())
217 |                 .where('createdAt', '<', theDay.add(1, 'day').toDate())
218 |                 .orderBy('createdAt', 'asc')
219 |                 .offset(offset)
220 |                 .limit(this.pageCacheCrunchingBatchSize)
221 |                 .count().get()).data().count;
222 | 
223 |             this.logger.info(`Found ${nRecords} records for ${theDay.format('YYYY-MM-DD')} at offset ${offset}`, { fileName, counter });
224 |             if (nRecords < this.pageCacheCrunchingBatchSize) {
225 |                 theDay = theDay.add(1, 'day');
226 |                 counter = 0;
227 |             }
228 |             if (nRecords) {
229 |                 yield { fileName, date: theDay.toISOString(), offset };
230 |             }
231 |         }
232 |     }
233 | 
234 |     async crunchCacheRecords(records: Crawled[]) {
235 |         const throttle = new PromiseThrottle(30);
236 |         const localFilePath = this.tempFileManager.alloc();
237 |         let nextDrainDeferred = Defer();
238 |         nextDrainDeferred.resolve();
239 | 
240 |         for (const record of records) {
241 |             await throttle.acquire();
242 |             this.firebaseObjectStorage.downloadFile(`snapshots/${record._id}`)
243 |                 .then(async (snapshotTxt) => {
244 |                     try {
245 |                         const snapshot = JSON.parse(snapshotTxt.toString('utf-8'));
246 | 
247 |                         let formatted = await this.snapshotFormatter.formatSnapshot('default', snapshot);
248 |                         if (!formatted.content) {
249 |                             formatted = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
250 |                         }
251 | 
252 |                         await nextDrainDeferred.promise;
253 |                         await appendFile(localFilePath, JSON.stringify({
254 |                             url: snapshot.href,
255 |                             title: snapshot.title || '',
256 |                             html: snapshot.html || '',
257 |                             text: snapshot.text || '',
258 |                             content: formatted.content || '',
259 |                         }) + '\n', { encoding: 'utf-8' });
260 | 
261 |                     } catch (err) {
262 |                         this.logger.warn(`Failed to parse snapshot for ${record._id}`, { err });
263 |                     }
264 |                 })
265 |                 .finally(() => {
266 |                     throttle.release();
267 |                 });
268 |         }
269 | 
270 |         await throttle.nextDrain();
271 | 
272 | 
273 |         const ro = {
274 |             path: localFilePath
275 |         };
276 | 
277 |         this.tempFileManager.bindPathTo(ro, localFilePath);
278 | 
279 |         return ro;
280 |     }
281 | }
282 | 


--------------------------------------------------------------------------------
/src/db/adaptive-crawl-task.ts:
--------------------------------------------------------------------------------
 1 | import { Also, Prop, parseJSONText } from 'civkit';
 2 | import { FirestoreRecord } from '../shared/lib/firestore';
 3 | import _ from 'lodash';
 4 | 
 5 | export enum AdaptiveCrawlTaskStatus {
 6 |     PENDING = 'pending',
 7 |     PROCESSING = 'processing',
 8 |     COMPLETED = 'completed',
 9 |     FAILED = 'failed',
10 | }
11 | 
12 | @Also({
13 |     dictOf: Object
14 | })
15 | export class AdaptiveCrawlTask extends FirestoreRecord {
16 |     static override collectionName = 'adaptiveCrawlTasks';
17 | 
18 |     override _id!: string;
19 | 
20 |     @Prop({
21 |         required: true
22 |     })
23 |     status!: AdaptiveCrawlTaskStatus;
24 | 
25 |     @Prop({
26 |         required: true
27 |     })
28 |     statusText!: string;
29 | 
30 |     @Prop()
31 |     meta!: {
32 |         useSitemap: boolean;
33 |         maxPages: number;
34 |         targetUrl: string;
35 |     };
36 | 
37 |     @Prop()
38 |     urls!: string[];
39 | 
40 |     @Prop()
41 |     processed!: {
42 |         [url: string]: string;
43 |     };
44 | 
45 |     @Prop()
46 |     failed!: {
47 |         [url: string]: any;
48 |     };
49 | 
50 |     @Prop()
51 |     createdAt!: Date;
52 | 
53 |     @Prop()
54 |     finishedAt?: Date;
55 | 
56 |     @Prop()
57 |     duration?: number;
58 | 
59 |     static patchedFields = [
60 |         'meta',
61 |     ];
62 | 
63 |     static override from(input: any) {
64 |         for (const field of this.patchedFields) {
65 |             if (typeof input[field] === 'string') {
66 |                 input[field] = parseJSONText(input[field]);
67 |             }
68 |         }
69 | 
70 |         return super.from(input) as AdaptiveCrawlTask;
71 |     }
72 | 
73 |     override degradeForFireStore() {
74 |         const copy: any = { ...this };
75 | 
76 |         for (const field of (this.constructor as typeof AdaptiveCrawlTask).patchedFields) {
77 |             if (typeof copy[field] === 'object') {
78 |                 copy[field] = JSON.stringify(copy[field]) as any;
79 |             }
80 |         }
81 | 
82 |         return copy;
83 |     }
84 | 
85 |     [k: string]: any;
86 | }
87 | 


--------------------------------------------------------------------------------
/src/db/crawled.ts:
--------------------------------------------------------------------------------
 1 | import { Also, parseJSONText, Prop } from 'civkit';
 2 | import { FirestoreRecord } from '../shared/lib/firestore';
 3 | import _ from 'lodash';
 4 | import type { PageSnapshot } from '../services/puppeteer';
 5 | 
 6 | @Also({
 7 |     dictOf: Object
 8 | })
 9 | export class Crawled extends FirestoreRecord {
10 |     static override collectionName = 'crawled';
11 | 
12 |     override _id!: string;
13 | 
14 |     @Prop({
15 |         required: true
16 |     })
17 |     url!: string;
18 | 
19 |     @Prop({
20 |         required: true
21 |     })
22 |     urlPathDigest!: string;
23 | 
24 |     @Prop()
25 |     htmlSignificantlyModifiedByJs?: boolean;
26 | 
27 |     @Prop()
28 |     snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
29 | 
30 |     @Prop()
31 |     screenshotAvailable?: boolean;
32 | 
33 |     @Prop()
34 |     pageshotAvailable?: boolean;
35 | 
36 |     @Prop()
37 |     snapshotAvailable?: boolean;
38 | 
39 |     @Prop()
40 |     createdAt!: Date;
41 | 
42 |     @Prop()
43 |     expireAt!: Date;
44 | 
45 |     static patchedFields = [
46 |         'snapshot'
47 |     ];
48 | 
49 |     static override from(input: any) {
50 |         for (const field of this.patchedFields) {
51 |             if (typeof input[field] === 'string') {
52 |                 input[field] = parseJSONText(input[field]);
53 |             }
54 |         }
55 | 
56 |         return super.from(input) as Crawled;
57 |     }
58 | 
59 |     override degradeForFireStore() {
60 |         const copy: any = { ...this };
61 | 
62 |         for (const field of (this.constructor as typeof Crawled).patchedFields) {
63 |             if (typeof copy[field] === 'object') {
64 |                 copy[field] = JSON.stringify(copy[field]) as any;
65 |             }
66 |         }
67 | 
68 |         return copy;
69 |     }
70 | 
71 |     [k: string]: any;
72 | }
73 | 


--------------------------------------------------------------------------------
/src/db/domain-blockade.ts:
--------------------------------------------------------------------------------
 1 | import { Also, Prop } from 'civkit';
 2 | import { FirestoreRecord } from '../shared/lib/firestore';
 3 | 
 4 | @Also({
 5 |     dictOf: Object
 6 | })
 7 | export class DomainBlockade extends FirestoreRecord {
 8 |     static override collectionName = 'domainBlockades';
 9 | 
10 |     override _id!: string;
11 | 
12 |     @Prop({
13 |         required: true
14 |     })
15 |     domain!: string;
16 | 
17 |     @Prop({ required: true })
18 |     triggerReason!: string;
19 | 
20 |     @Prop()
21 |     triggerUrl?: string;
22 | 
23 |     @Prop()
24 |     createdAt!: Date;
25 | 
26 |     @Prop()
27 |     expireAt?: Date;
28 | 
29 |     [k: string]: any;
30 | }
31 | 


--------------------------------------------------------------------------------
/src/db/domain-profile.ts:
--------------------------------------------------------------------------------
 1 | import { Also, Prop } from 'civkit';
 2 | import { FirestoreRecord } from '../shared/lib/firestore';
 3 | import { ENGINE_TYPE } from '../dto/crawler-options';
 4 | 
 5 | @Also({
 6 |     dictOf: Object
 7 | })
 8 | export class DomainProfile extends FirestoreRecord {
 9 |     static override collectionName = 'domainProfiles';
10 | 
11 |     override _id!: string;
12 | 
13 |     @Prop({
14 |         required: true
15 |     })
16 |     path!: string;
17 | 
18 |     @Prop()
19 |     triggerUrl?: string;
20 | 
21 |     @Prop({ required: true, type: ENGINE_TYPE })
22 |     engine!: string;
23 | 
24 |     @Prop()
25 |     createdAt!: Date;
26 | 
27 |     @Prop()
28 |     expireAt?: Date;
29 | 
30 |     [k: string]: any;
31 | }
32 | 


--------------------------------------------------------------------------------
/src/db/img-alt.ts:
--------------------------------------------------------------------------------
 1 | import { Also, Prop } from 'civkit';
 2 | import { FirestoreRecord } from '../shared/lib/firestore';
 3 | import _ from 'lodash';
 4 | 
 5 | @Also({
 6 |     dictOf: Object
 7 | })
 8 | export class ImgAlt extends FirestoreRecord {
 9 |     static override collectionName = 'imgAlts';
10 | 
11 |     override _id!: string;
12 | 
13 |     @Prop({
14 |         required: true
15 |     })
16 |     src!: string;
17 | 
18 |     @Prop({
19 |         required: true
20 |     })
21 |     urlDigest!: string;
22 | 
23 |     @Prop()
24 |     width?: number;
25 | 
26 |     @Prop()
27 |     height?: number;
28 | 
29 |     @Prop()
30 |     generatedAlt?: string;
31 | 
32 |     @Prop()
33 |     originalAlt?: string;
34 | 
35 |     @Prop()
36 |     createdAt!: Date;
37 | 
38 |     @Prop()
39 |     expireAt?: Date;
40 | 
41 |     [k: string]: any;
42 | }
43 | 


--------------------------------------------------------------------------------
/src/db/pdf.ts:
--------------------------------------------------------------------------------
 1 | import { Also, Prop, parseJSONText } from 'civkit';
 2 | import { FirestoreRecord } from '../shared/lib/firestore';
 3 | import _ from 'lodash';
 4 | 
 5 | @Also({
 6 |     dictOf: Object
 7 | })
 8 | export class PDFContent extends FirestoreRecord {
 9 |     static override collectionName = 'pdfs';
10 | 
11 |     override _id!: string;
12 | 
13 |     @Prop({
14 |         required: true
15 |     })
16 |     src!: string;
17 | 
18 |     @Prop({
19 |         required: true
20 |     })
21 |     urlDigest!: string;
22 | 
23 |     @Prop()
24 |     meta?: { [k: string]: any; };
25 | 
26 |     @Prop()
27 |     text?: string;
28 | 
29 |     @Prop()
30 |     content?: string;
31 | 
32 |     @Prop()
33 |     createdAt!: Date;
34 | 
35 |     @Prop()
36 |     expireAt?: Date;
37 | 
38 |     static patchedFields = [
39 |         'meta'
40 |     ];
41 | 
42 |     static override from(input: any) {
43 |         for (const field of this.patchedFields) {
44 |             if (typeof input[field] === 'string') {
45 |                 input[field] = parseJSONText(input[field]);
46 |             }
47 |         }
48 | 
49 |         return super.from(input) as PDFContent;
50 |     }
51 | 
52 |     override degradeForFireStore() {
53 |         const copy: any = { ...this };
54 | 
55 |         for (const field of (this.constructor as typeof PDFContent).patchedFields) {
56 |             if (typeof copy[field] === 'object') {
57 |                 copy[field] = JSON.stringify(copy[field]) as any;
58 |             }
59 |         }
60 | 
61 |         return copy;
62 |     }
63 | 
64 |     [k: string]: any;
65 | }
66 | 


--------------------------------------------------------------------------------
/src/db/searched.ts:
--------------------------------------------------------------------------------
 1 | import { Also, parseJSONText, Prop } from 'civkit';
 2 | import { FirestoreRecord } from '../shared/lib/firestore';
 3 | import _ from 'lodash';
 4 | 
 5 | @Also({
 6 |     dictOf: Object
 7 | })
 8 | export class SearchResult extends FirestoreRecord {
 9 |     static override collectionName = 'searchResults';
10 | 
11 |     override _id!: string;
12 | 
13 |     @Prop({
14 |         required: true
15 |     })
16 |     query!: any;
17 | 
18 |     @Prop({
19 |         required: true
20 |     })
21 |     queryDigest!: string;
22 | 
23 |     @Prop()
24 |     response?: any;
25 | 
26 |     @Prop()
27 |     createdAt!: Date;
28 | 
29 |     @Prop()
30 |     expireAt?: Date;
31 | 
32 |     [k: string]: any;
33 | 
34 |     static patchedFields = [
35 |         'query',
36 |         'response',
37 |     ];
38 | 
39 |     static override from(input: any) {
40 |         for (const field of this.patchedFields) {
41 |             if (typeof input[field] === 'string') {
42 |                 input[field] = parseJSONText(input[field]);
43 |             }
44 |         }
45 | 
46 |         return super.from(input) as SearchResult;
47 |     }
48 | 
49 |     override degradeForFireStore() {
50 |         const copy: any = { ...this };
51 | 
52 |         for (const field of (this.constructor as typeof SearchResult).patchedFields) {
53 |             if (typeof copy[field] === 'object') {
54 |                 copy[field] = JSON.stringify(copy[field]) as any;
55 |             }
56 |         }
57 | 
58 |         return copy;
59 |     }
60 | }
61 | 
62 | export class SerperSearchResult extends SearchResult {
63 |     static override collectionName = 'serperSearchResults';
64 | }
65 | 
66 | export class SERPResult extends SearchResult {
67 |     static override collectionName = 'SERPResults';
68 | }


--------------------------------------------------------------------------------
/src/dto/adaptive-crawler-options.ts:
--------------------------------------------------------------------------------
 1 | import { Also, AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit';
 2 | import type { Request, Response } from 'express';
 3 | 
 4 | 
 5 | @Also({
 6 |     openapi: {
 7 |         operation: {
 8 |             parameters: {
 9 |                 'X-Use-Sitemap': {
10 |                     description: 'Use sitemap to crawl the website.',
11 |                     in: 'header',
12 |                     schema: { type: 'string' }
13 |                 },
14 |                 'X-Max-Depth': {
15 |                     description: 'Max deep level to crawl.',
16 |                     in: 'header',
17 |                     schema: { type: 'string' }
18 |                 },
19 |                 'X-Max-Pages': {
20 |                     description: 'Max number of pages to crawl.',
21 |                     in: 'header',
22 |                     schema: { type: 'string' }
23 |                 },
24 |             }
25 |         }
26 |     }
27 | })
28 | export class AdaptiveCrawlerOptions extends AutoCastable {
29 |     @Prop({
30 |         default: true,
31 |         desc: 'Use sitemap to crawl the website.',
32 |     })
33 |     useSitemap!: boolean;
34 | 
35 |     @Prop({
36 |         default: 10,
37 |         desc: 'Max number of pages to crawl.',
38 |         validate: (v: number) => v >= 1 && v <= 100,
39 |     })
40 |     maxPages!: number;
41 | 
42 |     static override from(input: any) {
43 |         const instance = super.from(input) as AdaptiveCrawlerOptions;
44 |         const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
45 |             req: Request,
46 |             res: Response,
47 |         } | undefined;
48 | 
49 |         let maxPages = parseInt(ctx?.req.get('x-max-pages') || '');
50 |         if (!isNaN(maxPages) && maxPages > 0) {
51 |             instance.maxPages = maxPages <= 100 ? maxPages : 100;
52 |         }
53 | 
54 |         const useSitemap = ctx?.req.get('x-use-sitemap');
55 |         if (useSitemap !== undefined) {
56 |             instance.useSitemap = Boolean(useSitemap);
57 |         }
58 | 
59 |         return instance;
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/dto/jina-embeddings-auth.ts:
--------------------------------------------------------------------------------
  1 | import _ from 'lodash';
  2 | import {
  3 |     Also, AuthenticationFailedError, AuthenticationRequiredError,
  4 |     RPC_CALL_ENVIRONMENT,
  5 |     AutoCastable,
  6 |     DownstreamServiceError,
  7 | } from 'civkit/civ-rpc';
  8 | import { htmlEscape } from 'civkit/escape';
  9 | import { marshalErrorLike } from 'civkit/lang';
 10 | 
 11 | import type { Context } from 'koa';
 12 | 
 13 | import logger from '../services/logger';
 14 | import { InjectProperty } from '../services/registry';
 15 | import { AsyncLocalContext } from '../services/async-context';
 16 | 
 17 | import envConfig from '../shared/services/secrets';
 18 | import { JinaEmbeddingsDashboardHTTP } from '../shared/3rd-party/jina-embeddings';
 19 | import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
 20 | import { TierFeatureConstraintError } from '../services/errors';
 21 | 
 22 | const authDtoLogger = logger.child({ service: 'JinaAuthDTO' });
 23 | 
 24 | 
 25 | const THE_VERY_SAME_JINA_EMBEDDINGS_CLIENT = new JinaEmbeddingsDashboardHTTP(envConfig.JINA_EMBEDDINGS_DASHBOARD_API_KEY);
 26 | 
 27 | @Also({
 28 |     openapi: {
 29 |         operation: {
 30 |             parameters: {
 31 |                 'Authorization': {
 32 |                     description: htmlEscape`Jina Token for authentication.\n\n` +
 33 |                         htmlEscape`- Member of <JinaEmbeddingsAuthDTO>\n\n` +
 34 |                         `- Authorization: Bearer {YOUR_JINA_TOKEN}`
 35 |                     ,
 36 |                     in: 'header',
 37 |                     schema: {
 38 |                         anyOf: [
 39 |                             { type: 'string', format: 'token' }
 40 |                         ]
 41 |                     }
 42 |                 }
 43 |             }
 44 |         }
 45 |     }
 46 | })
 47 | export class JinaEmbeddingsAuthDTO extends AutoCastable {
 48 |     uid?: string;
 49 |     bearerToken?: string;
 50 |     user?: JinaEmbeddingsTokenAccount;
 51 | 
 52 |     @InjectProperty(AsyncLocalContext)
 53 |     ctxMgr!: AsyncLocalContext;
 54 | 
 55 |     jinaEmbeddingsDashboard = THE_VERY_SAME_JINA_EMBEDDINGS_CLIENT;
 56 | 
 57 |     static override from(input: any) {
 58 |         const instance = super.from(input) as JinaEmbeddingsAuthDTO;
 59 | 
 60 |         const ctx = input[RPC_CALL_ENVIRONMENT] as Context;
 61 | 
 62 |         if (ctx) {
 63 |             const authorization = ctx.get('authorization');
 64 | 
 65 |             if (authorization) {
 66 |                 const authToken = authorization.split(' ')[1] || authorization;
 67 |                 instance.bearerToken = authToken;
 68 |             }
 69 | 
 70 |         }
 71 | 
 72 |         if (!instance.bearerToken && input._token) {
 73 |             instance.bearerToken = input._token;
 74 |         }
 75 | 
 76 |         return instance;
 77 |     }
 78 | 
 79 |     async getBrief(ignoreCache?: boolean | string) {
 80 |         if (!this.bearerToken) {
 81 |             throw new AuthenticationRequiredError({
 82 |                 message: 'Jina API key is required to authenticate. Please get one from https://jina.ai'
 83 |             });
 84 |         }
 85 | 
 86 |         let firestoreDegradation = false;
 87 |         let account;
 88 |         try {
 89 |             account = await JinaEmbeddingsTokenAccount.fromFirestore(this.bearerToken);
 90 |         } catch (err) {
 91 |             // FireStore would not accept any string as input and may throw if not happy with it
 92 |             firestoreDegradation = true;
 93 |             logger.warn(`Firestore issue`, { err });
 94 |         }
 95 | 
 96 | 
 97 |         const age = account?.lastSyncedAt ? Date.now() - account.lastSyncedAt.valueOf() : Infinity;
 98 |         const jitter = Math.ceil(Math.random() * 30 * 1000);
 99 | 
100 |         if (account && !ignoreCache) {
101 |             if ((age < (180_000 - jitter)) && (account.wallet?.total_balance > 0)) {
102 |                 this.user = account;
103 |                 this.uid = this.user?.user_id;
104 | 
105 |                 return account;
106 |             }
107 |         }
108 | 
109 |         if (firestoreDegradation) {
110 |             logger.debug(`Using remote UC cached user`);
111 |             let r;
112 |             try {
113 |                 r = await this.jinaEmbeddingsDashboard.authorization(this.bearerToken);
114 |             } catch (err: any) {
115 |                 if (err?.status === 401) {
116 |                     throw new AuthenticationFailedError({
117 |                         message: 'Invalid API key, please get a new one from https://jina.ai'
118 |                     });
119 |                 }
120 |                 logger.warn(`Failed load remote cached user: ${err}`, { err });
121 |                 throw new DownstreamServiceError(`Failed to authenticate: ${err}`);
122 |             }
123 |             const brief = r?.data;
124 |             const draftAccount = JinaEmbeddingsTokenAccount.from({
125 |                 ...account, ...brief, _id: this.bearerToken,
126 |                 lastSyncedAt: new Date()
127 |             });
128 |             this.user = draftAccount;
129 |             this.uid = this.user?.user_id;
130 | 
131 |             return draftAccount;
132 |         }
133 | 
134 |         try {
135 |             // TODO: go back using validateToken after performance issue fixed
136 |             const r = ((account?.wallet?.total_balance || 0) > 0) ?
137 |                 await this.jinaEmbeddingsDashboard.authorization(this.bearerToken) :
138 |                 await this.jinaEmbeddingsDashboard.validateToken(this.bearerToken);
139 |             const brief = r.data;
140 |             const draftAccount = JinaEmbeddingsTokenAccount.from({
141 |                 ...account, ...brief, _id: this.bearerToken,
142 |                 lastSyncedAt: new Date()
143 |             });
144 |             await JinaEmbeddingsTokenAccount.save(draftAccount.degradeForFireStore(), undefined, { merge: true });
145 | 
146 |             this.user = draftAccount;
147 |             this.uid = this.user?.user_id;
148 | 
149 |             return draftAccount;
150 |         } catch (err: any) {
151 |             authDtoLogger.warn(`Failed to get user brief: ${err}`, { err: marshalErrorLike(err) });
152 | 
153 |             if (err?.status === 401) {
154 |                 throw new AuthenticationFailedError({
155 |                     message: 'Invalid API key, please get a new one from https://jina.ai'
156 |                 });
157 |             }
158 | 
159 |             if (account) {
160 |                 this.user = account;
161 |                 this.uid = this.user?.user_id;
162 | 
163 |                 return account;
164 |             }
165 | 
166 | 
167 |             throw new DownstreamServiceError(`Failed to authenticate: ${err}`);
168 |         }
169 |     }
170 | 
171 |     async reportUsage(tokenCount: number, mdl: string, endpoint: string = '/encode') {
172 |         const user = await this.assertUser();
173 |         const uid = user.user_id;
174 |         user.wallet.total_balance -= tokenCount;
175 | 
176 |         return this.jinaEmbeddingsDashboard.reportUsage(this.bearerToken!, {
177 |             model_name: mdl,
178 |             api_endpoint: endpoint,
179 |             consumer: {
180 |                 id: uid,
181 |                 user_id: uid,
182 |             },
183 |             usage: {
184 |                 total_tokens: tokenCount
185 |             },
186 |             labels: {
187 |                 model_name: mdl
188 |             }
189 |         }).then((r) => {
190 |             JinaEmbeddingsTokenAccount.COLLECTION.doc(this.bearerToken!)
191 |                 .update({ 'wallet.total_balance': JinaEmbeddingsTokenAccount.OPS.increment(-tokenCount) })
192 |                 .catch((err) => {
193 |                     authDtoLogger.warn(`Failed to update cache for ${uid}: ${err}`, { err: marshalErrorLike(err) });
194 |                 });
195 | 
196 |             return r;
197 |         }).catch((err) => {
198 |             user.wallet.total_balance += tokenCount;
199 |             authDtoLogger.warn(`Failed to report usage for ${uid}: ${err}`, { err: marshalErrorLike(err) });
200 |         });
201 |     }
202 | 
203 |     async solveUID() {
204 |         if (this.uid) {
205 |             this.ctxMgr.set('uid', this.uid);
206 | 
207 |             return this.uid;
208 |         }
209 | 
210 |         if (this.bearerToken) {
211 |             await this.getBrief();
212 |             this.ctxMgr.set('uid', this.uid);
213 | 
214 |             return this.uid;
215 |         }
216 | 
217 |         return undefined;
218 |     }
219 | 
220 |     async assertUID() {
221 |         const uid = await this.solveUID();
222 | 
223 |         if (!uid) {
224 |             throw new AuthenticationRequiredError('Authentication failed');
225 |         }
226 | 
227 |         return uid;
228 |     }
229 | 
230 |     async assertUser() {
231 |         if (this.user) {
232 |             return this.user;
233 |         }
234 | 
235 |         await this.getBrief();
236 | 
237 |         return this.user!;
238 |     }
239 | 
240 |     async assertTier(n: number, feature?: string) {
241 |         let user;
242 |         try {
243 |             user = await this.assertUser();
244 |         } catch (err) {
245 |             if (err instanceof AuthenticationRequiredError) {
246 |                 throw new AuthenticationRequiredError({
247 |                     message: `Authentication is required to use this feature${feature ? ` (${feature})` : ''}. Please provide a valid API key.`
248 |                 });
249 |             }
250 | 
251 |             throw err;
252 |         }
253 | 
254 |         const tier = parseInt(user.metadata?.speed_level);
255 |         if (isNaN(tier) || tier < n) {
256 |             throw new TierFeatureConstraintError({
257 |                 message: `Your current plan does not support this feature${feature ? ` (${feature})` : ''}. Please upgrade your plan.`
258 |             });
259 |         }
260 | 
261 |         return true;
262 |     }
263 | 
264 |     getRateLimits(...tags: string[]) {
265 |         const descs = tags.map((x) => this.user?.customRateLimits?.[x] || []).flat().filter((x) => x.isEffective());
266 | 
267 |         if (descs.length) {
268 |             return descs;
269 |         }
270 | 
271 |         return undefined;
272 |     }
273 | }
274 | 


--------------------------------------------------------------------------------
/src/dto/turndown-tweakable-options.ts:
--------------------------------------------------------------------------------
 1 | import { AutoCastable, Prop } from 'civkit/civ-rpc';
 2 | import {Context} from '../services/registry';
 3 | import _ from 'lodash';
 4 | 
 5 | 
 6 | export class TurnDownTweakableOptions extends AutoCastable {
 7 |     @Prop({
 8 |         desc: 'Turndown options > headingStyle',
 9 |         type: new Set(['setext', 'atx']),
10 |     })
11 |     headingStyle?: 'setext' | 'atx';
12 | 
13 |     @Prop({
14 |         desc: 'Turndown options > hr',
15 |         validate: (v: string) => v.length > 0 && v.length <= 128
16 |     })
17 |     hr?: string;
18 | 
19 |     @Prop({
20 |         desc: 'Turndown options > bulletListMarker',
21 |         type: new Set(['-', '+', '*']),
22 |     })
23 |     bulletListMarker?: '-' | '+' | '*';
24 | 
25 |     @Prop({
26 |         desc: 'Turndown options > emDelimiter',
27 |         type: new Set(['_', '*']),
28 |     })
29 |     emDelimiter?: '_' | '*';
30 | 
31 |     @Prop({
32 |         desc: 'Turndown options > strongDelimiter',
33 |         type: new Set(['__', '**']),
34 |     })
35 |     strongDelimiter?: '__' | '**';
36 | 
37 |     @Prop({
38 |         desc: 'Turndown options > linkStyle',
39 |         type: new Set(['inlined', 'referenced', 'discarded']),
40 |     })
41 |     linkStyle?: 'inlined' | 'referenced' | 'discarded';
42 | 
43 |     @Prop({
44 |         desc: 'Turndown options > linkReferenceStyle',
45 |         type: new Set(['full', 'collapsed', 'shortcut', 'discarded']),
46 |     })
47 |     linkReferenceStyle?: 'full' | 'collapsed' | 'shortcut' | 'discarded';
48 | 
49 |     static fromCtx(ctx: Context, prefix= 'x-md-') {
50 |         const draft: Record<string, string> = {};
51 |         for (const [k, v] of Object.entries(ctx.headers)) {
52 |             if (k.startsWith(prefix)) {
53 |                 const prop = k.slice(prefix.length);
54 |                 const sk = _.camelCase(prop);
55 |                 draft[sk] = v as string;
56 |             }
57 |         }
58 | 
59 |         return this.from(draft);
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/fetch.d.ts:
--------------------------------------------------------------------------------
 1 | declare global {
 2 |     export const {
 3 |         fetch,
 4 |         FormData,
 5 |         Headers,
 6 |         Request,
 7 |         Response,
 8 |         File,
 9 |     }: typeof import('undici');
10 |     export type { FormData, Headers, Request, RequestInit, Response, RequestInit, File } from 'undici';
11 | }
12 | 
13 | export { };
14 | 


--------------------------------------------------------------------------------
/src/lib/transform-server-event-stream.ts:
--------------------------------------------------------------------------------
  1 | import { TPM, parseJSONText } from 'civkit';
  2 | import { Transform, TransformCallback, TransformOptions } from 'stream';
  3 | 
  4 | export class InputServerEventStream extends Transform {
  5 |     cache: string[] = [];
  6 | 
  7 |     constructor(options?: TransformOptions) {
  8 |         super({
  9 |             ...options,
 10 |             readableObjectMode: true
 11 |         });
 12 |     }
 13 | 
 14 |     decodeRoutine() {
 15 |         if (!this.cache.length) {
 16 |             return;
 17 |         }
 18 | 
 19 |         const vecs = this.cache.join('').split(/\r?\n\r?\n/);
 20 |         this.cache.length = 0;
 21 |         const lastVec = vecs.pop();
 22 |         if (lastVec) {
 23 |             this.cache.push(lastVec);
 24 |         }
 25 | 
 26 |         for (const x of vecs) {
 27 |             const lines: string[] = x.split(/\r?\n/);
 28 | 
 29 |             const event: {
 30 |                 event?: string;
 31 |                 data?: string;
 32 |                 id?: string;
 33 |                 retry?: number;
 34 |             } = {};
 35 | 
 36 |             for (const l of lines) {
 37 |                 const columnPos = l.indexOf(':');
 38 |                 if (columnPos <= 0) {
 39 |                     continue;
 40 |                 }
 41 |                 const key = l.substring(0, columnPos);
 42 |                 const rawValue = l.substring(columnPos + 1);
 43 |                 const value = rawValue.startsWith(' ') ? rawValue.slice(1) : rawValue;
 44 |                 if (key === 'data') {
 45 |                     if (event.data) {
 46 |                         event.data += value || '\n';
 47 |                     } else if (event.data === '') {
 48 |                         event.data += '\n';
 49 |                         event.data += value || '\n';
 50 |                     } else {
 51 |                         event.data = value;
 52 |                     }
 53 |                 } else if (key === 'retry') {
 54 |                     event.retry = parseInt(value, 10);
 55 |                 } else {
 56 |                     Reflect.set(event, key, value);
 57 |                 }
 58 |             }
 59 | 
 60 |             if (event.data) {
 61 |                 const parsed = parseJSONText(event.data);
 62 |                 if (parsed && typeof parsed === 'object') {
 63 |                     event.data = parsed;
 64 |                 }
 65 |             }
 66 | 
 67 |             if (Object.keys(event).length) {
 68 |                 this.push(event);
 69 |             }
 70 |         }
 71 |     }
 72 | 
 73 |     override _transform(chunk: any, encoding: BufferEncoding, callback: TransformCallback): void {
 74 |         if (chunk === null) {
 75 |             this.push(null);
 76 |         }
 77 | 
 78 |         this.cache.push(chunk.toString());
 79 |         this.decodeRoutine();
 80 | 
 81 |         callback();
 82 |     }
 83 | 
 84 |     override _final(callback: (error?: Error | null | undefined) => void): void {
 85 |         this.decodeRoutine();
 86 |         callback();
 87 |     }
 88 | }
 89 | 
 90 | @TPM({
 91 |     contentType: 'text/event-stream',
 92 | })
 93 | export class OutputServerEventStream extends Transform {
 94 |     n: number = 0;
 95 | 
 96 |     constructor(options?: TransformOptions) {
 97 |         super({
 98 |             ...options, writableObjectMode: true, encoding: 'utf-8'
 99 |         });
100 |     }
101 | 
102 |     encodeRoutine(chunk: {
103 |         event?: string;
104 |         data?: any;
105 |         id?: string;
106 |         retry?: number;
107 |     } | string) {
108 |         if (typeof chunk === 'object') {
109 |             const lines: string[] = [];
110 | 
111 |             if (chunk.event) {
112 |                 lines.push(`event: ${chunk.event}`);
113 |             }
114 |             if (chunk.data) {
115 |                 if (typeof chunk.data === 'string') {
116 |                     for (const x of chunk.data.split(/\r?\n/)) {
117 |                         lines.push(`data: ${x}`);
118 |                     }
119 |                 } else {
120 |                     lines.push(`data: ${JSON.stringify(chunk.data)}`);
121 |                 }
122 |             }
123 |             if (chunk.id) {
124 |                 lines.push(`id: ${chunk.id}`);
125 |             }
126 |             if (chunk.retry) {
127 |                 lines.push(`retry: ${chunk.retry}`);
128 |             }
129 |             if (!lines.length) {
130 |                 lines.push(`data: ${JSON.stringify(chunk)}`);
131 |             }
132 |             this.push(lines.join('\n'));
133 |             this.push('\n\n');
134 |             this.n++;
135 | 
136 |             return;
137 |         } else if (typeof chunk === 'string') {
138 |             const lines: string[] = [];
139 |             for (const x of chunk.split(/\r?\n/)) {
140 |                 lines.push(`data: ${x}`);
141 |             }
142 | 
143 |             this.push(lines.join('\n'));
144 |             this.push('\n\n');
145 |             this.n++;
146 |         }
147 |     }
148 | 
149 |     override _transform(chunk: any, encoding: BufferEncoding, callback: TransformCallback): void {
150 |         if (chunk === null) {
151 |             this.push(null);
152 |         }
153 | 
154 |         this.encodeRoutine(chunk);
155 | 
156 |         callback();
157 |     }
158 | }
159 | 
160 | export interface OutputServerEventStream extends Transform {
161 |     write(chunk: string | {
162 |         event?: string;
163 |         data?: any;
164 |         id?: string;
165 |         retry?: number;
166 |     }, callback?: (error: Error | null | undefined) => void): boolean;
167 |     write(chunk: any, callback?: (error: Error | null | undefined) => void): boolean;
168 |     write(chunk: any, encoding: BufferEncoding, callback?: (error: Error | null | undefined) => void): boolean;
169 | }
170 | 


--------------------------------------------------------------------------------
/src/services/alt-text.ts:
--------------------------------------------------------------------------------
  1 | import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
  2 | import { singleton } from 'tsyringe';
  3 | import { GlobalLogger } from './logger';
  4 | import { CanvasService } from './canvas';
  5 | import { ImageInterrogationManager } from '../shared/services/common-iminterrogate';
  6 | import { ImgBrief } from './puppeteer';
  7 | import { ImgAlt } from '../db/img-alt';
  8 | import { AsyncLocalContext } from './async-context';
  9 | 
 10 | const md5Hasher = new HashManager('md5', 'hex');
 11 | 
 12 | @singleton()
 13 | export class AltTextService extends AsyncService {
 14 | 
 15 |     altsToIgnore = 'image,img,photo,picture,pic,alt,figure,fig'.split(',');
 16 |     logger = this.globalLogger.child({ service: this.constructor.name });
 17 | 
 18 |     constructor(
 19 |         protected globalLogger: GlobalLogger,
 20 |         protected imageInterrogator: ImageInterrogationManager,
 21 |         protected canvasService: CanvasService,
 22 |         protected asyncLocalContext: AsyncLocalContext
 23 |     ) {
 24 |         super(...arguments);
 25 |     }
 26 | 
 27 |     override async init() {
 28 |         await this.dependencyReady();
 29 |         this.emit('ready');
 30 |     }
 31 | 
 32 |     async caption(url: string) {
 33 |         try {
 34 |             const img = await this.canvasService.loadImage(url);
 35 |             const contentTypeHint = Reflect.get(img, 'contentType');
 36 |             if (Math.min(img.naturalHeight, img.naturalWidth) <= 1) {
 37 |                 return `A ${img.naturalWidth}x${img.naturalHeight} image, likely be a tacker probe`;
 38 |             }
 39 |             if (Math.min(img.naturalHeight, img.naturalWidth) < 64) {
 40 |                 return `A ${img.naturalWidth}x${img.naturalHeight} small image, likely a logo, icon or avatar`;
 41 |             }
 42 |             const resized = this.canvasService.fitImageToSquareBox(img, 1024);
 43 |             const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
 44 | 
 45 |             const svgHint = contentTypeHint.includes('svg') ? `Beware this image is a SVG rendered on a gray background, the gray background is not part of the image.\n\n` : '';
 46 |             const svgSystemHint = contentTypeHint.includes('svg') ? ` Sometimes the system renders SVG on a gray background. When this happens, you must not include the gray background in the description.` : '';
 47 | 
 48 |             const r = await this.imageInterrogator.interrogate('vertex-gemini-2.0-flash', {
 49 |                 image: exported,
 50 |                 prompt: `${svgHint}Give a concise image caption descriptive sentence in third person. Start directly with the description.`,
 51 |                 system: `You are BLIP2, an image caption model. You will generate Alt Text (in web pages) for any image for a11y purposes. You must not start with "This image is sth...", instead, start direly with "sth..."${svgSystemHint}`,
 52 |             });
 53 | 
 54 |             return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim();
 55 |         } catch (err) {
 56 |             throw new AssertionFailureError({ message: `Could not generate alt text for url ${url}`, cause: err });
 57 |         }
 58 |     }
 59 | 
 60 |     async getAltText(imgBrief: ImgBrief) {
 61 |         if (!imgBrief.src) {
 62 |             return undefined;
 63 |         }
 64 |         if (imgBrief.alt && !this.altsToIgnore.includes(imgBrief.alt.trim().toLowerCase())) {
 65 |             return imgBrief.alt;
 66 |         }
 67 |         const digest = md5Hasher.hash(imgBrief.src);
 68 |         const shortDigest = Buffer.from(digest, 'hex').toString('base64url');
 69 |         let dims: number[] = [];
 70 |         do {
 71 |             if (imgBrief.loaded) {
 72 |                 if (imgBrief.naturalWidth && imgBrief.naturalHeight) {
 73 |                     if (Math.min(imgBrief.naturalWidth, imgBrief.naturalHeight) < 64) {
 74 |                         dims = [imgBrief.naturalWidth, imgBrief.naturalHeight];
 75 |                         break;
 76 |                     }
 77 |                 }
 78 |             }
 79 | 
 80 |             if (imgBrief.width && imgBrief.height) {
 81 |                 if (Math.min(imgBrief.width, imgBrief.height) < 64) {
 82 |                     dims = [imgBrief.width, imgBrief.height];
 83 |                     break;
 84 |                 }
 85 |             }
 86 | 
 87 |         } while (false);
 88 | 
 89 |         if (Math.min(...dims) <= 1) {
 90 |             return `A ${dims[0]}x${dims[1]} image, likely be a tacker probe`;
 91 |         }
 92 |         if (Math.min(...dims) < 64) {
 93 |             return `A ${dims[0]}x${dims[1]} small image, likely a logo, icon or avatar`;
 94 |         }
 95 | 
 96 |         const existing = await ImgAlt.fromFirestore(shortDigest);
 97 | 
 98 |         if (existing) {
 99 |             return existing.generatedAlt || existing.originalAlt || '';
100 |         }
101 | 
102 |         let generatedCaption = '';
103 | 
104 |         try {
105 |             generatedCaption = await this.caption(imgBrief.src);
106 |         } catch (err) {
107 |             this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err });
108 |         }
109 | 
110 |         if (this.asyncLocalContext.ctx.DNT) {
111 |             // Don't cache alt text if DNT is set
112 |             return generatedCaption;
113 |         }
114 | 
115 |         // Don't try again until the next day
116 |         const expireMixin = generatedCaption ? {} : { expireAt: new Date(Date.now() + 1000 * 3600 * 24) };
117 | 
118 |         await ImgAlt.COLLECTION.doc(shortDigest).set(
119 |             {
120 |                 _id: shortDigest,
121 |                 src: imgBrief.src || '',
122 |                 width: imgBrief.naturalWidth || 0,
123 |                 height: imgBrief.naturalHeight || 0,
124 |                 urlDigest: digest,
125 |                 originalAlt: imgBrief.alt || '',
126 |                 generatedAlt: generatedCaption || '',
127 |                 createdAt: new Date(),
128 |                 ...expireMixin
129 |             }, { merge: true }
130 |         );
131 | 
132 |         return generatedCaption;
133 |     }
134 | };
135 | 


--------------------------------------------------------------------------------
/src/services/async-context.ts:
--------------------------------------------------------------------------------
 1 | import { GlobalAsyncContext } from 'civkit/async-context';
 2 | import { container, singleton } from 'tsyringe';
 3 | 
 4 | @singleton()
 5 | export class AsyncLocalContext extends GlobalAsyncContext { }
 6 | 
 7 | const instance = container.resolve(AsyncLocalContext);
 8 | Reflect.set(process, 'asyncLocalContext', instance);
 9 | 
10 | export default instance;
11 | 


--------------------------------------------------------------------------------
/src/services/blackhole-detector.ts:
--------------------------------------------------------------------------------
 1 | import { singleton } from 'tsyringe';
 2 | import { AsyncService } from 'civkit/async-service';
 3 | import { GlobalLogger } from './logger';
 4 | import { delay } from 'civkit/timeout';
 5 | 
 6 | 
 7 | @singleton()
 8 | export class BlackHoleDetector extends AsyncService {
 9 | 
10 |     logger = this.globalLogger.child({ service: this.constructor.name });
11 |     lastWorkedTs?: number;
12 |     lastDoneRequestTs?: number;
13 |     lastIncomingRequestTs?: number;
14 | 
15 |     maxDelay = 1000 * 30;
16 |     concurrentRequests = 0;
17 | 
18 |     strikes = 0;
19 | 
20 |     constructor(protected globalLogger: GlobalLogger) {
21 |         super(...arguments);
22 | 
23 |         if (process.env.NODE_ENV?.startsWith('prod')) {
24 |             setInterval(() => {
25 |                 this.routine();
26 |             }, 1000 * 30).unref();
27 |         }
28 |     }
29 | 
30 |     override async init() {
31 |         await this.dependencyReady();
32 |         this.logger.debug('BlackHoleDetector started');
33 |         this.emit('ready');
34 |     }
35 | 
36 |     async routine() {
37 |         // We give routine a 3s grace period for potentially paused CPU to spin up and process some requests
38 |         await delay(3000);
39 |         const now = Date.now();
40 |         const lastWorked = this.lastWorkedTs;
41 |         if (!lastWorked) {
42 |             return;
43 |         }
44 |         const dt = (now - lastWorked);
45 |         if (this.concurrentRequests > 1 &&
46 |             this.lastIncomingRequestTs && lastWorked &&
47 |             this.lastIncomingRequestTs >= lastWorked &&
48 |             (dt > (this.maxDelay * (this.strikes + 1)))
49 |         ) {
50 |             this.logger.warn(`BlackHole detected, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`);
51 |             this.strikes += 1;
52 |         }
53 | 
54 |         if (this.strikes >= 3) {
55 |             this.logger.error(`BlackHole detected for ${this.strikes} strikes, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`);
56 |             process.nextTick(() => {
57 |                 this.emit('error', new Error(`BlackHole detected for ${this.strikes} strikes, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`));
58 |                 // process.exit(1);
59 |             });
60 |         }
61 |     }
62 | 
63 |     incomingRequest() {
64 |         this.lastIncomingRequestTs = Date.now();
65 |         this.lastWorkedTs ??= Date.now();
66 |         this.concurrentRequests++;
67 |     }
68 |     doneWithRequest() {
69 |         this.concurrentRequests--;
70 |         this.lastDoneRequestTs = Date.now();
71 |     }
72 | 
73 |     itWorked() {
74 |         this.lastWorkedTs = Date.now();
75 |         this.strikes = 0;
76 |     }
77 | 
78 | };
79 | 


--------------------------------------------------------------------------------
/src/services/brave-search.ts:
--------------------------------------------------------------------------------
  1 | import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike } from 'civkit';
  2 | import { singleton } from 'tsyringe';
  3 | import { GlobalLogger } from './logger';
  4 | import { SecretExposer } from '../shared/services/secrets';
  5 | import { BraveSearchHTTP, WebSearchQueryParams } from '../shared/3rd-party/brave-search';
  6 | import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
  7 | import { AsyncLocalContext } from './async-context';
  8 | import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types';
  9 | import type { Request, Response } from 'express';
 10 | import { BlackHoleDetector } from './blackhole-detector';
 11 | 
 12 | @singleton()
 13 | export class BraveSearchService extends AsyncService {
 14 | 
 15 |     logger = this.globalLogger.child({ service: this.constructor.name });
 16 | 
 17 |     braveSearchHTTP!: BraveSearchHTTP;
 18 | 
 19 |     constructor(
 20 |         protected globalLogger: GlobalLogger,
 21 |         protected secretExposer: SecretExposer,
 22 |         protected geoipControl: GeoIPService,
 23 |         protected threadLocal: AsyncLocalContext,
 24 |         protected blackHoleDetector: BlackHoleDetector,
 25 |     ) {
 26 |         super(...arguments);
 27 |     }
 28 | 
 29 |     override async init() {
 30 |         await this.dependencyReady();
 31 |         this.emit('ready');
 32 | 
 33 |         this.braveSearchHTTP = new BraveSearchHTTP(this.secretExposer.BRAVE_SEARCH_API_KEY);
 34 |     }
 35 | 
 36 |     async webSearch(query: WebSearchQueryParams) {
 37 |         const ip = this.threadLocal.get('ip');
 38 |         const extraHeaders: WebSearchOptionalHeaderOptions = {};
 39 |         if (ip) {
 40 |             const geoip = await this.geoipControl.lookupCity(ip, GEOIP_SUPPORTED_LANGUAGES.EN);
 41 | 
 42 |             if (geoip?.city) {
 43 |                 extraHeaders['X-Loc-City'] = encodeURIComponent(geoip.city);
 44 |             }
 45 |             if (geoip?.country) {
 46 |                 extraHeaders['X-Loc-Country'] = geoip.country.code;
 47 |             }
 48 |             if (geoip?.timezone) {
 49 |                 extraHeaders['X-Loc-Timezone'] = geoip.timezone;
 50 |             }
 51 |             if (geoip?.coordinates) {
 52 |                 extraHeaders['X-Loc-Lat'] = `${geoip.coordinates[0]}`;
 53 |                 extraHeaders['X-Loc-Long'] = `${geoip.coordinates[1]}`;
 54 |             }
 55 |             if (geoip?.subdivisions?.length) {
 56 |                 extraHeaders['X-Loc-State'] = encodeURIComponent(`${geoip.subdivisions[0].code}`);
 57 |                 extraHeaders['X-Loc-State-Name'] = encodeURIComponent(`${geoip.subdivisions[0].name}`);
 58 |             }
 59 |         }
 60 |         if (this.threadLocal.get('userAgent')) {
 61 |             extraHeaders['User-Agent'] = this.threadLocal.get('userAgent');
 62 |         }
 63 | 
 64 |         const encoded = { ...query };
 65 |         if (encoded.q) {
 66 |             encoded.q = (Buffer.from(encoded.q).toString('ascii') === encoded.q) ? encoded.q : encodeURIComponent(encoded.q);
 67 |         }
 68 | 
 69 |         let maxTries = 11;
 70 | 
 71 |         while (maxTries--) {
 72 |             try {
 73 |                 const r = await this.braveSearchHTTP.webSearch(encoded, { headers: extraHeaders as Record<string, string> });
 74 |                 this.blackHoleDetector.itWorked();
 75 | 
 76 |                 return r.parsed;
 77 |             } catch (err: any) {
 78 |                 this.logger.error(`Web search failed: ${err?.message}`, { err: marshalErrorLike(err) });
 79 |                 if (err?.status === 429) {
 80 |                     await delay(500 + 1000 * Math.random());
 81 |                     continue;
 82 |                 }
 83 | 
 84 |                 throw new DownstreamServiceFailureError({ message: `Search failed` });
 85 |             }
 86 |         }
 87 | 
 88 |         throw new DownstreamServiceFailureError({ message: `Search failed` });
 89 |     }
 90 | 
 91 | }
 92 | 
 93 | 
 94 | export class BraveSearchExplicitOperatorsDto extends AutoCastable {
 95 |     @Prop({
 96 |         arrayOf: String,
 97 |         desc: `Returns web pages with a specific file extension. Example: to find the Honda GX120 Owner’s manual in PDF, type “Honda GX120 ownners manual ext:pdf”.`
 98 |     })
 99 |     ext?: string | string[];
100 | 
101 |     @Prop({
102 |         arrayOf: String,
103 |         desc: `Returns web pages created in the specified file type. Example: to find a web page created in PDF format about the evaluation of age-related cognitive changes, type “evaluation of age cognitive changes filetype:pdf”.`
104 |     })
105 |     filetype?: string | string[];
106 | 
107 |     @Prop({
108 |         arrayOf: String,
109 |         desc: `Returns web pages containing the specified term in the body of the page. Example: to find information about the Nvidia GeForce GTX 1080 Ti, making sure the page contains the keywords “founders edition” in the body, type “nvidia 1080 ti inbody:“founders edition””.`
110 |     })
111 |     inbody?: string | string[];
112 | 
113 |     @Prop({
114 |         arrayOf: String,
115 |         desc: `Returns webpages containing the specified term in the title of the page. Example: to find pages about SEO conferences making sure the results contain 2023 in the title, type “seo conference intitle:2023”.`
116 |     })
117 |     intitle?: string | string[];
118 | 
119 |     @Prop({
120 |         arrayOf: String,
121 |         desc: `Returns webpages containing the specified term either in the title or in the body of the page. Example: to find pages about the 2024 Oscars containing the keywords “best costume design” in the page, type “oscars 2024 inpage:“best costume design””.`
122 |     })
123 |     inpage?: string | string[];
124 | 
125 |     @Prop({
126 |         arrayOf: String,
127 |         desc: `Returns web pages written in the specified language. The language code must be in the ISO 639-1 two-letter code format. Example: to find information on visas only in Spanish, type “visas lang:es”.`
128 |     })
129 |     lang?: string | string[];
130 | 
131 |     @Prop({
132 |         arrayOf: String,
133 |         desc: `Returns web pages written in the specified language. The language code must be in the ISO 639-1 two-letter code format. Example: to find information on visas only in Spanish, type “visas lang:es”.`
134 |     })
135 |     loc?: string | string[];
136 | 
137 |     @Prop({
138 |         arrayOf: String,
139 |         desc: `Returns web pages coming only from a specific web site. Example: to find information about Goggles only on Brave pages, type “goggles site:brave.com”.`
140 |     })
141 |     site?: string | string[];
142 | 
143 |     addTo(searchTerm: string) {
144 |         const chunks = [];
145 |         for (const [key, value] of Object.entries(this)) {
146 |             if (value) {
147 |                 const values = Array.isArray(value) ? value : [value];
148 |                 const textValue = values.map((v) => `${key}:${v}`).join(' OR ');
149 |                 if (textValue) {
150 |                     chunks.push(textValue);
151 |                 }
152 |             }
153 |         }
154 |         const opPart = chunks.length > 1 ? chunks.map((x) => `(${x})`).join(' AND ') : chunks;
155 | 
156 |         if (opPart.length) {
157 |             return [searchTerm, opPart].join(' ');
158 |         }
159 | 
160 |         return searchTerm;
161 |     }
162 | 
163 |     static override from(input: any) {
164 |         const instance = super.from(input) as BraveSearchExplicitOperatorsDto;
165 |         const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
166 |             req: Request,
167 |             res: Response,
168 |         } | undefined;
169 | 
170 |         const params = ['ext', 'filetype', 'inbody', 'intitle', 'inpage', 'lang', 'loc', 'site'];
171 | 
172 |         for (const p of params) {
173 |             const customValue = ctx?.req.get(`x-${p}`) || ctx?.req.get(`${p}`);
174 |             if (!customValue) {
175 |                 continue;
176 |             }
177 | 
178 |             const filtered = customValue.split(', ').filter(Boolean);
179 |             if (filtered.length) {
180 |                 Reflect.set(instance, p, filtered);
181 |             }
182 |         }
183 | 
184 |         return instance;
185 |     }
186 | }
187 | 


--------------------------------------------------------------------------------
/src/services/canvas.ts:
--------------------------------------------------------------------------------
  1 | import { singleton, container } from 'tsyringe';
  2 | import { AsyncService, mimeOf, ParamValidationError, SubmittedDataMalformedError, /* downloadFile */ } from 'civkit';
  3 | import { readFile } from 'fs/promises';
  4 | 
  5 | import type canvas from '@napi-rs/canvas';
  6 | export type { Canvas, Image } from '@napi-rs/canvas';
  7 | 
  8 | import { GlobalLogger } from './logger';
  9 | import { TempFileManager } from './temp-file';
 10 | 
 11 | import { isMainThread } from 'worker_threads';
 12 | import type { svg2png } from 'svg2png-wasm' with { 'resolution-mode': 'import' };
 13 | import path from 'path';
 14 | import { Threaded } from './threaded';
 15 | 
 16 | const downloadFile = async (uri: string) => {
 17 |     const resp = await fetch(uri);
 18 |     if (!(resp.ok && resp.body)) {
 19 |         throw new Error(`Unexpected response ${resp.statusText}`);
 20 |     }
 21 |     const contentLength = parseInt(resp.headers.get('content-length') || '0');
 22 |     if (contentLength > 1024 * 1024 * 100) {
 23 |         throw new Error('File too large');
 24 |     }
 25 |     const buff = await resp.arrayBuffer();
 26 | 
 27 |     return { buff, contentType: resp.headers.get('content-type') };
 28 | };
 29 | 
 30 | @singleton()
 31 | export class CanvasService extends AsyncService {
 32 | 
 33 |     logger = this.globalLogger.child({ service: this.constructor.name });
 34 |     svg2png!: typeof svg2png;
 35 |     canvas!: typeof canvas;
 36 | 
 37 |     constructor(
 38 |         protected temp: TempFileManager,
 39 |         protected globalLogger: GlobalLogger,
 40 |     ) {
 41 |         super(...arguments);
 42 |     }
 43 | 
 44 |     override async init() {
 45 |         await this.dependencyReady();
 46 |         if (!isMainThread) {
 47 |             const { createSvg2png, initialize } = require('svg2png-wasm');
 48 |             const wasmBuff = await readFile(path.resolve(path.dirname(require.resolve('svg2png-wasm')), '../svg2png_wasm_bg.wasm'));
 49 |             const fontBuff = await readFile(path.resolve(__dirname, '../../licensed/SourceHanSansSC-Regular.otf'));
 50 |             await initialize(wasmBuff);
 51 |             this.svg2png = createSvg2png({
 52 |                 fonts: [Uint8Array.from(fontBuff)],
 53 |                 defaultFontFamily: {
 54 |                     serifFamily: 'Source Han Sans SC',
 55 |                     sansSerifFamily: 'Source Han Sans SC',
 56 |                     cursiveFamily: 'Source Han Sans SC',
 57 |                     fantasyFamily: 'Source Han Sans SC',
 58 |                     monospaceFamily: 'Source Han Sans SC',
 59 |                 }
 60 |             });
 61 |         }
 62 |         this.canvas = require('@napi-rs/canvas');
 63 | 
 64 |         this.emit('ready');
 65 |     }
 66 | 
 67 |     @Threaded()
 68 |     async renderSvgToPng(svgContent: string,) {
 69 |         return this.svg2png(svgContent, { backgroundColor: '#D3D3D3' });
 70 |     }
 71 | 
 72 |     protected async _loadImage(input: string | Buffer) {
 73 |         let buff;
 74 |         let contentType;
 75 |         do {
 76 |             if (typeof input === 'string') {
 77 |                 if (input.startsWith('data:')) {
 78 |                     const firstComma = input.indexOf(',');
 79 |                     const header = input.slice(0, firstComma);
 80 |                     const data = input.slice(firstComma + 1);
 81 |                     const encoding = header.split(';')[1];
 82 |                     contentType = header.split(';')[0].split(':')[1];
 83 |                     if (encoding?.startsWith('base64')) {
 84 |                         buff = Buffer.from(data, 'base64');
 85 |                     } else {
 86 |                         buff = Buffer.from(decodeURIComponent(data), 'utf-8');
 87 |                     }
 88 |                     break;
 89 |                 }
 90 |                 if (input.startsWith('http')) {
 91 |                     const r = await downloadFile(input);
 92 |                     buff = Buffer.from(r.buff);
 93 |                     contentType = r.contentType;
 94 |                     break;
 95 |                 }
 96 |             }
 97 |             if (Buffer.isBuffer(input)) {
 98 |                 buff = input;
 99 |                 const mime = await mimeOf(buff);
100 |                 contentType = `${mime.mediaType}/${mime.subType}`;
101 |                 break;
102 |             }
103 |             throw new ParamValidationError('Invalid input');
104 |         } while (false);
105 | 
106 |         if (!buff) {
107 |             throw new ParamValidationError('Invalid input');
108 |         }
109 | 
110 |         if (contentType?.includes('svg')) {
111 |             buff = await this.renderSvgToPng(buff.toString('utf-8'));
112 |         }
113 | 
114 |         const img = await this.canvas.loadImage(buff);
115 |         Reflect.set(img, 'contentType', contentType);
116 | 
117 |         return img;
118 |     }
119 | 
120 |     async loadImage(uri: string | Buffer) {
121 |         const t0 = Date.now();
122 |         try {
123 |             const theImage = await this._loadImage(uri);
124 |             const t1 = Date.now();
125 |             this.logger.debug(`Image loaded in ${t1 - t0}ms`);
126 | 
127 |             return theImage;
128 |         } catch (err: any) {
129 |             if (err?.message?.includes('Unsupported image type') || err?.message?.includes('unsupported')) {
130 |                 this.logger.warn(`Failed to load image ${uri.slice(0, 128)}`, { err });
131 |                 throw new SubmittedDataMalformedError(`Unknown image format for ${uri.slice(0, 128)}`);
132 |             }
133 |             throw err;
134 |         }
135 |     }
136 | 
137 |     fitImageToSquareBox(image: canvas.Image | canvas.Canvas, size: number = 1024) {
138 |         // this.logger.debug(`Fitting image(${ image.width }x${ image.height }) to ${ size } box`);
139 |         // const t0 = Date.now();
140 |         if (image.width <= size && image.height <= size) {
141 |             if (image instanceof this.canvas.Canvas) {
142 |                 return image;
143 |             }
144 |             const canvasInstance = this.canvas.createCanvas(image.width, image.height);
145 |             const ctx = canvasInstance.getContext('2d');
146 |             ctx.drawImage(image, 0, 0, image.width, image.height, 0, 0, canvasInstance.width, canvasInstance.height);
147 |             // this.logger.debug(`No need to resize, copied to canvas in ${ Date.now() - t0 } ms`);
148 | 
149 |             return canvasInstance;
150 |         }
151 | 
152 |         const aspectRatio = image.width / image.height;
153 | 
154 |         const resizedWidth = Math.round(aspectRatio > 1 ? size : size * aspectRatio);
155 |         const resizedHeight = Math.round(aspectRatio > 1 ? size / aspectRatio : size);
156 | 
157 |         const canvasInstance = this.canvas.createCanvas(resizedWidth, resizedHeight);
158 |         const ctx = canvasInstance.getContext('2d');
159 |         ctx.drawImage(image, 0, 0, image.width, image.height, 0, 0, resizedWidth, resizedHeight);
160 |         // this.logger.debug(`Resized to ${ resizedWidth }x${ resizedHeight } in ${ Date.now() - t0 } ms`);
161 | 
162 |         return canvasInstance;
163 |     }
164 | 
165 |     corpImage(image: canvas.Image | canvas.Canvas, x: number, y: number, w: number, h: number) {
166 |         // this.logger.debug(`Cropping image(${ image.width }x${ image.height }) to ${ w }x${ h } at ${ x },${ y } `);
167 |         // const t0 = Date.now();
168 |         const canvasInstance = this.canvas.createCanvas(w, h);
169 |         const ctx = canvasInstance.getContext('2d');
170 |         ctx.drawImage(image, x, y, w, h, 0, 0, w, h);
171 |         // this.logger.debug(`Crop complete in ${ Date.now() - t0 } ms`);
172 | 
173 |         return canvasInstance;
174 |     }
175 | 
176 |     canvasToDataUrl(canvas: canvas.Canvas, mimeType?: 'image/png' | 'image/jpeg') {
177 |         // this.logger.debug(`Exporting canvas(${ canvas.width }x${ canvas.height })`);
178 |         // const t0 = Date.now();
179 |         return canvas.toDataURLAsync((mimeType || 'image/png') as 'image/png');
180 |     }
181 | 
182 |     async canvasToBuffer(canvas: canvas.Canvas, mimeType?: 'image/png' | 'image/jpeg') {
183 |         // this.logger.debug(`Exporting canvas(${ canvas.width }x${ canvas.height })`);
184 |         // const t0 = Date.now();
185 |         return canvas.toBuffer((mimeType || 'image/png') as 'image/png');
186 |     }
187 | 
188 | }
189 | 
190 | const instance = container.resolve(CanvasService);
191 | export default instance;
192 | 


--------------------------------------------------------------------------------
/src/services/cf-browser-rendering.ts:
--------------------------------------------------------------------------------
 1 | import { container, singleton } from 'tsyringe';
 2 | import { AsyncService } from 'civkit/async-service';
 3 | import { SecretExposer } from '../shared/services/secrets';
 4 | import { GlobalLogger } from './logger';
 5 | import { CloudFlareHTTP } from '../shared/3rd-party/cloud-flare';
 6 | import { HTTPServiceError } from 'civkit/http';
 7 | import { ServiceNodeResourceDrainError } from './errors';
 8 | 
 9 | @singleton()
10 | export class CFBrowserRendering extends AsyncService {
11 | 
12 |     logger = this.globalLogger.child({ service: this.constructor.name });
13 |     client!: CloudFlareHTTP;
14 | 
15 |     constructor(
16 |         protected globalLogger: GlobalLogger,
17 |         protected secretExposer: SecretExposer,
18 |     ) {
19 |         super(...arguments);
20 |     }
21 | 
22 | 
23 |     override async init() {
24 |         await this.dependencyReady();
25 |         const [account, key] = this.secretExposer.CLOUD_FLARE_API_KEY?.split(':');
26 |         this.client = new CloudFlareHTTP(account, key);
27 | 
28 |         this.emit('ready');
29 |     }
30 | 
31 |     async fetchContent(url: string) {
32 |         try {
33 |             const r = await this.client.fetchBrowserRenderedHTML({ url });
34 | 
35 |             return r.parsed.result;
36 |         } catch (err) {
37 |             if (err instanceof HTTPServiceError) {
38 |                 if (err.status === 429) {
39 |                     // Rate limit exceeded, return empty result
40 |                     this.logger.warn('Cloudflare browser rendering rate limit exceeded', { url });
41 | 
42 |                     throw new ServiceNodeResourceDrainError(`Cloudflare browser rendering (our account) is at capacity, please try again later or switch to another engine.`,);
43 |                 }
44 |             }
45 | 
46 |             throw err;
47 |         }
48 |     }
49 | 
50 | }
51 | 
52 | const instance = container.resolve(CFBrowserRendering);
53 | 
54 | export default instance;
55 | 


--------------------------------------------------------------------------------
/src/services/errors.ts:
--------------------------------------------------------------------------------
 1 | import { ApplicationError, StatusCode } from 'civkit/civ-rpc';
 2 | import _ from 'lodash';
 3 | import dayjs from 'dayjs';
 4 | import utc from 'dayjs/plugin/utc';
 5 | 
 6 | dayjs.extend(utc);
 7 | 
 8 | @StatusCode(50301)
 9 | export class ServiceDisabledError extends ApplicationError { }
10 | 
11 | @StatusCode(50302)
12 | export class ServiceCrashedError extends ApplicationError { }
13 | 
14 | @StatusCode(50303)
15 | export class ServiceNodeResourceDrainError extends ApplicationError { }
16 | 
17 | @StatusCode(50304)
18 | export class ServiceBadAttemptError extends ApplicationError { }
19 | 
20 | @StatusCode(50305)
21 | export class ServiceBadApproachError extends ServiceBadAttemptError { }
22 | 
23 | @StatusCode(40104)
24 | export class EmailUnverifiedError extends ApplicationError { }
25 | 
26 | @StatusCode(40201)
27 | export class InsufficientCreditsError extends ApplicationError { }
28 | 
29 | @StatusCode(40202)
30 | export class TierFeatureConstraintError extends ApplicationError { }
31 | 
32 | @StatusCode(40203)
33 | export class InsufficientBalanceError extends ApplicationError { }
34 | 
35 | @StatusCode(40903)
36 | export class LockConflictError extends ApplicationError { }
37 | 
38 | @StatusCode(40904)
39 | export class BudgetExceededError extends ApplicationError { }
40 | 
41 | @StatusCode(45101)
42 | export class HarmfulContentError extends ApplicationError { }
43 | 
44 | @StatusCode(45102)
45 | export class SecurityCompromiseError extends ApplicationError { }
46 | 
47 | @StatusCode(41201)
48 | export class BatchSizeTooLargeError extends ApplicationError { }
49 | 


--------------------------------------------------------------------------------
/src/services/finalizer.ts:
--------------------------------------------------------------------------------
 1 | import { AbstractFinalizerService } from 'civkit/finalizer';
 2 | import { container, singleton } from 'tsyringe';
 3 | import { isMainThread } from 'worker_threads';
 4 | import { GlobalLogger } from './logger';
 5 | 
 6 | const realProcessExit = process.exit;
 7 | process.exit = ((code?: number) => {
 8 |     if (isMainThread) {
 9 |         return;
10 |     }
11 |     return realProcessExit(code);
12 | }) as typeof process.exit;
13 | 
14 | @singleton()
15 | export class FinalizerService extends AbstractFinalizerService {
16 | 
17 |     container = container;
18 |     logger = this.globalLogger.child({ service: this.constructor.name });
19 | 
20 |     override quitProcess(code?: string | number | null | undefined): never {
21 |         return realProcessExit(code);
22 |     }
23 | 
24 |     constructor(protected globalLogger: GlobalLogger) {
25 |         super(...arguments);
26 |     }
27 | 
28 |     override onUnhandledRejection(err: unknown, _triggeringPromise: Promise<unknown>): void {
29 |         this.logger.warn(`Unhandled promise rejection in pid ${process.pid}`, { err });
30 |     }
31 | }
32 | 
33 | const instance = container.resolve(FinalizerService);
34 | export const { Finalizer } = instance.decorators();
35 | export default instance;
36 | 
37 | if (isMainThread) {
38 |     instance.serviceReady();
39 | }
40 | 


--------------------------------------------------------------------------------
/src/services/geoip.ts:
--------------------------------------------------------------------------------
  1 | import { container, singleton } from 'tsyringe';
  2 | import fsp from 'fs/promises';
  3 | import { CityResponse, Reader } from 'maxmind';
  4 | import { AsyncService, AutoCastable, Prop, runOnce } from 'civkit';
  5 | import { GlobalLogger } from './logger';
  6 | import path from 'path';
  7 | import { Threaded } from './threaded';
  8 | 
  9 | export enum GEOIP_SUPPORTED_LANGUAGES {
 10 |     EN = 'en',
 11 |     ZH_CN = 'zh-CN',
 12 |     JA = 'ja',
 13 |     DE = 'de',
 14 |     FR = 'fr',
 15 |     ES = 'es',
 16 |     PT_BR = 'pt-BR',
 17 |     RU = 'ru',
 18 | }
 19 | 
 20 | export class GeoIPInfo extends AutoCastable {
 21 |     @Prop()
 22 |     code?: string;
 23 | 
 24 |     @Prop()
 25 |     name?: string;
 26 | }
 27 | 
 28 | export class GeoIPCountryInfo extends GeoIPInfo {
 29 |     @Prop()
 30 |     eu?: boolean;
 31 | }
 32 | 
 33 | export class GeoIPCityResponse extends AutoCastable {
 34 |     @Prop()
 35 |     continent?: GeoIPInfo;
 36 | 
 37 |     @Prop()
 38 |     country?: GeoIPCountryInfo;
 39 | 
 40 |     @Prop({
 41 |         arrayOf: GeoIPInfo
 42 |     })
 43 |     subdivisions?: GeoIPInfo[];
 44 | 
 45 |     @Prop()
 46 |     city?: string;
 47 | 
 48 |     @Prop({
 49 |         arrayOf: Number
 50 |     })
 51 |     coordinates?: [number, number, number];
 52 | 
 53 |     @Prop()
 54 |     timezone?: string;
 55 | }
 56 | 
 57 | @singleton()
 58 | export class GeoIPService extends AsyncService {
 59 | 
 60 |     logger = this.globalLogger.child({ service: this.constructor.name });
 61 | 
 62 |     mmdbCity!: Reader<CityResponse>;
 63 | 
 64 |     constructor(
 65 |         protected globalLogger: GlobalLogger,
 66 |     ) {
 67 |         super(...arguments);
 68 |     }
 69 | 
 70 | 
 71 |     override async init() {
 72 |         await this.dependencyReady();
 73 | 
 74 |         this.emit('ready');
 75 |     }
 76 | 
 77 |     @runOnce()
 78 |     async _lazyload() {
 79 |         const mmdpPath = path.resolve(__dirname, '..', '..', 'licensed', 'GeoLite2-City.mmdb');
 80 | 
 81 |         const dbBuff = await fsp.readFile(mmdpPath, { flag: 'r', encoding: null });
 82 | 
 83 |         this.mmdbCity = new Reader<CityResponse>(dbBuff);
 84 | 
 85 |         this.logger.info(`Loaded GeoIP database, ${dbBuff.byteLength} bytes`);
 86 |     }
 87 | 
 88 | 
 89 |     @Threaded()
 90 |     async lookupCity(ip: string, lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_SUPPORTED_LANGUAGES.EN) {
 91 |         await this._lazyload();
 92 | 
 93 |         const r = this.mmdbCity.get(ip);
 94 | 
 95 |         if (!r) {
 96 |             return undefined;
 97 |         }
 98 | 
 99 |         return GeoIPCityResponse.from({
100 |             continent: r.continent ? {
101 |                 code: r.continent?.code,
102 |                 name: r.continent?.names?.[lang] || r.continent?.names?.en,
103 |             } : undefined,
104 |             country: r.country ? {
105 |                 code: r.country?.iso_code,
106 |                 name: r.country?.names?.[lang] || r.country?.names.en,
107 |                 eu: r.country?.is_in_european_union,
108 |             } : undefined,
109 |             city: r.city?.names?.[lang] || r.city?.names?.en,
110 |             subdivisions: r.subdivisions?.map((x) => ({
111 |                 code: x.iso_code,
112 |                 name: x.names?.[lang] || x.names?.en,
113 |             })),
114 |             coordinates: r.location ? [
115 |                 r.location.latitude, r.location.longitude, r.location.accuracy_radius
116 |             ] : undefined,
117 |             timezone: r.location?.time_zone,
118 |         });
119 |     }
120 | 
121 |     @Threaded()
122 |     async lookupCities(ips: string[], lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_SUPPORTED_LANGUAGES.EN) {
123 |         const r = (await Promise.all(ips.map((ip) => this.lookupCity(ip, lang)))).filter(Boolean) as GeoIPCityResponse[];
124 | 
125 |         return r;
126 |     }
127 | 
128 | }
129 | 
130 | const instance = container.resolve(GeoIPService);
131 | 
132 | export default instance;
133 | 


--------------------------------------------------------------------------------
/src/services/lm.ts:
--------------------------------------------------------------------------------
  1 | import { AsyncService } from 'civkit/async-service';
  2 | import { singleton } from 'tsyringe';
  3 | 
  4 | import { PageSnapshot } from './puppeteer';
  5 | import { GlobalLogger } from './logger';
  6 | import _ from 'lodash';
  7 | import { AssertionFailureError } from 'civkit';
  8 | import { LLMManager } from '../shared/services/common-llm';
  9 | import { JSDomControl } from './jsdom';
 10 | 
 11 | const tripleBackTick = '```';
 12 | 
 13 | @singleton()
 14 | export class LmControl extends AsyncService {
 15 | 
 16 |     logger = this.globalLogger.child({ service: this.constructor.name });
 17 | 
 18 |     constructor(
 19 |         protected globalLogger: GlobalLogger,
 20 |         protected commonLLM: LLMManager,
 21 |         protected jsdomControl: JSDomControl,
 22 |     ) {
 23 |         super(...arguments);
 24 |     }
 25 | 
 26 |     override async init() {
 27 |         await this.dependencyReady();
 28 | 
 29 |         this.emit('ready');
 30 |     }
 31 | 
 32 |     async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & {
 33 |         pageshotUrl?: string,
 34 |     }) {
 35 |         const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
 36 | 
 37 |         if (!pageshot) {
 38 |             throw new AssertionFailureError('Screenshot of the page is not available');
 39 |         }
 40 | 
 41 |         const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg');
 42 | 
 43 |         const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
 44 |             prompt: [
 45 |                 `HTML: \n${html}\n\nSCREENSHOT: \n`,
 46 |                 typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
 47 |                 `Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
 48 |             ],
 49 | 
 50 |             options: {
 51 |                 system: 'You are ReaderLM-v7, a model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
 52 |                 stream: true
 53 |             }
 54 |         });
 55 | 
 56 |         const chunks: string[] = [];
 57 |         for await (const txt of it) {
 58 |             chunks.push(txt);
 59 |             const output: PageSnapshot = {
 60 |                 ...snapshot,
 61 |                 parsed: {
 62 |                     ...snapshot?.parsed,
 63 |                     textContent: chunks.join(''),
 64 |                 }
 65 |             };
 66 |             yield output;
 67 |         }
 68 | 
 69 |         return;
 70 |     }
 71 | 
 72 |     async* readerLMMarkdownFromSnapshot(snapshot?: PageSnapshot) {
 73 |         if (!snapshot) {
 74 |             throw new AssertionFailureError('Snapshot of the page is not available');
 75 |         }
 76 | 
 77 |         const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg');
 78 | 
 79 |         const it = this.commonLLM.iterRun('readerlm-v2', {
 80 |             prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${html}\n${tripleBackTick}\n`,
 81 | 
 82 |             options: {
 83 |                 // system: 'You are an AI assistant developed by VENDOR_NAME',
 84 |                 stream: true,
 85 |                 modelSpecific: {
 86 |                     top_k: 1,
 87 |                     temperature: 0,
 88 |                     repetition_penalty: 1.13,
 89 |                     presence_penalty: 0.25,
 90 |                     frequency_penalty: 0.25,
 91 |                     max_tokens: 8192,
 92 |                 }
 93 |             },
 94 |             maxTry: 1,
 95 |         });
 96 | 
 97 |         const chunks: string[] = [];
 98 |         for await (const txt of it) {
 99 |             chunks.push(txt);
100 |             const output: PageSnapshot = {
101 |                 ...snapshot,
102 |                 parsed: {
103 |                     ...snapshot?.parsed,
104 |                     textContent: chunks.join(''),
105 |                 }
106 |             };
107 |             yield output;
108 |         }
109 | 
110 |         return;
111 |     }
112 | 
113 |     async* readerLMFromSnapshot(schema?: string, instruction: string = 'Infer useful information from the HTML and present it in a structured JSON object.', snapshot?: PageSnapshot) {
114 |         if (!snapshot) {
115 |             throw new AssertionFailureError('Snapshot of the page is not available');
116 |         }
117 | 
118 |         const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg');
119 | 
120 |         const it = this.commonLLM.iterRun('readerlm-v2', {
121 |             prompt: `${instruction}\n\n${tripleBackTick}html\n${html}\n${tripleBackTick}\n${schema ? `The JSON schema:\n${tripleBackTick}json\n${schema}\n${tripleBackTick}\n` : ''}`,
122 |             options: {
123 |                 // system: 'You are an AI assistant developed by VENDOR_NAME',
124 |                 stream: true,
125 |                 modelSpecific: {
126 |                     top_k: 1,
127 |                     temperature: 0,
128 |                     repetition_penalty: 1.13,
129 |                     presence_penalty: 0.25,
130 |                     frequency_penalty: 0.25,
131 |                     max_tokens: 8192,
132 |                 }
133 |             },
134 |             maxTry: 1,
135 |         });
136 | 
137 |         const chunks: string[] = [];
138 |         for await (const txt of it) {
139 |             chunks.push(txt);
140 |             const output: PageSnapshot = {
141 |                 ...snapshot,
142 |                 parsed: {
143 |                     ...snapshot?.parsed,
144 |                     textContent: chunks.join(''),
145 |                 }
146 |             };
147 |             yield output;
148 |         }
149 | 
150 |         return;
151 |     }
152 | }
153 | 


--------------------------------------------------------------------------------
/src/services/logger.ts:
--------------------------------------------------------------------------------
 1 | import { AbstractPinoLogger } from 'civkit/pino-logger';
 2 | import { singleton, container } from 'tsyringe';
 3 | import { threadId } from 'node:worker_threads';
 4 | import { getTraceCtx } from 'civkit/async-context';
 5 | 
 6 | 
 7 | const levelToSeverityMap: { [k: string]: string | undefined; } = {
 8 |     trace: 'DEFAULT',
 9 |     debug: 'DEBUG',
10 |     info: 'INFO',
11 |     warn: 'WARNING',
12 |     error: 'ERROR',
13 |     fatal: 'CRITICAL',
14 | };
15 | 
16 | @singleton()
17 | export class GlobalLogger extends AbstractPinoLogger {
18 |     loggerOptions = {
19 |         level: 'debug',
20 |         base: {
21 |             tid: threadId,
22 |         }
23 |     };
24 | 
25 |     override init(): void {
26 |         if (process.env['NODE_ENV']?.startsWith('prod')) {
27 |             super.init(process.stdout);
28 |         } else {
29 |             const PinoPretty = require('pino-pretty').PinoPretty;
30 |             super.init(PinoPretty({
31 |                 singleLine: true,
32 |                 colorize: true,
33 |                 messageFormat(log: any, messageKey: any) {
34 |                     return `${log['tid'] ? `[${log['tid']}]` : ''}[${log['service'] || 'ROOT'}] ${log[messageKey]}`;
35 |                 },
36 |             }));
37 |         }
38 | 
39 | 
40 |         this.emit('ready');
41 |     }
42 | 
43 |     override log(...args: any[]) {
44 |         const [levelObj, ...rest] = args;
45 |         const severity = levelToSeverityMap[levelObj?.level];
46 |         const traceCtx = getTraceCtx();
47 |         const patched: any= { ...levelObj, severity };
48 |         const traceId = traceCtx?.googleTraceId || traceCtx?.traceId;
49 |         if (traceId && process.env['GCLOUD_PROJECT']) {
50 |             patched['logging.googleapis.com/trace'] = `projects/${process.env['GCLOUD_PROJECT']}/traces/${traceId}`;
51 |         }
52 |         return super.log(patched, ...rest);
53 |     }
54 | }
55 | 
56 | const instance = container.resolve(GlobalLogger);
57 | export default instance;
58 | 


--------------------------------------------------------------------------------
/src/services/misc.ts:
--------------------------------------------------------------------------------
  1 | import { singleton } from 'tsyringe';
  2 | import { AsyncService } from 'civkit/async-service';
  3 | import { ParamValidationError } from 'civkit/civ-rpc';
  4 | import { SecurityCompromiseError } from '../shared/lib/errors';
  5 | import { isIP } from 'node:net';
  6 | import { isIPInNonPublicRange } from '../utils/ip';
  7 | import { GlobalLogger } from './logger';
  8 | import { lookup } from 'node:dns/promises';
  9 | import { Threaded } from './threaded';
 10 | 
 11 | const normalizeUrl = require('@esm2cjs/normalize-url').default;
 12 | 
 13 | @singleton()
 14 | export class MiscService extends AsyncService {
 15 | 
 16 |     logger = this.globalLogger.child({ service: this.constructor.name });
 17 | 
 18 |     constructor(
 19 |         protected globalLogger: GlobalLogger,
 20 |     ) {
 21 |         super(...arguments);
 22 |     }
 23 | 
 24 |     override async init() {
 25 |         await this.dependencyReady();
 26 | 
 27 |         this.emit('ready');
 28 |     }
 29 | 
 30 |     @Threaded()
 31 |     async assertNormalizedUrl(input: string) {
 32 |         let result: URL;
 33 |         try {
 34 |             result = new URL(
 35 |                 normalizeUrl(
 36 |                     input,
 37 |                     {
 38 |                         stripWWW: false,
 39 |                         removeTrailingSlash: false,
 40 |                         removeSingleSlash: false,
 41 |                         sortQueryParameters: false,
 42 |                     }
 43 |                 )
 44 |             );
 45 |         } catch (err) {
 46 |             throw new ParamValidationError({
 47 |                 message: `${err}`,
 48 |                 path: 'url'
 49 |             });
 50 |         }
 51 | 
 52 |         if (!['http:', 'https:', 'blob:'].includes(result.protocol)) {
 53 |             throw new ParamValidationError({
 54 |                 message: `Invalid protocol ${result.protocol}`,
 55 |                 path: 'url'
 56 |             });
 57 |         }
 58 | 
 59 |         const normalizedHostname = result.hostname.startsWith('[') ? result.hostname.slice(1, -1) : result.hostname;
 60 |         let ips: string[] = [];
 61 |         const isIp = isIP(normalizedHostname);
 62 |         if (isIp) {
 63 |             ips.push(normalizedHostname);
 64 |         }
 65 |         if (
 66 |             (result.hostname === 'localhost') ||
 67 |             (isIp && isIPInNonPublicRange(normalizedHostname))
 68 |         ) {
 69 |             this.logger.warn(`Suspicious action: Request to localhost or non-public IP: ${normalizedHostname}`, { href: result.href });
 70 |             throw new SecurityCompromiseError({
 71 |                 message: `Suspicious action: Request to localhost or non-public IP: ${normalizedHostname}`,
 72 |                 path: 'url'
 73 |             });
 74 |         }
 75 |         if (!isIp && result.protocol !== 'blob:') {
 76 |             const resolved = await lookup(result.hostname, { all: true }).catch((err) => {
 77 |                 if (err.code === 'ENOTFOUND') {
 78 |                     return Promise.reject(new ParamValidationError({
 79 |                         message: `Domain '${result.hostname}' could not be resolved`,
 80 |                         path: 'url'
 81 |                     }));
 82 |                 }
 83 | 
 84 |                 return;
 85 |             });
 86 |             if (resolved) {
 87 |                 for (const x of resolved) {
 88 |                     if (isIPInNonPublicRange(x.address)) {
 89 |                         this.logger.warn(`Suspicious action: Domain resolved to non-public IP: ${result.hostname} => ${x.address}`, { href: result.href, ip: x.address });
 90 |                         throw new SecurityCompromiseError({
 91 |                             message: `Suspicious action: Domain resolved to non-public IP: ${x.address}`,
 92 |                             path: 'url'
 93 |                         });
 94 |                     }
 95 |                     ips.push(x.address);
 96 |                 }
 97 | 
 98 |             }
 99 |         }
100 | 
101 |         return {
102 |             url: result,
103 |             ips
104 |         };
105 |     }
106 | 
107 | }


--------------------------------------------------------------------------------
/src/services/pdf-extract.ts:
--------------------------------------------------------------------------------
  1 | import { singleton } from 'tsyringe';
  2 | import _ from 'lodash';
  3 | import { TextItem } from 'pdfjs-dist/types/src/display/api';
  4 | import { AssertionFailureError, AsyncService, HashManager } from 'civkit';
  5 | import { GlobalLogger } from './logger';
  6 | import { PDFContent } from '../db/pdf';
  7 | import dayjs from 'dayjs';
  8 | import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
  9 | import { randomUUID } from 'crypto';
 10 | import type { PDFDocumentLoadingTask } from 'pdfjs-dist';
 11 | import path from 'path';
 12 | import { AsyncLocalContext } from './async-context';
 13 | const utc = require('dayjs/plugin/utc');  // Import the UTC plugin
 14 | dayjs.extend(utc);  // Extend dayjs with the UTC plugin
 15 | const timezone = require('dayjs/plugin/timezone');
 16 | dayjs.extend(timezone);
 17 | 
 18 | const pPdfjs = import('pdfjs-dist/legacy/build/pdf.mjs');
 19 | const nodeCmapUrl = path.resolve(require.resolve('pdfjs-dist'), '../../cmaps') + '/';
 20 | 
 21 | const md5Hasher = new HashManager('md5', 'hex');
 22 | 
 23 | function stdDev(numbers: number[]) {
 24 |     const mean = _.mean(numbers);
 25 |     const squareDiffs = numbers.map((num) => Math.pow(num - mean, 2));
 26 |     const avgSquareDiff = _.mean(squareDiffs);
 27 |     return Math.sqrt(avgSquareDiff);
 28 | }
 29 | 
 30 | function isRotatedByAtLeast35Degrees(transform?: [number, number, number, number, number, number]): boolean {
 31 |     if (!transform) {
 32 |         return false;
 33 |     }
 34 |     const [a, b, c, d, _e, _f] = transform;
 35 | 
 36 |     // Calculate the rotation angles using arctan(b/a) and arctan(-c/d)
 37 |     const angle1 = Math.atan2(b, a) * (180 / Math.PI); // from a, b
 38 |     const angle2 = Math.atan2(-c, d) * (180 / Math.PI); // from c, d
 39 | 
 40 |     // Either angle1 or angle2 can be used to determine the rotation, they should be equivalent
 41 |     const rotationAngle1 = Math.abs(angle1);
 42 |     const rotationAngle2 = Math.abs(angle2);
 43 | 
 44 |     // Check if the absolute rotation angle is greater than or equal to 35 degrees
 45 |     return rotationAngle1 >= 35 || rotationAngle2 >= 35;
 46 | }
 47 | 
 48 | @singleton()
 49 | export class PDFExtractor extends AsyncService {
 50 | 
 51 |     logger = this.globalLogger.child({ service: this.constructor.name });
 52 |     pdfjs!: Awaited<typeof pPdfjs>;
 53 | 
 54 |     cacheRetentionMs = 1000 * 3600 * 24 * 7;
 55 | 
 56 |     constructor(
 57 |         protected globalLogger: GlobalLogger,
 58 |         protected firebaseObjectStorage: FirebaseStorageBucketControl,
 59 |         protected asyncLocalContext: AsyncLocalContext,
 60 |     ) {
 61 |         super(...arguments);
 62 |     }
 63 | 
 64 |     override async init() {
 65 |         await this.dependencyReady();
 66 |         this.pdfjs = await pPdfjs;
 67 | 
 68 |         this.emit('ready');
 69 |     }
 70 | 
 71 |     isDataUrl(url: string) {
 72 |         return url.startsWith('data:');
 73 |     }
 74 | 
 75 |     parseDataUrl(url: string) {
 76 |         const protocol = url.slice(0, url.indexOf(':'));
 77 |         const contentType = url.slice(url.indexOf(':') + 1, url.indexOf(';'));
 78 |         const data = url.slice(url.indexOf(',') + 1);
 79 |         if (protocol !== 'data' || !data) {
 80 |             throw new Error('Invalid data URL');
 81 |         }
 82 | 
 83 |         if (contentType !== 'application/pdf') {
 84 |             throw new Error('Invalid data URL type');
 85 |         }
 86 | 
 87 |         return {
 88 |             type: contentType,
 89 |             data: data
 90 |         };
 91 |     }
 92 | 
 93 |     async extract(url: string | URL) {
 94 |         let loadingTask: PDFDocumentLoadingTask;
 95 | 
 96 |         if (typeof url === 'string' && this.isDataUrl(url)) {
 97 |             const { data } = this.parseDataUrl(url);
 98 |             const binary = Uint8Array.from(Buffer.from(data, 'base64'));
 99 |             loadingTask = this.pdfjs.getDocument({
100 |                 data: binary,
101 |                 disableFontFace: true,
102 |                 verbosity: 0,
103 |                 cMapUrl: nodeCmapUrl,
104 |             });
105 |         } else {
106 |             loadingTask = this.pdfjs.getDocument({
107 |                 url,
108 |                 disableFontFace: true,
109 |                 verbosity: 0,
110 |                 cMapUrl: nodeCmapUrl,
111 |             });
112 |         }
113 | 
114 | 
115 |         const doc = await loadingTask.promise;
116 |         const meta = await doc.getMetadata();
117 | 
118 |         const textItems: TextItem[][] = [];
119 | 
120 |         for (const pg of _.range(0, doc.numPages)) {
121 |             const page = await doc.getPage(pg + 1);
122 |             const textContent = await page.getTextContent({ includeMarkedContent: true });
123 |             textItems.push((textContent.items as TextItem[]));
124 |         }
125 | 
126 |         const articleCharHeights: number[] = [];
127 |         for (const textItem of textItems.flat()) {
128 |             if (textItem.height) {
129 |                 articleCharHeights.push(...Array(textItem.str.length).fill(textItem.height));
130 |             }
131 |         }
132 |         const articleAvgHeight = _.mean(articleCharHeights);
133 |         const articleStdDevHeight = stdDev(articleCharHeights);
134 |         // const articleMedianHeight = articleCharHeights.sort()[Math.floor(articleCharHeights.length / 2)];
135 |         const mdOps: Array<{
136 |             text: string;
137 |             op?: 'new' | 'append';
138 |             mode: 'h1' | 'h2' | 'p' | 'appendix' | 'space';
139 |         }> = [];
140 | 
141 |         const rawChunks: string[] = [];
142 | 
143 |         let op: 'append' | 'new' = 'new';
144 |         let mode: 'h1' | 'h2' | 'p' | 'space' | 'appendix' = 'p';
145 |         for (const pageTextItems of textItems) {
146 |             const charHeights = [];
147 |             for (const textItem of pageTextItems as TextItem[]) {
148 |                 if (textItem.height) {
149 |                     charHeights.push(...Array(textItem.str.length).fill(textItem.height));
150 |                 }
151 |                 rawChunks.push(`${textItem.str}${textItem.hasEOL ? '\n' : ''}`);
152 |             }
153 | 
154 |             const avgHeight = _.mean(charHeights);
155 |             const stdDevHeight = stdDev(charHeights);
156 |             // const medianHeight = charHeights.sort()[Math.floor(charHeights.length / 2)];
157 | 
158 |             for (const textItem of pageTextItems) {
159 |                 if (textItem.height > articleAvgHeight + 3 * articleStdDevHeight) {
160 |                     mode = 'h1';
161 |                 } else if (textItem.height > articleAvgHeight + 2 * articleStdDevHeight) {
162 |                     mode = 'h2';
163 |                 } else if (textItem.height && textItem.height < avgHeight - stdDevHeight) {
164 |                     mode = 'appendix';
165 |                 } else if (textItem.height) {
166 |                     mode = 'p';
167 |                 } else {
168 |                     mode = 'space';
169 |                 }
170 | 
171 |                 if (isRotatedByAtLeast35Degrees(textItem.transform as any)) {
172 |                     mode = 'appendix';
173 |                 }
174 | 
175 |                 mdOps.push({
176 |                     op,
177 |                     mode,
178 |                     text: textItem.str
179 |                 });
180 | 
181 |                 if (textItem.hasEOL && !textItem.str) {
182 |                     op = 'new';
183 |                 } else {
184 |                     op = 'append';
185 |                 }
186 |             }
187 |         }
188 | 
189 |         const mdChunks = [];
190 |         const appendixChunks = [];
191 |         mode = 'space';
192 |         for (const x of mdOps) {
193 |             const previousMode: string = mode;
194 |             const changeToMdChunks = [];
195 | 
196 |             const isNewStart = x.mode !== 'space' && (x.op === 'new' || (previousMode === 'appendix' && x.mode !== previousMode));
197 | 
198 |             if (isNewStart) {
199 |                 switch (x.mode) {
200 |                     case 'h1': {
201 |                         changeToMdChunks.push(`\n\n# `);
202 |                         mode = x.mode;
203 |                         break;
204 |                     }
205 | 
206 |                     case 'h2': {
207 |                         changeToMdChunks.push(`\n\n## `);
208 |                         mode = x.mode;
209 |                         break;
210 |                     }
211 | 
212 |                     case 'p': {
213 |                         changeToMdChunks.push(`\n\n`);
214 |                         mode = x.mode;
215 |                         break;
216 |                     }
217 | 
218 |                     case 'appendix': {
219 |                         mode = x.mode;
220 |                         appendixChunks.push(`\n\n`);
221 |                         break;
222 |                     }
223 | 
224 |                     default: {
225 |                         break;
226 |                     }
227 |                 }
228 |             } else {
229 |                 if (x.mode === 'appendix' && appendixChunks.length) {
230 |                     const lastChunk = appendixChunks[appendixChunks.length - 1];
231 |                     if (!lastChunk.match(/(\s+|-)$/) && lastChunk.length !== 1) {
232 |                         appendixChunks.push(' ');
233 |                     }
234 |                 } else if (mdChunks.length) {
235 |                     const lastChunk = mdChunks[mdChunks.length - 1];
236 |                     if (!lastChunk.match(/(\s+|-)$/) && lastChunk.length !== 1) {
237 |                         changeToMdChunks.push(' ');
238 |                     }
239 |                 }
240 |             }
241 | 
242 |             if (x.text) {
243 |                 if (x.mode == 'appendix') {
244 |                     if (appendixChunks.length || isNewStart) {
245 |                         appendixChunks.push(x.text);
246 |                     } else {
247 |                         changeToMdChunks.push(x.text);
248 |                     }
249 |                 } else {
250 |                     changeToMdChunks.push(x.text);
251 |                 }
252 |             }
253 | 
254 |             if (isNewStart && x.mode !== 'appendix' && appendixChunks.length) {
255 |                 const appendix = appendixChunks.join('').split(/\r?\n/).map((x) => x.trim()).filter(Boolean).map((x) => `> ${x}`).join('\n');
256 |                 changeToMdChunks.unshift(appendix);
257 |                 changeToMdChunks.unshift(`\n\n`);
258 |                 appendixChunks.length = 0;
259 |             }
260 | 
261 |             if (x.mode === 'space' && changeToMdChunks.length) {
262 |                 changeToMdChunks.length = 1;
263 |             }
264 |             if (changeToMdChunks.length) {
265 |                 mdChunks.push(...changeToMdChunks);
266 |             }
267 |         }
268 | 
269 |         if (mdChunks.length) {
270 |             mdChunks[0] = mdChunks[0].trimStart();
271 |         }
272 | 
273 |         return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
274 |     }
275 | 
276 |     async cachedExtract(url: string, cacheTolerance: number = 1000 * 3600 * 24, alternativeUrl?: string) {
277 |         if (!url) {
278 |             return undefined;
279 |         }
280 |         let nameUrl = alternativeUrl || url;
281 |         const digest = md5Hasher.hash(nameUrl);
282 | 
283 |         if (this.isDataUrl(url)) {
284 |             nameUrl = `blob://pdf:${digest}`;
285 |         }
286 | 
287 |         const cache: PDFContent | undefined = nameUrl.startsWith('blob:') ? undefined :
288 |             (await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
289 | 
290 |         if (cache) {
291 |             const age = Date.now() - cache?.createdAt.valueOf();
292 |             const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance);
293 |             this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${nameUrl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, {
294 |                 data: url, url: nameUrl, digest, age, stale, cacheTolerance
295 |             });
296 | 
297 |             if (!stale) {
298 |                 if (cache.content && cache.text) {
299 |                     return {
300 |                         meta: cache.meta,
301 |                         content: cache.content,
302 |                         text: cache.text
303 |                     };
304 |                 }
305 | 
306 |                 try {
307 |                     const r = await this.firebaseObjectStorage.downloadFile(`pdfs/${cache._id}`);
308 |                     let cached = JSON.parse(r.toString('utf-8'));
309 | 
310 |                     return {
311 |                         meta: cached.meta,
312 |                         content: cached.content,
313 |                         text: cached.text
314 |                     };
315 |                 } catch (err) {
316 |                     this.logger.warn(`Unable to load cached content for ${nameUrl}`, { err });
317 | 
318 |                     return undefined;
319 |                 }
320 |             }
321 |         }
322 | 
323 |         let extracted;
324 | 
325 |         try {
326 |             extracted = await this.extract(url);
327 |         } catch (err: any) {
328 |             this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err, url, nameUrl });
329 |             throw new AssertionFailureError(`Unable to process ${nameUrl} as pdf: ${err?.message}`);
330 |         }
331 | 
332 |         if (!this.asyncLocalContext.ctx.DNT && !nameUrl.startsWith('blob:')) {
333 |             const theID = randomUUID();
334 |             await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
335 |                 Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
336 |             PDFContent.save(
337 |                 PDFContent.from({
338 |                     _id: theID,
339 |                     src: nameUrl,
340 |                     meta: extracted?.meta || {},
341 |                     urlDigest: digest,
342 |                     createdAt: new Date(),
343 |                     expireAt: new Date(Date.now() + this.cacheRetentionMs)
344 |                 }).degradeForFireStore()
345 |             ).catch((r) => {
346 |                 this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r });
347 |             });
348 |         }
349 | 
350 |         return extracted;
351 |     }
352 | 
353 |     parsePdfDate(pdfDate: string | undefined) {
354 |         if (!pdfDate) {
355 |             return undefined;
356 |         }
357 |         // Remove the 'D:' prefix
358 |         const cleanedDate = pdfDate.slice(2);
359 | 
360 |         // Define the format without the timezone part first
361 |         const dateTimePart = cleanedDate.slice(0, 14);
362 |         const timezonePart = cleanedDate.slice(14);
363 | 
364 |         // Construct the full date string in a standard format
365 |         const formattedDate = `${dateTimePart}${timezonePart.replace("'", "").replace("'", "")}`;
366 | 
367 |         // Parse the date with timezone
368 |         const parsedDate = dayjs(formattedDate, "YYYYMMDDHHmmssZ");
369 | 
370 |         const date = parsedDate.toDate();
371 | 
372 |         if (!date.valueOf()) {
373 |             return undefined;
374 |         }
375 | 
376 |         return date;
377 |     }
378 | }
379 | 


--------------------------------------------------------------------------------
/src/services/pseudo-transfer.ts:
--------------------------------------------------------------------------------
 1 | import { marshalErrorLike } from 'civkit';
 2 | import { AbstractPseudoTransfer, SYM_PSEUDO_TRANSFERABLE } from 'civkit/pseudo-transfer';
 3 | import { container, singleton } from 'tsyringe';
 4 | 
 5 | 
 6 | @singleton()
 7 | export class PseudoTransfer extends AbstractPseudoTransfer {
 8 | 
 9 |     override async init() {
10 |         await this.dependencyReady();
11 |         this.emit('ready');
12 |     }
13 | 
14 | }
15 | 
16 | const instance = container.resolve(PseudoTransfer);
17 | 
18 | Object.defineProperty(Error.prototype, SYM_PSEUDO_TRANSFERABLE, {
19 |     value: function () {
20 |         const prototype = this;
21 |         return {
22 |             copyOwnProperty: 'all',
23 |             marshall: (input: Error) => marshalErrorLike(input),
24 |             unMarshall: (input: object) => {
25 |                 Object.setPrototypeOf(input, prototype);
26 |                 return input;
27 |             },
28 |         };
29 |     },
30 |     enumerable: false,
31 | });
32 | instance.expectPseudoTransferableType(Error);
33 | for (const x of [...Object.values(require('./errors')), ...Object.values(require('civkit/civ-rpc'))]) {
34 |     if (typeof x === 'function' && x.prototype instanceof Error) {
35 |         instance.expectPseudoTransferableType(x as any);
36 |     }
37 | }
38 | 
39 | 
40 | Object.defineProperty(URL.prototype, SYM_PSEUDO_TRANSFERABLE, {
41 |     value: function () {
42 |         return {
43 |             copyOwnProperty: 'none',
44 |             marshall: (input: URL) => ({ href: input.href }),
45 |             unMarshall: (input: { href: string; }) => new URL(input.href),
46 |         };
47 |     },
48 |     enumerable: false,
49 | });
50 | instance.expectPseudoTransferableType(URL);
51 | 
52 | Object.defineProperty(Buffer.prototype, SYM_PSEUDO_TRANSFERABLE, {
53 |     value: function () {
54 |         return {
55 |             copyOwnProperty: 'none',
56 |             unMarshall: (input: Uint8Array | Buffer) => Buffer.isBuffer(input) ? input : Buffer.from(input),
57 |             marshall: (input: Uint8Array | Buffer) => input,
58 |         };
59 |     },
60 |     enumerable: false,
61 | });
62 | instance.expectPseudoTransferableType(Buffer);
63 | 
64 | 
65 | export default instance;
66 | 


--------------------------------------------------------------------------------
/src/services/registry.ts:
--------------------------------------------------------------------------------
 1 | import { propertyInjectorFactory } from 'civkit/property-injector';
 2 | import { KoaRPCRegistry } from 'civkit/civ-rpc/koa';
 3 | import { container, singleton } from 'tsyringe';
 4 | import { IntegrityEnvelope } from 'civkit/civ-rpc';
 5 | import bodyParser from '@koa/bodyparser';
 6 | 
 7 | import { GlobalLogger } from './logger';
 8 | import { TempFileManager } from './temp-file';
 9 | import { AsyncLocalContext } from './async-context';
10 | import { BlackHoleDetector } from './blackhole-detector';
11 | export { Context } from 'koa';
12 | 
13 | export const InjectProperty = propertyInjectorFactory(container);
14 | 
15 | @singleton()
16 | export class RPCRegistry extends KoaRPCRegistry {
17 | 
18 |     title = 'Jina Reader API';
19 |     container = container;
20 |     logger = this.globalLogger.child({ service: this.constructor.name });
21 |     static override envelope = IntegrityEnvelope;
22 |     override _BODY_PARSER_LIMIT = '102mb';
23 |     override _RESPONSE_STREAM_MODE = 'koa' as const;
24 | 
25 |     override koaMiddlewares = [
26 |         this.__CORSAllowAllMiddleware.bind(this),
27 |         bodyParser({
28 |             encoding: 'utf-8',
29 |             enableTypes: ['json', 'form'],
30 |             jsonLimit: this._BODY_PARSER_LIMIT,
31 |             xmlLimit: this._BODY_PARSER_LIMIT,
32 |             formLimit: this._BODY_PARSER_LIMIT,
33 |         }),
34 |         this.__multiParse.bind(this),
35 |         this.__binaryParse.bind(this),
36 |     ];
37 | 
38 |     constructor(
39 |         protected globalLogger: GlobalLogger,
40 |         protected ctxMgr: AsyncLocalContext,
41 |         protected tempFileManager: TempFileManager,
42 |         protected blackHoleDetector: BlackHoleDetector,
43 |     ) {
44 |         super(...arguments);
45 | 
46 |         this.on('run', () => this.blackHoleDetector.incomingRequest());
47 |         this.on('ran', () => this.blackHoleDetector.doneWithRequest());
48 |         this.on('fail', () => this.blackHoleDetector.doneWithRequest());
49 |     }
50 | 
51 |     override async init() {
52 |         await this.dependencyReady();
53 |         this.emit('ready');
54 |     }
55 | 
56 | }
57 | 
58 | const instance = container.resolve(RPCRegistry);
59 | export default instance;
60 | export const { Method, RPCMethod, RPCReflect, Param, Ctx, } = instance.decorators();
61 | 


--------------------------------------------------------------------------------
/src/services/robots-text.ts:
--------------------------------------------------------------------------------
  1 | import { singleton } from 'tsyringe';
  2 | import { URL } from 'url';
  3 | import { AssertionFailureError, DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc';
  4 | import { AsyncService } from 'civkit/async-service';
  5 | import { HashManager } from 'civkit/hash';
  6 | import { marshalErrorLike } from 'civkit/lang';
  7 | 
  8 | import { GlobalLogger } from './logger';
  9 | import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
 10 | import { Threaded } from '../services/threaded';
 11 | 
 12 | 
 13 | export const md5Hasher = new HashManager('md5', 'hex');
 14 | 
 15 | @singleton()
 16 | export class RobotsTxtService extends AsyncService {
 17 | 
 18 |     logger = this.globalLogger.child({ service: this.constructor.name });
 19 | 
 20 |     constructor(
 21 |         protected globalLogger: GlobalLogger,
 22 |         protected firebaseStorageBucketControl: FirebaseStorageBucketControl,
 23 |     ) {
 24 |         super(...arguments);
 25 |     }
 26 | 
 27 |     override async init() {
 28 |         await this.dependencyReady();
 29 |         this.emit('ready');
 30 |     }
 31 | 
 32 |     async getCachedRobotTxt(origin: string) {
 33 |         const digest = md5Hasher.hash(origin.toLowerCase());
 34 |         const cacheLoc = `robots-txt/${digest}`;
 35 |         let buff;
 36 |         buff = await this.firebaseStorageBucketControl.downloadFile(cacheLoc).catch(() => undefined);
 37 |         if (buff) {
 38 |             return buff.toString();
 39 |         }
 40 | 
 41 |         const r = await fetch(new URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) });
 42 |         if (!r.ok) {
 43 |             throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}: ${r.status} ${r.statusText}`);
 44 |         }
 45 |         buff = Buffer.from(await r.arrayBuffer());
 46 | 
 47 |         this.firebaseStorageBucketControl.saveFile(cacheLoc, buff, {
 48 |             contentType: 'text/plain'
 49 |         }).catch((err) => {
 50 |             this.logger.warn(`Failed to save robots.txt to cache: ${err}`, { err: marshalErrorLike(err) });
 51 |         });
 52 | 
 53 |         return buff.toString();
 54 |     }
 55 | 
 56 |     @Threaded()
 57 |     async assertAccessAllowed(url: URL, inputMyUa = '*') {
 58 |         let robotTxt: string = '';
 59 |         try {
 60 |             robotTxt = await this.getCachedRobotTxt(url.origin);
 61 |         } catch (err) {
 62 |             if (err instanceof DownstreamServiceFailureError) {
 63 |                 // Remote server is reachable but cannot provide a robot.txt; this is treated as public access
 64 |                 return true;
 65 |             }
 66 |             throw new AssertionFailureError(`Failed to load robots.txt from ${url.origin}: ${err}`);
 67 |         }
 68 |         const myUa = inputMyUa.toLowerCase();
 69 |         const lines = robotTxt.split(/\r?\n/g);
 70 | 
 71 |         let currentUa = myUa || '*';
 72 |         let uaLine = 'User-Agent: *';
 73 |         const pathNormalized = `${url.pathname}?`;
 74 | 
 75 |         for (const line of lines) {
 76 |             const trimmed = line.trim();
 77 |             if (trimmed.startsWith('#') || !trimmed) {
 78 |                 continue;
 79 |             }
 80 |             const [k, ...rest] = trimmed.split(':');
 81 |             const key = k.trim().toLowerCase();
 82 |             const value = rest.join(':').trim();
 83 | 
 84 |             if (key === 'user-agent') {
 85 |                 currentUa = value.toLowerCase();
 86 |                 if (value === '*') {
 87 |                     currentUa = myUa;
 88 |                 }
 89 |                 uaLine = line;
 90 |                 continue;
 91 |             }
 92 | 
 93 |             if (currentUa !== myUa) {
 94 |                 continue;
 95 |             }
 96 | 
 97 |             if (key === 'disallow') {
 98 |                 if (!value) {
 99 |                     return true;
100 |                 }
101 |                 if (value.includes('*')) {
102 |                     const [head, tail] = value.split('*');
103 |                     if (url.pathname.startsWith(head) && url.pathname.endsWith(tail)) {
104 |                         throw new ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`);
105 |                     }
106 |                 } else if (pathNormalized.startsWith(value)) {
107 |                     throw new ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`);
108 |                 }
109 | 
110 |                 continue;
111 |             }
112 | 
113 |             if (key === 'allow') {
114 |                 if (!value) {
115 |                     return true;
116 |                 }
117 |                 if (pathNormalized.startsWith(value)) {
118 |                     return true;
119 |                 }
120 |                 continue;
121 |             }
122 |         }
123 | 
124 |         return true;
125 |     }
126 | 
127 | }
128 | 


--------------------------------------------------------------------------------
/src/services/serp/compat.ts:
--------------------------------------------------------------------------------
 1 | export interface WebSearchEntry {
 2 |     link: string;
 3 |     title: string;
 4 |     source?: string;
 5 |     date?: string;
 6 |     snippet?: string;
 7 |     imageUrl?: string;
 8 |     siteLinks?: {
 9 |         link: string; title: string; snippet?: string;
10 |     }[];
11 |     variant?: 'web' | 'images' | 'news';
12 | }


--------------------------------------------------------------------------------
/src/services/serp/google.ts:
--------------------------------------------------------------------------------
  1 | import { singleton } from 'tsyringe';
  2 | import { AsyncService } from 'civkit/async-service';
  3 | import { GlobalLogger } from '../logger';
  4 | import { JSDomControl } from '../jsdom';
  5 | import { isMainThread } from 'worker_threads';
  6 | import _ from 'lodash';
  7 | import { WebSearchEntry } from './compat';
  8 | import { ScrappingOptions, SERPSpecializedPuppeteerControl } from './puppeteer';
  9 | import { CurlControl } from '../curl';
 10 | import { readFile } from 'fs/promises';
 11 | import { ApplicationError } from 'civkit/civ-rpc';
 12 | import { ServiceBadApproachError, ServiceBadAttemptError } from '../errors';
 13 | import { parseJSONText } from 'civkit/vectorize';
 14 | import { retryWith } from 'civkit/decorators';
 15 | import { ProxyProviderService } from '../../shared/services/proxy-provider';
 16 | 
 17 | @singleton()
 18 | export class GoogleSERP extends AsyncService {
 19 |     logger = this.globalLogger.child({ service: this.constructor.name });
 20 |     googleDomain = process.env.OVERRIDE_GOOGLE_DOMAIN || 'www.google.com';
 21 | 
 22 |     constructor(
 23 |         protected globalLogger: GlobalLogger,
 24 |         protected puppeteerControl: SERPSpecializedPuppeteerControl,
 25 |         protected jsDomControl: JSDomControl,
 26 |         protected curlControl: CurlControl,
 27 |         protected proxyProvider: ProxyProviderService,
 28 |     ) {
 29 |         const filteredDeps = isMainThread ? arguments : _.without(arguments, puppeteerControl);
 30 |         super(...filteredDeps);
 31 |     }
 32 | 
 33 |     override async init() {
 34 |         await this.dependencyReady();
 35 | 
 36 |         this.emit('ready');
 37 |     }
 38 | 
 39 |     @retryWith((err) => {
 40 |         if (err instanceof ServiceBadApproachError) {
 41 |             return false;
 42 |         }
 43 |         if (err instanceof ServiceBadAttemptError) {
 44 |             // Keep trying
 45 |             return true;
 46 |         }
 47 |         if (err instanceof ApplicationError) {
 48 |             // Quit with this error
 49 |             return false;
 50 |         }
 51 |         return undefined;
 52 |     }, 3)
 53 |     async sideLoadWithAllocatedProxy(url: URL, opts?: ScrappingOptions) {
 54 |         if (opts?.allocProxy === 'none') {
 55 |             return this.curlControl.sideLoad(url, opts);
 56 |         }
 57 | 
 58 |         const proxy = await this.proxyProvider.alloc(
 59 |             process.env.PREFERRED_PROXY_COUNTRY || 'auto'
 60 |         );
 61 |         this.logger.debug(`Proxy allocated`, { proxy: proxy.href });
 62 |         const r = await this.curlControl.sideLoad(url, {
 63 |             ...opts,
 64 |             proxyUrl: proxy.href,
 65 |         });
 66 | 
 67 |         if (r.status === 429) {
 68 |             throw new ServiceBadAttemptError('Google returned a 429 error. This may happen due to various reasons, including rate limiting or other issues.');
 69 |         }
 70 | 
 71 |         if (opts && opts.allocProxy) {
 72 |             opts.proxyUrl ??= proxy.href;
 73 |         }
 74 | 
 75 |         return { ...r, proxy };
 76 |     }
 77 | 
 78 |     digestQuery(query: { [k: string]: any; }) {
 79 |         const url = new URL(`https://${this.googleDomain}/search`);
 80 |         const clone = { ...query };
 81 |         const num = clone.num || 10;
 82 |         if (clone.page) {
 83 |             const page = parseInt(clone.page);
 84 |             delete clone.page;
 85 |             clone.start = (page - 1) * num;
 86 |             if (clone.start === 0) {
 87 |                 delete clone.start;
 88 |             }
 89 |         }
 90 |         if (clone.location) {
 91 |             delete clone.location;
 92 |         }
 93 | 
 94 |         for (const [k, v] of Object.entries(clone)) {
 95 |             if (v === undefined || v === null) {
 96 |                 continue;
 97 |             }
 98 |             url.searchParams.set(k, `${v}`);
 99 |         }
100 | 
101 |         return url;
102 |     }
103 | 
104 |     async webSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
105 |         const url = this.digestQuery(query);
106 | 
107 |         const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
108 |         if (opts && sideLoaded.sideLoadOpts) {
109 |             opts.sideLoad = sideLoaded.sideLoadOpts;
110 |         }
111 | 
112 |         const snapshot = await this.puppeteerControl.controlledScrap(url, getWebSearchResults, opts);
113 | 
114 |         return snapshot;
115 |     }
116 | 
117 |     async newsSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
118 |         const url = this.digestQuery(query);
119 | 
120 |         url.searchParams.set('tbm', 'nws');
121 | 
122 |         const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
123 |         if (opts && sideLoaded.sideLoadOpts) {
124 |             opts.sideLoad = sideLoaded.sideLoadOpts;
125 |         }
126 | 
127 |         const snapshot = await this.puppeteerControl.controlledScrap(url, getNewsSearchResults, opts);
128 | 
129 |         return snapshot;
130 |     }
131 | 
132 |     async imageSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
133 |         const url = this.digestQuery(query);
134 | 
135 |         url.searchParams.set('tbm', 'isch');
136 |         url.searchParams.set('asearch', 'isch');
137 |         url.searchParams.set('async', `_fmt:json,p:1,ijn:${query.start ? Math.floor(query.start / (query.num || 10)) : 0}`);
138 | 
139 |         const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
140 | 
141 |         if (sideLoaded.status !== 200 || !sideLoaded.file) {
142 |             throw new ServiceBadAttemptError('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
143 |         }
144 | 
145 |         const jsonTxt = (await readFile((await sideLoaded.file.filePath))).toString();
146 |         const rJSON = parseJSONText(jsonTxt.slice(jsonTxt.indexOf('{"ischj":')));
147 | 
148 |         return _.get(rJSON, 'ischj.metadata').map((x: any) => {
149 | 
150 |             return {
151 |                 link: _.get(x, 'result.referrer_url'),
152 |                 title: _.get(x, 'result.page_title'),
153 |                 snippet: _.get(x, 'text_in_grid.snippet'),
154 |                 source: _.get(x, 'result.site_title'),
155 |                 imageWidth: _.get(x, 'original_image.width'),
156 |                 imageHeight: _.get(x, 'original_image.height'),
157 |                 imageUrl: _.get(x, 'original_image.url'),
158 |                 variant: 'images',
159 |             };
160 |         }) as WebSearchEntry[];
161 |     }
162 | }
163 | 
164 | async function getWebSearchResults() {
165 |     if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) {
166 |         throw new Error('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
167 |     }
168 | 
169 |     // @ts-ignore
170 |     await Promise.race([window.waitForSelector('div[data-async-context^="query"]'), window.waitForSelector('#botstuff .mnr-c')]);
171 | 
172 |     const wrapper1 = document.querySelector('div[data-async-context^="query"]');
173 | 
174 |     if (!wrapper1) {
175 |         return undefined;
176 |     }
177 | 
178 |     const query = decodeURIComponent(wrapper1.getAttribute('data-async-context')?.split('query:')[1] || '');
179 | 
180 |     if (!query) {
181 |         return undefined;
182 |     }
183 | 
184 |     const candidates = Array.from(wrapper1.querySelectorAll('div[lang],div[data-surl]'));
185 | 
186 |     return candidates.map((x, pos) => {
187 |         const primaryLink = x.querySelector('a:not([href="#"])');
188 |         if (!primaryLink) {
189 |             return undefined;
190 |         }
191 |         const url = primaryLink.getAttribute('href');
192 | 
193 |         if (primaryLink.querySelector('div[role="heading"]')) {
194 |             // const spans = primaryLink.querySelectorAll('span');
195 |             // const title = spans[0]?.textContent;
196 |             // const source = spans[1]?.textContent;
197 |             // const date = spans[spans.length - 1].textContent;
198 | 
199 |             // return {
200 |             //     link: url,
201 |             //     title,
202 |             //     source,
203 |             //     date,
204 |             //     variant: 'video'
205 |             // };
206 |             return undefined;
207 |         }
208 | 
209 |         const title = primaryLink.querySelector('h3')?.textContent;
210 |         const source = Array.from(primaryLink.querySelectorAll('span')).find((x) => x.textContent)?.textContent;
211 |         const cite = primaryLink.querySelector('cite[role=text]')?.textContent;
212 |         let date = cite?.split('·')[1]?.trim();
213 |         const snippets = Array.from(x.querySelectorAll('div[data-sncf*="1"] span'));
214 |         let snippet = snippets[snippets.length - 1]?.textContent;
215 |         if (!snippet) {
216 |             snippet = x.querySelector('div.IsZvec')?.textContent?.trim() || null;
217 |         }
218 |         date ??= snippets[snippets.length - 2]?.textContent?.trim();
219 |         const imageUrl = x.querySelector('div[data-sncf*="1"] img[src]:not(img[src^="data"])')?.getAttribute('src');
220 |         let siteLinks = Array.from(x.querySelectorAll('div[data-sncf*="3"] a[href]')).map((l) => {
221 |             return {
222 |                 link: l.getAttribute('href'),
223 |                 title: l.textContent,
224 |             };
225 |         });
226 |         const perhapsParent = x.parentElement?.closest('div[data-hveid]');
227 |         if (!siteLinks?.length && perhapsParent) {
228 |             const candidates = Array.from(perhapsParent.querySelectorAll('td h3'));
229 |             if (candidates.length) {
230 |                 siteLinks = candidates.map((l) => {
231 |                     const link = l.querySelector('a');
232 |                     if (!link) {
233 |                         return undefined;
234 |                     }
235 |                     const snippet = l.nextElementSibling?.textContent;
236 |                     return {
237 |                         link: link.getAttribute('href'),
238 |                         title: link.textContent,
239 |                         snippet,
240 |                     };
241 |                 }).filter(Boolean) as any;
242 |             }
243 |         }
244 | 
245 |         return {
246 |             link: url,
247 |             title,
248 |             source,
249 |             date,
250 |             snippet: snippet ?? undefined,
251 |             imageUrl: imageUrl?.startsWith('data:') ? undefined : imageUrl,
252 |             siteLinks: siteLinks.length ? siteLinks : undefined,
253 |             variant: 'web',
254 |         };
255 |     }).filter(Boolean) as WebSearchEntry[];
256 | }
257 | 
258 | async function getNewsSearchResults() {
259 |     if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) {
260 |         throw new Error('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
261 |     }
262 | 
263 |     // @ts-ignore
264 |     await Promise.race([window.waitForSelector('div[data-async-context^="query"]'), window.waitForSelector('#botstuff .mnr-c')]);
265 | 
266 |     const wrapper1 = document.querySelector('div[data-async-context^="query"]');
267 | 
268 |     if (!wrapper1) {
269 |         return undefined;
270 |     }
271 | 
272 |     const query = decodeURIComponent(wrapper1.getAttribute('data-async-context')?.split('query:')[1] || '');
273 | 
274 |     if (!query) {
275 |         return undefined;
276 |     }
277 | 
278 |     const candidates = Array.from(wrapper1.querySelectorAll('div[data-news-doc-id]'));
279 | 
280 |     return candidates.map((x) => {
281 |         const primaryLink = x.querySelector('a:not([href="#"])');
282 |         if (!primaryLink) {
283 |             return undefined;
284 |         }
285 |         const url = primaryLink.getAttribute('href');
286 |         const titleElem = primaryLink.querySelector('div[role="heading"]');
287 | 
288 |         if (!titleElem) {
289 |             return undefined;
290 |         }
291 | 
292 |         const title = titleElem.textContent?.trim();
293 |         const source = titleElem.previousElementSibling?.textContent?.trim();
294 |         const snippet = titleElem.nextElementSibling?.textContent?.trim();
295 | 
296 |         const innerSpans = Array.from(titleElem.parentElement?.querySelectorAll('span') || []);
297 |         const date = innerSpans[innerSpans.length - 1]?.textContent?.trim();
298 | 
299 |         return {
300 |             link: url,
301 |             title,
302 |             source,
303 |             date,
304 |             snippet,
305 |             variant: 'news',
306 |         };
307 |     }).filter(Boolean) as WebSearchEntry[];
308 | }


--------------------------------------------------------------------------------
/src/services/serp/internal.ts:
--------------------------------------------------------------------------------
 1 | 
 2 | import { singleton } from 'tsyringe';
 3 | import { GlobalLogger } from '../logger';
 4 | import { SecretExposer } from '../../shared/services/secrets';
 5 | import { AsyncLocalContext } from '../async-context';
 6 | import { SerperSearchQueryParams } from '../../shared/3rd-party/serper-search';
 7 | import { BlackHoleDetector } from '../blackhole-detector';
 8 | import { AsyncService } from 'civkit/async-service';
 9 | import { JinaSerpApiHTTP } from '../../shared/3rd-party/internal-serp';
10 | import { WebSearchEntry } from './compat';
11 | 
12 | @singleton()
13 | export class InternalJinaSerpService extends AsyncService {
14 | 
15 |     logger = this.globalLogger.child({ service: this.constructor.name });
16 | 
17 |     client!: JinaSerpApiHTTP;
18 | 
19 |     constructor(
20 |         protected globalLogger: GlobalLogger,
21 |         protected secretExposer: SecretExposer,
22 |         protected threadLocal: AsyncLocalContext,
23 |         protected blackHoleDetector: BlackHoleDetector,
24 |     ) {
25 |         super(...arguments);
26 |     }
27 | 
28 |     override async init() {
29 |         await this.dependencyReady();
30 |         this.emit('ready');
31 | 
32 |         this.client = new JinaSerpApiHTTP(this.secretExposer.JINA_SERP_API_KEY);
33 |     }
34 | 
35 | 
36 |     async doSearch(variant: 'web' | 'images' | 'news', query: SerperSearchQueryParams) {
37 |         this.logger.debug(`Doing external search`, query);
38 |         let results;
39 |         switch (variant) {
40 |             // case 'images': {
41 |             //     const r = await this.client.imageSearch(query);
42 | 
43 |             //     results = r.parsed.images;
44 |             //     break;
45 |             // }
46 |             // case 'news': {
47 |             //     const r = await this.client.newsSearch(query);
48 | 
49 |             //     results = r.parsed.news;
50 |             //     break;
51 |             // }
52 |             case 'web':
53 |             default: {
54 |                 const r = await this.client.webSearch(query);
55 | 
56 |                 results = r.parsed.results?.map((x) => ({ ...x, link: x.url }));
57 |                 break;
58 |             }
59 |         }
60 | 
61 |         this.blackHoleDetector.itWorked();
62 | 
63 |         return results as WebSearchEntry[];
64 |     }
65 | 
66 | 
67 |     async webSearch(query: SerperSearchQueryParams) {
68 |         return this.doSearch('web', query);
69 |     }
70 |     async imageSearch(query: SerperSearchQueryParams) {
71 |         return this.doSearch('images', query);
72 |     }
73 |     async newsSearch(query: SerperSearchQueryParams) {
74 |         return this.doSearch('news', query);
75 |     }
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/src/services/serp/serper.ts:
--------------------------------------------------------------------------------
  1 | 
  2 | import { singleton } from 'tsyringe';
  3 | import { GlobalLogger } from '../logger';
  4 | import { SecretExposer } from '../../shared/services/secrets';
  5 | import { AsyncLocalContext } from '../async-context';
  6 | import { SerperBingHTTP, SerperGoogleHTTP, SerperImageSearchResponse, SerperNewsSearchResponse, SerperSearchQueryParams, SerperWebSearchResponse } from '../../shared/3rd-party/serper-search';
  7 | import { BlackHoleDetector } from '../blackhole-detector';
  8 | import { Context } from '../registry';
  9 | import { AsyncService } from 'civkit/async-service';
 10 | import { AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit/civ-rpc';
 11 | 
 12 | @singleton()
 13 | export class SerperGoogleSearchService extends AsyncService {
 14 | 
 15 |     logger = this.globalLogger.child({ service: this.constructor.name });
 16 | 
 17 |     client!: SerperGoogleHTTP;
 18 | 
 19 |     constructor(
 20 |         protected globalLogger: GlobalLogger,
 21 |         protected secretExposer: SecretExposer,
 22 |         protected threadLocal: AsyncLocalContext,
 23 |         protected blackHoleDetector: BlackHoleDetector,
 24 |     ) {
 25 |         super(...arguments);
 26 |     }
 27 | 
 28 |     override async init() {
 29 |         await this.dependencyReady();
 30 |         this.emit('ready');
 31 | 
 32 |         this.client = new SerperGoogleHTTP(this.secretExposer.SERPER_SEARCH_API_KEY);
 33 |     }
 34 | 
 35 | 
 36 |     doSearch(variant: 'web', query: SerperSearchQueryParams): Promise<SerperWebSearchResponse['organic']>;
 37 |     doSearch(variant: 'images', query: SerperSearchQueryParams): Promise<SerperImageSearchResponse['images']>;
 38 |     doSearch(variant: 'news', query: SerperSearchQueryParams): Promise<SerperNewsSearchResponse['news']>;
 39 |     async doSearch(variant: 'web' | 'images' | 'news', query: SerperSearchQueryParams) {
 40 |         this.logger.debug(`Doing external search`, query);
 41 |         let results;
 42 |         switch (variant) {
 43 |             case 'images': {
 44 |                 const r = await this.client.imageSearch(query);
 45 | 
 46 |                 results = r.parsed.images;
 47 |                 break;
 48 |             }
 49 |             case 'news': {
 50 |                 const r = await this.client.newsSearch(query);
 51 | 
 52 |                 results = r.parsed.news;
 53 |                 break;
 54 |             }
 55 |             case 'web':
 56 |             default: {
 57 |                 const r = await this.client.webSearch(query);
 58 | 
 59 |                 results = r.parsed.organic;
 60 |                 break;
 61 |             }
 62 |         }
 63 | 
 64 |         this.blackHoleDetector.itWorked();
 65 | 
 66 |         return results;
 67 |     }
 68 | 
 69 | 
 70 |     async webSearch(query: SerperSearchQueryParams) {
 71 |         return this.doSearch('web', query);
 72 |     }
 73 |     async imageSearch(query: SerperSearchQueryParams) {
 74 |         return this.doSearch('images', query);
 75 |     }
 76 |     async newsSearch(query: SerperSearchQueryParams) {
 77 |         return this.doSearch('news', query);
 78 |     }
 79 | 
 80 | }
 81 | 
 82 | @singleton()
 83 | export class SerperBingSearchService extends SerperGoogleSearchService {
 84 |     override client!: SerperBingHTTP;
 85 | 
 86 |     override async init() {
 87 |         await this.dependencyReady();
 88 |         this.emit('ready');
 89 | 
 90 |         this.client = new SerperBingHTTP(this.secretExposer.SERPER_SEARCH_API_KEY);
 91 |     }
 92 | }
 93 | 
 94 | export class GoogleSearchExplicitOperatorsDto extends AutoCastable {
 95 |     @Prop({
 96 |         arrayOf: String,
 97 |         desc: `Returns web pages with a specific file extension. Example: to find the Honda GX120 Owner’s manual in PDF, type “Honda GX120 ownners manual ext:pdf”.`
 98 |     })
 99 |     ext?: string | string[];
100 | 
101 |     @Prop({
102 |         arrayOf: String,
103 |         desc: `Returns web pages created in the specified file type. Example: to find a web page created in PDF format about the evaluation of age-related cognitive changes, type “evaluation of age cognitive changes filetype:pdf”.`
104 |     })
105 |     filetype?: string | string[];
106 | 
107 |     @Prop({
108 |         arrayOf: String,
109 |         desc: `Returns webpages containing the specified term in the title of the page. Example: to find pages about SEO conferences making sure the results contain 2023 in the title, type “seo conference intitle:2023”.`
110 |     })
111 |     intitle?: string | string[];
112 | 
113 |     @Prop({
114 |         arrayOf: String,
115 |         desc: `Returns web pages written in the specified language. The language code must be in the ISO 639-1 two-letter code format. Example: to find information on visas only in Spanish, type “visas lang:es”.`
116 |     })
117 |     loc?: string | string[];
118 | 
119 |     @Prop({
120 |         arrayOf: String,
121 |         desc: `Returns web pages coming only from a specific web site. Example: to find information about Goggles only on Brave pages, type “goggles site:brave.com”.`
122 |     })
123 |     site?: string | string[];
124 | 
125 |     addTo(searchTerm: string) {
126 |         const chunks = [];
127 |         for (const [key, value] of Object.entries(this)) {
128 |             if (value) {
129 |                 const values = Array.isArray(value) ? value : [value];
130 |                 const textValue = values.map((v) => `${key}:${v}`).join(' OR ');
131 |                 if (textValue) {
132 |                     chunks.push(textValue);
133 |                 }
134 |             }
135 |         }
136 |         const opPart = chunks.length > 1 ? chunks.map((x) => `(${x})`).join(' AND ') : chunks;
137 | 
138 |         if (opPart.length) {
139 |             return [searchTerm, opPart].join(' ');
140 |         }
141 | 
142 |         return searchTerm;
143 |     }
144 | 
145 |     static override from(input: any) {
146 |         const instance = super.from(input) as GoogleSearchExplicitOperatorsDto;
147 |         const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
148 | 
149 |         const params = ['ext', 'filetype', 'intitle', 'loc', 'site'];
150 | 
151 |         for (const p of params) {
152 |             const customValue = ctx?.get(`x-${p}`) || ctx?.get(`${p}`);
153 |             if (!customValue) {
154 |                 continue;
155 |             }
156 | 
157 |             const filtered = customValue.split(', ').filter(Boolean);
158 |             if (filtered.length) {
159 |                 Reflect.set(instance, p, filtered);
160 |             }
161 |         }
162 | 
163 |         return instance;
164 |     }
165 | }
166 | 


--------------------------------------------------------------------------------
/src/services/serper-search.ts:
--------------------------------------------------------------------------------
  1 | import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike } from 'civkit';
  2 | import { singleton } from 'tsyringe';
  3 | import { GlobalLogger } from './logger';
  4 | import { SecretExposer } from '../shared/services/secrets';
  5 | import { AsyncLocalContext } from './async-context';
  6 | import { SerperBingHTTP, SerperGoogleHTTP, SerperImageSearchResponse, SerperNewsSearchResponse, SerperSearchQueryParams, SerperWebSearchResponse } from '../shared/3rd-party/serper-search';
  7 | import { BlackHoleDetector } from './blackhole-detector';
  8 | import { Context } from './registry';
  9 | import { ServiceBadAttemptError } from '../shared';
 10 | 
 11 | @singleton()
 12 | export class SerperSearchService extends AsyncService {
 13 | 
 14 |     logger = this.globalLogger.child({ service: this.constructor.name });
 15 | 
 16 |     serperGoogleSearchHTTP!: SerperGoogleHTTP;
 17 |     serperBingSearchHTTP!: SerperBingHTTP;
 18 | 
 19 |     constructor(
 20 |         protected globalLogger: GlobalLogger,
 21 |         protected secretExposer: SecretExposer,
 22 |         protected threadLocal: AsyncLocalContext,
 23 |         protected blackHoleDetector: BlackHoleDetector,
 24 |     ) {
 25 |         super(...arguments);
 26 |     }
 27 | 
 28 |     override async init() {
 29 |         await this.dependencyReady();
 30 |         this.emit('ready');
 31 | 
 32 |         this.serperGoogleSearchHTTP = new SerperGoogleHTTP(this.secretExposer.SERPER_SEARCH_API_KEY);
 33 |         this.serperBingSearchHTTP = new SerperBingHTTP(this.secretExposer.SERPER_SEARCH_API_KEY);
 34 |     }
 35 | 
 36 |     *iterClient() {
 37 |         const preferBingSearch = this.threadLocal.get('bing-preferred');
 38 |         if (preferBingSearch) {
 39 |             yield this.serperBingSearchHTTP;
 40 |         }
 41 |         while (true) {
 42 |             yield this.serperGoogleSearchHTTP;
 43 |         }
 44 |     }
 45 | 
 46 |     doSearch(variant: 'web', query: SerperSearchQueryParams): Promise<SerperWebSearchResponse>;
 47 |     doSearch(variant: 'images', query: SerperSearchQueryParams): Promise<SerperImageSearchResponse>;
 48 |     doSearch(variant: 'news', query: SerperSearchQueryParams): Promise<SerperNewsSearchResponse>;
 49 |     async doSearch(variant: 'web' | 'images' | 'news', query: SerperSearchQueryParams) {
 50 |         const clientIt = this.iterClient();
 51 |         let client = clientIt.next().value;
 52 |         if (!client) {
 53 |             throw new Error(`Error iterating serper client`);
 54 |         }
 55 | 
 56 |         let maxTries = 3;
 57 | 
 58 |         while (maxTries--) {
 59 |             const t0 = Date.now();
 60 |             try {
 61 |                 this.logger.debug(`Doing external search`, query);
 62 |                 let r;
 63 |                 switch (variant) {
 64 |                     case 'images': {
 65 |                         r = await client.imageSearch(query);
 66 |                         const nextClient = clientIt.next().value;
 67 |                         if (nextClient && nextClient !== client) {
 68 |                             const results = r.parsed.images;
 69 |                             if (!results.length) {
 70 |                                 client = nextClient;
 71 |                                 throw new ServiceBadAttemptError('No results found');
 72 |                             }
 73 |                         }
 74 | 
 75 |                         break;
 76 |                     }
 77 |                     case 'news': {
 78 |                         r = await client.newsSearch(query);
 79 |                         const nextClient = clientIt.next().value;
 80 |                         if (nextClient && nextClient !== client) {
 81 |                             const results = r.parsed.news;
 82 |                             if (!results.length) {
 83 |                                 client = nextClient;
 84 |                                 throw new ServiceBadAttemptError('No results found');
 85 |                             }
 86 |                         }
 87 | 
 88 |                         break;
 89 |                     }
 90 |                     case 'web':
 91 |                     default: {
 92 |                         r = await client.webSearch(query);
 93 |                         const nextClient = clientIt.next().value;
 94 |                         if (nextClient && nextClient !== client) {
 95 |                             const results = r.parsed.organic;
 96 |                             if (!results.length) {
 97 |                                 client = nextClient;
 98 |                                 throw new ServiceBadAttemptError('No results found');
 99 |                             }
100 |                         }
101 | 
102 |                         break;
103 |                     }
104 |                 }
105 |                 const dt = Date.now() - t0;
106 |                 this.blackHoleDetector.itWorked();
107 |                 this.logger.debug(`External search took ${dt}ms`, { searchDt: dt, variant });
108 | 
109 |                 return r.parsed;
110 |             } catch (err: any) {
111 |                 const dt = Date.now() - t0;
112 |                 this.logger.error(`${variant} search failed: ${err?.message}`, { searchDt: dt, err: marshalErrorLike(err) });
113 |                 if (err?.status === 429) {
114 |                     await delay(500 + 1000 * Math.random());
115 |                     continue;
116 |                 }
117 |                 if (err instanceof ServiceBadAttemptError) {
118 |                     continue;
119 |                 }
120 | 
121 |                 throw new DownstreamServiceFailureError({ message: `Search(${variant}) failed` });
122 |             }
123 |         }
124 | 
125 |         throw new DownstreamServiceFailureError({ message: `Search(${variant}) failed` });
126 |     }
127 | 
128 | 
129 |     async webSearch(query: SerperSearchQueryParams) {
130 |         return this.doSearch('web', query);
131 |     }
132 |     async imageSearch(query: SerperSearchQueryParams) {
133 |         return this.doSearch('images', query);
134 |     }
135 |     async newsSearch(query: SerperSearchQueryParams) {
136 |         return this.doSearch('news', query);
137 |     }
138 | 
139 | }
140 | 
141 | export class GoogleSearchExplicitOperatorsDto extends AutoCastable {
142 |     @Prop({
143 |         arrayOf: String,
144 |         desc: `Returns web pages with a specific file extension. Example: to find the Honda GX120 Owner’s manual in PDF, type “Honda GX120 ownners manual ext:pdf”.`
145 |     })
146 |     ext?: string | string[];
147 | 
148 |     @Prop({
149 |         arrayOf: String,
150 |         desc: `Returns web pages created in the specified file type. Example: to find a web page created in PDF format about the evaluation of age-related cognitive changes, type “evaluation of age cognitive changes filetype:pdf”.`
151 |     })
152 |     filetype?: string | string[];
153 | 
154 |     @Prop({
155 |         arrayOf: String,
156 |         desc: `Returns webpages containing the specified term in the title of the page. Example: to find pages about SEO conferences making sure the results contain 2023 in the title, type “seo conference intitle:2023”.`
157 |     })
158 |     intitle?: string | string[];
159 | 
160 |     @Prop({
161 |         arrayOf: String,
162 |         desc: `Returns web pages written in the specified language. The language code must be in the ISO 639-1 two-letter code format. Example: to find information on visas only in Spanish, type “visas lang:es”.`
163 |     })
164 |     loc?: string | string[];
165 | 
166 |     @Prop({
167 |         arrayOf: String,
168 |         desc: `Returns web pages coming only from a specific web site. Example: to find information about Goggles only on Brave pages, type “goggles site:brave.com”.`
169 |     })
170 |     site?: string | string[];
171 | 
172 |     addTo(searchTerm: string) {
173 |         const chunks = [];
174 |         for (const [key, value] of Object.entries(this)) {
175 |             if (value) {
176 |                 const values = Array.isArray(value) ? value : [value];
177 |                 const textValue = values.map((v) => `${key}:${v}`).join(' OR ');
178 |                 if (textValue) {
179 |                     chunks.push(textValue);
180 |                 }
181 |             }
182 |         }
183 |         const opPart = chunks.length > 1 ? chunks.map((x) => `(${x})`).join(' AND ') : chunks;
184 | 
185 |         if (opPart.length) {
186 |             return [searchTerm, opPart].join(' ');
187 |         }
188 | 
189 |         return searchTerm;
190 |     }
191 | 
192 |     static override from(input: any) {
193 |         const instance = super.from(input) as GoogleSearchExplicitOperatorsDto;
194 |         const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
195 | 
196 |         const params = ['ext', 'filetype', 'intitle', 'loc', 'site'];
197 | 
198 |         for (const p of params) {
199 |             const customValue = ctx?.get(`x-${p}`) || ctx?.get(`${p}`);
200 |             if (!customValue) {
201 |                 continue;
202 |             }
203 | 
204 |             const filtered = customValue.split(', ').filter(Boolean);
205 |             if (filtered.length) {
206 |                 Reflect.set(instance, p, filtered);
207 |             }
208 |         }
209 | 
210 |         return instance;
211 |     }
212 | }
213 | 


--------------------------------------------------------------------------------
/src/services/temp-file.ts:
--------------------------------------------------------------------------------
 1 | import { AbstractTempFileManger } from 'civkit/temp';
 2 | import { rm } from 'fs/promises';
 3 | import { singleton } from 'tsyringe';
 4 | import { Finalizer } from './finalizer';
 5 | 
 6 | @singleton()
 7 | export class TempFileManager extends AbstractTempFileManger {
 8 | 
 9 |     rootDir = '';
10 | 
11 |     override async init() {
12 |         await this.dependencyReady();
13 |         await super.init();
14 |         this.emit('ready');
15 |     }
16 | 
17 |     @Finalizer()
18 |     override async standDown() {
19 |         await super.standDown();
20 | 
21 |         await rm(this.rootDir, { recursive: true, force: true });
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/services/threaded.ts:
--------------------------------------------------------------------------------
 1 | import 'reflect-metadata';
 2 | 
 3 | import { singleton, container } from 'tsyringe';
 4 | import { AbstractThreadedServiceRegistry } from 'civkit/threaded';
 5 | import _ from 'lodash';
 6 | 
 7 | import { GlobalLogger } from './logger';
 8 | import { AsyncLocalContext } from './async-context';
 9 | import { PseudoTransfer } from './pseudo-transfer';
10 | import { cpus } from 'os';
11 | import { isMainThread } from 'worker_threads';
12 | 
13 | @singleton()
14 | export class ThreadedServiceRegistry extends AbstractThreadedServiceRegistry {
15 |     container = container;
16 | 
17 |     logger = this.globalLogger.child({ service: this.constructor.name });
18 | 
19 |     constructor(
20 |         protected globalLogger: GlobalLogger,
21 |         public asyncContext: AsyncLocalContext,
22 |         public pseudoTransfer: PseudoTransfer,
23 |     ) {
24 |         super(...arguments);
25 |     }
26 | 
27 |     setMaxWorkersByCpu() {
28 |         const cpuStat = cpus();
29 | 
30 |         const evenCpuCycles = cpuStat.filter((_cpu, i) => i % 2 === 0).reduce((acc, cpu) => acc + cpu.times.user + cpu.times.sys, 0);
31 |         const oddCpuCycles = cpuStat.filter((_cpu, i) => i % 2 === 1).reduce((acc, cpu) => acc + cpu.times.user + cpu.times.sys, 0);
32 | 
33 |         const isLikelyHyperThreaded = (oddCpuCycles / evenCpuCycles) < 0.5;
34 | 
35 |         this.maxWorkers = isLikelyHyperThreaded ? cpuStat.length / 2 : cpuStat.length;
36 |     }
37 | 
38 |     override async init() {
39 |         await this.dependencyReady();
40 |         await super.init();
41 | 
42 |         if (isMainThread) {
43 |             this.setMaxWorkersByCpu();
44 |             await Promise.all(
45 |                 _.range(0, 2).map(
46 |                     (_n) =>
47 |                         new Promise<void>(
48 |                             (resolve, reject) => {
49 |                                 this.createWorker()
50 |                                     .once('message', resolve)
51 |                                     .once('error', reject);
52 |                             }
53 |                         )
54 |                 )
55 |             );
56 |         }
57 | 
58 |         this.emit('ready');
59 |     }
60 | 
61 | }
62 | 
63 | 
64 | const instance = container.resolve(ThreadedServiceRegistry);
65 | export default instance;
66 | export const { Method, Param, Ctx, RPCReflect, Threaded } = instance.decorators();
67 | 


--------------------------------------------------------------------------------
/src/shared:
--------------------------------------------------------------------------------
1 | ../thinapps-shared/backend


--------------------------------------------------------------------------------
/src/stand-alone/crawl.ts:
--------------------------------------------------------------------------------
  1 | import 'reflect-metadata';
  2 | import { container, singleton } from 'tsyringe';
  3 | 
  4 | import { KoaServer } from 'civkit/civ-rpc/koa';
  5 | import http2 from 'http2';
  6 | import http from 'http';
  7 | import { CrawlerHost } from '../api/crawler';
  8 | import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
  9 | import path from 'path';
 10 | import fs from 'fs';
 11 | import { mimeOfExt } from 'civkit/mime';
 12 | import { Context, Next } from 'koa';
 13 | import { RPCRegistry } from '../services/registry';
 14 | import { AsyncResource } from 'async_hooks';
 15 | import { runOnce } from 'civkit/decorators';
 16 | import { randomUUID } from 'crypto';
 17 | import { ThreadedServiceRegistry } from '../services/threaded';
 18 | import { GlobalLogger } from '../services/logger';
 19 | import { AsyncLocalContext } from '../services/async-context';
 20 | import finalizer, { Finalizer } from '../services/finalizer';
 21 | import koaCompress from 'koa-compress';
 22 | 
 23 | @singleton()
 24 | export class CrawlStandAloneServer extends KoaServer {
 25 |     logger = this.globalLogger.child({ service: this.constructor.name });
 26 | 
 27 |     httpAlternativeServer?: typeof this['httpServer'];
 28 |     assets = new Map<string, WalkOutEntity>();
 29 | 
 30 |     constructor(
 31 |         protected globalLogger: GlobalLogger,
 32 |         protected registry: RPCRegistry,
 33 |         protected crawlerHost: CrawlerHost,
 34 |         protected threadLocal: AsyncLocalContext,
 35 |         protected threads: ThreadedServiceRegistry,
 36 |     ) {
 37 |         super(...arguments);
 38 |     }
 39 | 
 40 |     h2c() {
 41 |         this.httpAlternativeServer = this.httpServer;
 42 |         const fn = this.koaApp.callback();
 43 |         this.httpServer = http2.createServer((req, res) => {
 44 |             const ar = new AsyncResource('HTTP2ServerRequest');
 45 |             ar.runInAsyncScope(fn, this.koaApp, req, res);
 46 |         });
 47 |         // useResourceBasedDefaultTracker();
 48 | 
 49 |         return this;
 50 |     }
 51 | 
 52 |     override async init() {
 53 |         await this.walkForAssets();
 54 |         await super.init();
 55 |     }
 56 | 
 57 |     async walkForAssets() {
 58 |         const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
 59 | 
 60 |         for (const file of files) {
 61 |             if (file.type !== 'file') {
 62 |                 continue;
 63 |             }
 64 |             this.assets.set(file.relativePath.toString(), file);
 65 |         }
 66 |     }
 67 | 
 68 |     override listen(port: number) {
 69 |         const r = super.listen(port);
 70 |         if (this.httpAlternativeServer) {
 71 |             const altPort = port + 1;
 72 |             this.httpAlternativeServer.listen(altPort, () => {
 73 |                 this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
 74 |             });
 75 |         }
 76 | 
 77 |         return r;
 78 |     }
 79 | 
 80 |     makeAssetsServingController() {
 81 |         return (ctx: Context, next: Next) => {
 82 |             const requestPath = ctx.path;
 83 |             const file = requestPath.slice(1);
 84 |             if (!file) {
 85 |                 return next();
 86 |             }
 87 | 
 88 |             const asset = this.assets.get(file);
 89 |             if (asset?.type !== 'file') {
 90 |                 return next();
 91 |             }
 92 | 
 93 |             ctx.body = fs.createReadStream(asset.path);
 94 |             ctx.type = mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream';
 95 |             ctx.set('Content-Length', asset.stats.size.toString());
 96 | 
 97 |             return;
 98 |         };
 99 |     }
100 | 
101 |     registerRoutes(): void {
102 |         this.koaApp.use(koaCompress({
103 |             filter(type) {
104 |                 if (type.startsWith('text/')) {
105 |                     return true;
106 |                 }
107 | 
108 |                 if (type.includes('application/json') || type.includes('+json') || type.includes('+xml')) {
109 |                     return true;
110 |                 }
111 | 
112 |                 if (type.includes('application/x-ndjson')) {
113 |                     return true;
114 |                 }
115 | 
116 |                 return false;
117 |             }
118 |         }));
119 |         this.koaApp.use(this.makeAssetsServingController());
120 |         this.koaApp.use(this.registry.makeShimController());
121 |     }
122 | 
123 |     // Using h2c server has an implication that multiple requests may share the same connection and x-cloud-trace-context
124 |     // TraceId is expected to be request-bound and unique. So these two has to be distinguished.
125 |     @runOnce()
126 |     override insertAsyncHookMiddleware() {
127 |         const asyncHookMiddleware = async (ctx: Context, next: () => Promise<void>) => {
128 |             const googleTraceId = ctx.get('x-cloud-trace-context').split('/')?.[0];
129 |             this.threadLocal.setup({
130 |                 traceId: randomUUID(),
131 |                 traceT0: new Date(),
132 |                 googleTraceId,
133 |             });
134 | 
135 |             return next();
136 |         };
137 | 
138 |         this.koaApp.use(asyncHookMiddleware);
139 |     }
140 | 
141 |     @Finalizer()
142 |     override async standDown() {
143 |         const tasks: Promise<any>[] = [];
144 |         if (this.httpAlternativeServer?.listening) {
145 |             (this.httpAlternativeServer as http.Server).closeIdleConnections?.();
146 |             this.httpAlternativeServer.close();
147 |             tasks.push(new Promise<void>((resolve, reject) => {
148 |                 this.httpAlternativeServer!.close((err) => {
149 |                     if (err) {
150 |                         return reject(err);
151 |                     }
152 |                     resolve();
153 |                 });
154 |             }));
155 |         }
156 |         tasks.push(super.standDown());
157 |         await Promise.all(tasks);
158 |     }
159 | 
160 | }
161 | const instance = container.resolve(CrawlStandAloneServer);
162 | 
163 | export default instance;
164 | 
165 | if (process.env.NODE_ENV?.includes('dry-run')) {
166 |     instance.serviceReady().then(() => finalizer.terminate());
167 | } else {
168 |     instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000));
169 | }


--------------------------------------------------------------------------------
/src/stand-alone/search.ts:
--------------------------------------------------------------------------------
  1 | import 'reflect-metadata';
  2 | import { container, singleton } from 'tsyringe';
  3 | 
  4 | import { KoaServer } from 'civkit/civ-rpc/koa';
  5 | import http2 from 'http2';
  6 | import http from 'http';
  7 | import { SearcherHost } from '../api/searcher';
  8 | import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
  9 | import path from 'path';
 10 | import fs from 'fs';
 11 | import { mimeOfExt } from 'civkit/mime';
 12 | import { Context, Next } from 'koa';
 13 | import { RPCRegistry } from '../services/registry';
 14 | import { AsyncResource } from 'async_hooks';
 15 | import { runOnce } from 'civkit/decorators';
 16 | import { randomUUID } from 'crypto';
 17 | import { ThreadedServiceRegistry } from '../services/threaded';
 18 | import { GlobalLogger } from '../services/logger';
 19 | import { AsyncLocalContext } from '../services/async-context';
 20 | import finalizer, { Finalizer } from '../services/finalizer';
 21 | import koaCompress from 'koa-compress';
 22 | 
 23 | @singleton()
 24 | export class SearchStandAloneServer extends KoaServer {
 25 |     logger = this.globalLogger.child({ service: this.constructor.name });
 26 | 
 27 |     httpAlternativeServer?: typeof this['httpServer'];
 28 |     assets = new Map<string, WalkOutEntity>();
 29 | 
 30 |     constructor(
 31 |         protected globalLogger: GlobalLogger,
 32 |         protected registry: RPCRegistry,
 33 |         protected searcherHost: SearcherHost,
 34 |         protected threadLocal: AsyncLocalContext,
 35 |         protected threads: ThreadedServiceRegistry,
 36 |     ) {
 37 |         super(...arguments);
 38 |     }
 39 | 
 40 |     h2c() {
 41 |         this.httpAlternativeServer = this.httpServer;
 42 |         const fn = this.koaApp.callback();
 43 |         this.httpServer = http2.createServer((req, res) => {
 44 |             const ar = new AsyncResource('HTTP2ServerRequest');
 45 |             ar.runInAsyncScope(fn, this.koaApp, req, res);
 46 |         });
 47 |         // useResourceBasedDefaultTracker();
 48 | 
 49 |         return this;
 50 |     }
 51 | 
 52 |     override async init() {
 53 |         await this.walkForAssets();
 54 |         await this.dependencyReady();
 55 | 
 56 |         for (const [k, v] of this.registry.conf.entries()) {
 57 |             if (v.tags?.includes('crawl')) {
 58 |                 this.registry.conf.delete(k);
 59 |             }
 60 |         }
 61 | 
 62 |         await super.init();
 63 |     }
 64 | 
 65 |     async walkForAssets() {
 66 |         const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
 67 | 
 68 |         for (const file of files) {
 69 |             if (file.type !== 'file') {
 70 |                 continue;
 71 |             }
 72 |             this.assets.set(file.relativePath.toString(), file);
 73 |         }
 74 |     }
 75 | 
 76 |     override listen(port: number) {
 77 |         const r = super.listen(port);
 78 |         if (this.httpAlternativeServer) {
 79 |             const altPort = port + 1;
 80 |             this.httpAlternativeServer.listen(altPort, () => {
 81 |                 this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
 82 |             });
 83 |         }
 84 | 
 85 |         return r;
 86 |     }
 87 | 
 88 |     makeAssetsServingController() {
 89 |         return (ctx: Context, next: Next) => {
 90 |             const requestPath = ctx.path;
 91 |             const file = requestPath.slice(1);
 92 |             if (!file) {
 93 |                 return next();
 94 |             }
 95 | 
 96 |             const asset = this.assets.get(file);
 97 |             if (asset?.type !== 'file') {
 98 |                 return next();
 99 |             }
100 | 
101 |             ctx.body = fs.createReadStream(asset.path);
102 |             ctx.type = mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream';
103 |             ctx.set('Content-Length', asset.stats.size.toString());
104 | 
105 |             return;
106 |         };
107 |     }
108 | 
109 |     registerRoutes(): void {
110 |         this.koaApp.use(koaCompress({
111 |             filter(type) {
112 |                 if (type.startsWith('text/')) {
113 |                     return true;
114 |                 }
115 | 
116 |                 if (type.includes('application/json') || type.includes('+json') || type.includes('+xml')) {
117 |                     return true;
118 |                 }
119 | 
120 |                 if (type.includes('application/x-ndjson')) {
121 |                     return true;
122 |                 }
123 | 
124 |                 return false;
125 |             }
126 |         }));
127 |         this.koaApp.use(this.makeAssetsServingController());
128 |         this.koaApp.use(this.registry.makeShimController());
129 |     }
130 | 
131 | 
132 |     // Using h2c server has an implication that multiple requests may share the same connection and x-cloud-trace-context
133 |     // TraceId is expected to be request-bound and unique. So these two has to be distinguished.
134 |     @runOnce()
135 |     override insertAsyncHookMiddleware() {
136 |         const asyncHookMiddleware = async (ctx: Context, next: () => Promise<void>) => {
137 |             const googleTraceId = ctx.get('x-cloud-trace-context').split('/')?.[0];
138 |             this.threadLocal.setup({
139 |                 traceId: randomUUID(),
140 |                 traceT0: new Date(),
141 |                 googleTraceId,
142 |             });
143 | 
144 |             return next();
145 |         };
146 | 
147 |         this.koaApp.use(asyncHookMiddleware);
148 |     }
149 | 
150 |     @Finalizer()
151 |     override async standDown() {
152 |         const tasks: Promise<any>[] = [];
153 |         if (this.httpAlternativeServer?.listening) {
154 |             (this.httpAlternativeServer as http.Server).closeIdleConnections?.();
155 |             this.httpAlternativeServer.close();
156 |             tasks.push(new Promise<void>((resolve, reject) => {
157 |                 this.httpAlternativeServer!.close((err) => {
158 |                     if (err) {
159 |                         return reject(err);
160 |                     }
161 |                     resolve();
162 |                 });
163 |             }));
164 |         }
165 |         tasks.push(super.standDown());
166 |         await Promise.all(tasks);
167 |     }
168 | 
169 | }
170 | const instance = container.resolve(SearchStandAloneServer);
171 | 
172 | export default instance;
173 | 
174 | if (process.env.NODE_ENV?.includes('dry-run')) {
175 |     instance.serviceReady().then(() => finalizer.terminate());
176 | } else {
177 |     instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000));
178 | }
179 | 


--------------------------------------------------------------------------------
/src/stand-alone/serp.ts:
--------------------------------------------------------------------------------
  1 | import 'reflect-metadata';
  2 | import { container, singleton } from 'tsyringe';
  3 | 
  4 | import { KoaServer } from 'civkit/civ-rpc/koa';
  5 | import http2 from 'http2';
  6 | import http from 'http';
  7 | import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
  8 | import path from 'path';
  9 | import fs from 'fs';
 10 | import { mimeOfExt } from 'civkit/mime';
 11 | import { Context, Next } from 'koa';
 12 | import { RPCRegistry } from '../services/registry';
 13 | import { AsyncResource } from 'async_hooks';
 14 | import { runOnce } from 'civkit/decorators';
 15 | import { randomUUID } from 'crypto';
 16 | import { ThreadedServiceRegistry } from '../services/threaded';
 17 | import { GlobalLogger } from '../services/logger';
 18 | import { AsyncLocalContext } from '../services/async-context';
 19 | import finalizer, { Finalizer } from '../services/finalizer';
 20 | import { SerpHost } from '../api/serp';
 21 | import koaCompress from 'koa-compress';
 22 | import { getAuditionMiddleware } from '../shared/utils/audition';
 23 | 
 24 | @singleton()
 25 | export class SERPStandAloneServer extends KoaServer {
 26 |     logger = this.globalLogger.child({ service: this.constructor.name });
 27 | 
 28 |     httpAlternativeServer?: typeof this['httpServer'];
 29 |     assets = new Map<string, WalkOutEntity>();
 30 | 
 31 |     constructor(
 32 |         protected globalLogger: GlobalLogger,
 33 |         protected registry: RPCRegistry,
 34 |         protected serpHost: SerpHost,
 35 |         protected threadLocal: AsyncLocalContext,
 36 |         protected threads: ThreadedServiceRegistry,
 37 |     ) {
 38 |         super(...arguments);
 39 |     }
 40 | 
 41 |     h2c() {
 42 |         this.httpAlternativeServer = this.httpServer;
 43 |         const fn = this.koaApp.callback();
 44 |         this.httpServer = http2.createServer((req, res) => {
 45 |             const ar = new AsyncResource('HTTP2ServerRequest');
 46 |             ar.runInAsyncScope(fn, this.koaApp, req, res);
 47 |         });
 48 |         // useResourceBasedDefaultTracker();
 49 | 
 50 |         return this;
 51 |     }
 52 | 
 53 |     override async init() {
 54 |         await this.walkForAssets();
 55 |         await this.dependencyReady();
 56 | 
 57 |         for (const [k, v] of this.registry.conf.entries()) {
 58 |             if (v.tags?.includes('crawl')) {
 59 |                 this.registry.conf.delete(k);
 60 |             }
 61 |         }
 62 | 
 63 |         await super.init();
 64 |     }
 65 | 
 66 |     async walkForAssets() {
 67 |         const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
 68 | 
 69 |         for (const file of files) {
 70 |             if (file.type !== 'file') {
 71 |                 continue;
 72 |             }
 73 |             this.assets.set(file.relativePath.toString(), file);
 74 |         }
 75 |     }
 76 | 
 77 |     override listen(port: number) {
 78 |         const r = super.listen(port);
 79 |         if (this.httpAlternativeServer) {
 80 |             const altPort = port + 1;
 81 |             this.httpAlternativeServer.listen(altPort, () => {
 82 |                 this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
 83 |             });
 84 |         }
 85 | 
 86 |         return r;
 87 |     }
 88 | 
 89 |     makeAssetsServingController() {
 90 |         return (ctx: Context, next: Next) => {
 91 |             const requestPath = ctx.path;
 92 |             const file = requestPath.slice(1);
 93 |             if (!file) {
 94 |                 return next();
 95 |             }
 96 | 
 97 |             const asset = this.assets.get(file);
 98 |             if (asset?.type !== 'file') {
 99 |                 return next();
100 |             }
101 | 
102 |             ctx.body = fs.createReadStream(asset.path);
103 |             ctx.type = mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream';
104 |             ctx.set('Content-Length', asset.stats.size.toString());
105 | 
106 |             return;
107 |         };
108 |     }
109 | 
110 |     registerRoutes(): void {
111 |         this.koaApp.use(getAuditionMiddleware());
112 |         this.koaApp.use(koaCompress({
113 |             filter(type) {
114 |                 if (type.startsWith('text/')) {
115 |                     return true;
116 |                 }
117 | 
118 |                 if (type.includes('application/json') || type.includes('+json') || type.includes('+xml')) {
119 |                     return true;
120 |                 }
121 | 
122 |                 if (type.includes('application/x-ndjson')) {
123 |                     return true;
124 |                 }
125 | 
126 |                 return false;
127 |             }
128 |         }));
129 |         this.koaApp.use(this.makeAssetsServingController());
130 |         this.koaApp.use(this.registry.makeShimController());
131 |     }
132 | 
133 | 
134 |     // Using h2c server has an implication that multiple requests may share the same connection and x-cloud-trace-context
135 |     // TraceId is expected to be request-bound and unique. So these two has to be distinguished.
136 |     @runOnce()
137 |     override insertAsyncHookMiddleware() {
138 |         const asyncHookMiddleware = async (ctx: Context, next: () => Promise<void>) => {
139 |             const googleTraceId = ctx.get('x-cloud-trace-context').split('/')?.[0];
140 |             this.threadLocal.setup({
141 |                 traceId: randomUUID(),
142 |                 traceT0: new Date(),
143 |                 googleTraceId,
144 |             });
145 | 
146 |             return next();
147 |         };
148 | 
149 |         this.koaApp.use(asyncHookMiddleware);
150 |     }
151 | 
152 |     @Finalizer()
153 |     override async standDown() {
154 |         const tasks: Promise<any>[] = [];
155 |         if (this.httpAlternativeServer?.listening) {
156 |             (this.httpAlternativeServer as http.Server).closeIdleConnections?.();
157 |             this.httpAlternativeServer.close();
158 |             tasks.push(new Promise<void>((resolve, reject) => {
159 |                 this.httpAlternativeServer!.close((err) => {
160 |                     if (err) {
161 |                         return reject(err);
162 |                     }
163 |                     resolve();
164 |                 });
165 |             }));
166 |         }
167 |         tasks.push(super.standDown());
168 |         await Promise.all(tasks);
169 |     }
170 | 
171 | }
172 | const instance = container.resolve(SERPStandAloneServer);
173 | 
174 | export default instance;
175 | 
176 | if (process.env.NODE_ENV?.includes('dry-run')) {
177 |     instance.serviceReady().then(() => finalizer.terminate());
178 | } else {
179 |     instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000));
180 | }
181 | 


--------------------------------------------------------------------------------
/src/types.d.ts:
--------------------------------------------------------------------------------
 1 | declare module 'langdetect' {
 2 |     interface DetectionResult {
 3 |         lang: string;
 4 |         prob: number;
 5 |     }
 6 | 
 7 |     export function detect(text: string): DetectionResult[];
 8 |     export function detectOne(text: string): string | null;
 9 | }
10 | 
11 | declare module 'jsdom' {
12 |     import EventEmitter from 'events';
13 |     export class JSDOM {
14 |         constructor(html: string, options?: any);
15 |         window: typeof window;
16 |     }
17 |     export class VirtualConsole extends EventEmitter {
18 |         constructor();
19 |         sendTo(console: any, options?: any);
20 |     }
21 | }
22 | 
23 | declare module 'simple-zstd' {
24 |     import { Duplex } from 'stream';
25 |     export function ZSTDCompress(lvl: Number): Duplex;
26 |     export function ZSTDDecompress(): Duplex;
27 |     export function ZSTDDecompressMaybe(): Duplex;
28 | }
29 | 


--------------------------------------------------------------------------------
/src/utils/encoding.ts:
--------------------------------------------------------------------------------
 1 | import { createReadStream } from 'fs';
 2 | import { Readable } from 'stream';
 3 | import { TextDecoderStream } from 'stream/web';
 4 | 
 5 | export async function decodeFileStream(
 6 |     fileStream: Readable,
 7 |     encoding: string = 'utf-8',
 8 | ): Promise<string> {
 9 |     const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
10 |     Readable.toWeb(fileStream).pipeThrough(decodeStream);
11 |     const chunks = [];
12 | 
13 |     for await (const chunk of decodeStream.readable) {
14 |         chunks.push(chunk);
15 |     }
16 | 
17 |     return chunks.join('');
18 | }
19 | 
20 | 
21 | export async function readFile(
22 |     filePath: string,
23 |     encoding: string = 'utf-8',
24 | ): Promise<string> {
25 |     const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
26 |     Readable.toWeb(createReadStream(filePath)).pipeThrough(decodeStream);
27 |     const chunks = [];
28 | 
29 |     for await (const chunk of decodeStream.readable) {
30 |         chunks.push(chunk);
31 |     }
32 | 
33 |     return chunks.join('');
34 | }


--------------------------------------------------------------------------------
/src/utils/get-function-url.ts:
--------------------------------------------------------------------------------
 1 | import { GoogleAuth } from 'google-auth-library';
 2 | 
 3 | /**
 4 |  * Get the URL of a given v2 cloud function.
 5 |  *
 6 |  * @param {string} name the function's name
 7 |  * @param {string} location the function's location
 8 |  * @return {Promise<string>} The URL of the function
 9 |  */
10 | export async function getFunctionUrl(name: string, location = "us-central1") {
11 |     const projectId = `reader-6b7dc`;
12 |     const url = "https://cloudfunctions.googleapis.com/v2beta/" +
13 |         `projects/${projectId}/locations/${location}/functions/${name}`;
14 |     const auth = new GoogleAuth({
15 |         scopes: 'https://www.googleapis.com/auth/cloud-platform',
16 |     });
17 |     const client = await auth.getClient();
18 |     const res = await client.request<any>({ url });
19 |     const uri = res.data?.serviceConfig?.uri;
20 |     if (!uri) {
21 |         throw new Error(`Unable to retreive uri for function at ${url}`);
22 |     }
23 |     return uri;
24 | }
25 | 


--------------------------------------------------------------------------------
/src/utils/ip.ts:
--------------------------------------------------------------------------------
  1 | import { isIPv4, isIPv6 } from 'net';
  2 | 
  3 | export function parseIp(ip: string): Buffer {
  4 |     if (isIPv4(ip)) {
  5 |         const [a, b, c, d] = ip.split('.').map(Number);
  6 | 
  7 |         const buf = Buffer.alloc(4);
  8 |         buf.writeUInt8(a, 0);
  9 |         buf.writeUInt8(b, 1);
 10 |         buf.writeUInt8(c, 2);
 11 |         buf.writeUInt8(d, 3);
 12 | 
 13 |         return buf;
 14 |     }
 15 | 
 16 |     if (isIPv6(ip)) {
 17 |         if (ip.includes('.')) {
 18 |             const parts = ip.split(':');
 19 |             const ipv4Part = parts.pop();
 20 |             if (!ipv4Part) throw new Error('Invalid IPv6 address');
 21 |             const ipv4Bytes = parseIp(ipv4Part);
 22 |             parts.push('0');
 23 |             const ipv6Bytes = parseIp(parts.join(':'));
 24 |             ipv6Bytes.writeUInt32BE(ipv4Bytes.readUInt32BE(0), 12);
 25 | 
 26 |             return ipv6Bytes;
 27 |         }
 28 | 
 29 |         const buf = Buffer.alloc(16);
 30 | 
 31 |         // Expand :: notation
 32 |         let expanded = ip;
 33 |         if (ip.includes('::')) {
 34 |             const sides = ip.split('::');
 35 |             const left = sides[0] ? sides[0].split(':') : [];
 36 |             const right = sides[1] ? sides[1].split(':') : [];
 37 |             const middle = Array(8 - left.length - right.length).fill('0');
 38 |             expanded = [...left, ...middle, ...right].join(':');
 39 |         }
 40 | 
 41 |         // Convert to buffer
 42 |         const parts = expanded.split(':');
 43 |         let offset = 0;
 44 |         for (const part of parts) {
 45 |             buf.writeUInt16BE(parseInt(part, 16), offset);
 46 |             offset += 2;
 47 |         }
 48 | 
 49 |         return buf;
 50 |     }
 51 | 
 52 |     throw new Error('Invalid IP address');
 53 | }
 54 | 
 55 | 
 56 | export function parseCIDR(cidr: string): [Buffer, Buffer] {
 57 |     const [ip, prefixTxt] = cidr.split('/');
 58 |     const buf = parseIp(ip);
 59 |     const maskBuf = Buffer.alloc(buf.byteLength, 0xff);
 60 |     const prefixBits = parseInt(prefixTxt);
 61 | 
 62 |     let offsetBits = 0;
 63 |     while (offsetBits < (buf.byteLength * 8)) {
 64 |         if (offsetBits <= (prefixBits - 8)) {
 65 |             offsetBits += 8;
 66 |             continue;
 67 |         }
 68 |         const bitsRemain = prefixBits - offsetBits;
 69 |         const byteOffset = Math.floor(offsetBits / 8);
 70 | 
 71 |         if (bitsRemain > 0) {
 72 |             const theByte = buf[byteOffset];
 73 |             const mask = 0xff << (8 - bitsRemain);
 74 |             maskBuf[byteOffset] = mask;
 75 |             buf[byteOffset] = theByte & mask;
 76 | 
 77 |             offsetBits += 8;
 78 |             continue;
 79 |         };
 80 |         buf[byteOffset] = 0;
 81 |         maskBuf[byteOffset] = 0;
 82 | 
 83 |         offsetBits += 8;
 84 |     }
 85 | 
 86 |     return [buf, maskBuf];
 87 | }
 88 | 
 89 | export class CIDR {
 90 |     buff: Buffer;
 91 |     mask: Buffer;
 92 |     text: string;
 93 |     constructor(cidr: string) {
 94 |         this.text = cidr;
 95 |         [this.buff, this.mask] = parseCIDR(cidr);
 96 |     }
 97 | 
 98 |     toString() {
 99 |         return this.text;
100 |     }
101 | 
102 |     get family() {
103 |         return this.buff.byteLength === 4 ? 4 : 6;
104 |     }
105 | 
106 |     test(ip: string | Buffer): boolean {
107 |         const parsedIp = typeof ip === 'string' ? parseIp(ip) : ip;
108 | 
109 |         if (parsedIp.byteLength !== this.buff.byteLength) {
110 |             return false;
111 |         }
112 | 
113 |         for (const i of Array(this.buff.byteLength).keys()) {
114 |             const t = parsedIp[i];
115 |             const m = this.mask[i];
116 | 
117 |             if (m === 0) {
118 |                 return true;
119 |             }
120 | 
121 |             const r = this.buff[i];
122 |             if ((t & m) !== r) {
123 |                 return false;
124 |             }
125 |         }
126 | 
127 |         return true;
128 |     }
129 | }
130 | 
131 | const nonPublicNetworks4 = [
132 |     '10.0.0.0/8',
133 |     '172.16.0.0/12',
134 |     '192.168.0.0/16',
135 | 
136 |     '127.0.0.0/8',
137 |     '255.255.255.255/32',
138 |     '169.254.0.0/16',
139 |     '224.0.0.0/4',
140 | 
141 |     '100.64.0.0/10',
142 |     '0.0.0.0/32',
143 | ];
144 | 
145 | 
146 | const nonPublicNetworks6 = [
147 |     'fc00::/7',
148 |     'fe80::/10',
149 |     'ff00::/8',
150 | 
151 |     '::127.0.0.0/104',
152 |     '::/128',
153 | ];
154 | 
155 | const nonPublicCIDRs = [...nonPublicNetworks4, ...nonPublicNetworks6].map(cidr => new CIDR(cidr));
156 | 
157 | export function isIPInNonPublicRange(ip: string) {
158 |     const parsed = parseIp(ip);
159 | 
160 |     for (const cidr of nonPublicCIDRs) {
161 |         if (cidr.test(parsed)) {
162 |             return true;
163 |         }
164 |     }
165 | 
166 |     return false;
167 | }
168 | 


--------------------------------------------------------------------------------
/src/utils/markdown.ts:
--------------------------------------------------------------------------------
 1 | 
 2 | export function tidyMarkdown(markdown: string): string {
 3 | 
 4 |     // Step 1: Handle complex broken links with text and optional images spread across multiple lines
 5 |     let normalizedMarkdown = markdown.replace(/\[\s*([^\]\n]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => {
 6 |         // Remove internal new lines and excessive spaces within the text
 7 |         text = text.replace(/\s+/g, ' ').trim();
 8 |         url = url.replace(/\s+/g, '').trim();
 9 |         return `[${text}](${url})`;
10 |     });
11 | 
12 |     normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]\n!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
13 |         // Normalize by removing excessive spaces and new lines
14 |         text = text.replace(/\s+/g, ' ').trim();
15 |         alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
16 |         imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
17 |         linkUrl = linkUrl.replace(/\s+/g, '').trim();
18 |         if (imgUrl) {
19 |             return `[${text} ![${alt}](${imgUrl})](${linkUrl})`;
20 |         } else {
21 |             return `[${text}](${linkUrl})`;
22 |         }
23 |     });
24 | 
25 |     // Step 2: Normalize regular links that may be broken across lines
26 |     normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => {
27 |         text = text.replace(/\s+/g, ' ').trim();
28 |         url = url.replace(/\s+/g, '').trim();
29 |         return `[${text}](${url})`;
30 |     });
31 | 
32 |     // Step 3: Replace more than two consecutive empty lines with exactly two empty lines
33 |     normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
34 | 
35 |     // Step 4: Remove leading spaces from each line
36 |     normalizedMarkdown = normalizedMarkdown.replace(/^[ \t]+/gm, '');
37 | 
38 |     return normalizedMarkdown.trim();
39 | }
40 | 


--------------------------------------------------------------------------------
/src/utils/misc.ts:
--------------------------------------------------------------------------------
 1 | import { ParamValidationError } from 'civkit';
 2 | 
 3 | export function cleanAttribute(attribute: string | null) {
 4 |     return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : '';
 5 | }
 6 | 
 7 | 
 8 | export function tryDecodeURIComponent(input: string) {
 9 |     try {
10 |         return decodeURIComponent(input);
11 |     } catch (err) {
12 |         if (URL.canParse(input, 'http://localhost:3000')) {
13 |             return input;
14 |         }
15 | 
16 |         throw new ParamValidationError(`Invalid URIComponent: ${input}`);
17 |     }
18 | }
19 | 
20 | 
21 | export async function* toAsyncGenerator<T>(val: T) {
22 |     yield val;
23 | }
24 | 
25 | export async function* toGenerator<T>(val: T) {
26 |     yield val;
27 | }


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "compilerOptions": {
 3 |         "module": "node16",
 4 | 
 5 |         "noImplicitReturns": true,
 6 |         "noUnusedLocals": true,
 7 |         "outDir": "build",
 8 |         "sourceMap": true,
 9 |         "strict": true,
10 |         "allowJs": true,
11 |         "target": "es2022",
12 |         "lib": ["es2022"],
13 |         "skipLibCheck": true,
14 |         "useDefineForClassFields": false,
15 |         "experimentalDecorators": true,
16 |         "emitDecoratorMetadata": true,
17 |         "esModuleInterop": true,
18 |         "noImplicitOverride": true,
19 |     },
20 |     "compileOnSave": true,
21 |     "include": ["src"]
22 | }
23 | 


--------------------------------------------------------------------------------