├── .github └── workflows │ ├── .keep │ └── cd.yml ├── .gitignore ├── .gitmodules ├── .vscode ├── exensions.json ├── launch.json ├── settings.json └── tasks.json ├── Dockerfile ├── LICENSE ├── README.md ├── integrity-check.cjs ├── package-lock.json ├── package.json ├── public ├── favicon.ico └── robots.txt ├── src ├── api │ ├── crawler.ts │ ├── searcher.ts │ └── serp.ts ├── cloud-functions │ ├── adaptive-crawler.ts │ └── data-crunching.ts ├── db │ ├── adaptive-crawl-task.ts │ ├── crawled.ts │ ├── domain-blockade.ts │ ├── domain-profile.ts │ ├── img-alt.ts │ ├── pdf.ts │ └── searched.ts ├── dto │ ├── adaptive-crawler-options.ts │ ├── crawler-options.ts │ ├── jina-embeddings-auth.ts │ └── turndown-tweakable-options.ts ├── fetch.d.ts ├── lib │ └── transform-server-event-stream.ts ├── services │ ├── alt-text.ts │ ├── async-context.ts │ ├── blackhole-detector.ts │ ├── brave-search.ts │ ├── canvas.ts │ ├── cf-browser-rendering.ts │ ├── curl.ts │ ├── errors.ts │ ├── finalizer.ts │ ├── geoip.ts │ ├── jsdom.ts │ ├── lm.ts │ ├── logger.ts │ ├── minimal-stealth.js │ ├── misc.ts │ ├── pdf-extract.ts │ ├── pseudo-transfer.ts │ ├── puppeteer.ts │ ├── registry.ts │ ├── robots-text.ts │ ├── serp │ │ ├── compat.ts │ │ ├── google.ts │ │ ├── internal.ts │ │ ├── puppeteer.ts │ │ └── serper.ts │ ├── serper-search.ts │ ├── snapshot-formatter.ts │ ├── temp-file.ts │ └── threaded.ts ├── shared ├── stand-alone │ ├── crawl.ts │ ├── search.ts │ └── serp.ts ├── types.d.ts └── utils │ ├── encoding.ts │ ├── get-function-url.ts │ ├── ip.ts │ ├── markdown.ts │ ├── misc.ts │ └── tailwind-classes.ts └── tsconfig.json /.github/workflows/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/reader/5f07900eabe07b1dd0e8d09e6c8ea022e6b2c176/.github/workflows/.keep -------------------------------------------------------------------------------- /.github/workflows/cd.yml: -------------------------------------------------------------------------------- 1 | run-name: Build push and deploy (CD) 2 | on: 3 | push: 4 | branches: 5 | - main 6 | - ci-debug 7 | - dev 8 | tags: 9 | - '*' 10 | 11 | jobs: 12 | build-and-push-to-gcr: 13 | runs-on: ubuntu-latest 14 | concurrency: 15 | group: ${{ github.ref_type == 'branch' && github.ref }} 16 | cancel-in-progress: true 17 | permissions: 18 | contents: read 19 | steps: 20 | - uses: actions/checkout@v4 21 | with: 22 | lfs: true 23 | submodules: true 24 | token: ${{ secrets.THINAPPS_SHARED_READ_TOKEN }} 25 | - uses: 'google-github-actions/auth@v2' 26 | with: 27 | credentials_json: '${{ secrets.GCLOUD_SERVICE_ACCOUNT_SECRET_JSON }}' 28 | - name: 'Set up Cloud SDK' 29 | uses: 'google-github-actions/setup-gcloud@v2' 30 | with: 31 | install_components: beta 32 | - name: "Docker auth" 33 | run: |- 34 | gcloud auth configure-docker us-docker.pkg.dev --quiet 35 | - name: Set controller release version 36 | run: echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV 37 | - name: Set up Node.js 38 | uses: actions/setup-node@v4 39 | with: 40 | node-version: 22.12.0 41 | cache: npm 42 | 43 | - name: npm install 44 | run: npm ci 45 | - name: get maxmind mmdb 46 | run: mkdir -p licensed && curl -o licensed/GeoLite2-City.mmdb https://raw.githubusercontent.com/P3TERX/GeoLite.mmdb/download/GeoLite2-City.mmdb 47 | - name: get source han sans font 48 | run: curl -o licensed/SourceHanSansSC-Regular.otf https://raw.githubusercontent.com/adobe-fonts/source-han-sans/refs/heads/release/OTF/SimplifiedChinese/SourceHanSansSC-Regular.otf 49 | - name: build application 50 | run: npm run build 51 | - name: Set package version 52 | run: npm version --no-git-tag-version ${{ env.RELEASE_VERSION }} 53 | if: github.ref_type == 'tag' 54 | - name: Docker meta 55 | id: meta 56 | uses: docker/metadata-action@v5 57 | with: 58 | images: | 59 | us-docker.pkg.dev/reader-6b7dc/jina-reader/reader 60 | - name: Set up QEMU 61 | uses: docker/setup-qemu-action@v3 62 | - name: Set up Docker Buildx 63 | uses: docker/setup-buildx-action@v3 64 | - name: Build and push 65 | id: container 66 | uses: docker/build-push-action@v6 67 | with: 68 | context: . 69 | push: true 70 | tags: ${{ steps.meta.outputs.tags }} 71 | labels: ${{ steps.meta.outputs.labels }} 72 | - name: Deploy CRAWL with Tag 73 | run: | 74 | gcloud beta run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2 75 | - name: Deploy SEARCH with Tag 76 | run: | 77 | gcloud beta run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2 78 | - name: Deploy SERP with Tag 79 | run: | 80 | gcloud beta run deploy serp --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2 81 | - name: Deploy CRAWL-EU with Tag 82 | run: | 83 | gcloud beta run deploy crawl-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2 84 | - name: Deploy SEARCH-EU with Tag 85 | run: | 86 | gcloud beta run deploy search-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2 87 | - name: Deploy SERP-HK with Tag 88 | run: | 89 | gcloud beta run deploy serp-hk --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region asia-east2 --async --min-instances 0 --deploy-health-check --use-http2 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | firebase-debug.log* 8 | firebase-debug.*.log* 9 | 10 | # Firebase cache 11 | .firebase/ 12 | 13 | # Firebase config 14 | 15 | # Uncomment this if you'd like others to create their own Firebase project. 16 | # For a team working on the same Firebase project(s), it is recommended to leave 17 | # it commented so all members can deploy to the same project(s) in .firebaserc. 18 | # .firebaserc 19 | 20 | # Runtime data 21 | pids 22 | *.pid 23 | *.seed 24 | *.pid.lock 25 | 26 | # Directory for instrumented libs generated by jscoverage/JSCover 27 | lib-cov 28 | 29 | # Coverage directory used by tools like istanbul 30 | coverage 31 | 32 | # nyc test coverage 33 | .nyc_output 34 | 35 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 36 | .grunt 37 | 38 | # Bower dependency directory (https://bower.io/) 39 | bower_components 40 | 41 | # node-waf configuration 42 | .lock-wscript 43 | 44 | # Compiled binary addons (http://nodejs.org/api/addons.html) 45 | build/Release 46 | 47 | # Dependency directories 48 | node_modules/ 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Optional REPL history 57 | .node_repl_history 58 | 59 | # Output of 'npm pack' 60 | *.tgz 61 | 62 | # Yarn Integrity file 63 | .yarn-integrity 64 | 65 | # dotenv environment variables file 66 | .env 67 | .secret.local 68 | 69 | toy*.ts 70 | 71 | .DS_Store 72 | build/ 73 | .firebase-emu/ 74 | *.log 75 | .DS_Store 76 | 77 | *.local 78 | .secret.* 79 | licensed/ -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "thinapps-shared"] 2 | path = thinapps-shared 3 | url = git@github.com:jina-ai/thinapps-shared.git 4 | -------------------------------------------------------------------------------- /.vscode/exensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "editorconfig.editorconfig", 4 | "octref.vetur", 5 | "redhat.vscode-yaml", 6 | "dbaeumer.vscode-eslint", 7 | "esbenp.prettier-vscode", 8 | "streetsidesoftware.code-spell-checker" 9 | ] 10 | } -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Attach", 6 | "port": 9229, 7 | "request": "attach", 8 | "skipFiles": [ 9 | "/**" 10 | ], 11 | "type": "node" 12 | }, 13 | { 14 | "name": "Attach by Process ID", 15 | "processId": "${command:PickProcess}", 16 | "request": "attach", 17 | "skipFiles": [ 18 | "/**" 19 | ], 20 | "type": "node" 21 | }, 22 | { 23 | "name": "Debug Stand Alone Crawl", 24 | "request": "launch", 25 | "runtimeArgs": [ 26 | "--env-file=.secret.local", 27 | ], 28 | "env": { 29 | "GCLOUD_PROJECT": "reader-6b7dc", 30 | "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib" 31 | }, 32 | "cwd": "${workspaceFolder}", 33 | "program": "build/stand-alone/crawl.js", 34 | "skipFiles": [ 35 | "/**" 36 | ], 37 | "type": "node", 38 | "outputCapture": "std", 39 | "preLaunchTask": "Backend:build:watch", 40 | "killBehavior": "forceful" 41 | }, 42 | { 43 | "name": "Debug Stand Alone Crawl + Browser", 44 | "request": "launch", 45 | "runtimeArgs": [ 46 | "--env-file=.secret.local", 47 | ], 48 | "env": { 49 | "GCLOUD_PROJECT": "reader-6b7dc", 50 | "DEBUG_BROWSER": "true", 51 | "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib" 52 | }, 53 | "cwd": "${workspaceFolder}", 54 | "program": "build/stand-alone/crawl.js", 55 | "skipFiles": [ 56 | "/**" 57 | ], 58 | "type": "node", 59 | "outputCapture": "std", 60 | "preLaunchTask": "Backend:build:watch", 61 | "killBehavior": "forceful" 62 | }, 63 | { 64 | "name": "Debug Stand Alone Crawl - EU", 65 | "request": "launch", 66 | "runtimeArgs": [ 67 | "--env-file=.secret.local", 68 | ], 69 | "env": { 70 | "GCLOUD_PROJECT": "reader-6b7dc", 71 | "FIRESTORE_DATABASE": "reader-eu", 72 | "GCP_STORAGE_BUCKET": "reader-eu", 73 | "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib" 74 | }, 75 | "cwd": "${workspaceFolder}", 76 | "program": "build/stand-alone/crawl.js", 77 | "skipFiles": [ 78 | "/**" 79 | ], 80 | "type": "node", 81 | "outputCapture": "std", 82 | "preLaunchTask": "Backend:build:watch", 83 | "killBehavior": "forceful" 84 | }, 85 | { 86 | "name": "Debug Stand Alone Search", 87 | "request": "launch", 88 | "runtimeArgs": [ 89 | "--env-file=.secret.local", 90 | ], 91 | "env": { 92 | "GCLOUD_PROJECT": "reader-6b7dc", 93 | "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib" 94 | }, 95 | "cwd": "${workspaceFolder}", 96 | "program": "build/stand-alone/search.js", 97 | "skipFiles": [ 98 | "/**" 99 | ], 100 | "type": "node", 101 | "outputCapture": "std", 102 | "preLaunchTask": "Backend:build:watch", 103 | "killBehavior": "forceful" 104 | }, 105 | { 106 | "name": "Debug Stand Alone SERP", 107 | "request": "launch", 108 | "runtimeArgs": [ 109 | "--env-file=.secret.local", 110 | ], 111 | "env": { 112 | "GCLOUD_PROJECT": "reader-6b7dc", 113 | "PREFERRED_PROXY_COUNTRY": "hk", 114 | "OVERRIDE_GOOGLE_DOMAIN": "www.google.com.hk", 115 | "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib" 116 | }, 117 | "cwd": "${workspaceFolder}", 118 | "program": "build/stand-alone/serp.js", 119 | "skipFiles": [ 120 | "/**" 121 | ], 122 | "type": "node", 123 | "outputCapture": "std", 124 | "preLaunchTask": "Backend:build:watch", 125 | "killBehavior": "forceful" 126 | }, 127 | ] 128 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.wordWrap": "on", 3 | "editor.wordWrapColumn": 120, 4 | "files.trimTrailingWhitespace": true, 5 | "files.trimFinalNewlines": true, 6 | "[javascript]": { 7 | "editor.defaultFormatter": "vscode.typescript-language-features" 8 | }, 9 | "[jsonc]": { 10 | "editor.defaultFormatter": "vscode.json-language-features" 11 | }, 12 | "[typescript]": { 13 | "editor.defaultFormatter": "vscode.typescript-language-features" 14 | }, 15 | "[json]": { 16 | "editor.defaultFormatter": "vscode.json-language-features" 17 | }, 18 | "[yaml]": { 19 | "editor.defaultFormatter": "redhat.vscode-yaml" 20 | }, 21 | "[markdown]": { 22 | "files.trimTrailingWhitespace": false 23 | }, 24 | "typescript.tsdk": "node_modules/typescript/lib", 25 | "typescript.preferences.quoteStyle": "single", 26 | "typescript.format.semicolons": "insert", 27 | "typescript.preferences.importModuleSpecifier": "project-relative", 28 | "typescript.locale": "en", 29 | "cSpell.enabled": true, 30 | "cSpell.words": [ 31 | ], 32 | } -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.0", 3 | "tasks": [ 4 | { 5 | "type": "npm", 6 | "script": "build", 7 | "group": "build", 8 | "options": { 9 | "cwd": "${workspaceFolder}" 10 | }, 11 | "problemMatcher": [], 12 | "label": "Backend:rebuild", 13 | "detail": "Backend:rebuild" 14 | }, 15 | { 16 | "type": "typescript", 17 | "options": { 18 | "cwd": "${workspaceFolder}" 19 | }, 20 | "tsconfig": "tsconfig.json", 21 | "option": "watch", 22 | "isBackground": true, 23 | "problemMatcher": [ 24 | "$tsc-watch" 25 | ], 26 | "group": "build", 27 | "label": "Backend:build:watch" 28 | } 29 | ] 30 | } -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | FROM lwthiker/curl-impersonate:0.6-chrome-slim-bullseye 3 | 4 | FROM node:22 5 | 6 | RUN apt-get update \ 7 | && apt-get install -y wget gnupg \ 8 | && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ 9 | && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \ 10 | && apt-get update \ 11 | && apt-get install -y google-chrome-stable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-freefont-ttf libxss1 zstd \ 12 | --no-install-recommends \ 13 | && rm -rf /var/lib/apt/lists/* 14 | 15 | COPY --from=0 /usr/local/lib/libcurl-impersonate.so /usr/local/lib/libcurl-impersonate.so 16 | 17 | RUN groupadd -r jina 18 | RUN useradd -g jina -G audio,video -m jina 19 | USER jina 20 | 21 | WORKDIR /app 22 | 23 | COPY package.json package-lock.json ./ 24 | RUN npm ci 25 | 26 | COPY build ./build 27 | COPY public ./public 28 | COPY licensed ./licensed 29 | 30 | RUN rm -rf ~/.config/chromium && mkdir -p ~/.config/chromium 31 | 32 | RUN NODE_COMPILE_CACHE=node_modules npm run dry-run 33 | 34 | ENV OVERRIDE_CHROME_EXECUTABLE_PATH=/usr/bin/google-chrome-stable 35 | ENV LD_PRELOAD=/usr/local/lib/libcurl-impersonate.so CURL_IMPERSONATE=chrome116 CURL_IMPERSONATE_HEADERS=no 36 | ENV NODE_COMPILE_CACHE=node_modules 37 | ENV PORT=8080 38 | 39 | EXPOSE 3000 3001 8080 8081 40 | ENTRYPOINT ["node"] 41 | CMD [ "build/stand-alone/crawl.js" ] 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2020-2024 Jina AI Limited. All rights reserved. 2 | 3 | 4 | Apache License 5 | Version 2.0, January 2004 6 | http://www.apache.org/licenses/ 7 | 8 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 9 | 10 | 1. Definitions. 11 | 12 | "License" shall mean the terms and conditions for use, reproduction, 13 | and distribution as defined by Sections 1 through 9 of this document. 14 | 15 | "Licensor" shall mean the copyright owner or entity authorized by 16 | the copyright owner that is granting the License. 17 | 18 | "Legal Entity" shall mean the union of the acting entity and all 19 | other entities that control, are controlled by, or are under common 20 | control with that entity. For the purposes of this definition, 21 | "control" means (i) the power, direct or indirect, to cause the 22 | direction or management of such entity, whether by contract or 23 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 24 | outstanding shares, or (iii) beneficial ownership of such entity. 25 | 26 | "You" (or "Your") shall mean an individual or Legal Entity 27 | exercising permissions granted by this License. 28 | 29 | "Source" form shall mean the preferred form for making modifications, 30 | including but not limited to software source code, documentation 31 | source, and configuration files. 32 | 33 | "Object" form shall mean any form resulting from mechanical 34 | transformation or translation of a Source form, including but 35 | not limited to compiled object code, generated documentation, 36 | and conversions to other media types. 37 | 38 | "Work" shall mean the work of authorship, whether in Source or 39 | Object form, made available under the License, as indicated by a 40 | copyright notice that is included in or attached to the work 41 | (an example is provided in the Appendix below). 42 | 43 | "Derivative Works" shall mean any work, whether in Source or Object 44 | form, that is based on (or derived from) the Work and for which the 45 | editorial revisions, annotations, elaborations, or other modifications 46 | represent, as a whole, an original work of authorship. For the purposes 47 | of this License, Derivative Works shall not include works that remain 48 | separable from, or merely link (or bind by name) to the interfaces of, 49 | the Work and Derivative Works thereof. 50 | 51 | "Contribution" shall mean any work of authorship, including 52 | the original version of the Work and any modifications or additions 53 | to that Work or Derivative Works thereof, that is intentionally 54 | submitted to Licensor for inclusion in the Work by the copyright owner 55 | or by an individual or Legal Entity authorized to submit on behalf of 56 | the copyright owner. For the purposes of this definition, "submitted" 57 | means any form of electronic, verbal, or written communication sent 58 | to the Licensor or its representatives, including but not limited to 59 | communication on electronic mailing lists, source code control systems, 60 | and issue tracking systems that are managed by, or on behalf of, the 61 | Licensor for the purpose of discussing and improving the Work, but 62 | excluding communication that is conspicuously marked or otherwise 63 | designated in writing by the copyright owner as "Not a Contribution." 64 | 65 | "Contributor" shall mean Licensor and any individual or Legal Entity 66 | on behalf of whom a Contribution has been received by Licensor and 67 | subsequently incorporated within the Work. 68 | 69 | 2. Grant of Copyright License. Subject to the terms and conditions of 70 | this License, each Contributor hereby grants to You a perpetual, 71 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 72 | copyright license to reproduce, prepare Derivative Works of, 73 | publicly display, publicly perform, sublicense, and distribute the 74 | Work and such Derivative Works in Source or Object form. 75 | 76 | 3. Grant of Patent License. Subject to the terms and conditions of 77 | this License, each Contributor hereby grants to You a perpetual, 78 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 79 | (except as stated in this section) patent license to make, have made, 80 | use, offer to sell, sell, import, and otherwise transfer the Work, 81 | where such license applies only to those patent claims licensable 82 | by such Contributor that are necessarily infringed by their 83 | Contribution(s) alone or by combination of their Contribution(s) 84 | with the Work to which such Contribution(s) was submitted. If You 85 | institute patent litigation against any entity (including a 86 | cross-claim or counterclaim in a lawsuit) alleging that the Work 87 | or a Contribution incorporated within the Work constitutes direct 88 | or contributory patent infringement, then any patent licenses 89 | granted to You under this License for that Work shall terminate 90 | as of the date such litigation is filed. 91 | 92 | 4. Redistribution. You may reproduce and distribute copies of the 93 | Work or Derivative Works thereof in any medium, with or without 94 | modifications, and in Source or Object form, provided that You 95 | meet the following conditions: 96 | 97 | (a) You must give any other recipients of the Work or 98 | Derivative Works a copy of this License; and 99 | 100 | (b) You must cause any modified files to carry prominent notices 101 | stating that You changed the files; and 102 | 103 | (c) You must retain, in the Source form of any Derivative Works 104 | that You distribute, all copyright, patent, trademark, and 105 | attribution notices from the Source form of the Work, 106 | excluding those notices that do not pertain to any part of 107 | the Derivative Works; and 108 | 109 | (d) If the Work includes a "NOTICE" text file as part of its 110 | distribution, then any Derivative Works that You distribute must 111 | include a readable copy of the attribution notices contained 112 | within such NOTICE file, excluding those notices that do not 113 | pertain to any part of the Derivative Works, in at least one 114 | of the following places: within a NOTICE text file distributed 115 | as part of the Derivative Works; within the Source form or 116 | documentation, if provided along with the Derivative Works; or, 117 | within a display generated by the Derivative Works, if and 118 | wherever such third-party notices normally appear. The contents 119 | of the NOTICE file are for informational purposes only and 120 | do not modify the License. You may add Your own attribution 121 | notices within Derivative Works that You distribute, alongside 122 | or as an addendum to the NOTICE text from the Work, provided 123 | that such additional attribution notices cannot be construed 124 | as modifying the License. 125 | 126 | You may add Your own copyright statement to Your modifications and 127 | may provide additional or different license terms and conditions 128 | for use, reproduction, or distribution of Your modifications, or 129 | for any such Derivative Works as a whole, provided Your use, 130 | reproduction, and distribution of the Work otherwise complies with 131 | the conditions stated in this License. 132 | 133 | 5. Submission of Contributions. Unless You explicitly state otherwise, 134 | any Contribution intentionally submitted for inclusion in the Work 135 | by You to the Licensor shall be under the terms and conditions of 136 | this License, without any additional terms or conditions. 137 | Notwithstanding the above, nothing herein shall supersede or modify 138 | the terms of any separate license agreement you may have executed 139 | with Licensor regarding such Contributions. 140 | 141 | 6. Trademarks. This License does not grant permission to use the trade 142 | names, trademarks, service marks, or product names of the Licensor, 143 | except as required for reasonable and customary use in describing the 144 | origin of the Work and reproducing the content of the NOTICE file. 145 | 146 | 7. Disclaimer of Warranty. Unless required by applicable law or 147 | agreed to in writing, Licensor provides the Work (and each 148 | Contributor provides its Contributions) on an "AS IS" BASIS, 149 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 150 | implied, including, without limitation, any warranties or conditions 151 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 152 | PARTICULAR PURPOSE. You are solely responsible for determining the 153 | appropriateness of using or redistributing the Work and assume any 154 | risks associated with Your exercise of permissions under this License. 155 | 156 | 8. Limitation of Liability. In no event and under no legal theory, 157 | whether in tort (including negligence), contract, or otherwise, 158 | unless required by applicable law (such as deliberate and grossly 159 | negligent acts) or agreed to in writing, shall any Contributor be 160 | liable to You for damages, including any direct, indirect, special, 161 | incidental, or consequential damages of any character arising as a 162 | result of this License or out of the use or inability to use the 163 | Work (including but not limited to damages for loss of goodwill, 164 | work stoppage, computer failure or malfunction, or any and all 165 | other commercial damages or losses), even if such Contributor 166 | has been advised of the possibility of such damages. 167 | 168 | 9. Accepting Warranty or Additional Liability. While redistributing 169 | the Work or Derivative Works thereof, You may choose to offer, 170 | and charge a fee for, acceptance of support, warranty, indemnity, 171 | or other liability obligations and/or rights consistent with this 172 | License. However, in accepting such obligations, You may act only 173 | on Your own behalf and on Your sole responsibility, not on behalf 174 | of any other Contributor, and only if You agree to indemnify, 175 | defend, and hold each Contributor harmless for any liability 176 | incurred by, or claims asserted against, such Contributor by reason 177 | of your accepting any such warranty or additional liability. 178 | 179 | END OF TERMS AND CONDITIONS 180 | 181 | Copyright 2020-2021 Jina AI Limited 182 | 183 | Licensed under the Apache License, Version 2.0 (the "License"); 184 | you may not use this file except in compliance with the License. 185 | You may obtain a copy of the License at 186 | 187 | http://www.apache.org/licenses/LICENSE-2.0 188 | 189 | Unless required by applicable law or agreed to in writing, software 190 | distributed under the License is distributed on an "AS IS" BASIS, 191 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 192 | See the License for the specific language governing permissions and 193 | limitations under the License. 194 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reader 2 | 3 | Your LLMs deserve better input. 4 | 5 | Reader does two things: 6 | - **Read**: It converts any URL to an **LLM-friendly** input with `https://r.jina.ai/https://your.url`. Get improved output for your agent and RAG systems at no cost. 7 | - **Search**: It searches the web for a given query with `https://s.jina.ai/your+query`. This allows your LLMs to access the latest world knowledge from the web. 8 | 9 | Check out [the live demo](https://jina.ai/reader#demo) 10 | 11 | Or just visit these URLs (**Read**) https://r.jina.ai/https://github.com/jina-ai/reader, (**Search**) https://s.jina.ai/Who%20will%20win%202024%20US%20presidential%20election%3F and see yourself. 12 | 13 | > Feel free to use Reader API in production. It is free, stable and scalable. We are maintaining it actively as one of the core products of Jina AI. [Check out rate limit](https://jina.ai/reader#pricing) 14 | 15 | image 16 | image 17 | 18 | 19 | ## Updates 20 | 21 | - **2024-07-15**: To restrict the results of `s.jina.ai` to certain domain/website, you can set e.g. `site=jina.ai` in the query parameters, which enables in-site search. For more options, [try our updated live-demo](https://jina.ai/reader/#apiform). 22 | - **2024-05-30**: Reader can now read abitrary PDF from any URL! Check out [this PDF result from NASA.gov](https://r.jina.ai/https://www.nasa.gov/wp-content/uploads/2023/01/55583main_vision_space_exploration2.pdf) vs [the original](https://www.nasa.gov/wp-content/uploads/2023/01/55583main_vision_space_exploration2.pdf). 23 | - **2024-05-15**: We introduced a new endpoint `s.jina.ai` that searches on the web and return top-5 results, each in a LLM-friendly format. [Read more about this new feature here](https://jina.ai/news/jina-reader-for-search-grounding-to-improve-factuality-of-llms). 24 | - **2024-05-08**: Image caption is off by default for better latency. To turn it on, set `x-with-generated-alt: true` in the request header. 25 | - **2024-04-24**: You now have more fine-grained control over Reader API [using headers](#using-request-headers), e.g. forwarding cookies, using HTTP proxy. 26 | - **2024-04-15**: Reader now supports image reading! It captions all images at the specified URL and adds `Image [idx]: [caption]` as an alt tag (if they initially lack one). This enables downstream LLMs to interact with the images in reasoning, summarizing etc. [See example here](https://x.com/JinaAI_/status/1780094402071023926). 27 | 28 | ## Usage 29 | 30 | ### Using `r.jina.ai` for single URL fetching 31 | Simply prepend `https://r.jina.ai/` to any URL. For example, to convert the URL `https://en.wikipedia.org/wiki/Artificial_intelligence` to an LLM-friendly input, use the following URL: 32 | 33 | [https://r.jina.ai/https://en.wikipedia.org/wiki/Artificial_intelligence](https://r.jina.ai/https://en.wikipedia.org/wiki/Artificial_intelligence) 34 | 35 | ### [Using `r.jina.ai` for a full website fetching (Google Colab)](https://colab.research.google.com/drive/1uoBy6_7BhxqpFQ45vuhgDDDGwstaCt4P#scrollTo=5LQjzJiT9ewT) 36 | 37 | ### Using `s.jina.ai` for web search 38 | Simply prepend `https://s.jina.ai/` to your search query. Note that if you are using this in the code, make sure to encode your search query first, e.g. if your query is `Who will win 2024 US presidential election?` then your url should look like: 39 | 40 | [https://s.jina.ai/Who%20will%20win%202024%20US%20presidential%20election%3F](https://s.jina.ai/Who%20will%20win%202024%20US%20presidential%20election%3F) 41 | 42 | Behind the scenes, Reader searches the web, fetches the top 5 results, visits each URL, and applies `r.jina.ai` to it. This is different from many `web search function-calling` in agent/RAG frameworks, which often return only the title, URL, and description provided by the search engine API. If you want to read one result more deeply, you have to fetch the content yourself from that URL. With Reader, `http://s.jina.ai` automatically fetches the content from the top 5 search result URLs for you (reusing the tech stack behind `http://r.jina.ai`). This means you don't have to handle browser rendering, blocking, or any issues related to JavaScript and CSS yourself. 43 | 44 | ### Using `s.jina.ai` for in-site search 45 | Simply specify `site` in the query parameters such as: 46 | 47 | ```bash 48 | curl 'https://s.jina.ai/When%20was%20Jina%20AI%20founded%3F?site=jina.ai&site=github.com' 49 | ``` 50 | 51 | ### [Interactive Code Snippet Builder](https://jina.ai/reader#apiform) 52 | 53 | We highly recommend using the code builder to explore different parameter combinations of the Reader API. 54 | 55 | image 56 | 57 | 58 | ### Using request headers 59 | 60 | As you have already seen above, one can control the behavior of the Reader API using request headers. Here is a complete list of supported headers. 61 | 62 | - You can enable the image caption feature via the `x-with-generated-alt: true` header. 63 | - You can ask the Reader API to forward cookies settings via the `x-set-cookie` header. 64 | - Note that requests with cookies will not be cached. 65 | - You can bypass `readability` filtering via the `x-respond-with` header, specifically: 66 | - `x-respond-with: markdown` returns markdown *without* going through `reability` 67 | - `x-respond-with: html` returns `documentElement.outerHTML` 68 | - `x-respond-with: text` returns `document.body.innerText` 69 | - `x-respond-with: screenshot` returns the URL of the webpage's screenshot 70 | - You can specify a proxy server via the `x-proxy-url` header. 71 | - You can customize cache tolerance via the `x-cache-tolerance` header (integer in seconds). 72 | - You can bypass the cached page (lifetime 3600s) via the `x-no-cache: true` header (equivalent of `x-cache-tolerance: 0`). 73 | - If you already know the HTML structure of your target page, you may specify `x-target-selector` or `x-wait-for-selector` to direct the Reader API to focus on a specific part of the page. 74 | - By setting `x-target-selector` header to a CSS selector, the Reader API return the content within the matched element, instead of the full HTML. Setting this header is useful when the automatic content extraction fails to capture the desired content and you can manually select the correct target. 75 | - By setting `x-wait-for-selector` header to a CSS selector, the Reader API will wait until the matched element is rendered before returning the content. If you already specified `x-wait-for-selector`, this header can be omitted if you plan to wait for the same element. 76 | 77 | ### Using `r.jina.ai` for single page application (SPA) fetching 78 | Many websites nowadays rely on JavaScript frameworks and client-side rendering. Usually known as Single Page Application (SPA). Thanks to [Puppeteer](https://github.com/puppeteer/puppeteer) and headless Chrome browser, Reader natively supports fetching these websites. However, due to specific approach some SPA are developed, there may be some extra precautions to take. 79 | 80 | #### SPAs with hash-based routing 81 | By definition of the web standards, content come after `#` in a URL is not sent to the server. To mitigate this issue, use `POST` method with `url` parameter in body. 82 | 83 | ```bash 84 | curl -X POST 'https://r.jina.ai/' -d 'url=https://example.com/#/route' 85 | ``` 86 | 87 | #### SPAs with preloading contents 88 | Some SPAs, or even some websites that are not strictly SPAs, may show preload contents before later loading the main content dynamically. In this case, Reader may be capturing the preload content instead of the main content. To mitigate this issue, here are some possible solutions: 89 | 90 | ##### Specifying `x-timeout` 91 | When timeout is explicitly specified, Reader will not attempt to return early and will wait for network idle until the timeout is reached. This is useful when the target website will eventually come to a network idle. 92 | 93 | ```bash 94 | curl 'https://example.com/' -H 'x-timeout: 30' 95 | ``` 96 | 97 | ##### Specifying `x-wait-for-selector` 98 | When wait-for-selector is explicitly specified, Reader will wait for the appearance of the specified CSS selector until timeout is reached. This is useful when you know exactly what element to wait for. 99 | 100 | ```bash 101 | curl 'https://example.com/' -H 'x-wait-for-selector: #content' 102 | ``` 103 | 104 | ### Streaming mode 105 | 106 | Streaming mode is useful when you find that the standard mode provides an incomplete result. This is because the Reader will wait a bit longer until the page is *stablely* rendered. Use the accept-header to toggle the streaming mode: 107 | 108 | ```bash 109 | curl -H "Accept: text/event-stream" https://r.jina.ai/https://en.m.wikipedia.org/wiki/Main_Page 110 | ``` 111 | 112 | The data comes in a stream; each subsequent chunk contains more complete information. **The last chunk should provide the most complete and final result.** If you come from LLMs, please note that it is a different behavior than the LLMs' text-generation streaming. 113 | 114 | For example, compare these two curl commands below. You can see streaming one gives you complete information at last, whereas standard mode does not. This is because the content loading on this particular site is triggered by some js *after* the page is fully loaded, and standard mode returns the page "too soon". 115 | ```bash 116 | curl -H 'x-no-cache: true' https://access.redhat.com/security/cve/CVE-2023-45853 117 | curl -H "Accept: text/event-stream" -H 'x-no-cache: true' https://r.jina.ai/https://access.redhat.com/security/cve/CVE-2023-45853 118 | ``` 119 | 120 | > Note: `-H 'x-no-cache: true'` is used only for demonstration purposes to bypass the cache. 121 | 122 | Streaming mode is also useful if your downstream LLM/agent system requires immediate content delivery or needs to process data in chunks to interleave I/O and LLM processing times. This allows for quicker access and more efficient data handling: 123 | 124 | ```text 125 | Reader API: streamContent1 ----> streamContent2 ----> streamContent3 ---> ... 126 | | | | 127 | v | | 128 | Your LLM: LLM(streamContent1) | | 129 | v | 130 | LLM(streamContent2) | 131 | v 132 | LLM(streamContent3) 133 | ``` 134 | 135 | Note that in terms of completeness: `... > streamContent3 > streamContent2 > streamContent1`, each subsequent chunk contains more complete information. 136 | 137 | ### JSON mode 138 | 139 | This is still very early and the result is not really a "useful" JSON. It contains three fields `url`, `title` and `content` only. Nonetheless, you can use accept-header to control the output format: 140 | ```bash 141 | curl -H "Accept: application/json" https://r.jina.ai/https://en.m.wikipedia.org/wiki/Main_Page 142 | ``` 143 | 144 | JSON mode is probably more useful in `s.jina.ai` than `r.jina.ai`. For `s.jina.ai` with JSON mode, it returns 5 results in a list, each in the structure of `{'title', 'content', 'url'}`. 145 | 146 | ### Generated alt 147 | 148 | All images in that page that lack `alt` tag can be auto-captioned by a VLM (vision langauge model) and formatted as `!(Image [idx]: [VLM_caption])[img_URL]`. This should give your downstream text-only LLM *just enough* hints to include those images into reasoning, selecting, and summarization. Use the x-with-generated-alt header to toggle the streaming mode: 149 | 150 | ```bash 151 | curl -H "X-With-Generated-Alt: true" https://r.jina.ai/https://en.m.wikipedia.org/wiki/Main_Page 152 | ``` 153 | 154 | ## How it works 155 | [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/jina-ai/reader) 156 | 157 | ## What is `thinapps-shared` submodule? 158 | 159 | You might notice a reference to `thinapps-shared` submodule, an internal package we use to share code across our products. While it’s not open-sourced and isn't integral to the Reader's functions, it mainly helps with decorators, logging, secrets management, etc. Feel free to ignore it for now. 160 | 161 | That said, this is *the single codebase* behind `https://r.jina.ai`, so everytime we commit here, we will deploy the new version to the `https://r.jina.ai`. 162 | 163 | ## Having trouble on some websites? 164 | Please raise an issue with the URL you are having trouble with. We will look into it and try to fix it. 165 | 166 | ## License 167 | Reader is backed by [Jina AI](https://jina.ai) and licensed under [Apache-2.0](./LICENSE). 168 | -------------------------------------------------------------------------------- /integrity-check.cjs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const fs = require('fs'); 4 | const path = require('path'); 5 | 6 | const file = path.resolve(__dirname, 'licensed/GeoLite2-City.mmdb'); 7 | 8 | if (!fs.existsSync(file)) { 9 | console.error(`Integrity check failed: ${file} does not exist.`); 10 | process.exit(1); 11 | } 12 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "reader", 3 | "scripts": { 4 | "lint": "eslint --ext .js,.ts .", 5 | "build": "node ./integrity-check.cjs && tsc -p .", 6 | "build:watch": "tsc --watch", 7 | "build:clean": "rm -rf ./build", 8 | "serve": "npm run build && npm run start", 9 | "debug": "npm run build && npm run dev", 10 | "start": "node ./build/stand-alone/crawl.js", 11 | "dry-run": "NODE_ENV=dry-run node ./build/stand-alone/search.js" 12 | }, 13 | "engines": { 14 | "node": ">=18" 15 | }, 16 | "main": "build/index.js", 17 | "dependencies": { 18 | "@esm2cjs/normalize-url": "^8.0.0", 19 | "@google-cloud/translate": "^8.2.0", 20 | "@koa/bodyparser": "^5.1.1", 21 | "@mozilla/readability": "^0.6.0", 22 | "@napi-rs/canvas": "^0.1.68", 23 | "@types/turndown": "^5.0.4", 24 | "@xmldom/xmldom": "^0.9.3", 25 | "archiver": "^6.0.1", 26 | "axios": "^1.3.3", 27 | "bcrypt": "^5.1.0", 28 | "busboy": "^1.6.0", 29 | "civkit": "^0.9.0-2570394", 30 | "cors": "^2.8.5", 31 | "dayjs": "^1.11.9", 32 | "express": "^4.19.2", 33 | "firebase-admin": "^12.1.0", 34 | "firebase-functions": "^6.1.1", 35 | "htmlparser2": "^9.0.0", 36 | "jose": "^5.1.0", 37 | "koa": "^2.16.0", 38 | "koa-compress": "^5.1.1", 39 | "langdetect": "^0.2.1", 40 | "linkedom": "^0.18.4", 41 | "lru-cache": "^11.0.2", 42 | "maxmind": "^4.3.18", 43 | "minio": "^7.1.3", 44 | "node-libcurl": "^4.1.0", 45 | "openai": "^4.20.0", 46 | "pdfjs-dist": "^4.10.38", 47 | "puppeteer": "^23.3.0", 48 | "puppeteer-extra": "^3.3.6", 49 | "puppeteer-extra-plugin-block-resources": "^2.4.3", 50 | "robots-parser": "^3.0.1", 51 | "set-cookie-parser": "^2.6.0", 52 | "simple-zstd": "^1.4.2", 53 | "stripe": "^11.11.0", 54 | "svg2png-wasm": "^1.4.1", 55 | "tiktoken": "^1.0.16", 56 | "tld-extract": "^2.1.0", 57 | "turndown": "^7.1.3", 58 | "turndown-plugin-gfm": "^1.0.2", 59 | "undici": "^7.8.0" 60 | }, 61 | "devDependencies": { 62 | "@types/archiver": "^5.3.4", 63 | "@types/bcrypt": "^5.0.0", 64 | "@types/busboy": "^1.5.4", 65 | "@types/cors": "^2.8.17", 66 | "@types/koa": "^2.15.0", 67 | "@types/koa-compress": "^4.0.6", 68 | "@types/node": "^20.14.13", 69 | "@types/set-cookie-parser": "^2.4.7", 70 | "@types/xmldom": "^0.1.34", 71 | "@typescript-eslint/eslint-plugin": "^5.12.0", 72 | "@typescript-eslint/parser": "^5.12.0", 73 | "eslint": "^8.9.0", 74 | "eslint-config-google": "^0.14.0", 75 | "eslint-plugin-import": "^2.25.4", 76 | "firebase-functions-test": "^3.0.0", 77 | "pino-pretty": "^13.0.0", 78 | "replicate": "^0.16.1", 79 | "typescript": "^5.5.4" 80 | }, 81 | "private": true, 82 | "exports": { 83 | ".": "./build/index.js" 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/reader/5f07900eabe07b1dd0e8d09e6c8ea022e6b2c176/public/favicon.ico -------------------------------------------------------------------------------- /public/robots.txt: -------------------------------------------------------------------------------- 1 | User-Agent: * 2 | Disallow: / 3 | -------------------------------------------------------------------------------- /src/cloud-functions/data-crunching.ts: -------------------------------------------------------------------------------- 1 | import { 2 | Defer, 3 | PromiseThrottle, 4 | RPCHost, 5 | } from 'civkit'; 6 | import { singleton } from 'tsyringe'; 7 | import { 8 | // CloudScheduleV2, CloudTaskV2, 9 | FirebaseStorageBucketControl, Logger, Param, TempFileManager 10 | } from '../shared'; 11 | import _ from 'lodash'; 12 | import { CrawlerHost } from '../api/crawler'; 13 | 14 | import { Crawled } from '../db/crawled'; 15 | import dayjs from 'dayjs'; 16 | import { createReadStream } from 'fs'; 17 | import { appendFile } from 'fs/promises'; 18 | import { createGzip } from 'zlib'; 19 | import { getFunctions } from 'firebase-admin/functions'; 20 | import { SnapshotFormatter } from '../services/snapshot-formatter'; 21 | import { getFunctionUrl } from '../utils/get-function-url'; 22 | 23 | dayjs.extend(require('dayjs/plugin/utc')); 24 | 25 | @singleton() 26 | export class DataCrunchingHost extends RPCHost { 27 | logger = this.globalLogger.child({ service: this.constructor.name }); 28 | 29 | pageCacheCrunchingPrefix = 'crunched-pages'; 30 | pageCacheCrunchingBatchSize = 5000; 31 | pageCacheCrunchingTMinus = 6 * 24 * 60 * 60 * 1000; 32 | rev = 7; 33 | 34 | constructor( 35 | protected globalLogger: Logger, 36 | 37 | protected crawler: CrawlerHost, 38 | protected snapshotFormatter: SnapshotFormatter, 39 | protected tempFileManager: TempFileManager, 40 | protected firebaseObjectStorage: FirebaseStorageBucketControl, 41 | ) { 42 | super(..._.without(arguments, crawler)); 43 | } 44 | 45 | override async init() { 46 | await this.dependencyReady(); 47 | 48 | this.emit('ready'); 49 | } 50 | 51 | // @CloudTaskV2({ 52 | // runtime: { 53 | // cpu: 2, 54 | // memory: '4GiB', 55 | // timeoutSeconds: 3600, 56 | // concurrency: 2, 57 | // maxInstances: 200, 58 | // retryConfig: { 59 | // maxAttempts: 3, 60 | // minBackoffSeconds: 60, 61 | // }, 62 | // rateLimits: { 63 | // maxConcurrentDispatches: 150, 64 | // maxDispatchesPerSecond: 2, 65 | // }, 66 | // }, 67 | // tags: ['DataCrunching'], 68 | // }) 69 | async crunchPageCacheWorker( 70 | @Param('date') date: string, 71 | @Param('offset', { default: 0 }) offset: number 72 | ) { 73 | this.logger.info(`Crunching page cache @${date}+${offset}...`); 74 | for await (const { fileName, records } of this.iterPageCacheRecords(date, offset)) { 75 | this.logger.info(`Crunching ${fileName}...`); 76 | const fileOnDrive = await this.crunchCacheRecords(records); 77 | const fstream = createReadStream(fileOnDrive.path); 78 | const gzipStream = createGzip(); 79 | fstream.pipe(gzipStream, { end: true }); 80 | await this.firebaseObjectStorage.bucket.file(fileName).save(gzipStream, { 81 | contentType: 'application/jsonl+gzip', 82 | }); 83 | } 84 | 85 | this.logger.info(`Crunching page cache @${date}+${offset} done.`); 86 | 87 | return true; 88 | } 89 | 90 | // @CloudScheduleV2('2 0 * * *', { 91 | // name: 'crunchPageCacheEveryday', 92 | // runtime: { 93 | // cpu: 2, 94 | // memory: '4GiB', 95 | // timeoutSeconds: 1800, 96 | // timeZone: 'UTC', 97 | // retryCount: 3, 98 | // minBackoffSeconds: 60, 99 | // }, 100 | // tags: ['DataCrunching'], 101 | // }) 102 | async dispatchPageCacheCrunching() { 103 | for await (const { fileName, date, offset } of this.iterPageCacheChunks()) { 104 | this.logger.info(`Dispatching ${fileName}...`); 105 | // sse.write({ data: `Dispatching ${fileName}...` }); 106 | 107 | await getFunctions().taskQueue('crunchPageCacheWorker').enqueue({ date, offset }, { 108 | dispatchDeadlineSeconds: 1800, 109 | uri: await getFunctionUrl('crunchPageCacheWorker'), 110 | }); 111 | } 112 | 113 | return true; 114 | } 115 | 116 | // @CloudHTTPv2({ 117 | // runtime: { 118 | // cpu: 2, 119 | // memory: '4GiB', 120 | // timeoutSeconds: 3600, 121 | // concurrency: 2, 122 | // maxInstances: 200, 123 | // }, 124 | // tags: ['DataCrunching'], 125 | // }) 126 | // async dispatchPageCacheCrunching( 127 | // @RPCReflect() rpcReflect: RPCReflection 128 | // ) { 129 | // const sse = new OutputServerEventStream({ highWaterMark: 4096 }); 130 | // rpcReflect.return(sse); 131 | // rpcReflect.catch((err) => { 132 | // sse.end({ data: `Error: ${err.message}` }); 133 | // }); 134 | // for await (const { fileName, date, offset } of this.iterPageCacheChunks()) { 135 | // this.logger.info(`Dispatching ${fileName}...`); 136 | // sse.write({ data: `Dispatching ${fileName}...` }); 137 | 138 | // await getFunctions().taskQueue('crunchPageCacheWorker').enqueue({ date, offset }, { 139 | // dispatchDeadlineSeconds: 1800, 140 | // uri: await getFunctionUrl('crunchPageCacheWorker'), 141 | // }); 142 | // } 143 | 144 | // sse.end({ data: 'done' }); 145 | 146 | // return true; 147 | // } 148 | 149 | async* iterPageCacheRecords(date?: string, inputOffset?: number | string) { 150 | const startOfToday = dayjs().utc().startOf('day'); 151 | const startingPoint = dayjs().utc().subtract(this.pageCacheCrunchingTMinus, 'ms').startOf('day'); 152 | let theDay = startingPoint; 153 | 154 | if (date) { 155 | theDay = dayjs(date).utc().startOf('day'); 156 | } 157 | 158 | let counter = 0; 159 | if (inputOffset) { 160 | counter = parseInt(inputOffset as string, 10); 161 | } 162 | 163 | while (theDay.isBefore(startOfToday)) { 164 | const fileName = `${this.pageCacheCrunchingPrefix}/r${this.rev}/${theDay.format('YYYY-MM-DD')}/${counter}.jsonl.gz`; 165 | const offset = counter; 166 | counter += this.pageCacheCrunchingBatchSize; 167 | const fileExists = (await this.firebaseObjectStorage.bucket.file(fileName).exists())[0]; 168 | if (fileExists) { 169 | continue; 170 | } 171 | 172 | const records = await Crawled.fromFirestoreQuery(Crawled.COLLECTION 173 | .where('createdAt', '>=', theDay.toDate()) 174 | .where('createdAt', '<', theDay.add(1, 'day').toDate()) 175 | .orderBy('createdAt', 'asc') 176 | .offset(offset) 177 | .limit(this.pageCacheCrunchingBatchSize) 178 | ); 179 | 180 | this.logger.info(`Found ${records.length} records for ${theDay.format('YYYY-MM-DD')} at offset ${offset}`, { fileName, counter }); 181 | 182 | if (!records.length) { 183 | if (date) { 184 | break; 185 | } 186 | theDay = theDay.add(1, 'day'); 187 | counter = 0; 188 | continue; 189 | } 190 | 191 | yield { fileName, records }; 192 | 193 | if (offset) { 194 | break; 195 | } 196 | } 197 | } 198 | 199 | async* iterPageCacheChunks() { 200 | const startOfToday = dayjs().utc().startOf('day'); 201 | const startingPoint = dayjs().utc().subtract(this.pageCacheCrunchingTMinus, 'ms').startOf('day'); 202 | let theDay = startingPoint; 203 | 204 | let counter = 0; 205 | 206 | while (theDay.isBefore(startOfToday)) { 207 | const fileName = `${this.pageCacheCrunchingPrefix}/r${this.rev}/${theDay.format('YYYY-MM-DD')}/${counter}.jsonl.gz`; 208 | const offset = counter; 209 | counter += this.pageCacheCrunchingBatchSize; 210 | const fileExists = (await this.firebaseObjectStorage.bucket.file(fileName).exists())[0]; 211 | if (fileExists) { 212 | continue; 213 | } 214 | 215 | const nRecords = (await Crawled.COLLECTION 216 | .where('createdAt', '>=', theDay.toDate()) 217 | .where('createdAt', '<', theDay.add(1, 'day').toDate()) 218 | .orderBy('createdAt', 'asc') 219 | .offset(offset) 220 | .limit(this.pageCacheCrunchingBatchSize) 221 | .count().get()).data().count; 222 | 223 | this.logger.info(`Found ${nRecords} records for ${theDay.format('YYYY-MM-DD')} at offset ${offset}`, { fileName, counter }); 224 | if (nRecords < this.pageCacheCrunchingBatchSize) { 225 | theDay = theDay.add(1, 'day'); 226 | counter = 0; 227 | } 228 | if (nRecords) { 229 | yield { fileName, date: theDay.toISOString(), offset }; 230 | } 231 | } 232 | } 233 | 234 | async crunchCacheRecords(records: Crawled[]) { 235 | const throttle = new PromiseThrottle(30); 236 | const localFilePath = this.tempFileManager.alloc(); 237 | let nextDrainDeferred = Defer(); 238 | nextDrainDeferred.resolve(); 239 | 240 | for (const record of records) { 241 | await throttle.acquire(); 242 | this.firebaseObjectStorage.downloadFile(`snapshots/${record._id}`) 243 | .then(async (snapshotTxt) => { 244 | try { 245 | const snapshot = JSON.parse(snapshotTxt.toString('utf-8')); 246 | 247 | let formatted = await this.snapshotFormatter.formatSnapshot('default', snapshot); 248 | if (!formatted.content) { 249 | formatted = await this.snapshotFormatter.formatSnapshot('markdown', snapshot); 250 | } 251 | 252 | await nextDrainDeferred.promise; 253 | await appendFile(localFilePath, JSON.stringify({ 254 | url: snapshot.href, 255 | title: snapshot.title || '', 256 | html: snapshot.html || '', 257 | text: snapshot.text || '', 258 | content: formatted.content || '', 259 | }) + '\n', { encoding: 'utf-8' }); 260 | 261 | } catch (err) { 262 | this.logger.warn(`Failed to parse snapshot for ${record._id}`, { err }); 263 | } 264 | }) 265 | .finally(() => { 266 | throttle.release(); 267 | }); 268 | } 269 | 270 | await throttle.nextDrain(); 271 | 272 | 273 | const ro = { 274 | path: localFilePath 275 | }; 276 | 277 | this.tempFileManager.bindPathTo(ro, localFilePath); 278 | 279 | return ro; 280 | } 281 | } 282 | -------------------------------------------------------------------------------- /src/db/adaptive-crawl-task.ts: -------------------------------------------------------------------------------- 1 | import { Also, Prop, parseJSONText } from 'civkit'; 2 | import { FirestoreRecord } from '../shared/lib/firestore'; 3 | import _ from 'lodash'; 4 | 5 | export enum AdaptiveCrawlTaskStatus { 6 | PENDING = 'pending', 7 | PROCESSING = 'processing', 8 | COMPLETED = 'completed', 9 | FAILED = 'failed', 10 | } 11 | 12 | @Also({ 13 | dictOf: Object 14 | }) 15 | export class AdaptiveCrawlTask extends FirestoreRecord { 16 | static override collectionName = 'adaptiveCrawlTasks'; 17 | 18 | override _id!: string; 19 | 20 | @Prop({ 21 | required: true 22 | }) 23 | status!: AdaptiveCrawlTaskStatus; 24 | 25 | @Prop({ 26 | required: true 27 | }) 28 | statusText!: string; 29 | 30 | @Prop() 31 | meta!: { 32 | useSitemap: boolean; 33 | maxPages: number; 34 | targetUrl: string; 35 | }; 36 | 37 | @Prop() 38 | urls!: string[]; 39 | 40 | @Prop() 41 | processed!: { 42 | [url: string]: string; 43 | }; 44 | 45 | @Prop() 46 | failed!: { 47 | [url: string]: any; 48 | }; 49 | 50 | @Prop() 51 | createdAt!: Date; 52 | 53 | @Prop() 54 | finishedAt?: Date; 55 | 56 | @Prop() 57 | duration?: number; 58 | 59 | static patchedFields = [ 60 | 'meta', 61 | ]; 62 | 63 | static override from(input: any) { 64 | for (const field of this.patchedFields) { 65 | if (typeof input[field] === 'string') { 66 | input[field] = parseJSONText(input[field]); 67 | } 68 | } 69 | 70 | return super.from(input) as AdaptiveCrawlTask; 71 | } 72 | 73 | override degradeForFireStore() { 74 | const copy: any = { ...this }; 75 | 76 | for (const field of (this.constructor as typeof AdaptiveCrawlTask).patchedFields) { 77 | if (typeof copy[field] === 'object') { 78 | copy[field] = JSON.stringify(copy[field]) as any; 79 | } 80 | } 81 | 82 | return copy; 83 | } 84 | 85 | [k: string]: any; 86 | } 87 | -------------------------------------------------------------------------------- /src/db/crawled.ts: -------------------------------------------------------------------------------- 1 | import { Also, parseJSONText, Prop } from 'civkit'; 2 | import { FirestoreRecord } from '../shared/lib/firestore'; 3 | import _ from 'lodash'; 4 | import type { PageSnapshot } from '../services/puppeteer'; 5 | 6 | @Also({ 7 | dictOf: Object 8 | }) 9 | export class Crawled extends FirestoreRecord { 10 | static override collectionName = 'crawled'; 11 | 12 | override _id!: string; 13 | 14 | @Prop({ 15 | required: true 16 | }) 17 | url!: string; 18 | 19 | @Prop({ 20 | required: true 21 | }) 22 | urlPathDigest!: string; 23 | 24 | @Prop() 25 | htmlSignificantlyModifiedByJs?: boolean; 26 | 27 | @Prop() 28 | snapshot?: PageSnapshot & { screenshot: never; pageshot: never; }; 29 | 30 | @Prop() 31 | screenshotAvailable?: boolean; 32 | 33 | @Prop() 34 | pageshotAvailable?: boolean; 35 | 36 | @Prop() 37 | snapshotAvailable?: boolean; 38 | 39 | @Prop() 40 | createdAt!: Date; 41 | 42 | @Prop() 43 | expireAt!: Date; 44 | 45 | static patchedFields = [ 46 | 'snapshot' 47 | ]; 48 | 49 | static override from(input: any) { 50 | for (const field of this.patchedFields) { 51 | if (typeof input[field] === 'string') { 52 | input[field] = parseJSONText(input[field]); 53 | } 54 | } 55 | 56 | return super.from(input) as Crawled; 57 | } 58 | 59 | override degradeForFireStore() { 60 | const copy: any = { ...this }; 61 | 62 | for (const field of (this.constructor as typeof Crawled).patchedFields) { 63 | if (typeof copy[field] === 'object') { 64 | copy[field] = JSON.stringify(copy[field]) as any; 65 | } 66 | } 67 | 68 | return copy; 69 | } 70 | 71 | [k: string]: any; 72 | } 73 | -------------------------------------------------------------------------------- /src/db/domain-blockade.ts: -------------------------------------------------------------------------------- 1 | import { Also, Prop } from 'civkit'; 2 | import { FirestoreRecord } from '../shared/lib/firestore'; 3 | 4 | @Also({ 5 | dictOf: Object 6 | }) 7 | export class DomainBlockade extends FirestoreRecord { 8 | static override collectionName = 'domainBlockades'; 9 | 10 | override _id!: string; 11 | 12 | @Prop({ 13 | required: true 14 | }) 15 | domain!: string; 16 | 17 | @Prop({ required: true }) 18 | triggerReason!: string; 19 | 20 | @Prop() 21 | triggerUrl?: string; 22 | 23 | @Prop() 24 | createdAt!: Date; 25 | 26 | @Prop() 27 | expireAt?: Date; 28 | 29 | [k: string]: any; 30 | } 31 | -------------------------------------------------------------------------------- /src/db/domain-profile.ts: -------------------------------------------------------------------------------- 1 | import { Also, Prop } from 'civkit'; 2 | import { FirestoreRecord } from '../shared/lib/firestore'; 3 | import { ENGINE_TYPE } from '../dto/crawler-options'; 4 | 5 | @Also({ 6 | dictOf: Object 7 | }) 8 | export class DomainProfile extends FirestoreRecord { 9 | static override collectionName = 'domainProfiles'; 10 | 11 | override _id!: string; 12 | 13 | @Prop({ 14 | required: true 15 | }) 16 | path!: string; 17 | 18 | @Prop() 19 | triggerUrl?: string; 20 | 21 | @Prop({ required: true, type: ENGINE_TYPE }) 22 | engine!: string; 23 | 24 | @Prop() 25 | createdAt!: Date; 26 | 27 | @Prop() 28 | expireAt?: Date; 29 | 30 | [k: string]: any; 31 | } 32 | -------------------------------------------------------------------------------- /src/db/img-alt.ts: -------------------------------------------------------------------------------- 1 | import { Also, Prop } from 'civkit'; 2 | import { FirestoreRecord } from '../shared/lib/firestore'; 3 | import _ from 'lodash'; 4 | 5 | @Also({ 6 | dictOf: Object 7 | }) 8 | export class ImgAlt extends FirestoreRecord { 9 | static override collectionName = 'imgAlts'; 10 | 11 | override _id!: string; 12 | 13 | @Prop({ 14 | required: true 15 | }) 16 | src!: string; 17 | 18 | @Prop({ 19 | required: true 20 | }) 21 | urlDigest!: string; 22 | 23 | @Prop() 24 | width?: number; 25 | 26 | @Prop() 27 | height?: number; 28 | 29 | @Prop() 30 | generatedAlt?: string; 31 | 32 | @Prop() 33 | originalAlt?: string; 34 | 35 | @Prop() 36 | createdAt!: Date; 37 | 38 | @Prop() 39 | expireAt?: Date; 40 | 41 | [k: string]: any; 42 | } 43 | -------------------------------------------------------------------------------- /src/db/pdf.ts: -------------------------------------------------------------------------------- 1 | import { Also, Prop, parseJSONText } from 'civkit'; 2 | import { FirestoreRecord } from '../shared/lib/firestore'; 3 | import _ from 'lodash'; 4 | 5 | @Also({ 6 | dictOf: Object 7 | }) 8 | export class PDFContent extends FirestoreRecord { 9 | static override collectionName = 'pdfs'; 10 | 11 | override _id!: string; 12 | 13 | @Prop({ 14 | required: true 15 | }) 16 | src!: string; 17 | 18 | @Prop({ 19 | required: true 20 | }) 21 | urlDigest!: string; 22 | 23 | @Prop() 24 | meta?: { [k: string]: any; }; 25 | 26 | @Prop() 27 | text?: string; 28 | 29 | @Prop() 30 | content?: string; 31 | 32 | @Prop() 33 | createdAt!: Date; 34 | 35 | @Prop() 36 | expireAt?: Date; 37 | 38 | static patchedFields = [ 39 | 'meta' 40 | ]; 41 | 42 | static override from(input: any) { 43 | for (const field of this.patchedFields) { 44 | if (typeof input[field] === 'string') { 45 | input[field] = parseJSONText(input[field]); 46 | } 47 | } 48 | 49 | return super.from(input) as PDFContent; 50 | } 51 | 52 | override degradeForFireStore() { 53 | const copy: any = { ...this }; 54 | 55 | for (const field of (this.constructor as typeof PDFContent).patchedFields) { 56 | if (typeof copy[field] === 'object') { 57 | copy[field] = JSON.stringify(copy[field]) as any; 58 | } 59 | } 60 | 61 | return copy; 62 | } 63 | 64 | [k: string]: any; 65 | } 66 | -------------------------------------------------------------------------------- /src/db/searched.ts: -------------------------------------------------------------------------------- 1 | import { Also, parseJSONText, Prop } from 'civkit'; 2 | import { FirestoreRecord } from '../shared/lib/firestore'; 3 | import _ from 'lodash'; 4 | 5 | @Also({ 6 | dictOf: Object 7 | }) 8 | export class SearchResult extends FirestoreRecord { 9 | static override collectionName = 'searchResults'; 10 | 11 | override _id!: string; 12 | 13 | @Prop({ 14 | required: true 15 | }) 16 | query!: any; 17 | 18 | @Prop({ 19 | required: true 20 | }) 21 | queryDigest!: string; 22 | 23 | @Prop() 24 | response?: any; 25 | 26 | @Prop() 27 | createdAt!: Date; 28 | 29 | @Prop() 30 | expireAt?: Date; 31 | 32 | [k: string]: any; 33 | 34 | static patchedFields = [ 35 | 'query', 36 | 'response', 37 | ]; 38 | 39 | static override from(input: any) { 40 | for (const field of this.patchedFields) { 41 | if (typeof input[field] === 'string') { 42 | input[field] = parseJSONText(input[field]); 43 | } 44 | } 45 | 46 | return super.from(input) as SearchResult; 47 | } 48 | 49 | override degradeForFireStore() { 50 | const copy: any = { ...this }; 51 | 52 | for (const field of (this.constructor as typeof SearchResult).patchedFields) { 53 | if (typeof copy[field] === 'object') { 54 | copy[field] = JSON.stringify(copy[field]) as any; 55 | } 56 | } 57 | 58 | return copy; 59 | } 60 | } 61 | 62 | export class SerperSearchResult extends SearchResult { 63 | static override collectionName = 'serperSearchResults'; 64 | } 65 | 66 | export class SERPResult extends SearchResult { 67 | static override collectionName = 'SERPResults'; 68 | } -------------------------------------------------------------------------------- /src/dto/adaptive-crawler-options.ts: -------------------------------------------------------------------------------- 1 | import { Also, AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; 2 | import type { Request, Response } from 'express'; 3 | 4 | 5 | @Also({ 6 | openapi: { 7 | operation: { 8 | parameters: { 9 | 'X-Use-Sitemap': { 10 | description: 'Use sitemap to crawl the website.', 11 | in: 'header', 12 | schema: { type: 'string' } 13 | }, 14 | 'X-Max-Depth': { 15 | description: 'Max deep level to crawl.', 16 | in: 'header', 17 | schema: { type: 'string' } 18 | }, 19 | 'X-Max-Pages': { 20 | description: 'Max number of pages to crawl.', 21 | in: 'header', 22 | schema: { type: 'string' } 23 | }, 24 | } 25 | } 26 | } 27 | }) 28 | export class AdaptiveCrawlerOptions extends AutoCastable { 29 | @Prop({ 30 | default: true, 31 | desc: 'Use sitemap to crawl the website.', 32 | }) 33 | useSitemap!: boolean; 34 | 35 | @Prop({ 36 | default: 10, 37 | desc: 'Max number of pages to crawl.', 38 | validate: (v: number) => v >= 1 && v <= 100, 39 | }) 40 | maxPages!: number; 41 | 42 | static override from(input: any) { 43 | const instance = super.from(input) as AdaptiveCrawlerOptions; 44 | const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as { 45 | req: Request, 46 | res: Response, 47 | } | undefined; 48 | 49 | let maxPages = parseInt(ctx?.req.get('x-max-pages') || ''); 50 | if (!isNaN(maxPages) && maxPages > 0) { 51 | instance.maxPages = maxPages <= 100 ? maxPages : 100; 52 | } 53 | 54 | const useSitemap = ctx?.req.get('x-use-sitemap'); 55 | if (useSitemap !== undefined) { 56 | instance.useSitemap = Boolean(useSitemap); 57 | } 58 | 59 | return instance; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/dto/jina-embeddings-auth.ts: -------------------------------------------------------------------------------- 1 | import _ from 'lodash'; 2 | import { 3 | Also, AuthenticationFailedError, AuthenticationRequiredError, 4 | RPC_CALL_ENVIRONMENT, 5 | AutoCastable, 6 | DownstreamServiceError, 7 | } from 'civkit/civ-rpc'; 8 | import { htmlEscape } from 'civkit/escape'; 9 | import { marshalErrorLike } from 'civkit/lang'; 10 | 11 | import type { Context } from 'koa'; 12 | 13 | import logger from '../services/logger'; 14 | import { InjectProperty } from '../services/registry'; 15 | import { AsyncLocalContext } from '../services/async-context'; 16 | 17 | import envConfig from '../shared/services/secrets'; 18 | import { JinaEmbeddingsDashboardHTTP } from '../shared/3rd-party/jina-embeddings'; 19 | import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account'; 20 | import { TierFeatureConstraintError } from '../services/errors'; 21 | 22 | const authDtoLogger = logger.child({ service: 'JinaAuthDTO' }); 23 | 24 | 25 | const THE_VERY_SAME_JINA_EMBEDDINGS_CLIENT = new JinaEmbeddingsDashboardHTTP(envConfig.JINA_EMBEDDINGS_DASHBOARD_API_KEY); 26 | 27 | @Also({ 28 | openapi: { 29 | operation: { 30 | parameters: { 31 | 'Authorization': { 32 | description: htmlEscape`Jina Token for authentication.\n\n` + 33 | htmlEscape`- Member of \n\n` + 34 | `- Authorization: Bearer {YOUR_JINA_TOKEN}` 35 | , 36 | in: 'header', 37 | schema: { 38 | anyOf: [ 39 | { type: 'string', format: 'token' } 40 | ] 41 | } 42 | } 43 | } 44 | } 45 | } 46 | }) 47 | export class JinaEmbeddingsAuthDTO extends AutoCastable { 48 | uid?: string; 49 | bearerToken?: string; 50 | user?: JinaEmbeddingsTokenAccount; 51 | 52 | @InjectProperty(AsyncLocalContext) 53 | ctxMgr!: AsyncLocalContext; 54 | 55 | jinaEmbeddingsDashboard = THE_VERY_SAME_JINA_EMBEDDINGS_CLIENT; 56 | 57 | static override from(input: any) { 58 | const instance = super.from(input) as JinaEmbeddingsAuthDTO; 59 | 60 | const ctx = input[RPC_CALL_ENVIRONMENT] as Context; 61 | 62 | if (ctx) { 63 | const authorization = ctx.get('authorization'); 64 | 65 | if (authorization) { 66 | const authToken = authorization.split(' ')[1] || authorization; 67 | instance.bearerToken = authToken; 68 | } 69 | 70 | } 71 | 72 | if (!instance.bearerToken && input._token) { 73 | instance.bearerToken = input._token; 74 | } 75 | 76 | return instance; 77 | } 78 | 79 | async getBrief(ignoreCache?: boolean | string) { 80 | if (!this.bearerToken) { 81 | throw new AuthenticationRequiredError({ 82 | message: 'Jina API key is required to authenticate. Please get one from https://jina.ai' 83 | }); 84 | } 85 | 86 | let firestoreDegradation = false; 87 | let account; 88 | try { 89 | account = await JinaEmbeddingsTokenAccount.fromFirestore(this.bearerToken); 90 | } catch (err) { 91 | // FireStore would not accept any string as input and may throw if not happy with it 92 | firestoreDegradation = true; 93 | logger.warn(`Firestore issue`, { err }); 94 | } 95 | 96 | 97 | const age = account?.lastSyncedAt ? Date.now() - account.lastSyncedAt.valueOf() : Infinity; 98 | const jitter = Math.ceil(Math.random() * 30 * 1000); 99 | 100 | if (account && !ignoreCache) { 101 | if ((age < (180_000 - jitter)) && (account.wallet?.total_balance > 0)) { 102 | this.user = account; 103 | this.uid = this.user?.user_id; 104 | 105 | return account; 106 | } 107 | } 108 | 109 | if (firestoreDegradation) { 110 | logger.debug(`Using remote UC cached user`); 111 | let r; 112 | try { 113 | r = await this.jinaEmbeddingsDashboard.authorization(this.bearerToken); 114 | } catch (err: any) { 115 | if (err?.status === 401) { 116 | throw new AuthenticationFailedError({ 117 | message: 'Invalid API key, please get a new one from https://jina.ai' 118 | }); 119 | } 120 | logger.warn(`Failed load remote cached user: ${err}`, { err }); 121 | throw new DownstreamServiceError(`Failed to authenticate: ${err}`); 122 | } 123 | const brief = r?.data; 124 | const draftAccount = JinaEmbeddingsTokenAccount.from({ 125 | ...account, ...brief, _id: this.bearerToken, 126 | lastSyncedAt: new Date() 127 | }); 128 | this.user = draftAccount; 129 | this.uid = this.user?.user_id; 130 | 131 | return draftAccount; 132 | } 133 | 134 | try { 135 | // TODO: go back using validateToken after performance issue fixed 136 | const r = ((account?.wallet?.total_balance || 0) > 0) ? 137 | await this.jinaEmbeddingsDashboard.authorization(this.bearerToken) : 138 | await this.jinaEmbeddingsDashboard.validateToken(this.bearerToken); 139 | const brief = r.data; 140 | const draftAccount = JinaEmbeddingsTokenAccount.from({ 141 | ...account, ...brief, _id: this.bearerToken, 142 | lastSyncedAt: new Date() 143 | }); 144 | await JinaEmbeddingsTokenAccount.save(draftAccount.degradeForFireStore(), undefined, { merge: true }); 145 | 146 | this.user = draftAccount; 147 | this.uid = this.user?.user_id; 148 | 149 | return draftAccount; 150 | } catch (err: any) { 151 | authDtoLogger.warn(`Failed to get user brief: ${err}`, { err: marshalErrorLike(err) }); 152 | 153 | if (err?.status === 401) { 154 | throw new AuthenticationFailedError({ 155 | message: 'Invalid API key, please get a new one from https://jina.ai' 156 | }); 157 | } 158 | 159 | if (account) { 160 | this.user = account; 161 | this.uid = this.user?.user_id; 162 | 163 | return account; 164 | } 165 | 166 | 167 | throw new DownstreamServiceError(`Failed to authenticate: ${err}`); 168 | } 169 | } 170 | 171 | async reportUsage(tokenCount: number, mdl: string, endpoint: string = '/encode') { 172 | const user = await this.assertUser(); 173 | const uid = user.user_id; 174 | user.wallet.total_balance -= tokenCount; 175 | 176 | return this.jinaEmbeddingsDashboard.reportUsage(this.bearerToken!, { 177 | model_name: mdl, 178 | api_endpoint: endpoint, 179 | consumer: { 180 | id: uid, 181 | user_id: uid, 182 | }, 183 | usage: { 184 | total_tokens: tokenCount 185 | }, 186 | labels: { 187 | model_name: mdl 188 | } 189 | }).then((r) => { 190 | JinaEmbeddingsTokenAccount.COLLECTION.doc(this.bearerToken!) 191 | .update({ 'wallet.total_balance': JinaEmbeddingsTokenAccount.OPS.increment(-tokenCount) }) 192 | .catch((err) => { 193 | authDtoLogger.warn(`Failed to update cache for ${uid}: ${err}`, { err: marshalErrorLike(err) }); 194 | }); 195 | 196 | return r; 197 | }).catch((err) => { 198 | user.wallet.total_balance += tokenCount; 199 | authDtoLogger.warn(`Failed to report usage for ${uid}: ${err}`, { err: marshalErrorLike(err) }); 200 | }); 201 | } 202 | 203 | async solveUID() { 204 | if (this.uid) { 205 | this.ctxMgr.set('uid', this.uid); 206 | 207 | return this.uid; 208 | } 209 | 210 | if (this.bearerToken) { 211 | await this.getBrief(); 212 | this.ctxMgr.set('uid', this.uid); 213 | 214 | return this.uid; 215 | } 216 | 217 | return undefined; 218 | } 219 | 220 | async assertUID() { 221 | const uid = await this.solveUID(); 222 | 223 | if (!uid) { 224 | throw new AuthenticationRequiredError('Authentication failed'); 225 | } 226 | 227 | return uid; 228 | } 229 | 230 | async assertUser() { 231 | if (this.user) { 232 | return this.user; 233 | } 234 | 235 | await this.getBrief(); 236 | 237 | return this.user!; 238 | } 239 | 240 | async assertTier(n: number, feature?: string) { 241 | let user; 242 | try { 243 | user = await this.assertUser(); 244 | } catch (err) { 245 | if (err instanceof AuthenticationRequiredError) { 246 | throw new AuthenticationRequiredError({ 247 | message: `Authentication is required to use this feature${feature ? ` (${feature})` : ''}. Please provide a valid API key.` 248 | }); 249 | } 250 | 251 | throw err; 252 | } 253 | 254 | const tier = parseInt(user.metadata?.speed_level); 255 | if (isNaN(tier) || tier < n) { 256 | throw new TierFeatureConstraintError({ 257 | message: `Your current plan does not support this feature${feature ? ` (${feature})` : ''}. Please upgrade your plan.` 258 | }); 259 | } 260 | 261 | return true; 262 | } 263 | 264 | getRateLimits(...tags: string[]) { 265 | const descs = tags.map((x) => this.user?.customRateLimits?.[x] || []).flat().filter((x) => x.isEffective()); 266 | 267 | if (descs.length) { 268 | return descs; 269 | } 270 | 271 | return undefined; 272 | } 273 | } 274 | -------------------------------------------------------------------------------- /src/dto/turndown-tweakable-options.ts: -------------------------------------------------------------------------------- 1 | import { AutoCastable, Prop } from 'civkit/civ-rpc'; 2 | import {Context} from '../services/registry'; 3 | import _ from 'lodash'; 4 | 5 | 6 | export class TurnDownTweakableOptions extends AutoCastable { 7 | @Prop({ 8 | desc: 'Turndown options > headingStyle', 9 | type: new Set(['setext', 'atx']), 10 | }) 11 | headingStyle?: 'setext' | 'atx'; 12 | 13 | @Prop({ 14 | desc: 'Turndown options > hr', 15 | validate: (v: string) => v.length > 0 && v.length <= 128 16 | }) 17 | hr?: string; 18 | 19 | @Prop({ 20 | desc: 'Turndown options > bulletListMarker', 21 | type: new Set(['-', '+', '*']), 22 | }) 23 | bulletListMarker?: '-' | '+' | '*'; 24 | 25 | @Prop({ 26 | desc: 'Turndown options > emDelimiter', 27 | type: new Set(['_', '*']), 28 | }) 29 | emDelimiter?: '_' | '*'; 30 | 31 | @Prop({ 32 | desc: 'Turndown options > strongDelimiter', 33 | type: new Set(['__', '**']), 34 | }) 35 | strongDelimiter?: '__' | '**'; 36 | 37 | @Prop({ 38 | desc: 'Turndown options > linkStyle', 39 | type: new Set(['inlined', 'referenced', 'discarded']), 40 | }) 41 | linkStyle?: 'inlined' | 'referenced' | 'discarded'; 42 | 43 | @Prop({ 44 | desc: 'Turndown options > linkReferenceStyle', 45 | type: new Set(['full', 'collapsed', 'shortcut', 'discarded']), 46 | }) 47 | linkReferenceStyle?: 'full' | 'collapsed' | 'shortcut' | 'discarded'; 48 | 49 | static fromCtx(ctx: Context, prefix= 'x-md-') { 50 | const draft: Record = {}; 51 | for (const [k, v] of Object.entries(ctx.headers)) { 52 | if (k.startsWith(prefix)) { 53 | const prop = k.slice(prefix.length); 54 | const sk = _.camelCase(prop); 55 | draft[sk] = v as string; 56 | } 57 | } 58 | 59 | return this.from(draft); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/fetch.d.ts: -------------------------------------------------------------------------------- 1 | declare global { 2 | export const { 3 | fetch, 4 | FormData, 5 | Headers, 6 | Request, 7 | Response, 8 | File, 9 | }: typeof import('undici'); 10 | export type { FormData, Headers, Request, RequestInit, Response, RequestInit, File } from 'undici'; 11 | } 12 | 13 | export { }; 14 | -------------------------------------------------------------------------------- /src/lib/transform-server-event-stream.ts: -------------------------------------------------------------------------------- 1 | import { TPM, parseJSONText } from 'civkit'; 2 | import { Transform, TransformCallback, TransformOptions } from 'stream'; 3 | 4 | export class InputServerEventStream extends Transform { 5 | cache: string[] = []; 6 | 7 | constructor(options?: TransformOptions) { 8 | super({ 9 | ...options, 10 | readableObjectMode: true 11 | }); 12 | } 13 | 14 | decodeRoutine() { 15 | if (!this.cache.length) { 16 | return; 17 | } 18 | 19 | const vecs = this.cache.join('').split(/\r?\n\r?\n/); 20 | this.cache.length = 0; 21 | const lastVec = vecs.pop(); 22 | if (lastVec) { 23 | this.cache.push(lastVec); 24 | } 25 | 26 | for (const x of vecs) { 27 | const lines: string[] = x.split(/\r?\n/); 28 | 29 | const event: { 30 | event?: string; 31 | data?: string; 32 | id?: string; 33 | retry?: number; 34 | } = {}; 35 | 36 | for (const l of lines) { 37 | const columnPos = l.indexOf(':'); 38 | if (columnPos <= 0) { 39 | continue; 40 | } 41 | const key = l.substring(0, columnPos); 42 | const rawValue = l.substring(columnPos + 1); 43 | const value = rawValue.startsWith(' ') ? rawValue.slice(1) : rawValue; 44 | if (key === 'data') { 45 | if (event.data) { 46 | event.data += value || '\n'; 47 | } else if (event.data === '') { 48 | event.data += '\n'; 49 | event.data += value || '\n'; 50 | } else { 51 | event.data = value; 52 | } 53 | } else if (key === 'retry') { 54 | event.retry = parseInt(value, 10); 55 | } else { 56 | Reflect.set(event, key, value); 57 | } 58 | } 59 | 60 | if (event.data) { 61 | const parsed = parseJSONText(event.data); 62 | if (parsed && typeof parsed === 'object') { 63 | event.data = parsed; 64 | } 65 | } 66 | 67 | if (Object.keys(event).length) { 68 | this.push(event); 69 | } 70 | } 71 | } 72 | 73 | override _transform(chunk: any, encoding: BufferEncoding, callback: TransformCallback): void { 74 | if (chunk === null) { 75 | this.push(null); 76 | } 77 | 78 | this.cache.push(chunk.toString()); 79 | this.decodeRoutine(); 80 | 81 | callback(); 82 | } 83 | 84 | override _final(callback: (error?: Error | null | undefined) => void): void { 85 | this.decodeRoutine(); 86 | callback(); 87 | } 88 | } 89 | 90 | @TPM({ 91 | contentType: 'text/event-stream', 92 | }) 93 | export class OutputServerEventStream extends Transform { 94 | n: number = 0; 95 | 96 | constructor(options?: TransformOptions) { 97 | super({ 98 | ...options, writableObjectMode: true, encoding: 'utf-8' 99 | }); 100 | } 101 | 102 | encodeRoutine(chunk: { 103 | event?: string; 104 | data?: any; 105 | id?: string; 106 | retry?: number; 107 | } | string) { 108 | if (typeof chunk === 'object') { 109 | const lines: string[] = []; 110 | 111 | if (chunk.event) { 112 | lines.push(`event: ${chunk.event}`); 113 | } 114 | if (chunk.data) { 115 | if (typeof chunk.data === 'string') { 116 | for (const x of chunk.data.split(/\r?\n/)) { 117 | lines.push(`data: ${x}`); 118 | } 119 | } else { 120 | lines.push(`data: ${JSON.stringify(chunk.data)}`); 121 | } 122 | } 123 | if (chunk.id) { 124 | lines.push(`id: ${chunk.id}`); 125 | } 126 | if (chunk.retry) { 127 | lines.push(`retry: ${chunk.retry}`); 128 | } 129 | if (!lines.length) { 130 | lines.push(`data: ${JSON.stringify(chunk)}`); 131 | } 132 | this.push(lines.join('\n')); 133 | this.push('\n\n'); 134 | this.n++; 135 | 136 | return; 137 | } else if (typeof chunk === 'string') { 138 | const lines: string[] = []; 139 | for (const x of chunk.split(/\r?\n/)) { 140 | lines.push(`data: ${x}`); 141 | } 142 | 143 | this.push(lines.join('\n')); 144 | this.push('\n\n'); 145 | this.n++; 146 | } 147 | } 148 | 149 | override _transform(chunk: any, encoding: BufferEncoding, callback: TransformCallback): void { 150 | if (chunk === null) { 151 | this.push(null); 152 | } 153 | 154 | this.encodeRoutine(chunk); 155 | 156 | callback(); 157 | } 158 | } 159 | 160 | export interface OutputServerEventStream extends Transform { 161 | write(chunk: string | { 162 | event?: string; 163 | data?: any; 164 | id?: string; 165 | retry?: number; 166 | }, callback?: (error: Error | null | undefined) => void): boolean; 167 | write(chunk: any, callback?: (error: Error | null | undefined) => void): boolean; 168 | write(chunk: any, encoding: BufferEncoding, callback?: (error: Error | null | undefined) => void): boolean; 169 | } 170 | -------------------------------------------------------------------------------- /src/services/alt-text.ts: -------------------------------------------------------------------------------- 1 | import { AssertionFailureError, AsyncService, HashManager } from 'civkit'; 2 | import { singleton } from 'tsyringe'; 3 | import { GlobalLogger } from './logger'; 4 | import { CanvasService } from './canvas'; 5 | import { ImageInterrogationManager } from '../shared/services/common-iminterrogate'; 6 | import { ImgBrief } from './puppeteer'; 7 | import { ImgAlt } from '../db/img-alt'; 8 | import { AsyncLocalContext } from './async-context'; 9 | 10 | const md5Hasher = new HashManager('md5', 'hex'); 11 | 12 | @singleton() 13 | export class AltTextService extends AsyncService { 14 | 15 | altsToIgnore = 'image,img,photo,picture,pic,alt,figure,fig'.split(','); 16 | logger = this.globalLogger.child({ service: this.constructor.name }); 17 | 18 | constructor( 19 | protected globalLogger: GlobalLogger, 20 | protected imageInterrogator: ImageInterrogationManager, 21 | protected canvasService: CanvasService, 22 | protected asyncLocalContext: AsyncLocalContext 23 | ) { 24 | super(...arguments); 25 | } 26 | 27 | override async init() { 28 | await this.dependencyReady(); 29 | this.emit('ready'); 30 | } 31 | 32 | async caption(url: string) { 33 | try { 34 | const img = await this.canvasService.loadImage(url); 35 | const contentTypeHint = Reflect.get(img, 'contentType'); 36 | if (Math.min(img.naturalHeight, img.naturalWidth) <= 1) { 37 | return `A ${img.naturalWidth}x${img.naturalHeight} image, likely be a tacker probe`; 38 | } 39 | if (Math.min(img.naturalHeight, img.naturalWidth) < 64) { 40 | return `A ${img.naturalWidth}x${img.naturalHeight} small image, likely a logo, icon or avatar`; 41 | } 42 | const resized = this.canvasService.fitImageToSquareBox(img, 1024); 43 | const exported = await this.canvasService.canvasToBuffer(resized, 'image/png'); 44 | 45 | const svgHint = contentTypeHint.includes('svg') ? `Beware this image is a SVG rendered on a gray background, the gray background is not part of the image.\n\n` : ''; 46 | const svgSystemHint = contentTypeHint.includes('svg') ? ` Sometimes the system renders SVG on a gray background. When this happens, you must not include the gray background in the description.` : ''; 47 | 48 | const r = await this.imageInterrogator.interrogate('vertex-gemini-2.0-flash', { 49 | image: exported, 50 | prompt: `${svgHint}Give a concise image caption descriptive sentence in third person. Start directly with the description.`, 51 | system: `You are BLIP2, an image caption model. You will generate Alt Text (in web pages) for any image for a11y purposes. You must not start with "This image is sth...", instead, start direly with "sth..."${svgSystemHint}`, 52 | }); 53 | 54 | return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim(); 55 | } catch (err) { 56 | throw new AssertionFailureError({ message: `Could not generate alt text for url ${url}`, cause: err }); 57 | } 58 | } 59 | 60 | async getAltText(imgBrief: ImgBrief) { 61 | if (!imgBrief.src) { 62 | return undefined; 63 | } 64 | if (imgBrief.alt && !this.altsToIgnore.includes(imgBrief.alt.trim().toLowerCase())) { 65 | return imgBrief.alt; 66 | } 67 | const digest = md5Hasher.hash(imgBrief.src); 68 | const shortDigest = Buffer.from(digest, 'hex').toString('base64url'); 69 | let dims: number[] = []; 70 | do { 71 | if (imgBrief.loaded) { 72 | if (imgBrief.naturalWidth && imgBrief.naturalHeight) { 73 | if (Math.min(imgBrief.naturalWidth, imgBrief.naturalHeight) < 64) { 74 | dims = [imgBrief.naturalWidth, imgBrief.naturalHeight]; 75 | break; 76 | } 77 | } 78 | } 79 | 80 | if (imgBrief.width && imgBrief.height) { 81 | if (Math.min(imgBrief.width, imgBrief.height) < 64) { 82 | dims = [imgBrief.width, imgBrief.height]; 83 | break; 84 | } 85 | } 86 | 87 | } while (false); 88 | 89 | if (Math.min(...dims) <= 1) { 90 | return `A ${dims[0]}x${dims[1]} image, likely be a tacker probe`; 91 | } 92 | if (Math.min(...dims) < 64) { 93 | return `A ${dims[0]}x${dims[1]} small image, likely a logo, icon or avatar`; 94 | } 95 | 96 | const existing = await ImgAlt.fromFirestore(shortDigest); 97 | 98 | if (existing) { 99 | return existing.generatedAlt || existing.originalAlt || ''; 100 | } 101 | 102 | let generatedCaption = ''; 103 | 104 | try { 105 | generatedCaption = await this.caption(imgBrief.src); 106 | } catch (err) { 107 | this.logger.warn(`Unable to generate alt text for ${imgBrief.src}`, { err }); 108 | } 109 | 110 | if (this.asyncLocalContext.ctx.DNT) { 111 | // Don't cache alt text if DNT is set 112 | return generatedCaption; 113 | } 114 | 115 | // Don't try again until the next day 116 | const expireMixin = generatedCaption ? {} : { expireAt: new Date(Date.now() + 1000 * 3600 * 24) }; 117 | 118 | await ImgAlt.COLLECTION.doc(shortDigest).set( 119 | { 120 | _id: shortDigest, 121 | src: imgBrief.src || '', 122 | width: imgBrief.naturalWidth || 0, 123 | height: imgBrief.naturalHeight || 0, 124 | urlDigest: digest, 125 | originalAlt: imgBrief.alt || '', 126 | generatedAlt: generatedCaption || '', 127 | createdAt: new Date(), 128 | ...expireMixin 129 | }, { merge: true } 130 | ); 131 | 132 | return generatedCaption; 133 | } 134 | }; 135 | -------------------------------------------------------------------------------- /src/services/async-context.ts: -------------------------------------------------------------------------------- 1 | import { GlobalAsyncContext } from 'civkit/async-context'; 2 | import { container, singleton } from 'tsyringe'; 3 | 4 | @singleton() 5 | export class AsyncLocalContext extends GlobalAsyncContext { } 6 | 7 | const instance = container.resolve(AsyncLocalContext); 8 | Reflect.set(process, 'asyncLocalContext', instance); 9 | 10 | export default instance; 11 | -------------------------------------------------------------------------------- /src/services/blackhole-detector.ts: -------------------------------------------------------------------------------- 1 | import { singleton } from 'tsyringe'; 2 | import { AsyncService } from 'civkit/async-service'; 3 | import { GlobalLogger } from './logger'; 4 | import { delay } from 'civkit/timeout'; 5 | 6 | 7 | @singleton() 8 | export class BlackHoleDetector extends AsyncService { 9 | 10 | logger = this.globalLogger.child({ service: this.constructor.name }); 11 | lastWorkedTs?: number; 12 | lastDoneRequestTs?: number; 13 | lastIncomingRequestTs?: number; 14 | 15 | maxDelay = 1000 * 30; 16 | concurrentRequests = 0; 17 | 18 | strikes = 0; 19 | 20 | constructor(protected globalLogger: GlobalLogger) { 21 | super(...arguments); 22 | 23 | if (process.env.NODE_ENV?.startsWith('prod')) { 24 | setInterval(() => { 25 | this.routine(); 26 | }, 1000 * 30).unref(); 27 | } 28 | } 29 | 30 | override async init() { 31 | await this.dependencyReady(); 32 | this.logger.debug('BlackHoleDetector started'); 33 | this.emit('ready'); 34 | } 35 | 36 | async routine() { 37 | // We give routine a 3s grace period for potentially paused CPU to spin up and process some requests 38 | await delay(3000); 39 | const now = Date.now(); 40 | const lastWorked = this.lastWorkedTs; 41 | if (!lastWorked) { 42 | return; 43 | } 44 | const dt = (now - lastWorked); 45 | if (this.concurrentRequests > 1 && 46 | this.lastIncomingRequestTs && lastWorked && 47 | this.lastIncomingRequestTs >= lastWorked && 48 | (dt > (this.maxDelay * (this.strikes + 1))) 49 | ) { 50 | this.logger.warn(`BlackHole detected, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`); 51 | this.strikes += 1; 52 | } 53 | 54 | if (this.strikes >= 3) { 55 | this.logger.error(`BlackHole detected for ${this.strikes} strikes, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`); 56 | process.nextTick(() => { 57 | this.emit('error', new Error(`BlackHole detected for ${this.strikes} strikes, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`)); 58 | // process.exit(1); 59 | }); 60 | } 61 | } 62 | 63 | incomingRequest() { 64 | this.lastIncomingRequestTs = Date.now(); 65 | this.lastWorkedTs ??= Date.now(); 66 | this.concurrentRequests++; 67 | } 68 | doneWithRequest() { 69 | this.concurrentRequests--; 70 | this.lastDoneRequestTs = Date.now(); 71 | } 72 | 73 | itWorked() { 74 | this.lastWorkedTs = Date.now(); 75 | this.strikes = 0; 76 | } 77 | 78 | }; 79 | -------------------------------------------------------------------------------- /src/services/brave-search.ts: -------------------------------------------------------------------------------- 1 | import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike } from 'civkit'; 2 | import { singleton } from 'tsyringe'; 3 | import { GlobalLogger } from './logger'; 4 | import { SecretExposer } from '../shared/services/secrets'; 5 | import { BraveSearchHTTP, WebSearchQueryParams } from '../shared/3rd-party/brave-search'; 6 | import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip'; 7 | import { AsyncLocalContext } from './async-context'; 8 | import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types'; 9 | import type { Request, Response } from 'express'; 10 | import { BlackHoleDetector } from './blackhole-detector'; 11 | 12 | @singleton() 13 | export class BraveSearchService extends AsyncService { 14 | 15 | logger = this.globalLogger.child({ service: this.constructor.name }); 16 | 17 | braveSearchHTTP!: BraveSearchHTTP; 18 | 19 | constructor( 20 | protected globalLogger: GlobalLogger, 21 | protected secretExposer: SecretExposer, 22 | protected geoipControl: GeoIPService, 23 | protected threadLocal: AsyncLocalContext, 24 | protected blackHoleDetector: BlackHoleDetector, 25 | ) { 26 | super(...arguments); 27 | } 28 | 29 | override async init() { 30 | await this.dependencyReady(); 31 | this.emit('ready'); 32 | 33 | this.braveSearchHTTP = new BraveSearchHTTP(this.secretExposer.BRAVE_SEARCH_API_KEY); 34 | } 35 | 36 | async webSearch(query: WebSearchQueryParams) { 37 | const ip = this.threadLocal.get('ip'); 38 | const extraHeaders: WebSearchOptionalHeaderOptions = {}; 39 | if (ip) { 40 | const geoip = await this.geoipControl.lookupCity(ip, GEOIP_SUPPORTED_LANGUAGES.EN); 41 | 42 | if (geoip?.city) { 43 | extraHeaders['X-Loc-City'] = encodeURIComponent(geoip.city); 44 | } 45 | if (geoip?.country) { 46 | extraHeaders['X-Loc-Country'] = geoip.country.code; 47 | } 48 | if (geoip?.timezone) { 49 | extraHeaders['X-Loc-Timezone'] = geoip.timezone; 50 | } 51 | if (geoip?.coordinates) { 52 | extraHeaders['X-Loc-Lat'] = `${geoip.coordinates[0]}`; 53 | extraHeaders['X-Loc-Long'] = `${geoip.coordinates[1]}`; 54 | } 55 | if (geoip?.subdivisions?.length) { 56 | extraHeaders['X-Loc-State'] = encodeURIComponent(`${geoip.subdivisions[0].code}`); 57 | extraHeaders['X-Loc-State-Name'] = encodeURIComponent(`${geoip.subdivisions[0].name}`); 58 | } 59 | } 60 | if (this.threadLocal.get('userAgent')) { 61 | extraHeaders['User-Agent'] = this.threadLocal.get('userAgent'); 62 | } 63 | 64 | const encoded = { ...query }; 65 | if (encoded.q) { 66 | encoded.q = (Buffer.from(encoded.q).toString('ascii') === encoded.q) ? encoded.q : encodeURIComponent(encoded.q); 67 | } 68 | 69 | let maxTries = 11; 70 | 71 | while (maxTries--) { 72 | try { 73 | const r = await this.braveSearchHTTP.webSearch(encoded, { headers: extraHeaders as Record }); 74 | this.blackHoleDetector.itWorked(); 75 | 76 | return r.parsed; 77 | } catch (err: any) { 78 | this.logger.error(`Web search failed: ${err?.message}`, { err: marshalErrorLike(err) }); 79 | if (err?.status === 429) { 80 | await delay(500 + 1000 * Math.random()); 81 | continue; 82 | } 83 | 84 | throw new DownstreamServiceFailureError({ message: `Search failed` }); 85 | } 86 | } 87 | 88 | throw new DownstreamServiceFailureError({ message: `Search failed` }); 89 | } 90 | 91 | } 92 | 93 | 94 | export class BraveSearchExplicitOperatorsDto extends AutoCastable { 95 | @Prop({ 96 | arrayOf: String, 97 | desc: `Returns web pages with a specific file extension. Example: to find the Honda GX120 Owner’s manual in PDF, type “Honda GX120 ownners manual ext:pdf”.` 98 | }) 99 | ext?: string | string[]; 100 | 101 | @Prop({ 102 | arrayOf: String, 103 | desc: `Returns web pages created in the specified file type. Example: to find a web page created in PDF format about the evaluation of age-related cognitive changes, type “evaluation of age cognitive changes filetype:pdf”.` 104 | }) 105 | filetype?: string | string[]; 106 | 107 | @Prop({ 108 | arrayOf: String, 109 | desc: `Returns web pages containing the specified term in the body of the page. Example: to find information about the Nvidia GeForce GTX 1080 Ti, making sure the page contains the keywords “founders edition” in the body, type “nvidia 1080 ti inbody:“founders edition””.` 110 | }) 111 | inbody?: string | string[]; 112 | 113 | @Prop({ 114 | arrayOf: String, 115 | desc: `Returns webpages containing the specified term in the title of the page. Example: to find pages about SEO conferences making sure the results contain 2023 in the title, type “seo conference intitle:2023”.` 116 | }) 117 | intitle?: string | string[]; 118 | 119 | @Prop({ 120 | arrayOf: String, 121 | desc: `Returns webpages containing the specified term either in the title or in the body of the page. Example: to find pages about the 2024 Oscars containing the keywords “best costume design” in the page, type “oscars 2024 inpage:“best costume design””.` 122 | }) 123 | inpage?: string | string[]; 124 | 125 | @Prop({ 126 | arrayOf: String, 127 | desc: `Returns web pages written in the specified language. The language code must be in the ISO 639-1 two-letter code format. Example: to find information on visas only in Spanish, type “visas lang:es”.` 128 | }) 129 | lang?: string | string[]; 130 | 131 | @Prop({ 132 | arrayOf: String, 133 | desc: `Returns web pages written in the specified language. The language code must be in the ISO 639-1 two-letter code format. Example: to find information on visas only in Spanish, type “visas lang:es”.` 134 | }) 135 | loc?: string | string[]; 136 | 137 | @Prop({ 138 | arrayOf: String, 139 | desc: `Returns web pages coming only from a specific web site. Example: to find information about Goggles only on Brave pages, type “goggles site:brave.com”.` 140 | }) 141 | site?: string | string[]; 142 | 143 | addTo(searchTerm: string) { 144 | const chunks = []; 145 | for (const [key, value] of Object.entries(this)) { 146 | if (value) { 147 | const values = Array.isArray(value) ? value : [value]; 148 | const textValue = values.map((v) => `${key}:${v}`).join(' OR '); 149 | if (textValue) { 150 | chunks.push(textValue); 151 | } 152 | } 153 | } 154 | const opPart = chunks.length > 1 ? chunks.map((x) => `(${x})`).join(' AND ') : chunks; 155 | 156 | if (opPart.length) { 157 | return [searchTerm, opPart].join(' '); 158 | } 159 | 160 | return searchTerm; 161 | } 162 | 163 | static override from(input: any) { 164 | const instance = super.from(input) as BraveSearchExplicitOperatorsDto; 165 | const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as { 166 | req: Request, 167 | res: Response, 168 | } | undefined; 169 | 170 | const params = ['ext', 'filetype', 'inbody', 'intitle', 'inpage', 'lang', 'loc', 'site']; 171 | 172 | for (const p of params) { 173 | const customValue = ctx?.req.get(`x-${p}`) || ctx?.req.get(`${p}`); 174 | if (!customValue) { 175 | continue; 176 | } 177 | 178 | const filtered = customValue.split(', ').filter(Boolean); 179 | if (filtered.length) { 180 | Reflect.set(instance, p, filtered); 181 | } 182 | } 183 | 184 | return instance; 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /src/services/canvas.ts: -------------------------------------------------------------------------------- 1 | import { singleton, container } from 'tsyringe'; 2 | import { AsyncService, mimeOf, ParamValidationError, SubmittedDataMalformedError, /* downloadFile */ } from 'civkit'; 3 | import { readFile } from 'fs/promises'; 4 | 5 | import type canvas from '@napi-rs/canvas'; 6 | export type { Canvas, Image } from '@napi-rs/canvas'; 7 | 8 | import { GlobalLogger } from './logger'; 9 | import { TempFileManager } from './temp-file'; 10 | 11 | import { isMainThread } from 'worker_threads'; 12 | import type { svg2png } from 'svg2png-wasm' with { 'resolution-mode': 'import' }; 13 | import path from 'path'; 14 | import { Threaded } from './threaded'; 15 | 16 | const downloadFile = async (uri: string) => { 17 | const resp = await fetch(uri); 18 | if (!(resp.ok && resp.body)) { 19 | throw new Error(`Unexpected response ${resp.statusText}`); 20 | } 21 | const contentLength = parseInt(resp.headers.get('content-length') || '0'); 22 | if (contentLength > 1024 * 1024 * 100) { 23 | throw new Error('File too large'); 24 | } 25 | const buff = await resp.arrayBuffer(); 26 | 27 | return { buff, contentType: resp.headers.get('content-type') }; 28 | }; 29 | 30 | @singleton() 31 | export class CanvasService extends AsyncService { 32 | 33 | logger = this.globalLogger.child({ service: this.constructor.name }); 34 | svg2png!: typeof svg2png; 35 | canvas!: typeof canvas; 36 | 37 | constructor( 38 | protected temp: TempFileManager, 39 | protected globalLogger: GlobalLogger, 40 | ) { 41 | super(...arguments); 42 | } 43 | 44 | override async init() { 45 | await this.dependencyReady(); 46 | if (!isMainThread) { 47 | const { createSvg2png, initialize } = require('svg2png-wasm'); 48 | const wasmBuff = await readFile(path.resolve(path.dirname(require.resolve('svg2png-wasm')), '../svg2png_wasm_bg.wasm')); 49 | const fontBuff = await readFile(path.resolve(__dirname, '../../licensed/SourceHanSansSC-Regular.otf')); 50 | await initialize(wasmBuff); 51 | this.svg2png = createSvg2png({ 52 | fonts: [Uint8Array.from(fontBuff)], 53 | defaultFontFamily: { 54 | serifFamily: 'Source Han Sans SC', 55 | sansSerifFamily: 'Source Han Sans SC', 56 | cursiveFamily: 'Source Han Sans SC', 57 | fantasyFamily: 'Source Han Sans SC', 58 | monospaceFamily: 'Source Han Sans SC', 59 | } 60 | }); 61 | } 62 | this.canvas = require('@napi-rs/canvas'); 63 | 64 | this.emit('ready'); 65 | } 66 | 67 | @Threaded() 68 | async renderSvgToPng(svgContent: string,) { 69 | return this.svg2png(svgContent, { backgroundColor: '#D3D3D3' }); 70 | } 71 | 72 | protected async _loadImage(input: string | Buffer) { 73 | let buff; 74 | let contentType; 75 | do { 76 | if (typeof input === 'string') { 77 | if (input.startsWith('data:')) { 78 | const firstComma = input.indexOf(','); 79 | const header = input.slice(0, firstComma); 80 | const data = input.slice(firstComma + 1); 81 | const encoding = header.split(';')[1]; 82 | contentType = header.split(';')[0].split(':')[1]; 83 | if (encoding?.startsWith('base64')) { 84 | buff = Buffer.from(data, 'base64'); 85 | } else { 86 | buff = Buffer.from(decodeURIComponent(data), 'utf-8'); 87 | } 88 | break; 89 | } 90 | if (input.startsWith('http')) { 91 | const r = await downloadFile(input); 92 | buff = Buffer.from(r.buff); 93 | contentType = r.contentType; 94 | break; 95 | } 96 | } 97 | if (Buffer.isBuffer(input)) { 98 | buff = input; 99 | const mime = await mimeOf(buff); 100 | contentType = `${mime.mediaType}/${mime.subType}`; 101 | break; 102 | } 103 | throw new ParamValidationError('Invalid input'); 104 | } while (false); 105 | 106 | if (!buff) { 107 | throw new ParamValidationError('Invalid input'); 108 | } 109 | 110 | if (contentType?.includes('svg')) { 111 | buff = await this.renderSvgToPng(buff.toString('utf-8')); 112 | } 113 | 114 | const img = await this.canvas.loadImage(buff); 115 | Reflect.set(img, 'contentType', contentType); 116 | 117 | return img; 118 | } 119 | 120 | async loadImage(uri: string | Buffer) { 121 | const t0 = Date.now(); 122 | try { 123 | const theImage = await this._loadImage(uri); 124 | const t1 = Date.now(); 125 | this.logger.debug(`Image loaded in ${t1 - t0}ms`); 126 | 127 | return theImage; 128 | } catch (err: any) { 129 | if (err?.message?.includes('Unsupported image type') || err?.message?.includes('unsupported')) { 130 | this.logger.warn(`Failed to load image ${uri.slice(0, 128)}`, { err }); 131 | throw new SubmittedDataMalformedError(`Unknown image format for ${uri.slice(0, 128)}`); 132 | } 133 | throw err; 134 | } 135 | } 136 | 137 | fitImageToSquareBox(image: canvas.Image | canvas.Canvas, size: number = 1024) { 138 | // this.logger.debug(`Fitting image(${ image.width }x${ image.height }) to ${ size } box`); 139 | // const t0 = Date.now(); 140 | if (image.width <= size && image.height <= size) { 141 | if (image instanceof this.canvas.Canvas) { 142 | return image; 143 | } 144 | const canvasInstance = this.canvas.createCanvas(image.width, image.height); 145 | const ctx = canvasInstance.getContext('2d'); 146 | ctx.drawImage(image, 0, 0, image.width, image.height, 0, 0, canvasInstance.width, canvasInstance.height); 147 | // this.logger.debug(`No need to resize, copied to canvas in ${ Date.now() - t0 } ms`); 148 | 149 | return canvasInstance; 150 | } 151 | 152 | const aspectRatio = image.width / image.height; 153 | 154 | const resizedWidth = Math.round(aspectRatio > 1 ? size : size * aspectRatio); 155 | const resizedHeight = Math.round(aspectRatio > 1 ? size / aspectRatio : size); 156 | 157 | const canvasInstance = this.canvas.createCanvas(resizedWidth, resizedHeight); 158 | const ctx = canvasInstance.getContext('2d'); 159 | ctx.drawImage(image, 0, 0, image.width, image.height, 0, 0, resizedWidth, resizedHeight); 160 | // this.logger.debug(`Resized to ${ resizedWidth }x${ resizedHeight } in ${ Date.now() - t0 } ms`); 161 | 162 | return canvasInstance; 163 | } 164 | 165 | corpImage(image: canvas.Image | canvas.Canvas, x: number, y: number, w: number, h: number) { 166 | // this.logger.debug(`Cropping image(${ image.width }x${ image.height }) to ${ w }x${ h } at ${ x },${ y } `); 167 | // const t0 = Date.now(); 168 | const canvasInstance = this.canvas.createCanvas(w, h); 169 | const ctx = canvasInstance.getContext('2d'); 170 | ctx.drawImage(image, x, y, w, h, 0, 0, w, h); 171 | // this.logger.debug(`Crop complete in ${ Date.now() - t0 } ms`); 172 | 173 | return canvasInstance; 174 | } 175 | 176 | canvasToDataUrl(canvas: canvas.Canvas, mimeType?: 'image/png' | 'image/jpeg') { 177 | // this.logger.debug(`Exporting canvas(${ canvas.width }x${ canvas.height })`); 178 | // const t0 = Date.now(); 179 | return canvas.toDataURLAsync((mimeType || 'image/png') as 'image/png'); 180 | } 181 | 182 | async canvasToBuffer(canvas: canvas.Canvas, mimeType?: 'image/png' | 'image/jpeg') { 183 | // this.logger.debug(`Exporting canvas(${ canvas.width }x${ canvas.height })`); 184 | // const t0 = Date.now(); 185 | return canvas.toBuffer((mimeType || 'image/png') as 'image/png'); 186 | } 187 | 188 | } 189 | 190 | const instance = container.resolve(CanvasService); 191 | export default instance; 192 | -------------------------------------------------------------------------------- /src/services/cf-browser-rendering.ts: -------------------------------------------------------------------------------- 1 | import { container, singleton } from 'tsyringe'; 2 | import { AsyncService } from 'civkit/async-service'; 3 | import { SecretExposer } from '../shared/services/secrets'; 4 | import { GlobalLogger } from './logger'; 5 | import { CloudFlareHTTP } from '../shared/3rd-party/cloud-flare'; 6 | import { HTTPServiceError } from 'civkit/http'; 7 | import { ServiceNodeResourceDrainError } from './errors'; 8 | 9 | @singleton() 10 | export class CFBrowserRendering extends AsyncService { 11 | 12 | logger = this.globalLogger.child({ service: this.constructor.name }); 13 | client!: CloudFlareHTTP; 14 | 15 | constructor( 16 | protected globalLogger: GlobalLogger, 17 | protected secretExposer: SecretExposer, 18 | ) { 19 | super(...arguments); 20 | } 21 | 22 | 23 | override async init() { 24 | await this.dependencyReady(); 25 | const [account, key] = this.secretExposer.CLOUD_FLARE_API_KEY?.split(':'); 26 | this.client = new CloudFlareHTTP(account, key); 27 | 28 | this.emit('ready'); 29 | } 30 | 31 | async fetchContent(url: string) { 32 | try { 33 | const r = await this.client.fetchBrowserRenderedHTML({ url }); 34 | 35 | return r.parsed.result; 36 | } catch (err) { 37 | if (err instanceof HTTPServiceError) { 38 | if (err.status === 429) { 39 | // Rate limit exceeded, return empty result 40 | this.logger.warn('Cloudflare browser rendering rate limit exceeded', { url }); 41 | 42 | throw new ServiceNodeResourceDrainError(`Cloudflare browser rendering (our account) is at capacity, please try again later or switch to another engine.`,); 43 | } 44 | } 45 | 46 | throw err; 47 | } 48 | } 49 | 50 | } 51 | 52 | const instance = container.resolve(CFBrowserRendering); 53 | 54 | export default instance; 55 | -------------------------------------------------------------------------------- /src/services/errors.ts: -------------------------------------------------------------------------------- 1 | import { ApplicationError, StatusCode } from 'civkit/civ-rpc'; 2 | import _ from 'lodash'; 3 | import dayjs from 'dayjs'; 4 | import utc from 'dayjs/plugin/utc'; 5 | 6 | dayjs.extend(utc); 7 | 8 | @StatusCode(50301) 9 | export class ServiceDisabledError extends ApplicationError { } 10 | 11 | @StatusCode(50302) 12 | export class ServiceCrashedError extends ApplicationError { } 13 | 14 | @StatusCode(50303) 15 | export class ServiceNodeResourceDrainError extends ApplicationError { } 16 | 17 | @StatusCode(50304) 18 | export class ServiceBadAttemptError extends ApplicationError { } 19 | 20 | @StatusCode(50305) 21 | export class ServiceBadApproachError extends ServiceBadAttemptError { } 22 | 23 | @StatusCode(40104) 24 | export class EmailUnverifiedError extends ApplicationError { } 25 | 26 | @StatusCode(40201) 27 | export class InsufficientCreditsError extends ApplicationError { } 28 | 29 | @StatusCode(40202) 30 | export class TierFeatureConstraintError extends ApplicationError { } 31 | 32 | @StatusCode(40203) 33 | export class InsufficientBalanceError extends ApplicationError { } 34 | 35 | @StatusCode(40903) 36 | export class LockConflictError extends ApplicationError { } 37 | 38 | @StatusCode(40904) 39 | export class BudgetExceededError extends ApplicationError { } 40 | 41 | @StatusCode(45101) 42 | export class HarmfulContentError extends ApplicationError { } 43 | 44 | @StatusCode(45102) 45 | export class SecurityCompromiseError extends ApplicationError { } 46 | 47 | @StatusCode(41201) 48 | export class BatchSizeTooLargeError extends ApplicationError { } 49 | -------------------------------------------------------------------------------- /src/services/finalizer.ts: -------------------------------------------------------------------------------- 1 | import { AbstractFinalizerService } from 'civkit/finalizer'; 2 | import { container, singleton } from 'tsyringe'; 3 | import { isMainThread } from 'worker_threads'; 4 | import { GlobalLogger } from './logger'; 5 | 6 | const realProcessExit = process.exit; 7 | process.exit = ((code?: number) => { 8 | if (isMainThread) { 9 | return; 10 | } 11 | return realProcessExit(code); 12 | }) as typeof process.exit; 13 | 14 | @singleton() 15 | export class FinalizerService extends AbstractFinalizerService { 16 | 17 | container = container; 18 | logger = this.globalLogger.child({ service: this.constructor.name }); 19 | 20 | override quitProcess(code?: string | number | null | undefined): never { 21 | return realProcessExit(code); 22 | } 23 | 24 | constructor(protected globalLogger: GlobalLogger) { 25 | super(...arguments); 26 | } 27 | 28 | override onUnhandledRejection(err: unknown, _triggeringPromise: Promise): void { 29 | this.logger.warn(`Unhandled promise rejection in pid ${process.pid}`, { err }); 30 | } 31 | } 32 | 33 | const instance = container.resolve(FinalizerService); 34 | export const { Finalizer } = instance.decorators(); 35 | export default instance; 36 | 37 | if (isMainThread) { 38 | instance.serviceReady(); 39 | } 40 | -------------------------------------------------------------------------------- /src/services/geoip.ts: -------------------------------------------------------------------------------- 1 | import { container, singleton } from 'tsyringe'; 2 | import fsp from 'fs/promises'; 3 | import { CityResponse, Reader } from 'maxmind'; 4 | import { AsyncService, AutoCastable, Prop, runOnce } from 'civkit'; 5 | import { GlobalLogger } from './logger'; 6 | import path from 'path'; 7 | import { Threaded } from './threaded'; 8 | 9 | export enum GEOIP_SUPPORTED_LANGUAGES { 10 | EN = 'en', 11 | ZH_CN = 'zh-CN', 12 | JA = 'ja', 13 | DE = 'de', 14 | FR = 'fr', 15 | ES = 'es', 16 | PT_BR = 'pt-BR', 17 | RU = 'ru', 18 | } 19 | 20 | export class GeoIPInfo extends AutoCastable { 21 | @Prop() 22 | code?: string; 23 | 24 | @Prop() 25 | name?: string; 26 | } 27 | 28 | export class GeoIPCountryInfo extends GeoIPInfo { 29 | @Prop() 30 | eu?: boolean; 31 | } 32 | 33 | export class GeoIPCityResponse extends AutoCastable { 34 | @Prop() 35 | continent?: GeoIPInfo; 36 | 37 | @Prop() 38 | country?: GeoIPCountryInfo; 39 | 40 | @Prop({ 41 | arrayOf: GeoIPInfo 42 | }) 43 | subdivisions?: GeoIPInfo[]; 44 | 45 | @Prop() 46 | city?: string; 47 | 48 | @Prop({ 49 | arrayOf: Number 50 | }) 51 | coordinates?: [number, number, number]; 52 | 53 | @Prop() 54 | timezone?: string; 55 | } 56 | 57 | @singleton() 58 | export class GeoIPService extends AsyncService { 59 | 60 | logger = this.globalLogger.child({ service: this.constructor.name }); 61 | 62 | mmdbCity!: Reader; 63 | 64 | constructor( 65 | protected globalLogger: GlobalLogger, 66 | ) { 67 | super(...arguments); 68 | } 69 | 70 | 71 | override async init() { 72 | await this.dependencyReady(); 73 | 74 | this.emit('ready'); 75 | } 76 | 77 | @runOnce() 78 | async _lazyload() { 79 | const mmdpPath = path.resolve(__dirname, '..', '..', 'licensed', 'GeoLite2-City.mmdb'); 80 | 81 | const dbBuff = await fsp.readFile(mmdpPath, { flag: 'r', encoding: null }); 82 | 83 | this.mmdbCity = new Reader(dbBuff); 84 | 85 | this.logger.info(`Loaded GeoIP database, ${dbBuff.byteLength} bytes`); 86 | } 87 | 88 | 89 | @Threaded() 90 | async lookupCity(ip: string, lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_SUPPORTED_LANGUAGES.EN) { 91 | await this._lazyload(); 92 | 93 | const r = this.mmdbCity.get(ip); 94 | 95 | if (!r) { 96 | return undefined; 97 | } 98 | 99 | return GeoIPCityResponse.from({ 100 | continent: r.continent ? { 101 | code: r.continent?.code, 102 | name: r.continent?.names?.[lang] || r.continent?.names?.en, 103 | } : undefined, 104 | country: r.country ? { 105 | code: r.country?.iso_code, 106 | name: r.country?.names?.[lang] || r.country?.names.en, 107 | eu: r.country?.is_in_european_union, 108 | } : undefined, 109 | city: r.city?.names?.[lang] || r.city?.names?.en, 110 | subdivisions: r.subdivisions?.map((x) => ({ 111 | code: x.iso_code, 112 | name: x.names?.[lang] || x.names?.en, 113 | })), 114 | coordinates: r.location ? [ 115 | r.location.latitude, r.location.longitude, r.location.accuracy_radius 116 | ] : undefined, 117 | timezone: r.location?.time_zone, 118 | }); 119 | } 120 | 121 | @Threaded() 122 | async lookupCities(ips: string[], lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_SUPPORTED_LANGUAGES.EN) { 123 | const r = (await Promise.all(ips.map((ip) => this.lookupCity(ip, lang)))).filter(Boolean) as GeoIPCityResponse[]; 124 | 125 | return r; 126 | } 127 | 128 | } 129 | 130 | const instance = container.resolve(GeoIPService); 131 | 132 | export default instance; 133 | -------------------------------------------------------------------------------- /src/services/lm.ts: -------------------------------------------------------------------------------- 1 | import { AsyncService } from 'civkit/async-service'; 2 | import { singleton } from 'tsyringe'; 3 | 4 | import { PageSnapshot } from './puppeteer'; 5 | import { GlobalLogger } from './logger'; 6 | import _ from 'lodash'; 7 | import { AssertionFailureError } from 'civkit'; 8 | import { LLMManager } from '../shared/services/common-llm'; 9 | import { JSDomControl } from './jsdom'; 10 | 11 | const tripleBackTick = '```'; 12 | 13 | @singleton() 14 | export class LmControl extends AsyncService { 15 | 16 | logger = this.globalLogger.child({ service: this.constructor.name }); 17 | 18 | constructor( 19 | protected globalLogger: GlobalLogger, 20 | protected commonLLM: LLMManager, 21 | protected jsdomControl: JSDomControl, 22 | ) { 23 | super(...arguments); 24 | } 25 | 26 | override async init() { 27 | await this.dependencyReady(); 28 | 29 | this.emit('ready'); 30 | } 31 | 32 | async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & { 33 | pageshotUrl?: string, 34 | }) { 35 | const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot; 36 | 37 | if (!pageshot) { 38 | throw new AssertionFailureError('Screenshot of the page is not available'); 39 | } 40 | 41 | const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg'); 42 | 43 | const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', { 44 | prompt: [ 45 | `HTML: \n${html}\n\nSCREENSHOT: \n`, 46 | typeof pageshot === 'string' ? new URL(pageshot) : pageshot, 47 | `Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`, 48 | ], 49 | 50 | options: { 51 | system: 'You are ReaderLM-v7, a model that generates Markdown source files only. No HTML, notes and chit-chats allowed', 52 | stream: true 53 | } 54 | }); 55 | 56 | const chunks: string[] = []; 57 | for await (const txt of it) { 58 | chunks.push(txt); 59 | const output: PageSnapshot = { 60 | ...snapshot, 61 | parsed: { 62 | ...snapshot?.parsed, 63 | textContent: chunks.join(''), 64 | } 65 | }; 66 | yield output; 67 | } 68 | 69 | return; 70 | } 71 | 72 | async* readerLMMarkdownFromSnapshot(snapshot?: PageSnapshot) { 73 | if (!snapshot) { 74 | throw new AssertionFailureError('Snapshot of the page is not available'); 75 | } 76 | 77 | const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg'); 78 | 79 | const it = this.commonLLM.iterRun('readerlm-v2', { 80 | prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${html}\n${tripleBackTick}\n`, 81 | 82 | options: { 83 | // system: 'You are an AI assistant developed by VENDOR_NAME', 84 | stream: true, 85 | modelSpecific: { 86 | top_k: 1, 87 | temperature: 0, 88 | repetition_penalty: 1.13, 89 | presence_penalty: 0.25, 90 | frequency_penalty: 0.25, 91 | max_tokens: 8192, 92 | } 93 | }, 94 | maxTry: 1, 95 | }); 96 | 97 | const chunks: string[] = []; 98 | for await (const txt of it) { 99 | chunks.push(txt); 100 | const output: PageSnapshot = { 101 | ...snapshot, 102 | parsed: { 103 | ...snapshot?.parsed, 104 | textContent: chunks.join(''), 105 | } 106 | }; 107 | yield output; 108 | } 109 | 110 | return; 111 | } 112 | 113 | async* readerLMFromSnapshot(schema?: string, instruction: string = 'Infer useful information from the HTML and present it in a structured JSON object.', snapshot?: PageSnapshot) { 114 | if (!snapshot) { 115 | throw new AssertionFailureError('Snapshot of the page is not available'); 116 | } 117 | 118 | const html = await this.jsdomControl.cleanHTMLforLMs(snapshot.html, 'script,link,style,textarea,select>option,svg'); 119 | 120 | const it = this.commonLLM.iterRun('readerlm-v2', { 121 | prompt: `${instruction}\n\n${tripleBackTick}html\n${html}\n${tripleBackTick}\n${schema ? `The JSON schema:\n${tripleBackTick}json\n${schema}\n${tripleBackTick}\n` : ''}`, 122 | options: { 123 | // system: 'You are an AI assistant developed by VENDOR_NAME', 124 | stream: true, 125 | modelSpecific: { 126 | top_k: 1, 127 | temperature: 0, 128 | repetition_penalty: 1.13, 129 | presence_penalty: 0.25, 130 | frequency_penalty: 0.25, 131 | max_tokens: 8192, 132 | } 133 | }, 134 | maxTry: 1, 135 | }); 136 | 137 | const chunks: string[] = []; 138 | for await (const txt of it) { 139 | chunks.push(txt); 140 | const output: PageSnapshot = { 141 | ...snapshot, 142 | parsed: { 143 | ...snapshot?.parsed, 144 | textContent: chunks.join(''), 145 | } 146 | }; 147 | yield output; 148 | } 149 | 150 | return; 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /src/services/logger.ts: -------------------------------------------------------------------------------- 1 | import { AbstractPinoLogger } from 'civkit/pino-logger'; 2 | import { singleton, container } from 'tsyringe'; 3 | import { threadId } from 'node:worker_threads'; 4 | import { getTraceCtx } from 'civkit/async-context'; 5 | 6 | 7 | const levelToSeverityMap: { [k: string]: string | undefined; } = { 8 | trace: 'DEFAULT', 9 | debug: 'DEBUG', 10 | info: 'INFO', 11 | warn: 'WARNING', 12 | error: 'ERROR', 13 | fatal: 'CRITICAL', 14 | }; 15 | 16 | @singleton() 17 | export class GlobalLogger extends AbstractPinoLogger { 18 | loggerOptions = { 19 | level: 'debug', 20 | base: { 21 | tid: threadId, 22 | } 23 | }; 24 | 25 | override init(): void { 26 | if (process.env['NODE_ENV']?.startsWith('prod')) { 27 | super.init(process.stdout); 28 | } else { 29 | const PinoPretty = require('pino-pretty').PinoPretty; 30 | super.init(PinoPretty({ 31 | singleLine: true, 32 | colorize: true, 33 | messageFormat(log: any, messageKey: any) { 34 | return `${log['tid'] ? `[${log['tid']}]` : ''}[${log['service'] || 'ROOT'}] ${log[messageKey]}`; 35 | }, 36 | })); 37 | } 38 | 39 | 40 | this.emit('ready'); 41 | } 42 | 43 | override log(...args: any[]) { 44 | const [levelObj, ...rest] = args; 45 | const severity = levelToSeverityMap[levelObj?.level]; 46 | const traceCtx = getTraceCtx(); 47 | const patched: any= { ...levelObj, severity }; 48 | const traceId = traceCtx?.googleTraceId || traceCtx?.traceId; 49 | if (traceId && process.env['GCLOUD_PROJECT']) { 50 | patched['logging.googleapis.com/trace'] = `projects/${process.env['GCLOUD_PROJECT']}/traces/${traceId}`; 51 | } 52 | return super.log(patched, ...rest); 53 | } 54 | } 55 | 56 | const instance = container.resolve(GlobalLogger); 57 | export default instance; 58 | -------------------------------------------------------------------------------- /src/services/misc.ts: -------------------------------------------------------------------------------- 1 | import { singleton } from 'tsyringe'; 2 | import { AsyncService } from 'civkit/async-service'; 3 | import { ParamValidationError } from 'civkit/civ-rpc'; 4 | import { SecurityCompromiseError } from '../shared/lib/errors'; 5 | import { isIP } from 'node:net'; 6 | import { isIPInNonPublicRange } from '../utils/ip'; 7 | import { GlobalLogger } from './logger'; 8 | import { lookup } from 'node:dns/promises'; 9 | import { Threaded } from './threaded'; 10 | 11 | const normalizeUrl = require('@esm2cjs/normalize-url').default; 12 | 13 | @singleton() 14 | export class MiscService extends AsyncService { 15 | 16 | logger = this.globalLogger.child({ service: this.constructor.name }); 17 | 18 | constructor( 19 | protected globalLogger: GlobalLogger, 20 | ) { 21 | super(...arguments); 22 | } 23 | 24 | override async init() { 25 | await this.dependencyReady(); 26 | 27 | this.emit('ready'); 28 | } 29 | 30 | @Threaded() 31 | async assertNormalizedUrl(input: string) { 32 | let result: URL; 33 | try { 34 | result = new URL( 35 | normalizeUrl( 36 | input, 37 | { 38 | stripWWW: false, 39 | removeTrailingSlash: false, 40 | removeSingleSlash: false, 41 | sortQueryParameters: false, 42 | } 43 | ) 44 | ); 45 | } catch (err) { 46 | throw new ParamValidationError({ 47 | message: `${err}`, 48 | path: 'url' 49 | }); 50 | } 51 | 52 | if (!['http:', 'https:', 'blob:'].includes(result.protocol)) { 53 | throw new ParamValidationError({ 54 | message: `Invalid protocol ${result.protocol}`, 55 | path: 'url' 56 | }); 57 | } 58 | 59 | const normalizedHostname = result.hostname.startsWith('[') ? result.hostname.slice(1, -1) : result.hostname; 60 | let ips: string[] = []; 61 | const isIp = isIP(normalizedHostname); 62 | if (isIp) { 63 | ips.push(normalizedHostname); 64 | } 65 | if ( 66 | (result.hostname === 'localhost') || 67 | (isIp && isIPInNonPublicRange(normalizedHostname)) 68 | ) { 69 | this.logger.warn(`Suspicious action: Request to localhost or non-public IP: ${normalizedHostname}`, { href: result.href }); 70 | throw new SecurityCompromiseError({ 71 | message: `Suspicious action: Request to localhost or non-public IP: ${normalizedHostname}`, 72 | path: 'url' 73 | }); 74 | } 75 | if (!isIp && result.protocol !== 'blob:') { 76 | const resolved = await lookup(result.hostname, { all: true }).catch((err) => { 77 | if (err.code === 'ENOTFOUND') { 78 | return Promise.reject(new ParamValidationError({ 79 | message: `Domain '${result.hostname}' could not be resolved`, 80 | path: 'url' 81 | })); 82 | } 83 | 84 | return; 85 | }); 86 | if (resolved) { 87 | for (const x of resolved) { 88 | if (isIPInNonPublicRange(x.address)) { 89 | this.logger.warn(`Suspicious action: Domain resolved to non-public IP: ${result.hostname} => ${x.address}`, { href: result.href, ip: x.address }); 90 | throw new SecurityCompromiseError({ 91 | message: `Suspicious action: Domain resolved to non-public IP: ${x.address}`, 92 | path: 'url' 93 | }); 94 | } 95 | ips.push(x.address); 96 | } 97 | 98 | } 99 | } 100 | 101 | return { 102 | url: result, 103 | ips 104 | }; 105 | } 106 | 107 | } -------------------------------------------------------------------------------- /src/services/pdf-extract.ts: -------------------------------------------------------------------------------- 1 | import { singleton } from 'tsyringe'; 2 | import _ from 'lodash'; 3 | import { TextItem } from 'pdfjs-dist/types/src/display/api'; 4 | import { AssertionFailureError, AsyncService, HashManager } from 'civkit'; 5 | import { GlobalLogger } from './logger'; 6 | import { PDFContent } from '../db/pdf'; 7 | import dayjs from 'dayjs'; 8 | import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket'; 9 | import { randomUUID } from 'crypto'; 10 | import type { PDFDocumentLoadingTask } from 'pdfjs-dist'; 11 | import path from 'path'; 12 | import { AsyncLocalContext } from './async-context'; 13 | const utc = require('dayjs/plugin/utc'); // Import the UTC plugin 14 | dayjs.extend(utc); // Extend dayjs with the UTC plugin 15 | const timezone = require('dayjs/plugin/timezone'); 16 | dayjs.extend(timezone); 17 | 18 | const pPdfjs = import('pdfjs-dist/legacy/build/pdf.mjs'); 19 | const nodeCmapUrl = path.resolve(require.resolve('pdfjs-dist'), '../../cmaps') + '/'; 20 | 21 | const md5Hasher = new HashManager('md5', 'hex'); 22 | 23 | function stdDev(numbers: number[]) { 24 | const mean = _.mean(numbers); 25 | const squareDiffs = numbers.map((num) => Math.pow(num - mean, 2)); 26 | const avgSquareDiff = _.mean(squareDiffs); 27 | return Math.sqrt(avgSquareDiff); 28 | } 29 | 30 | function isRotatedByAtLeast35Degrees(transform?: [number, number, number, number, number, number]): boolean { 31 | if (!transform) { 32 | return false; 33 | } 34 | const [a, b, c, d, _e, _f] = transform; 35 | 36 | // Calculate the rotation angles using arctan(b/a) and arctan(-c/d) 37 | const angle1 = Math.atan2(b, a) * (180 / Math.PI); // from a, b 38 | const angle2 = Math.atan2(-c, d) * (180 / Math.PI); // from c, d 39 | 40 | // Either angle1 or angle2 can be used to determine the rotation, they should be equivalent 41 | const rotationAngle1 = Math.abs(angle1); 42 | const rotationAngle2 = Math.abs(angle2); 43 | 44 | // Check if the absolute rotation angle is greater than or equal to 35 degrees 45 | return rotationAngle1 >= 35 || rotationAngle2 >= 35; 46 | } 47 | 48 | @singleton() 49 | export class PDFExtractor extends AsyncService { 50 | 51 | logger = this.globalLogger.child({ service: this.constructor.name }); 52 | pdfjs!: Awaited; 53 | 54 | cacheRetentionMs = 1000 * 3600 * 24 * 7; 55 | 56 | constructor( 57 | protected globalLogger: GlobalLogger, 58 | protected firebaseObjectStorage: FirebaseStorageBucketControl, 59 | protected asyncLocalContext: AsyncLocalContext, 60 | ) { 61 | super(...arguments); 62 | } 63 | 64 | override async init() { 65 | await this.dependencyReady(); 66 | this.pdfjs = await pPdfjs; 67 | 68 | this.emit('ready'); 69 | } 70 | 71 | isDataUrl(url: string) { 72 | return url.startsWith('data:'); 73 | } 74 | 75 | parseDataUrl(url: string) { 76 | const protocol = url.slice(0, url.indexOf(':')); 77 | const contentType = url.slice(url.indexOf(':') + 1, url.indexOf(';')); 78 | const data = url.slice(url.indexOf(',') + 1); 79 | if (protocol !== 'data' || !data) { 80 | throw new Error('Invalid data URL'); 81 | } 82 | 83 | if (contentType !== 'application/pdf') { 84 | throw new Error('Invalid data URL type'); 85 | } 86 | 87 | return { 88 | type: contentType, 89 | data: data 90 | }; 91 | } 92 | 93 | async extract(url: string | URL) { 94 | let loadingTask: PDFDocumentLoadingTask; 95 | 96 | if (typeof url === 'string' && this.isDataUrl(url)) { 97 | const { data } = this.parseDataUrl(url); 98 | const binary = Uint8Array.from(Buffer.from(data, 'base64')); 99 | loadingTask = this.pdfjs.getDocument({ 100 | data: binary, 101 | disableFontFace: true, 102 | verbosity: 0, 103 | cMapUrl: nodeCmapUrl, 104 | }); 105 | } else { 106 | loadingTask = this.pdfjs.getDocument({ 107 | url, 108 | disableFontFace: true, 109 | verbosity: 0, 110 | cMapUrl: nodeCmapUrl, 111 | }); 112 | } 113 | 114 | 115 | const doc = await loadingTask.promise; 116 | const meta = await doc.getMetadata(); 117 | 118 | const textItems: TextItem[][] = []; 119 | 120 | for (const pg of _.range(0, doc.numPages)) { 121 | const page = await doc.getPage(pg + 1); 122 | const textContent = await page.getTextContent({ includeMarkedContent: true }); 123 | textItems.push((textContent.items as TextItem[])); 124 | } 125 | 126 | const articleCharHeights: number[] = []; 127 | for (const textItem of textItems.flat()) { 128 | if (textItem.height) { 129 | articleCharHeights.push(...Array(textItem.str.length).fill(textItem.height)); 130 | } 131 | } 132 | const articleAvgHeight = _.mean(articleCharHeights); 133 | const articleStdDevHeight = stdDev(articleCharHeights); 134 | // const articleMedianHeight = articleCharHeights.sort()[Math.floor(articleCharHeights.length / 2)]; 135 | const mdOps: Array<{ 136 | text: string; 137 | op?: 'new' | 'append'; 138 | mode: 'h1' | 'h2' | 'p' | 'appendix' | 'space'; 139 | }> = []; 140 | 141 | const rawChunks: string[] = []; 142 | 143 | let op: 'append' | 'new' = 'new'; 144 | let mode: 'h1' | 'h2' | 'p' | 'space' | 'appendix' = 'p'; 145 | for (const pageTextItems of textItems) { 146 | const charHeights = []; 147 | for (const textItem of pageTextItems as TextItem[]) { 148 | if (textItem.height) { 149 | charHeights.push(...Array(textItem.str.length).fill(textItem.height)); 150 | } 151 | rawChunks.push(`${textItem.str}${textItem.hasEOL ? '\n' : ''}`); 152 | } 153 | 154 | const avgHeight = _.mean(charHeights); 155 | const stdDevHeight = stdDev(charHeights); 156 | // const medianHeight = charHeights.sort()[Math.floor(charHeights.length / 2)]; 157 | 158 | for (const textItem of pageTextItems) { 159 | if (textItem.height > articleAvgHeight + 3 * articleStdDevHeight) { 160 | mode = 'h1'; 161 | } else if (textItem.height > articleAvgHeight + 2 * articleStdDevHeight) { 162 | mode = 'h2'; 163 | } else if (textItem.height && textItem.height < avgHeight - stdDevHeight) { 164 | mode = 'appendix'; 165 | } else if (textItem.height) { 166 | mode = 'p'; 167 | } else { 168 | mode = 'space'; 169 | } 170 | 171 | if (isRotatedByAtLeast35Degrees(textItem.transform as any)) { 172 | mode = 'appendix'; 173 | } 174 | 175 | mdOps.push({ 176 | op, 177 | mode, 178 | text: textItem.str 179 | }); 180 | 181 | if (textItem.hasEOL && !textItem.str) { 182 | op = 'new'; 183 | } else { 184 | op = 'append'; 185 | } 186 | } 187 | } 188 | 189 | const mdChunks = []; 190 | const appendixChunks = []; 191 | mode = 'space'; 192 | for (const x of mdOps) { 193 | const previousMode: string = mode; 194 | const changeToMdChunks = []; 195 | 196 | const isNewStart = x.mode !== 'space' && (x.op === 'new' || (previousMode === 'appendix' && x.mode !== previousMode)); 197 | 198 | if (isNewStart) { 199 | switch (x.mode) { 200 | case 'h1': { 201 | changeToMdChunks.push(`\n\n# `); 202 | mode = x.mode; 203 | break; 204 | } 205 | 206 | case 'h2': { 207 | changeToMdChunks.push(`\n\n## `); 208 | mode = x.mode; 209 | break; 210 | } 211 | 212 | case 'p': { 213 | changeToMdChunks.push(`\n\n`); 214 | mode = x.mode; 215 | break; 216 | } 217 | 218 | case 'appendix': { 219 | mode = x.mode; 220 | appendixChunks.push(`\n\n`); 221 | break; 222 | } 223 | 224 | default: { 225 | break; 226 | } 227 | } 228 | } else { 229 | if (x.mode === 'appendix' && appendixChunks.length) { 230 | const lastChunk = appendixChunks[appendixChunks.length - 1]; 231 | if (!lastChunk.match(/(\s+|-)$/) && lastChunk.length !== 1) { 232 | appendixChunks.push(' '); 233 | } 234 | } else if (mdChunks.length) { 235 | const lastChunk = mdChunks[mdChunks.length - 1]; 236 | if (!lastChunk.match(/(\s+|-)$/) && lastChunk.length !== 1) { 237 | changeToMdChunks.push(' '); 238 | } 239 | } 240 | } 241 | 242 | if (x.text) { 243 | if (x.mode == 'appendix') { 244 | if (appendixChunks.length || isNewStart) { 245 | appendixChunks.push(x.text); 246 | } else { 247 | changeToMdChunks.push(x.text); 248 | } 249 | } else { 250 | changeToMdChunks.push(x.text); 251 | } 252 | } 253 | 254 | if (isNewStart && x.mode !== 'appendix' && appendixChunks.length) { 255 | const appendix = appendixChunks.join('').split(/\r?\n/).map((x) => x.trim()).filter(Boolean).map((x) => `> ${x}`).join('\n'); 256 | changeToMdChunks.unshift(appendix); 257 | changeToMdChunks.unshift(`\n\n`); 258 | appendixChunks.length = 0; 259 | } 260 | 261 | if (x.mode === 'space' && changeToMdChunks.length) { 262 | changeToMdChunks.length = 1; 263 | } 264 | if (changeToMdChunks.length) { 265 | mdChunks.push(...changeToMdChunks); 266 | } 267 | } 268 | 269 | if (mdChunks.length) { 270 | mdChunks[0] = mdChunks[0].trimStart(); 271 | } 272 | 273 | return { meta: meta.info as Record, content: mdChunks.join(''), text: rawChunks.join('') }; 274 | } 275 | 276 | async cachedExtract(url: string, cacheTolerance: number = 1000 * 3600 * 24, alternativeUrl?: string) { 277 | if (!url) { 278 | return undefined; 279 | } 280 | let nameUrl = alternativeUrl || url; 281 | const digest = md5Hasher.hash(nameUrl); 282 | 283 | if (this.isDataUrl(url)) { 284 | nameUrl = `blob://pdf:${digest}`; 285 | } 286 | 287 | const cache: PDFContent | undefined = nameUrl.startsWith('blob:') ? undefined : 288 | (await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0]; 289 | 290 | if (cache) { 291 | const age = Date.now() - cache?.createdAt.valueOf(); 292 | const stale = cache.createdAt.valueOf() < (Date.now() - cacheTolerance); 293 | this.logger.info(`${stale ? 'Stale cache exists' : 'Cache hit'} for PDF ${nameUrl}, normalized digest: ${digest}, ${age}ms old, tolerance ${cacheTolerance}ms`, { 294 | data: url, url: nameUrl, digest, age, stale, cacheTolerance 295 | }); 296 | 297 | if (!stale) { 298 | if (cache.content && cache.text) { 299 | return { 300 | meta: cache.meta, 301 | content: cache.content, 302 | text: cache.text 303 | }; 304 | } 305 | 306 | try { 307 | const r = await this.firebaseObjectStorage.downloadFile(`pdfs/${cache._id}`); 308 | let cached = JSON.parse(r.toString('utf-8')); 309 | 310 | return { 311 | meta: cached.meta, 312 | content: cached.content, 313 | text: cached.text 314 | }; 315 | } catch (err) { 316 | this.logger.warn(`Unable to load cached content for ${nameUrl}`, { err }); 317 | 318 | return undefined; 319 | } 320 | } 321 | } 322 | 323 | let extracted; 324 | 325 | try { 326 | extracted = await this.extract(url); 327 | } catch (err: any) { 328 | this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err, url, nameUrl }); 329 | throw new AssertionFailureError(`Unable to process ${nameUrl} as pdf: ${err?.message}`); 330 | } 331 | 332 | if (!this.asyncLocalContext.ctx.DNT && !nameUrl.startsWith('blob:')) { 333 | const theID = randomUUID(); 334 | await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`, 335 | Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' }); 336 | PDFContent.save( 337 | PDFContent.from({ 338 | _id: theID, 339 | src: nameUrl, 340 | meta: extracted?.meta || {}, 341 | urlDigest: digest, 342 | createdAt: new Date(), 343 | expireAt: new Date(Date.now() + this.cacheRetentionMs) 344 | }).degradeForFireStore() 345 | ).catch((r) => { 346 | this.logger.warn(`Unable to cache PDF content for ${nameUrl}`, { err: r }); 347 | }); 348 | } 349 | 350 | return extracted; 351 | } 352 | 353 | parsePdfDate(pdfDate: string | undefined) { 354 | if (!pdfDate) { 355 | return undefined; 356 | } 357 | // Remove the 'D:' prefix 358 | const cleanedDate = pdfDate.slice(2); 359 | 360 | // Define the format without the timezone part first 361 | const dateTimePart = cleanedDate.slice(0, 14); 362 | const timezonePart = cleanedDate.slice(14); 363 | 364 | // Construct the full date string in a standard format 365 | const formattedDate = `${dateTimePart}${timezonePart.replace("'", "").replace("'", "")}`; 366 | 367 | // Parse the date with timezone 368 | const parsedDate = dayjs(formattedDate, "YYYYMMDDHHmmssZ"); 369 | 370 | const date = parsedDate.toDate(); 371 | 372 | if (!date.valueOf()) { 373 | return undefined; 374 | } 375 | 376 | return date; 377 | } 378 | } 379 | -------------------------------------------------------------------------------- /src/services/pseudo-transfer.ts: -------------------------------------------------------------------------------- 1 | import { marshalErrorLike } from 'civkit'; 2 | import { AbstractPseudoTransfer, SYM_PSEUDO_TRANSFERABLE } from 'civkit/pseudo-transfer'; 3 | import { container, singleton } from 'tsyringe'; 4 | 5 | 6 | @singleton() 7 | export class PseudoTransfer extends AbstractPseudoTransfer { 8 | 9 | override async init() { 10 | await this.dependencyReady(); 11 | this.emit('ready'); 12 | } 13 | 14 | } 15 | 16 | const instance = container.resolve(PseudoTransfer); 17 | 18 | Object.defineProperty(Error.prototype, SYM_PSEUDO_TRANSFERABLE, { 19 | value: function () { 20 | const prototype = this; 21 | return { 22 | copyOwnProperty: 'all', 23 | marshall: (input: Error) => marshalErrorLike(input), 24 | unMarshall: (input: object) => { 25 | Object.setPrototypeOf(input, prototype); 26 | return input; 27 | }, 28 | }; 29 | }, 30 | enumerable: false, 31 | }); 32 | instance.expectPseudoTransferableType(Error); 33 | for (const x of [...Object.values(require('./errors')), ...Object.values(require('civkit/civ-rpc'))]) { 34 | if (typeof x === 'function' && x.prototype instanceof Error) { 35 | instance.expectPseudoTransferableType(x as any); 36 | } 37 | } 38 | 39 | 40 | Object.defineProperty(URL.prototype, SYM_PSEUDO_TRANSFERABLE, { 41 | value: function () { 42 | return { 43 | copyOwnProperty: 'none', 44 | marshall: (input: URL) => ({ href: input.href }), 45 | unMarshall: (input: { href: string; }) => new URL(input.href), 46 | }; 47 | }, 48 | enumerable: false, 49 | }); 50 | instance.expectPseudoTransferableType(URL); 51 | 52 | Object.defineProperty(Buffer.prototype, SYM_PSEUDO_TRANSFERABLE, { 53 | value: function () { 54 | return { 55 | copyOwnProperty: 'none', 56 | unMarshall: (input: Uint8Array | Buffer) => Buffer.isBuffer(input) ? input : Buffer.from(input), 57 | marshall: (input: Uint8Array | Buffer) => input, 58 | }; 59 | }, 60 | enumerable: false, 61 | }); 62 | instance.expectPseudoTransferableType(Buffer); 63 | 64 | 65 | export default instance; 66 | -------------------------------------------------------------------------------- /src/services/registry.ts: -------------------------------------------------------------------------------- 1 | import { propertyInjectorFactory } from 'civkit/property-injector'; 2 | import { KoaRPCRegistry } from 'civkit/civ-rpc/koa'; 3 | import { container, singleton } from 'tsyringe'; 4 | import { IntegrityEnvelope } from 'civkit/civ-rpc'; 5 | import bodyParser from '@koa/bodyparser'; 6 | 7 | import { GlobalLogger } from './logger'; 8 | import { TempFileManager } from './temp-file'; 9 | import { AsyncLocalContext } from './async-context'; 10 | import { BlackHoleDetector } from './blackhole-detector'; 11 | export { Context } from 'koa'; 12 | 13 | export const InjectProperty = propertyInjectorFactory(container); 14 | 15 | @singleton() 16 | export class RPCRegistry extends KoaRPCRegistry { 17 | 18 | title = 'Jina Reader API'; 19 | container = container; 20 | logger = this.globalLogger.child({ service: this.constructor.name }); 21 | static override envelope = IntegrityEnvelope; 22 | override _BODY_PARSER_LIMIT = '102mb'; 23 | override _RESPONSE_STREAM_MODE = 'koa' as const; 24 | 25 | override koaMiddlewares = [ 26 | this.__CORSAllowAllMiddleware.bind(this), 27 | bodyParser({ 28 | encoding: 'utf-8', 29 | enableTypes: ['json', 'form'], 30 | jsonLimit: this._BODY_PARSER_LIMIT, 31 | xmlLimit: this._BODY_PARSER_LIMIT, 32 | formLimit: this._BODY_PARSER_LIMIT, 33 | }), 34 | this.__multiParse.bind(this), 35 | this.__binaryParse.bind(this), 36 | ]; 37 | 38 | constructor( 39 | protected globalLogger: GlobalLogger, 40 | protected ctxMgr: AsyncLocalContext, 41 | protected tempFileManager: TempFileManager, 42 | protected blackHoleDetector: BlackHoleDetector, 43 | ) { 44 | super(...arguments); 45 | 46 | this.on('run', () => this.blackHoleDetector.incomingRequest()); 47 | this.on('ran', () => this.blackHoleDetector.doneWithRequest()); 48 | this.on('fail', () => this.blackHoleDetector.doneWithRequest()); 49 | } 50 | 51 | override async init() { 52 | await this.dependencyReady(); 53 | this.emit('ready'); 54 | } 55 | 56 | } 57 | 58 | const instance = container.resolve(RPCRegistry); 59 | export default instance; 60 | export const { Method, RPCMethod, RPCReflect, Param, Ctx, } = instance.decorators(); 61 | -------------------------------------------------------------------------------- /src/services/robots-text.ts: -------------------------------------------------------------------------------- 1 | import { singleton } from 'tsyringe'; 2 | import { URL } from 'url'; 3 | import { AssertionFailureError, DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc'; 4 | import { AsyncService } from 'civkit/async-service'; 5 | import { HashManager } from 'civkit/hash'; 6 | import { marshalErrorLike } from 'civkit/lang'; 7 | 8 | import { GlobalLogger } from './logger'; 9 | import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket'; 10 | import { Threaded } from '../services/threaded'; 11 | 12 | 13 | export const md5Hasher = new HashManager('md5', 'hex'); 14 | 15 | @singleton() 16 | export class RobotsTxtService extends AsyncService { 17 | 18 | logger = this.globalLogger.child({ service: this.constructor.name }); 19 | 20 | constructor( 21 | protected globalLogger: GlobalLogger, 22 | protected firebaseStorageBucketControl: FirebaseStorageBucketControl, 23 | ) { 24 | super(...arguments); 25 | } 26 | 27 | override async init() { 28 | await this.dependencyReady(); 29 | this.emit('ready'); 30 | } 31 | 32 | async getCachedRobotTxt(origin: string) { 33 | const digest = md5Hasher.hash(origin.toLowerCase()); 34 | const cacheLoc = `robots-txt/${digest}`; 35 | let buff; 36 | buff = await this.firebaseStorageBucketControl.downloadFile(cacheLoc).catch(() => undefined); 37 | if (buff) { 38 | return buff.toString(); 39 | } 40 | 41 | const r = await fetch(new URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) }); 42 | if (!r.ok) { 43 | throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}: ${r.status} ${r.statusText}`); 44 | } 45 | buff = Buffer.from(await r.arrayBuffer()); 46 | 47 | this.firebaseStorageBucketControl.saveFile(cacheLoc, buff, { 48 | contentType: 'text/plain' 49 | }).catch((err) => { 50 | this.logger.warn(`Failed to save robots.txt to cache: ${err}`, { err: marshalErrorLike(err) }); 51 | }); 52 | 53 | return buff.toString(); 54 | } 55 | 56 | @Threaded() 57 | async assertAccessAllowed(url: URL, inputMyUa = '*') { 58 | let robotTxt: string = ''; 59 | try { 60 | robotTxt = await this.getCachedRobotTxt(url.origin); 61 | } catch (err) { 62 | if (err instanceof DownstreamServiceFailureError) { 63 | // Remote server is reachable but cannot provide a robot.txt; this is treated as public access 64 | return true; 65 | } 66 | throw new AssertionFailureError(`Failed to load robots.txt from ${url.origin}: ${err}`); 67 | } 68 | const myUa = inputMyUa.toLowerCase(); 69 | const lines = robotTxt.split(/\r?\n/g); 70 | 71 | let currentUa = myUa || '*'; 72 | let uaLine = 'User-Agent: *'; 73 | const pathNormalized = `${url.pathname}?`; 74 | 75 | for (const line of lines) { 76 | const trimmed = line.trim(); 77 | if (trimmed.startsWith('#') || !trimmed) { 78 | continue; 79 | } 80 | const [k, ...rest] = trimmed.split(':'); 81 | const key = k.trim().toLowerCase(); 82 | const value = rest.join(':').trim(); 83 | 84 | if (key === 'user-agent') { 85 | currentUa = value.toLowerCase(); 86 | if (value === '*') { 87 | currentUa = myUa; 88 | } 89 | uaLine = line; 90 | continue; 91 | } 92 | 93 | if (currentUa !== myUa) { 94 | continue; 95 | } 96 | 97 | if (key === 'disallow') { 98 | if (!value) { 99 | return true; 100 | } 101 | if (value.includes('*')) { 102 | const [head, tail] = value.split('*'); 103 | if (url.pathname.startsWith(head) && url.pathname.endsWith(tail)) { 104 | throw new ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`); 105 | } 106 | } else if (pathNormalized.startsWith(value)) { 107 | throw new ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`); 108 | } 109 | 110 | continue; 111 | } 112 | 113 | if (key === 'allow') { 114 | if (!value) { 115 | return true; 116 | } 117 | if (pathNormalized.startsWith(value)) { 118 | return true; 119 | } 120 | continue; 121 | } 122 | } 123 | 124 | return true; 125 | } 126 | 127 | } 128 | -------------------------------------------------------------------------------- /src/services/serp/compat.ts: -------------------------------------------------------------------------------- 1 | export interface WebSearchEntry { 2 | link: string; 3 | title: string; 4 | source?: string; 5 | date?: string; 6 | snippet?: string; 7 | imageUrl?: string; 8 | siteLinks?: { 9 | link: string; title: string; snippet?: string; 10 | }[]; 11 | variant?: 'web' | 'images' | 'news'; 12 | } -------------------------------------------------------------------------------- /src/services/serp/google.ts: -------------------------------------------------------------------------------- 1 | import { singleton } from 'tsyringe'; 2 | import { AsyncService } from 'civkit/async-service'; 3 | import { GlobalLogger } from '../logger'; 4 | import { JSDomControl } from '../jsdom'; 5 | import { isMainThread } from 'worker_threads'; 6 | import _ from 'lodash'; 7 | import { WebSearchEntry } from './compat'; 8 | import { ScrappingOptions, SERPSpecializedPuppeteerControl } from './puppeteer'; 9 | import { CurlControl } from '../curl'; 10 | import { readFile } from 'fs/promises'; 11 | import { ApplicationError } from 'civkit/civ-rpc'; 12 | import { ServiceBadApproachError, ServiceBadAttemptError } from '../errors'; 13 | import { parseJSONText } from 'civkit/vectorize'; 14 | import { retryWith } from 'civkit/decorators'; 15 | import { ProxyProviderService } from '../../shared/services/proxy-provider'; 16 | 17 | @singleton() 18 | export class GoogleSERP extends AsyncService { 19 | logger = this.globalLogger.child({ service: this.constructor.name }); 20 | googleDomain = process.env.OVERRIDE_GOOGLE_DOMAIN || 'www.google.com'; 21 | 22 | constructor( 23 | protected globalLogger: GlobalLogger, 24 | protected puppeteerControl: SERPSpecializedPuppeteerControl, 25 | protected jsDomControl: JSDomControl, 26 | protected curlControl: CurlControl, 27 | protected proxyProvider: ProxyProviderService, 28 | ) { 29 | const filteredDeps = isMainThread ? arguments : _.without(arguments, puppeteerControl); 30 | super(...filteredDeps); 31 | } 32 | 33 | override async init() { 34 | await this.dependencyReady(); 35 | 36 | this.emit('ready'); 37 | } 38 | 39 | @retryWith((err) => { 40 | if (err instanceof ServiceBadApproachError) { 41 | return false; 42 | } 43 | if (err instanceof ServiceBadAttemptError) { 44 | // Keep trying 45 | return true; 46 | } 47 | if (err instanceof ApplicationError) { 48 | // Quit with this error 49 | return false; 50 | } 51 | return undefined; 52 | }, 3) 53 | async sideLoadWithAllocatedProxy(url: URL, opts?: ScrappingOptions) { 54 | if (opts?.allocProxy === 'none') { 55 | return this.curlControl.sideLoad(url, opts); 56 | } 57 | 58 | const proxy = await this.proxyProvider.alloc( 59 | process.env.PREFERRED_PROXY_COUNTRY || 'auto' 60 | ); 61 | this.logger.debug(`Proxy allocated`, { proxy: proxy.href }); 62 | const r = await this.curlControl.sideLoad(url, { 63 | ...opts, 64 | proxyUrl: proxy.href, 65 | }); 66 | 67 | if (r.status === 429) { 68 | throw new ServiceBadAttemptError('Google returned a 429 error. This may happen due to various reasons, including rate limiting or other issues.'); 69 | } 70 | 71 | if (opts && opts.allocProxy) { 72 | opts.proxyUrl ??= proxy.href; 73 | } 74 | 75 | return { ...r, proxy }; 76 | } 77 | 78 | digestQuery(query: { [k: string]: any; }) { 79 | const url = new URL(`https://${this.googleDomain}/search`); 80 | const clone = { ...query }; 81 | const num = clone.num || 10; 82 | if (clone.page) { 83 | const page = parseInt(clone.page); 84 | delete clone.page; 85 | clone.start = (page - 1) * num; 86 | if (clone.start === 0) { 87 | delete clone.start; 88 | } 89 | } 90 | if (clone.location) { 91 | delete clone.location; 92 | } 93 | 94 | for (const [k, v] of Object.entries(clone)) { 95 | if (v === undefined || v === null) { 96 | continue; 97 | } 98 | url.searchParams.set(k, `${v}`); 99 | } 100 | 101 | return url; 102 | } 103 | 104 | async webSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) { 105 | const url = this.digestQuery(query); 106 | 107 | const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts); 108 | if (opts && sideLoaded.sideLoadOpts) { 109 | opts.sideLoad = sideLoaded.sideLoadOpts; 110 | } 111 | 112 | const snapshot = await this.puppeteerControl.controlledScrap(url, getWebSearchResults, opts); 113 | 114 | return snapshot; 115 | } 116 | 117 | async newsSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) { 118 | const url = this.digestQuery(query); 119 | 120 | url.searchParams.set('tbm', 'nws'); 121 | 122 | const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts); 123 | if (opts && sideLoaded.sideLoadOpts) { 124 | opts.sideLoad = sideLoaded.sideLoadOpts; 125 | } 126 | 127 | const snapshot = await this.puppeteerControl.controlledScrap(url, getNewsSearchResults, opts); 128 | 129 | return snapshot; 130 | } 131 | 132 | async imageSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) { 133 | const url = this.digestQuery(query); 134 | 135 | url.searchParams.set('tbm', 'isch'); 136 | url.searchParams.set('asearch', 'isch'); 137 | url.searchParams.set('async', `_fmt:json,p:1,ijn:${query.start ? Math.floor(query.start / (query.num || 10)) : 0}`); 138 | 139 | const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts); 140 | 141 | if (sideLoaded.status !== 200 || !sideLoaded.file) { 142 | throw new ServiceBadAttemptError('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.'); 143 | } 144 | 145 | const jsonTxt = (await readFile((await sideLoaded.file.filePath))).toString(); 146 | const rJSON = parseJSONText(jsonTxt.slice(jsonTxt.indexOf('{"ischj":'))); 147 | 148 | return _.get(rJSON, 'ischj.metadata').map((x: any) => { 149 | 150 | return { 151 | link: _.get(x, 'result.referrer_url'), 152 | title: _.get(x, 'result.page_title'), 153 | snippet: _.get(x, 'text_in_grid.snippet'), 154 | source: _.get(x, 'result.site_title'), 155 | imageWidth: _.get(x, 'original_image.width'), 156 | imageHeight: _.get(x, 'original_image.height'), 157 | imageUrl: _.get(x, 'original_image.url'), 158 | variant: 'images', 159 | }; 160 | }) as WebSearchEntry[]; 161 | } 162 | } 163 | 164 | async function getWebSearchResults() { 165 | if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) { 166 | throw new Error('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.'); 167 | } 168 | 169 | // @ts-ignore 170 | await Promise.race([window.waitForSelector('div[data-async-context^="query"]'), window.waitForSelector('#botstuff .mnr-c')]); 171 | 172 | const wrapper1 = document.querySelector('div[data-async-context^="query"]'); 173 | 174 | if (!wrapper1) { 175 | return undefined; 176 | } 177 | 178 | const query = decodeURIComponent(wrapper1.getAttribute('data-async-context')?.split('query:')[1] || ''); 179 | 180 | if (!query) { 181 | return undefined; 182 | } 183 | 184 | const candidates = Array.from(wrapper1.querySelectorAll('div[lang],div[data-surl]')); 185 | 186 | return candidates.map((x, pos) => { 187 | const primaryLink = x.querySelector('a:not([href="#"])'); 188 | if (!primaryLink) { 189 | return undefined; 190 | } 191 | const url = primaryLink.getAttribute('href'); 192 | 193 | if (primaryLink.querySelector('div[role="heading"]')) { 194 | // const spans = primaryLink.querySelectorAll('span'); 195 | // const title = spans[0]?.textContent; 196 | // const source = spans[1]?.textContent; 197 | // const date = spans[spans.length - 1].textContent; 198 | 199 | // return { 200 | // link: url, 201 | // title, 202 | // source, 203 | // date, 204 | // variant: 'video' 205 | // }; 206 | return undefined; 207 | } 208 | 209 | const title = primaryLink.querySelector('h3')?.textContent; 210 | const source = Array.from(primaryLink.querySelectorAll('span')).find((x) => x.textContent)?.textContent; 211 | const cite = primaryLink.querySelector('cite[role=text]')?.textContent; 212 | let date = cite?.split('·')[1]?.trim(); 213 | const snippets = Array.from(x.querySelectorAll('div[data-sncf*="1"] span')); 214 | let snippet = snippets[snippets.length - 1]?.textContent; 215 | if (!snippet) { 216 | snippet = x.querySelector('div.IsZvec')?.textContent?.trim() || null; 217 | } 218 | date ??= snippets[snippets.length - 2]?.textContent?.trim(); 219 | const imageUrl = x.querySelector('div[data-sncf*="1"] img[src]:not(img[src^="data"])')?.getAttribute('src'); 220 | let siteLinks = Array.from(x.querySelectorAll('div[data-sncf*="3"] a[href]')).map((l) => { 221 | return { 222 | link: l.getAttribute('href'), 223 | title: l.textContent, 224 | }; 225 | }); 226 | const perhapsParent = x.parentElement?.closest('div[data-hveid]'); 227 | if (!siteLinks?.length && perhapsParent) { 228 | const candidates = Array.from(perhapsParent.querySelectorAll('td h3')); 229 | if (candidates.length) { 230 | siteLinks = candidates.map((l) => { 231 | const link = l.querySelector('a'); 232 | if (!link) { 233 | return undefined; 234 | } 235 | const snippet = l.nextElementSibling?.textContent; 236 | return { 237 | link: link.getAttribute('href'), 238 | title: link.textContent, 239 | snippet, 240 | }; 241 | }).filter(Boolean) as any; 242 | } 243 | } 244 | 245 | return { 246 | link: url, 247 | title, 248 | source, 249 | date, 250 | snippet: snippet ?? undefined, 251 | imageUrl: imageUrl?.startsWith('data:') ? undefined : imageUrl, 252 | siteLinks: siteLinks.length ? siteLinks : undefined, 253 | variant: 'web', 254 | }; 255 | }).filter(Boolean) as WebSearchEntry[]; 256 | } 257 | 258 | async function getNewsSearchResults() { 259 | if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) { 260 | throw new Error('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.'); 261 | } 262 | 263 | // @ts-ignore 264 | await Promise.race([window.waitForSelector('div[data-async-context^="query"]'), window.waitForSelector('#botstuff .mnr-c')]); 265 | 266 | const wrapper1 = document.querySelector('div[data-async-context^="query"]'); 267 | 268 | if (!wrapper1) { 269 | return undefined; 270 | } 271 | 272 | const query = decodeURIComponent(wrapper1.getAttribute('data-async-context')?.split('query:')[1] || ''); 273 | 274 | if (!query) { 275 | return undefined; 276 | } 277 | 278 | const candidates = Array.from(wrapper1.querySelectorAll('div[data-news-doc-id]')); 279 | 280 | return candidates.map((x) => { 281 | const primaryLink = x.querySelector('a:not([href="#"])'); 282 | if (!primaryLink) { 283 | return undefined; 284 | } 285 | const url = primaryLink.getAttribute('href'); 286 | const titleElem = primaryLink.querySelector('div[role="heading"]'); 287 | 288 | if (!titleElem) { 289 | return undefined; 290 | } 291 | 292 | const title = titleElem.textContent?.trim(); 293 | const source = titleElem.previousElementSibling?.textContent?.trim(); 294 | const snippet = titleElem.nextElementSibling?.textContent?.trim(); 295 | 296 | const innerSpans = Array.from(titleElem.parentElement?.querySelectorAll('span') || []); 297 | const date = innerSpans[innerSpans.length - 1]?.textContent?.trim(); 298 | 299 | return { 300 | link: url, 301 | title, 302 | source, 303 | date, 304 | snippet, 305 | variant: 'news', 306 | }; 307 | }).filter(Boolean) as WebSearchEntry[]; 308 | } -------------------------------------------------------------------------------- /src/services/serp/internal.ts: -------------------------------------------------------------------------------- 1 | 2 | import { singleton } from 'tsyringe'; 3 | import { GlobalLogger } from '../logger'; 4 | import { SecretExposer } from '../../shared/services/secrets'; 5 | import { AsyncLocalContext } from '../async-context'; 6 | import { SerperSearchQueryParams } from '../../shared/3rd-party/serper-search'; 7 | import { BlackHoleDetector } from '../blackhole-detector'; 8 | import { AsyncService } from 'civkit/async-service'; 9 | import { JinaSerpApiHTTP } from '../../shared/3rd-party/internal-serp'; 10 | import { WebSearchEntry } from './compat'; 11 | 12 | @singleton() 13 | export class InternalJinaSerpService extends AsyncService { 14 | 15 | logger = this.globalLogger.child({ service: this.constructor.name }); 16 | 17 | client!: JinaSerpApiHTTP; 18 | 19 | constructor( 20 | protected globalLogger: GlobalLogger, 21 | protected secretExposer: SecretExposer, 22 | protected threadLocal: AsyncLocalContext, 23 | protected blackHoleDetector: BlackHoleDetector, 24 | ) { 25 | super(...arguments); 26 | } 27 | 28 | override async init() { 29 | await this.dependencyReady(); 30 | this.emit('ready'); 31 | 32 | this.client = new JinaSerpApiHTTP(this.secretExposer.JINA_SERP_API_KEY); 33 | } 34 | 35 | 36 | async doSearch(variant: 'web' | 'images' | 'news', query: SerperSearchQueryParams) { 37 | this.logger.debug(`Doing external search`, query); 38 | let results; 39 | switch (variant) { 40 | // case 'images': { 41 | // const r = await this.client.imageSearch(query); 42 | 43 | // results = r.parsed.images; 44 | // break; 45 | // } 46 | // case 'news': { 47 | // const r = await this.client.newsSearch(query); 48 | 49 | // results = r.parsed.news; 50 | // break; 51 | // } 52 | case 'web': 53 | default: { 54 | const r = await this.client.webSearch(query); 55 | 56 | results = r.parsed.results?.map((x) => ({ ...x, link: x.url })); 57 | break; 58 | } 59 | } 60 | 61 | this.blackHoleDetector.itWorked(); 62 | 63 | return results as WebSearchEntry[]; 64 | } 65 | 66 | 67 | async webSearch(query: SerperSearchQueryParams) { 68 | return this.doSearch('web', query); 69 | } 70 | async imageSearch(query: SerperSearchQueryParams) { 71 | return this.doSearch('images', query); 72 | } 73 | async newsSearch(query: SerperSearchQueryParams) { 74 | return this.doSearch('news', query); 75 | } 76 | 77 | } 78 | -------------------------------------------------------------------------------- /src/services/serp/serper.ts: -------------------------------------------------------------------------------- 1 | 2 | import { singleton } from 'tsyringe'; 3 | import { GlobalLogger } from '../logger'; 4 | import { SecretExposer } from '../../shared/services/secrets'; 5 | import { AsyncLocalContext } from '../async-context'; 6 | import { SerperBingHTTP, SerperGoogleHTTP, SerperImageSearchResponse, SerperNewsSearchResponse, SerperSearchQueryParams, SerperWebSearchResponse } from '../../shared/3rd-party/serper-search'; 7 | import { BlackHoleDetector } from '../blackhole-detector'; 8 | import { Context } from '../registry'; 9 | import { AsyncService } from 'civkit/async-service'; 10 | import { AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit/civ-rpc'; 11 | 12 | @singleton() 13 | export class SerperGoogleSearchService extends AsyncService { 14 | 15 | logger = this.globalLogger.child({ service: this.constructor.name }); 16 | 17 | client!: SerperGoogleHTTP; 18 | 19 | constructor( 20 | protected globalLogger: GlobalLogger, 21 | protected secretExposer: SecretExposer, 22 | protected threadLocal: AsyncLocalContext, 23 | protected blackHoleDetector: BlackHoleDetector, 24 | ) { 25 | super(...arguments); 26 | } 27 | 28 | override async init() { 29 | await this.dependencyReady(); 30 | this.emit('ready'); 31 | 32 | this.client = new SerperGoogleHTTP(this.secretExposer.SERPER_SEARCH_API_KEY); 33 | } 34 | 35 | 36 | doSearch(variant: 'web', query: SerperSearchQueryParams): Promise; 37 | doSearch(variant: 'images', query: SerperSearchQueryParams): Promise; 38 | doSearch(variant: 'news', query: SerperSearchQueryParams): Promise; 39 | async doSearch(variant: 'web' | 'images' | 'news', query: SerperSearchQueryParams) { 40 | this.logger.debug(`Doing external search`, query); 41 | let results; 42 | switch (variant) { 43 | case 'images': { 44 | const r = await this.client.imageSearch(query); 45 | 46 | results = r.parsed.images; 47 | break; 48 | } 49 | case 'news': { 50 | const r = await this.client.newsSearch(query); 51 | 52 | results = r.parsed.news; 53 | break; 54 | } 55 | case 'web': 56 | default: { 57 | const r = await this.client.webSearch(query); 58 | 59 | results = r.parsed.organic; 60 | break; 61 | } 62 | } 63 | 64 | this.blackHoleDetector.itWorked(); 65 | 66 | return results; 67 | } 68 | 69 | 70 | async webSearch(query: SerperSearchQueryParams) { 71 | return this.doSearch('web', query); 72 | } 73 | async imageSearch(query: SerperSearchQueryParams) { 74 | return this.doSearch('images', query); 75 | } 76 | async newsSearch(query: SerperSearchQueryParams) { 77 | return this.doSearch('news', query); 78 | } 79 | 80 | } 81 | 82 | @singleton() 83 | export class SerperBingSearchService extends SerperGoogleSearchService { 84 | override client!: SerperBingHTTP; 85 | 86 | override async init() { 87 | await this.dependencyReady(); 88 | this.emit('ready'); 89 | 90 | this.client = new SerperBingHTTP(this.secretExposer.SERPER_SEARCH_API_KEY); 91 | } 92 | } 93 | 94 | export class GoogleSearchExplicitOperatorsDto extends AutoCastable { 95 | @Prop({ 96 | arrayOf: String, 97 | desc: `Returns web pages with a specific file extension. Example: to find the Honda GX120 Owner’s manual in PDF, type “Honda GX120 ownners manual ext:pdf”.` 98 | }) 99 | ext?: string | string[]; 100 | 101 | @Prop({ 102 | arrayOf: String, 103 | desc: `Returns web pages created in the specified file type. Example: to find a web page created in PDF format about the evaluation of age-related cognitive changes, type “evaluation of age cognitive changes filetype:pdf”.` 104 | }) 105 | filetype?: string | string[]; 106 | 107 | @Prop({ 108 | arrayOf: String, 109 | desc: `Returns webpages containing the specified term in the title of the page. Example: to find pages about SEO conferences making sure the results contain 2023 in the title, type “seo conference intitle:2023”.` 110 | }) 111 | intitle?: string | string[]; 112 | 113 | @Prop({ 114 | arrayOf: String, 115 | desc: `Returns web pages written in the specified language. The language code must be in the ISO 639-1 two-letter code format. Example: to find information on visas only in Spanish, type “visas lang:es”.` 116 | }) 117 | loc?: string | string[]; 118 | 119 | @Prop({ 120 | arrayOf: String, 121 | desc: `Returns web pages coming only from a specific web site. Example: to find information about Goggles only on Brave pages, type “goggles site:brave.com”.` 122 | }) 123 | site?: string | string[]; 124 | 125 | addTo(searchTerm: string) { 126 | const chunks = []; 127 | for (const [key, value] of Object.entries(this)) { 128 | if (value) { 129 | const values = Array.isArray(value) ? value : [value]; 130 | const textValue = values.map((v) => `${key}:${v}`).join(' OR '); 131 | if (textValue) { 132 | chunks.push(textValue); 133 | } 134 | } 135 | } 136 | const opPart = chunks.length > 1 ? chunks.map((x) => `(${x})`).join(' AND ') : chunks; 137 | 138 | if (opPart.length) { 139 | return [searchTerm, opPart].join(' '); 140 | } 141 | 142 | return searchTerm; 143 | } 144 | 145 | static override from(input: any) { 146 | const instance = super.from(input) as GoogleSearchExplicitOperatorsDto; 147 | const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined; 148 | 149 | const params = ['ext', 'filetype', 'intitle', 'loc', 'site']; 150 | 151 | for (const p of params) { 152 | const customValue = ctx?.get(`x-${p}`) || ctx?.get(`${p}`); 153 | if (!customValue) { 154 | continue; 155 | } 156 | 157 | const filtered = customValue.split(', ').filter(Boolean); 158 | if (filtered.length) { 159 | Reflect.set(instance, p, filtered); 160 | } 161 | } 162 | 163 | return instance; 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /src/services/serper-search.ts: -------------------------------------------------------------------------------- 1 | import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike } from 'civkit'; 2 | import { singleton } from 'tsyringe'; 3 | import { GlobalLogger } from './logger'; 4 | import { SecretExposer } from '../shared/services/secrets'; 5 | import { AsyncLocalContext } from './async-context'; 6 | import { SerperBingHTTP, SerperGoogleHTTP, SerperImageSearchResponse, SerperNewsSearchResponse, SerperSearchQueryParams, SerperWebSearchResponse } from '../shared/3rd-party/serper-search'; 7 | import { BlackHoleDetector } from './blackhole-detector'; 8 | import { Context } from './registry'; 9 | import { ServiceBadAttemptError } from '../shared'; 10 | 11 | @singleton() 12 | export class SerperSearchService extends AsyncService { 13 | 14 | logger = this.globalLogger.child({ service: this.constructor.name }); 15 | 16 | serperGoogleSearchHTTP!: SerperGoogleHTTP; 17 | serperBingSearchHTTP!: SerperBingHTTP; 18 | 19 | constructor( 20 | protected globalLogger: GlobalLogger, 21 | protected secretExposer: SecretExposer, 22 | protected threadLocal: AsyncLocalContext, 23 | protected blackHoleDetector: BlackHoleDetector, 24 | ) { 25 | super(...arguments); 26 | } 27 | 28 | override async init() { 29 | await this.dependencyReady(); 30 | this.emit('ready'); 31 | 32 | this.serperGoogleSearchHTTP = new SerperGoogleHTTP(this.secretExposer.SERPER_SEARCH_API_KEY); 33 | this.serperBingSearchHTTP = new SerperBingHTTP(this.secretExposer.SERPER_SEARCH_API_KEY); 34 | } 35 | 36 | *iterClient() { 37 | const preferBingSearch = this.threadLocal.get('bing-preferred'); 38 | if (preferBingSearch) { 39 | yield this.serperBingSearchHTTP; 40 | } 41 | while (true) { 42 | yield this.serperGoogleSearchHTTP; 43 | } 44 | } 45 | 46 | doSearch(variant: 'web', query: SerperSearchQueryParams): Promise; 47 | doSearch(variant: 'images', query: SerperSearchQueryParams): Promise; 48 | doSearch(variant: 'news', query: SerperSearchQueryParams): Promise; 49 | async doSearch(variant: 'web' | 'images' | 'news', query: SerperSearchQueryParams) { 50 | const clientIt = this.iterClient(); 51 | let client = clientIt.next().value; 52 | if (!client) { 53 | throw new Error(`Error iterating serper client`); 54 | } 55 | 56 | let maxTries = 3; 57 | 58 | while (maxTries--) { 59 | const t0 = Date.now(); 60 | try { 61 | this.logger.debug(`Doing external search`, query); 62 | let r; 63 | switch (variant) { 64 | case 'images': { 65 | r = await client.imageSearch(query); 66 | const nextClient = clientIt.next().value; 67 | if (nextClient && nextClient !== client) { 68 | const results = r.parsed.images; 69 | if (!results.length) { 70 | client = nextClient; 71 | throw new ServiceBadAttemptError('No results found'); 72 | } 73 | } 74 | 75 | break; 76 | } 77 | case 'news': { 78 | r = await client.newsSearch(query); 79 | const nextClient = clientIt.next().value; 80 | if (nextClient && nextClient !== client) { 81 | const results = r.parsed.news; 82 | if (!results.length) { 83 | client = nextClient; 84 | throw new ServiceBadAttemptError('No results found'); 85 | } 86 | } 87 | 88 | break; 89 | } 90 | case 'web': 91 | default: { 92 | r = await client.webSearch(query); 93 | const nextClient = clientIt.next().value; 94 | if (nextClient && nextClient !== client) { 95 | const results = r.parsed.organic; 96 | if (!results.length) { 97 | client = nextClient; 98 | throw new ServiceBadAttemptError('No results found'); 99 | } 100 | } 101 | 102 | break; 103 | } 104 | } 105 | const dt = Date.now() - t0; 106 | this.blackHoleDetector.itWorked(); 107 | this.logger.debug(`External search took ${dt}ms`, { searchDt: dt, variant }); 108 | 109 | return r.parsed; 110 | } catch (err: any) { 111 | const dt = Date.now() - t0; 112 | this.logger.error(`${variant} search failed: ${err?.message}`, { searchDt: dt, err: marshalErrorLike(err) }); 113 | if (err?.status === 429) { 114 | await delay(500 + 1000 * Math.random()); 115 | continue; 116 | } 117 | if (err instanceof ServiceBadAttemptError) { 118 | continue; 119 | } 120 | 121 | throw new DownstreamServiceFailureError({ message: `Search(${variant}) failed` }); 122 | } 123 | } 124 | 125 | throw new DownstreamServiceFailureError({ message: `Search(${variant}) failed` }); 126 | } 127 | 128 | 129 | async webSearch(query: SerperSearchQueryParams) { 130 | return this.doSearch('web', query); 131 | } 132 | async imageSearch(query: SerperSearchQueryParams) { 133 | return this.doSearch('images', query); 134 | } 135 | async newsSearch(query: SerperSearchQueryParams) { 136 | return this.doSearch('news', query); 137 | } 138 | 139 | } 140 | 141 | export class GoogleSearchExplicitOperatorsDto extends AutoCastable { 142 | @Prop({ 143 | arrayOf: String, 144 | desc: `Returns web pages with a specific file extension. Example: to find the Honda GX120 Owner’s manual in PDF, type “Honda GX120 ownners manual ext:pdf”.` 145 | }) 146 | ext?: string | string[]; 147 | 148 | @Prop({ 149 | arrayOf: String, 150 | desc: `Returns web pages created in the specified file type. Example: to find a web page created in PDF format about the evaluation of age-related cognitive changes, type “evaluation of age cognitive changes filetype:pdf”.` 151 | }) 152 | filetype?: string | string[]; 153 | 154 | @Prop({ 155 | arrayOf: String, 156 | desc: `Returns webpages containing the specified term in the title of the page. Example: to find pages about SEO conferences making sure the results contain 2023 in the title, type “seo conference intitle:2023”.` 157 | }) 158 | intitle?: string | string[]; 159 | 160 | @Prop({ 161 | arrayOf: String, 162 | desc: `Returns web pages written in the specified language. The language code must be in the ISO 639-1 two-letter code format. Example: to find information on visas only in Spanish, type “visas lang:es”.` 163 | }) 164 | loc?: string | string[]; 165 | 166 | @Prop({ 167 | arrayOf: String, 168 | desc: `Returns web pages coming only from a specific web site. Example: to find information about Goggles only on Brave pages, type “goggles site:brave.com”.` 169 | }) 170 | site?: string | string[]; 171 | 172 | addTo(searchTerm: string) { 173 | const chunks = []; 174 | for (const [key, value] of Object.entries(this)) { 175 | if (value) { 176 | const values = Array.isArray(value) ? value : [value]; 177 | const textValue = values.map((v) => `${key}:${v}`).join(' OR '); 178 | if (textValue) { 179 | chunks.push(textValue); 180 | } 181 | } 182 | } 183 | const opPart = chunks.length > 1 ? chunks.map((x) => `(${x})`).join(' AND ') : chunks; 184 | 185 | if (opPart.length) { 186 | return [searchTerm, opPart].join(' '); 187 | } 188 | 189 | return searchTerm; 190 | } 191 | 192 | static override from(input: any) { 193 | const instance = super.from(input) as GoogleSearchExplicitOperatorsDto; 194 | const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined; 195 | 196 | const params = ['ext', 'filetype', 'intitle', 'loc', 'site']; 197 | 198 | for (const p of params) { 199 | const customValue = ctx?.get(`x-${p}`) || ctx?.get(`${p}`); 200 | if (!customValue) { 201 | continue; 202 | } 203 | 204 | const filtered = customValue.split(', ').filter(Boolean); 205 | if (filtered.length) { 206 | Reflect.set(instance, p, filtered); 207 | } 208 | } 209 | 210 | return instance; 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /src/services/temp-file.ts: -------------------------------------------------------------------------------- 1 | import { AbstractTempFileManger } from 'civkit/temp'; 2 | import { rm } from 'fs/promises'; 3 | import { singleton } from 'tsyringe'; 4 | import { Finalizer } from './finalizer'; 5 | 6 | @singleton() 7 | export class TempFileManager extends AbstractTempFileManger { 8 | 9 | rootDir = ''; 10 | 11 | override async init() { 12 | await this.dependencyReady(); 13 | await super.init(); 14 | this.emit('ready'); 15 | } 16 | 17 | @Finalizer() 18 | override async standDown() { 19 | await super.standDown(); 20 | 21 | await rm(this.rootDir, { recursive: true, force: true }); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/services/threaded.ts: -------------------------------------------------------------------------------- 1 | import 'reflect-metadata'; 2 | 3 | import { singleton, container } from 'tsyringe'; 4 | import { AbstractThreadedServiceRegistry } from 'civkit/threaded'; 5 | import _ from 'lodash'; 6 | 7 | import { GlobalLogger } from './logger'; 8 | import { AsyncLocalContext } from './async-context'; 9 | import { PseudoTransfer } from './pseudo-transfer'; 10 | import { cpus } from 'os'; 11 | import { isMainThread } from 'worker_threads'; 12 | 13 | @singleton() 14 | export class ThreadedServiceRegistry extends AbstractThreadedServiceRegistry { 15 | container = container; 16 | 17 | logger = this.globalLogger.child({ service: this.constructor.name }); 18 | 19 | constructor( 20 | protected globalLogger: GlobalLogger, 21 | public asyncContext: AsyncLocalContext, 22 | public pseudoTransfer: PseudoTransfer, 23 | ) { 24 | super(...arguments); 25 | } 26 | 27 | setMaxWorkersByCpu() { 28 | const cpuStat = cpus(); 29 | 30 | const evenCpuCycles = cpuStat.filter((_cpu, i) => i % 2 === 0).reduce((acc, cpu) => acc + cpu.times.user + cpu.times.sys, 0); 31 | const oddCpuCycles = cpuStat.filter((_cpu, i) => i % 2 === 1).reduce((acc, cpu) => acc + cpu.times.user + cpu.times.sys, 0); 32 | 33 | const isLikelyHyperThreaded = (oddCpuCycles / evenCpuCycles) < 0.5; 34 | 35 | this.maxWorkers = isLikelyHyperThreaded ? cpuStat.length / 2 : cpuStat.length; 36 | } 37 | 38 | override async init() { 39 | await this.dependencyReady(); 40 | await super.init(); 41 | 42 | if (isMainThread) { 43 | this.setMaxWorkersByCpu(); 44 | await Promise.all( 45 | _.range(0, 2).map( 46 | (_n) => 47 | new Promise( 48 | (resolve, reject) => { 49 | this.createWorker() 50 | .once('message', resolve) 51 | .once('error', reject); 52 | } 53 | ) 54 | ) 55 | ); 56 | } 57 | 58 | this.emit('ready'); 59 | } 60 | 61 | } 62 | 63 | 64 | const instance = container.resolve(ThreadedServiceRegistry); 65 | export default instance; 66 | export const { Method, Param, Ctx, RPCReflect, Threaded } = instance.decorators(); 67 | -------------------------------------------------------------------------------- /src/shared: -------------------------------------------------------------------------------- 1 | ../thinapps-shared/backend -------------------------------------------------------------------------------- /src/stand-alone/crawl.ts: -------------------------------------------------------------------------------- 1 | import 'reflect-metadata'; 2 | import { container, singleton } from 'tsyringe'; 3 | 4 | import { KoaServer } from 'civkit/civ-rpc/koa'; 5 | import http2 from 'http2'; 6 | import http from 'http'; 7 | import { CrawlerHost } from '../api/crawler'; 8 | import { FsWalk, WalkOutEntity } from 'civkit/fswalk'; 9 | import path from 'path'; 10 | import fs from 'fs'; 11 | import { mimeOfExt } from 'civkit/mime'; 12 | import { Context, Next } from 'koa'; 13 | import { RPCRegistry } from '../services/registry'; 14 | import { AsyncResource } from 'async_hooks'; 15 | import { runOnce } from 'civkit/decorators'; 16 | import { randomUUID } from 'crypto'; 17 | import { ThreadedServiceRegistry } from '../services/threaded'; 18 | import { GlobalLogger } from '../services/logger'; 19 | import { AsyncLocalContext } from '../services/async-context'; 20 | import finalizer, { Finalizer } from '../services/finalizer'; 21 | import koaCompress from 'koa-compress'; 22 | 23 | @singleton() 24 | export class CrawlStandAloneServer extends KoaServer { 25 | logger = this.globalLogger.child({ service: this.constructor.name }); 26 | 27 | httpAlternativeServer?: typeof this['httpServer']; 28 | assets = new Map(); 29 | 30 | constructor( 31 | protected globalLogger: GlobalLogger, 32 | protected registry: RPCRegistry, 33 | protected crawlerHost: CrawlerHost, 34 | protected threadLocal: AsyncLocalContext, 35 | protected threads: ThreadedServiceRegistry, 36 | ) { 37 | super(...arguments); 38 | } 39 | 40 | h2c() { 41 | this.httpAlternativeServer = this.httpServer; 42 | const fn = this.koaApp.callback(); 43 | this.httpServer = http2.createServer((req, res) => { 44 | const ar = new AsyncResource('HTTP2ServerRequest'); 45 | ar.runInAsyncScope(fn, this.koaApp, req, res); 46 | }); 47 | // useResourceBasedDefaultTracker(); 48 | 49 | return this; 50 | } 51 | 52 | override async init() { 53 | await this.walkForAssets(); 54 | await super.init(); 55 | } 56 | 57 | async walkForAssets() { 58 | const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public')); 59 | 60 | for (const file of files) { 61 | if (file.type !== 'file') { 62 | continue; 63 | } 64 | this.assets.set(file.relativePath.toString(), file); 65 | } 66 | } 67 | 68 | override listen(port: number) { 69 | const r = super.listen(port); 70 | if (this.httpAlternativeServer) { 71 | const altPort = port + 1; 72 | this.httpAlternativeServer.listen(altPort, () => { 73 | this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`); 74 | }); 75 | } 76 | 77 | return r; 78 | } 79 | 80 | makeAssetsServingController() { 81 | return (ctx: Context, next: Next) => { 82 | const requestPath = ctx.path; 83 | const file = requestPath.slice(1); 84 | if (!file) { 85 | return next(); 86 | } 87 | 88 | const asset = this.assets.get(file); 89 | if (asset?.type !== 'file') { 90 | return next(); 91 | } 92 | 93 | ctx.body = fs.createReadStream(asset.path); 94 | ctx.type = mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream'; 95 | ctx.set('Content-Length', asset.stats.size.toString()); 96 | 97 | return; 98 | }; 99 | } 100 | 101 | registerRoutes(): void { 102 | this.koaApp.use(koaCompress({ 103 | filter(type) { 104 | if (type.startsWith('text/')) { 105 | return true; 106 | } 107 | 108 | if (type.includes('application/json') || type.includes('+json') || type.includes('+xml')) { 109 | return true; 110 | } 111 | 112 | if (type.includes('application/x-ndjson')) { 113 | return true; 114 | } 115 | 116 | return false; 117 | } 118 | })); 119 | this.koaApp.use(this.makeAssetsServingController()); 120 | this.koaApp.use(this.registry.makeShimController()); 121 | } 122 | 123 | // Using h2c server has an implication that multiple requests may share the same connection and x-cloud-trace-context 124 | // TraceId is expected to be request-bound and unique. So these two has to be distinguished. 125 | @runOnce() 126 | override insertAsyncHookMiddleware() { 127 | const asyncHookMiddleware = async (ctx: Context, next: () => Promise) => { 128 | const googleTraceId = ctx.get('x-cloud-trace-context').split('/')?.[0]; 129 | this.threadLocal.setup({ 130 | traceId: randomUUID(), 131 | traceT0: new Date(), 132 | googleTraceId, 133 | }); 134 | 135 | return next(); 136 | }; 137 | 138 | this.koaApp.use(asyncHookMiddleware); 139 | } 140 | 141 | @Finalizer() 142 | override async standDown() { 143 | const tasks: Promise[] = []; 144 | if (this.httpAlternativeServer?.listening) { 145 | (this.httpAlternativeServer as http.Server).closeIdleConnections?.(); 146 | this.httpAlternativeServer.close(); 147 | tasks.push(new Promise((resolve, reject) => { 148 | this.httpAlternativeServer!.close((err) => { 149 | if (err) { 150 | return reject(err); 151 | } 152 | resolve(); 153 | }); 154 | })); 155 | } 156 | tasks.push(super.standDown()); 157 | await Promise.all(tasks); 158 | } 159 | 160 | } 161 | const instance = container.resolve(CrawlStandAloneServer); 162 | 163 | export default instance; 164 | 165 | if (process.env.NODE_ENV?.includes('dry-run')) { 166 | instance.serviceReady().then(() => finalizer.terminate()); 167 | } else { 168 | instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000)); 169 | } -------------------------------------------------------------------------------- /src/stand-alone/search.ts: -------------------------------------------------------------------------------- 1 | import 'reflect-metadata'; 2 | import { container, singleton } from 'tsyringe'; 3 | 4 | import { KoaServer } from 'civkit/civ-rpc/koa'; 5 | import http2 from 'http2'; 6 | import http from 'http'; 7 | import { SearcherHost } from '../api/searcher'; 8 | import { FsWalk, WalkOutEntity } from 'civkit/fswalk'; 9 | import path from 'path'; 10 | import fs from 'fs'; 11 | import { mimeOfExt } from 'civkit/mime'; 12 | import { Context, Next } from 'koa'; 13 | import { RPCRegistry } from '../services/registry'; 14 | import { AsyncResource } from 'async_hooks'; 15 | import { runOnce } from 'civkit/decorators'; 16 | import { randomUUID } from 'crypto'; 17 | import { ThreadedServiceRegistry } from '../services/threaded'; 18 | import { GlobalLogger } from '../services/logger'; 19 | import { AsyncLocalContext } from '../services/async-context'; 20 | import finalizer, { Finalizer } from '../services/finalizer'; 21 | import koaCompress from 'koa-compress'; 22 | 23 | @singleton() 24 | export class SearchStandAloneServer extends KoaServer { 25 | logger = this.globalLogger.child({ service: this.constructor.name }); 26 | 27 | httpAlternativeServer?: typeof this['httpServer']; 28 | assets = new Map(); 29 | 30 | constructor( 31 | protected globalLogger: GlobalLogger, 32 | protected registry: RPCRegistry, 33 | protected searcherHost: SearcherHost, 34 | protected threadLocal: AsyncLocalContext, 35 | protected threads: ThreadedServiceRegistry, 36 | ) { 37 | super(...arguments); 38 | } 39 | 40 | h2c() { 41 | this.httpAlternativeServer = this.httpServer; 42 | const fn = this.koaApp.callback(); 43 | this.httpServer = http2.createServer((req, res) => { 44 | const ar = new AsyncResource('HTTP2ServerRequest'); 45 | ar.runInAsyncScope(fn, this.koaApp, req, res); 46 | }); 47 | // useResourceBasedDefaultTracker(); 48 | 49 | return this; 50 | } 51 | 52 | override async init() { 53 | await this.walkForAssets(); 54 | await this.dependencyReady(); 55 | 56 | for (const [k, v] of this.registry.conf.entries()) { 57 | if (v.tags?.includes('crawl')) { 58 | this.registry.conf.delete(k); 59 | } 60 | } 61 | 62 | await super.init(); 63 | } 64 | 65 | async walkForAssets() { 66 | const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public')); 67 | 68 | for (const file of files) { 69 | if (file.type !== 'file') { 70 | continue; 71 | } 72 | this.assets.set(file.relativePath.toString(), file); 73 | } 74 | } 75 | 76 | override listen(port: number) { 77 | const r = super.listen(port); 78 | if (this.httpAlternativeServer) { 79 | const altPort = port + 1; 80 | this.httpAlternativeServer.listen(altPort, () => { 81 | this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`); 82 | }); 83 | } 84 | 85 | return r; 86 | } 87 | 88 | makeAssetsServingController() { 89 | return (ctx: Context, next: Next) => { 90 | const requestPath = ctx.path; 91 | const file = requestPath.slice(1); 92 | if (!file) { 93 | return next(); 94 | } 95 | 96 | const asset = this.assets.get(file); 97 | if (asset?.type !== 'file') { 98 | return next(); 99 | } 100 | 101 | ctx.body = fs.createReadStream(asset.path); 102 | ctx.type = mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream'; 103 | ctx.set('Content-Length', asset.stats.size.toString()); 104 | 105 | return; 106 | }; 107 | } 108 | 109 | registerRoutes(): void { 110 | this.koaApp.use(koaCompress({ 111 | filter(type) { 112 | if (type.startsWith('text/')) { 113 | return true; 114 | } 115 | 116 | if (type.includes('application/json') || type.includes('+json') || type.includes('+xml')) { 117 | return true; 118 | } 119 | 120 | if (type.includes('application/x-ndjson')) { 121 | return true; 122 | } 123 | 124 | return false; 125 | } 126 | })); 127 | this.koaApp.use(this.makeAssetsServingController()); 128 | this.koaApp.use(this.registry.makeShimController()); 129 | } 130 | 131 | 132 | // Using h2c server has an implication that multiple requests may share the same connection and x-cloud-trace-context 133 | // TraceId is expected to be request-bound and unique. So these two has to be distinguished. 134 | @runOnce() 135 | override insertAsyncHookMiddleware() { 136 | const asyncHookMiddleware = async (ctx: Context, next: () => Promise) => { 137 | const googleTraceId = ctx.get('x-cloud-trace-context').split('/')?.[0]; 138 | this.threadLocal.setup({ 139 | traceId: randomUUID(), 140 | traceT0: new Date(), 141 | googleTraceId, 142 | }); 143 | 144 | return next(); 145 | }; 146 | 147 | this.koaApp.use(asyncHookMiddleware); 148 | } 149 | 150 | @Finalizer() 151 | override async standDown() { 152 | const tasks: Promise[] = []; 153 | if (this.httpAlternativeServer?.listening) { 154 | (this.httpAlternativeServer as http.Server).closeIdleConnections?.(); 155 | this.httpAlternativeServer.close(); 156 | tasks.push(new Promise((resolve, reject) => { 157 | this.httpAlternativeServer!.close((err) => { 158 | if (err) { 159 | return reject(err); 160 | } 161 | resolve(); 162 | }); 163 | })); 164 | } 165 | tasks.push(super.standDown()); 166 | await Promise.all(tasks); 167 | } 168 | 169 | } 170 | const instance = container.resolve(SearchStandAloneServer); 171 | 172 | export default instance; 173 | 174 | if (process.env.NODE_ENV?.includes('dry-run')) { 175 | instance.serviceReady().then(() => finalizer.terminate()); 176 | } else { 177 | instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000)); 178 | } 179 | -------------------------------------------------------------------------------- /src/stand-alone/serp.ts: -------------------------------------------------------------------------------- 1 | import 'reflect-metadata'; 2 | import { container, singleton } from 'tsyringe'; 3 | 4 | import { KoaServer } from 'civkit/civ-rpc/koa'; 5 | import http2 from 'http2'; 6 | import http from 'http'; 7 | import { FsWalk, WalkOutEntity } from 'civkit/fswalk'; 8 | import path from 'path'; 9 | import fs from 'fs'; 10 | import { mimeOfExt } from 'civkit/mime'; 11 | import { Context, Next } from 'koa'; 12 | import { RPCRegistry } from '../services/registry'; 13 | import { AsyncResource } from 'async_hooks'; 14 | import { runOnce } from 'civkit/decorators'; 15 | import { randomUUID } from 'crypto'; 16 | import { ThreadedServiceRegistry } from '../services/threaded'; 17 | import { GlobalLogger } from '../services/logger'; 18 | import { AsyncLocalContext } from '../services/async-context'; 19 | import finalizer, { Finalizer } from '../services/finalizer'; 20 | import { SerpHost } from '../api/serp'; 21 | import koaCompress from 'koa-compress'; 22 | import { getAuditionMiddleware } from '../shared/utils/audition'; 23 | 24 | @singleton() 25 | export class SERPStandAloneServer extends KoaServer { 26 | logger = this.globalLogger.child({ service: this.constructor.name }); 27 | 28 | httpAlternativeServer?: typeof this['httpServer']; 29 | assets = new Map(); 30 | 31 | constructor( 32 | protected globalLogger: GlobalLogger, 33 | protected registry: RPCRegistry, 34 | protected serpHost: SerpHost, 35 | protected threadLocal: AsyncLocalContext, 36 | protected threads: ThreadedServiceRegistry, 37 | ) { 38 | super(...arguments); 39 | } 40 | 41 | h2c() { 42 | this.httpAlternativeServer = this.httpServer; 43 | const fn = this.koaApp.callback(); 44 | this.httpServer = http2.createServer((req, res) => { 45 | const ar = new AsyncResource('HTTP2ServerRequest'); 46 | ar.runInAsyncScope(fn, this.koaApp, req, res); 47 | }); 48 | // useResourceBasedDefaultTracker(); 49 | 50 | return this; 51 | } 52 | 53 | override async init() { 54 | await this.walkForAssets(); 55 | await this.dependencyReady(); 56 | 57 | for (const [k, v] of this.registry.conf.entries()) { 58 | if (v.tags?.includes('crawl')) { 59 | this.registry.conf.delete(k); 60 | } 61 | } 62 | 63 | await super.init(); 64 | } 65 | 66 | async walkForAssets() { 67 | const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public')); 68 | 69 | for (const file of files) { 70 | if (file.type !== 'file') { 71 | continue; 72 | } 73 | this.assets.set(file.relativePath.toString(), file); 74 | } 75 | } 76 | 77 | override listen(port: number) { 78 | const r = super.listen(port); 79 | if (this.httpAlternativeServer) { 80 | const altPort = port + 1; 81 | this.httpAlternativeServer.listen(altPort, () => { 82 | this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`); 83 | }); 84 | } 85 | 86 | return r; 87 | } 88 | 89 | makeAssetsServingController() { 90 | return (ctx: Context, next: Next) => { 91 | const requestPath = ctx.path; 92 | const file = requestPath.slice(1); 93 | if (!file) { 94 | return next(); 95 | } 96 | 97 | const asset = this.assets.get(file); 98 | if (asset?.type !== 'file') { 99 | return next(); 100 | } 101 | 102 | ctx.body = fs.createReadStream(asset.path); 103 | ctx.type = mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream'; 104 | ctx.set('Content-Length', asset.stats.size.toString()); 105 | 106 | return; 107 | }; 108 | } 109 | 110 | registerRoutes(): void { 111 | this.koaApp.use(getAuditionMiddleware()); 112 | this.koaApp.use(koaCompress({ 113 | filter(type) { 114 | if (type.startsWith('text/')) { 115 | return true; 116 | } 117 | 118 | if (type.includes('application/json') || type.includes('+json') || type.includes('+xml')) { 119 | return true; 120 | } 121 | 122 | if (type.includes('application/x-ndjson')) { 123 | return true; 124 | } 125 | 126 | return false; 127 | } 128 | })); 129 | this.koaApp.use(this.makeAssetsServingController()); 130 | this.koaApp.use(this.registry.makeShimController()); 131 | } 132 | 133 | 134 | // Using h2c server has an implication that multiple requests may share the same connection and x-cloud-trace-context 135 | // TraceId is expected to be request-bound and unique. So these two has to be distinguished. 136 | @runOnce() 137 | override insertAsyncHookMiddleware() { 138 | const asyncHookMiddleware = async (ctx: Context, next: () => Promise) => { 139 | const googleTraceId = ctx.get('x-cloud-trace-context').split('/')?.[0]; 140 | this.threadLocal.setup({ 141 | traceId: randomUUID(), 142 | traceT0: new Date(), 143 | googleTraceId, 144 | }); 145 | 146 | return next(); 147 | }; 148 | 149 | this.koaApp.use(asyncHookMiddleware); 150 | } 151 | 152 | @Finalizer() 153 | override async standDown() { 154 | const tasks: Promise[] = []; 155 | if (this.httpAlternativeServer?.listening) { 156 | (this.httpAlternativeServer as http.Server).closeIdleConnections?.(); 157 | this.httpAlternativeServer.close(); 158 | tasks.push(new Promise((resolve, reject) => { 159 | this.httpAlternativeServer!.close((err) => { 160 | if (err) { 161 | return reject(err); 162 | } 163 | resolve(); 164 | }); 165 | })); 166 | } 167 | tasks.push(super.standDown()); 168 | await Promise.all(tasks); 169 | } 170 | 171 | } 172 | const instance = container.resolve(SERPStandAloneServer); 173 | 174 | export default instance; 175 | 176 | if (process.env.NODE_ENV?.includes('dry-run')) { 177 | instance.serviceReady().then(() => finalizer.terminate()); 178 | } else { 179 | instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000)); 180 | } 181 | -------------------------------------------------------------------------------- /src/types.d.ts: -------------------------------------------------------------------------------- 1 | declare module 'langdetect' { 2 | interface DetectionResult { 3 | lang: string; 4 | prob: number; 5 | } 6 | 7 | export function detect(text: string): DetectionResult[]; 8 | export function detectOne(text: string): string | null; 9 | } 10 | 11 | declare module 'jsdom' { 12 | import EventEmitter from 'events'; 13 | export class JSDOM { 14 | constructor(html: string, options?: any); 15 | window: typeof window; 16 | } 17 | export class VirtualConsole extends EventEmitter { 18 | constructor(); 19 | sendTo(console: any, options?: any); 20 | } 21 | } 22 | 23 | declare module 'simple-zstd' { 24 | import { Duplex } from 'stream'; 25 | export function ZSTDCompress(lvl: Number): Duplex; 26 | export function ZSTDDecompress(): Duplex; 27 | export function ZSTDDecompressMaybe(): Duplex; 28 | } 29 | -------------------------------------------------------------------------------- /src/utils/encoding.ts: -------------------------------------------------------------------------------- 1 | import { createReadStream } from 'fs'; 2 | import { Readable } from 'stream'; 3 | import { TextDecoderStream } from 'stream/web'; 4 | 5 | export async function decodeFileStream( 6 | fileStream: Readable, 7 | encoding: string = 'utf-8', 8 | ): Promise { 9 | const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false }); 10 | Readable.toWeb(fileStream).pipeThrough(decodeStream); 11 | const chunks = []; 12 | 13 | for await (const chunk of decodeStream.readable) { 14 | chunks.push(chunk); 15 | } 16 | 17 | return chunks.join(''); 18 | } 19 | 20 | 21 | export async function readFile( 22 | filePath: string, 23 | encoding: string = 'utf-8', 24 | ): Promise { 25 | const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false }); 26 | Readable.toWeb(createReadStream(filePath)).pipeThrough(decodeStream); 27 | const chunks = []; 28 | 29 | for await (const chunk of decodeStream.readable) { 30 | chunks.push(chunk); 31 | } 32 | 33 | return chunks.join(''); 34 | } -------------------------------------------------------------------------------- /src/utils/get-function-url.ts: -------------------------------------------------------------------------------- 1 | import { GoogleAuth } from 'google-auth-library'; 2 | 3 | /** 4 | * Get the URL of a given v2 cloud function. 5 | * 6 | * @param {string} name the function's name 7 | * @param {string} location the function's location 8 | * @return {Promise} The URL of the function 9 | */ 10 | export async function getFunctionUrl(name: string, location = "us-central1") { 11 | const projectId = `reader-6b7dc`; 12 | const url = "https://cloudfunctions.googleapis.com/v2beta/" + 13 | `projects/${projectId}/locations/${location}/functions/${name}`; 14 | const auth = new GoogleAuth({ 15 | scopes: 'https://www.googleapis.com/auth/cloud-platform', 16 | }); 17 | const client = await auth.getClient(); 18 | const res = await client.request({ url }); 19 | const uri = res.data?.serviceConfig?.uri; 20 | if (!uri) { 21 | throw new Error(`Unable to retreive uri for function at ${url}`); 22 | } 23 | return uri; 24 | } 25 | -------------------------------------------------------------------------------- /src/utils/ip.ts: -------------------------------------------------------------------------------- 1 | import { isIPv4, isIPv6 } from 'net'; 2 | 3 | export function parseIp(ip: string): Buffer { 4 | if (isIPv4(ip)) { 5 | const [a, b, c, d] = ip.split('.').map(Number); 6 | 7 | const buf = Buffer.alloc(4); 8 | buf.writeUInt8(a, 0); 9 | buf.writeUInt8(b, 1); 10 | buf.writeUInt8(c, 2); 11 | buf.writeUInt8(d, 3); 12 | 13 | return buf; 14 | } 15 | 16 | if (isIPv6(ip)) { 17 | if (ip.includes('.')) { 18 | const parts = ip.split(':'); 19 | const ipv4Part = parts.pop(); 20 | if (!ipv4Part) throw new Error('Invalid IPv6 address'); 21 | const ipv4Bytes = parseIp(ipv4Part); 22 | parts.push('0'); 23 | const ipv6Bytes = parseIp(parts.join(':')); 24 | ipv6Bytes.writeUInt32BE(ipv4Bytes.readUInt32BE(0), 12); 25 | 26 | return ipv6Bytes; 27 | } 28 | 29 | const buf = Buffer.alloc(16); 30 | 31 | // Expand :: notation 32 | let expanded = ip; 33 | if (ip.includes('::')) { 34 | const sides = ip.split('::'); 35 | const left = sides[0] ? sides[0].split(':') : []; 36 | const right = sides[1] ? sides[1].split(':') : []; 37 | const middle = Array(8 - left.length - right.length).fill('0'); 38 | expanded = [...left, ...middle, ...right].join(':'); 39 | } 40 | 41 | // Convert to buffer 42 | const parts = expanded.split(':'); 43 | let offset = 0; 44 | for (const part of parts) { 45 | buf.writeUInt16BE(parseInt(part, 16), offset); 46 | offset += 2; 47 | } 48 | 49 | return buf; 50 | } 51 | 52 | throw new Error('Invalid IP address'); 53 | } 54 | 55 | 56 | export function parseCIDR(cidr: string): [Buffer, Buffer] { 57 | const [ip, prefixTxt] = cidr.split('/'); 58 | const buf = parseIp(ip); 59 | const maskBuf = Buffer.alloc(buf.byteLength, 0xff); 60 | const prefixBits = parseInt(prefixTxt); 61 | 62 | let offsetBits = 0; 63 | while (offsetBits < (buf.byteLength * 8)) { 64 | if (offsetBits <= (prefixBits - 8)) { 65 | offsetBits += 8; 66 | continue; 67 | } 68 | const bitsRemain = prefixBits - offsetBits; 69 | const byteOffset = Math.floor(offsetBits / 8); 70 | 71 | if (bitsRemain > 0) { 72 | const theByte = buf[byteOffset]; 73 | const mask = 0xff << (8 - bitsRemain); 74 | maskBuf[byteOffset] = mask; 75 | buf[byteOffset] = theByte & mask; 76 | 77 | offsetBits += 8; 78 | continue; 79 | }; 80 | buf[byteOffset] = 0; 81 | maskBuf[byteOffset] = 0; 82 | 83 | offsetBits += 8; 84 | } 85 | 86 | return [buf, maskBuf]; 87 | } 88 | 89 | export class CIDR { 90 | buff: Buffer; 91 | mask: Buffer; 92 | text: string; 93 | constructor(cidr: string) { 94 | this.text = cidr; 95 | [this.buff, this.mask] = parseCIDR(cidr); 96 | } 97 | 98 | toString() { 99 | return this.text; 100 | } 101 | 102 | get family() { 103 | return this.buff.byteLength === 4 ? 4 : 6; 104 | } 105 | 106 | test(ip: string | Buffer): boolean { 107 | const parsedIp = typeof ip === 'string' ? parseIp(ip) : ip; 108 | 109 | if (parsedIp.byteLength !== this.buff.byteLength) { 110 | return false; 111 | } 112 | 113 | for (const i of Array(this.buff.byteLength).keys()) { 114 | const t = parsedIp[i]; 115 | const m = this.mask[i]; 116 | 117 | if (m === 0) { 118 | return true; 119 | } 120 | 121 | const r = this.buff[i]; 122 | if ((t & m) !== r) { 123 | return false; 124 | } 125 | } 126 | 127 | return true; 128 | } 129 | } 130 | 131 | const nonPublicNetworks4 = [ 132 | '10.0.0.0/8', 133 | '172.16.0.0/12', 134 | '192.168.0.0/16', 135 | 136 | '127.0.0.0/8', 137 | '255.255.255.255/32', 138 | '169.254.0.0/16', 139 | '224.0.0.0/4', 140 | 141 | '100.64.0.0/10', 142 | '0.0.0.0/32', 143 | ]; 144 | 145 | 146 | const nonPublicNetworks6 = [ 147 | 'fc00::/7', 148 | 'fe80::/10', 149 | 'ff00::/8', 150 | 151 | '::127.0.0.0/104', 152 | '::/128', 153 | ]; 154 | 155 | const nonPublicCIDRs = [...nonPublicNetworks4, ...nonPublicNetworks6].map(cidr => new CIDR(cidr)); 156 | 157 | export function isIPInNonPublicRange(ip: string) { 158 | const parsed = parseIp(ip); 159 | 160 | for (const cidr of nonPublicCIDRs) { 161 | if (cidr.test(parsed)) { 162 | return true; 163 | } 164 | } 165 | 166 | return false; 167 | } 168 | -------------------------------------------------------------------------------- /src/utils/markdown.ts: -------------------------------------------------------------------------------- 1 | 2 | export function tidyMarkdown(markdown: string): string { 3 | 4 | // Step 1: Handle complex broken links with text and optional images spread across multiple lines 5 | let normalizedMarkdown = markdown.replace(/\[\s*([^\]\n]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => { 6 | // Remove internal new lines and excessive spaces within the text 7 | text = text.replace(/\s+/g, ' ').trim(); 8 | url = url.replace(/\s+/g, '').trim(); 9 | return `[${text}](${url})`; 10 | }); 11 | 12 | normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]\n!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => { 13 | // Normalize by removing excessive spaces and new lines 14 | text = text.replace(/\s+/g, ' ').trim(); 15 | alt = alt ? alt.replace(/\s+/g, ' ').trim() : ''; 16 | imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : ''; 17 | linkUrl = linkUrl.replace(/\s+/g, '').trim(); 18 | if (imgUrl) { 19 | return `[${text} ![${alt}](${imgUrl})](${linkUrl})`; 20 | } else { 21 | return `[${text}](${linkUrl})`; 22 | } 23 | }); 24 | 25 | // Step 2: Normalize regular links that may be broken across lines 26 | normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => { 27 | text = text.replace(/\s+/g, ' ').trim(); 28 | url = url.replace(/\s+/g, '').trim(); 29 | return `[${text}](${url})`; 30 | }); 31 | 32 | // Step 3: Replace more than two consecutive empty lines with exactly two empty lines 33 | normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n'); 34 | 35 | // Step 4: Remove leading spaces from each line 36 | normalizedMarkdown = normalizedMarkdown.replace(/^[ \t]+/gm, ''); 37 | 38 | return normalizedMarkdown.trim(); 39 | } 40 | -------------------------------------------------------------------------------- /src/utils/misc.ts: -------------------------------------------------------------------------------- 1 | import { ParamValidationError } from 'civkit'; 2 | 3 | export function cleanAttribute(attribute: string | null) { 4 | return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : ''; 5 | } 6 | 7 | 8 | export function tryDecodeURIComponent(input: string) { 9 | try { 10 | return decodeURIComponent(input); 11 | } catch (err) { 12 | if (URL.canParse(input, 'http://localhost:3000')) { 13 | return input; 14 | } 15 | 16 | throw new ParamValidationError(`Invalid URIComponent: ${input}`); 17 | } 18 | } 19 | 20 | 21 | export async function* toAsyncGenerator(val: T) { 22 | yield val; 23 | } 24 | 25 | export async function* toGenerator(val: T) { 26 | yield val; 27 | } -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "module": "node16", 4 | 5 | "noImplicitReturns": true, 6 | "noUnusedLocals": true, 7 | "outDir": "build", 8 | "sourceMap": true, 9 | "strict": true, 10 | "allowJs": true, 11 | "target": "es2022", 12 | "lib": ["es2022"], 13 | "skipLibCheck": true, 14 | "useDefineForClassFields": false, 15 | "experimentalDecorators": true, 16 | "emitDecoratorMetadata": true, 17 | "esModuleInterop": true, 18 | "noImplicitOverride": true, 19 | }, 20 | "compileOnSave": true, 21 | "include": ["src"] 22 | } 23 | --------------------------------------------------------------------------------