├── .github └── workflows │ └── .keep ├── .gitignore ├── .vscode ├── exensions.json ├── launch.json ├── settings.json └── tasks.json ├── Dockerfile ├── LICENSE ├── README.md ├── backend ├── .firebaserc ├── .gitignore ├── firebase.json ├── firestore.indexes.json ├── firestore.rules ├── functions │ ├── .editorconfig │ ├── .env.example │ ├── .puppeteerrc.cjs │ ├── integrity-check.cjs │ ├── package-lock.json │ ├── package.json │ ├── src │ │ ├── cloud-functions │ │ │ └── crawler.ts │ │ ├── db │ │ │ ├── crawled.ts │ │ │ ├── domain-blockade.ts │ │ │ ├── img-alt.ts │ │ │ ├── pdf.ts │ │ │ └── searched.ts │ │ ├── dto │ │ │ └── scrapping-options.ts │ │ ├── fetch.d.ts │ │ ├── index.ts │ │ ├── server.ts │ │ ├── services │ │ │ ├── geoip.ts │ │ │ ├── jsdom.ts │ │ │ └── puppeteer.ts │ │ ├── shared │ │ │ ├── 3rd-party │ │ │ │ └── brave-search.ts │ │ │ ├── decorators.ts │ │ │ ├── errors.ts │ │ │ ├── index.ts │ │ │ ├── lib │ │ │ │ └── firestore.ts │ │ │ ├── logger.ts │ │ │ ├── output-stream.ts │ │ │ ├── rpc-reflect.ts │ │ │ ├── services │ │ │ │ ├── canvas.ts │ │ │ │ ├── rate-limit.ts │ │ │ │ └── secrets.ts │ │ │ └── types.ts │ │ ├── types.d.ts │ │ └── utils │ │ │ ├── markdown.ts │ │ │ └── misc.ts │ └── tsconfig.json └── storage.rules ├── docker-compose.yaml ├── package.json └── screenshots └── .gitkeep /.github/workflows/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intergalacticalvariable/reader/d5eee9517578c1a31e8beb1fbac1e3a638c940e7/.github/workflows/.keep -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | .DS_Store 3 | /package-lock.json 4 | .aider* 5 | -------------------------------------------------------------------------------- /.vscode/exensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "editorconfig.editorconfig", 4 | "octref.vetur", 5 | "redhat.vscode-yaml", 6 | "dbaeumer.vscode-eslint", 7 | "esbenp.prettier-vscode", 8 | "streetsidesoftware.code-spell-checker" 9 | ] 10 | } -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Debug Fullstack: attach", 6 | "request": "attach", 7 | "cwd": "${workspaceFolder}/backend/functions", 8 | "skipFiles": [ 9 | "/**" 10 | ], 11 | "type": "node", 12 | "preLaunchTask": "Fullstack:debug" 13 | }, 14 | { 15 | "name": "Debug Fullstack: attach: with proxy", 16 | "request": "attach", 17 | "cwd": "${workspaceFolder}/backend/functions", 18 | "skipFiles": [ 19 | "/**" 20 | ], 21 | "type": "node", 22 | "preLaunchTask": "Fullstack:debug:with-proxy" 23 | }, 24 | { 25 | "name": "Attach", 26 | "port": 9229, 27 | "request": "attach", 28 | "skipFiles": [ 29 | "/**" 30 | ], 31 | "type": "node" 32 | }, 33 | { 34 | "name": "Attach by Process ID", 35 | "processId": "${command:PickProcess}", 36 | "request": "attach", 37 | "skipFiles": [ 38 | "/**" 39 | ], 40 | "type": "node" 41 | }, 42 | { 43 | "name": "Debug Fullstack", 44 | "request": "launch", 45 | "runtimeArgs": [ 46 | "emulators:start", 47 | "--import=../.firebase-emu", 48 | "--export-on-exit=../.firebase-emu", 49 | ], 50 | "cwd": "${workspaceFolder}/backend/functions", 51 | "runtimeExecutable": "${workspaceFolder}/node_modules/.bin/firebase", 52 | "skipFiles": [ 53 | "/**" 54 | ], 55 | "type": "node", 56 | "preLaunchTask": "Fullstack:prepare", 57 | "killBehavior": "polite" 58 | }, 59 | ] 60 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.wordWrap": "on", 3 | "editor.wordWrapColumn": 120, 4 | "files.trimTrailingWhitespace": true, 5 | "files.trimFinalNewlines": true, 6 | "[javascript]": { 7 | "editor.defaultFormatter": "vscode.typescript-language-features" 8 | }, 9 | "[jsonc]": { 10 | "editor.defaultFormatter": "vscode.json-language-features" 11 | }, 12 | "[typescript]": { 13 | "editor.defaultFormatter": "vscode.typescript-language-features" 14 | }, 15 | "[json]": { 16 | "editor.defaultFormatter": "vscode.json-language-features" 17 | }, 18 | "[yaml]": { 19 | "editor.defaultFormatter": "redhat.vscode-yaml" 20 | }, 21 | "[markdown]": { 22 | "files.trimTrailingWhitespace": false 23 | }, 24 | "typescript.tsdk": "node_modules/typescript/lib", 25 | "typescript.preferences.quoteStyle": "single", 26 | "typescript.format.semicolons": "insert", 27 | "typescript.preferences.importModuleSpecifier": "project-relative", 28 | "typescript.locale": "en", 29 | "cSpell.enabled": true, 30 | "cSpell.words": [ 31 | ], 32 | } -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.0", 3 | "tasks": [ 4 | { 5 | "type": "npm", 6 | "script": "build", 7 | "group": "build", 8 | "options": { 9 | "cwd": "${workspaceFolder}/backend/functions" 10 | }, 11 | "problemMatcher": [], 12 | "label": "Backend:rebuild", 13 | "detail": "Backend:rebuild" 14 | }, 15 | { 16 | "type": "npm", 17 | "script": "emu:reset", 18 | "group": "build", 19 | "options": { 20 | "cwd": "${workspaceFolder}/backend/functions" 21 | }, 22 | "problemMatcher": [], 23 | "label": "Backend:reset-emulator", 24 | "detail": "Backend:reset-emulator" 25 | }, 26 | { 27 | "type": "typescript", 28 | "options": { 29 | "cwd": "${workspaceFolder}/backend/functions" 30 | }, 31 | "tsconfig": "backend/functions/tsconfig.json", 32 | "option": "watch", 33 | "isBackground": true, 34 | "problemMatcher": [ 35 | "$tsc-watch" 36 | ], 37 | "group": "build", 38 | "label": "Backend:build:watch" 39 | }, 40 | { 41 | "type": "npm", 42 | "script": "emu:debug", 43 | "group": "none", 44 | "options": { 45 | "cwd": "${workspaceFolder}/backend/functions" 46 | }, 47 | "problemMatcher": [ 48 | { 49 | "base": "$tsc", 50 | "background": { 51 | "activeOnStart": false, 52 | "beginsPattern": "shutdown requested|Starting emulators", 53 | "endsPattern": "Debugger listening" 54 | } 55 | } 56 | ], 57 | "label": "Backend:start-emulator-debug", 58 | "detail": "Backend:start-emulator-debug", 59 | "dependsOn": [ 60 | "Backend:build:watch" 61 | ], 62 | "isBackground": true, 63 | }, 64 | { 65 | "type": "npm", 66 | "script": "dev", 67 | "options": { 68 | "cwd": "${workspaceFolder}/webapp", 69 | }, 70 | "group": "build", 71 | "label": "Frontend:start:dev", 72 | "detail": "Frontend:start:dev", 73 | "isBackground": true, 74 | "problemMatcher": { 75 | "base": "$vite", 76 | "background": { 77 | "activeOnStart": true, 78 | "endsPattern": "OK", 79 | "beginsPattern": "vite" 80 | } 81 | }, 82 | }, 83 | { 84 | "type": "npm", 85 | "script": "dev", 86 | "options": { 87 | "cwd": "${workspaceFolder}/webapp", 88 | "env": { 89 | "FIREBASE_EMULATE": "true", 90 | } 91 | }, 92 | "group": "build", 93 | "label": "Frontend:start:emu", 94 | "detail": "Frontend:start:emu", 95 | "isBackground": true, 96 | "problemMatcher": { 97 | "base": "$vite", 98 | "background": { 99 | "activeOnStart": true, 100 | "endsPattern": "OK", 101 | "beginsPattern": "vite" 102 | } 103 | }, 104 | }, 105 | { 106 | "type": "npm", 107 | "script": "emu:debug2", 108 | "group": "none", 109 | "options": { 110 | "cwd": "${workspaceFolder}/backend/functions", 111 | "env": { 112 | "https_proxy": "http://127.0.0.1:7890", 113 | "http_proxy": "http://127.0.0.1:7890", 114 | "all_proxy": "socks5://127.0.0.1:7890" 115 | } 116 | }, 117 | "problemMatcher": [ 118 | { 119 | "base": "$tsc", 120 | "background": { 121 | "activeOnStart": false, 122 | "beginsPattern": "shutdown requested|Starting emulators", 123 | "endsPattern": "Debugger listening" 124 | } 125 | } 126 | ], 127 | "label": "Backend:start-emulator-debug:with-proxy", 128 | "detail": "Backend:start-emulator-debug:with-proxy", 129 | "dependsOn": [ 130 | "Backend:build:watch" 131 | ], 132 | "isBackground": true, 133 | }, 134 | { 135 | "label": "Fullstack:prepare", 136 | "dependsOn": [ 137 | "Frontend:start:emu", 138 | "Backend:build:watch", 139 | ], 140 | }, 141 | { 142 | "label": "Fullstack:debug", 143 | "dependsOn": [ 144 | // "Frontend:start:emu", 145 | "Backend:start-emulator-debug", 146 | ], 147 | }, 148 | { 149 | "label": "Fullstack:debug:with-proxy", 150 | "dependsOn": [ 151 | "Frontend:start:emu", 152 | "Backend:start-emulator-debug:with-proxy", 153 | ], 154 | } 155 | ] 156 | } -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use Node.js 18 slim image (Debian-based) 2 | FROM node:18-slim 3 | 4 | # Install necessary tools and libraries 5 | RUN apt-get update && apt-get install -y \ 6 | chromium \ 7 | libmagic-dev \ 8 | build-essential \ 9 | python3 \ 10 | wget \ 11 | gnupg \ 12 | && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ 13 | && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \ 14 | && apt-get update \ 15 | && apt-get install -y google-chrome-stable \ 16 | && rm -rf /var/lib/apt/lists/* 17 | 18 | # Set environment variables 19 | ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true 20 | ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/google-chrome-stable 21 | 22 | # Set working directory 23 | WORKDIR /app 24 | 25 | # Copy package.json and package-lock.json 26 | COPY backend/functions/package*.json ./ 27 | 28 | # Install dependencies 29 | RUN npm ci 30 | 31 | # Copy the rest of the application code 32 | COPY backend/functions . 33 | 34 | # Build the application 35 | RUN npm run build 36 | 37 | # Create local storage directory and set permissions 38 | RUN mkdir -p /app/local-storage && chmod 777 /app/local-storage 39 | 40 | # Expose the port the app runs on 41 | EXPOSE 3000 42 | 43 | # Start the application 44 | CMD ["node", "build/server.js"] 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2020-2024 Jina AI Limited. All rights reserved. 2 | 3 | 4 | Apache License 5 | Version 2.0, January 2004 6 | http://www.apache.org/licenses/ 7 | 8 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 9 | 10 | 1. Definitions. 11 | 12 | "License" shall mean the terms and conditions for use, reproduction, 13 | and distribution as defined by Sections 1 through 9 of this document. 14 | 15 | "Licensor" shall mean the copyright owner or entity authorized by 16 | the copyright owner that is granting the License. 17 | 18 | "Legal Entity" shall mean the union of the acting entity and all 19 | other entities that control, are controlled by, or are under common 20 | control with that entity. For the purposes of this definition, 21 | "control" means (i) the power, direct or indirect, to cause the 22 | direction or management of such entity, whether by contract or 23 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 24 | outstanding shares, or (iii) beneficial ownership of such entity. 25 | 26 | "You" (or "Your") shall mean an individual or Legal Entity 27 | exercising permissions granted by this License. 28 | 29 | "Source" form shall mean the preferred form for making modifications, 30 | including but not limited to software source code, documentation 31 | source, and configuration files. 32 | 33 | "Object" form shall mean any form resulting from mechanical 34 | transformation or translation of a Source form, including but 35 | not limited to compiled object code, generated documentation, 36 | and conversions to other media types. 37 | 38 | "Work" shall mean the work of authorship, whether in Source or 39 | Object form, made available under the License, as indicated by a 40 | copyright notice that is included in or attached to the work 41 | (an example is provided in the Appendix below). 42 | 43 | "Derivative Works" shall mean any work, whether in Source or Object 44 | form, that is based on (or derived from) the Work and for which the 45 | editorial revisions, annotations, elaborations, or other modifications 46 | represent, as a whole, an original work of authorship. For the purposes 47 | of this License, Derivative Works shall not include works that remain 48 | separable from, or merely link (or bind by name) to the interfaces of, 49 | the Work and Derivative Works thereof. 50 | 51 | "Contribution" shall mean any work of authorship, including 52 | the original version of the Work and any modifications or additions 53 | to that Work or Derivative Works thereof, that is intentionally 54 | submitted to Licensor for inclusion in the Work by the copyright owner 55 | or by an individual or Legal Entity authorized to submit on behalf of 56 | the copyright owner. For the purposes of this definition, "submitted" 57 | means any form of electronic, verbal, or written communication sent 58 | to the Licensor or its representatives, including but not limited to 59 | communication on electronic mailing lists, source code control systems, 60 | and issue tracking systems that are managed by, or on behalf of, the 61 | Licensor for the purpose of discussing and improving the Work, but 62 | excluding communication that is conspicuously marked or otherwise 63 | designated in writing by the copyright owner as "Not a Contribution." 64 | 65 | "Contributor" shall mean Licensor and any individual or Legal Entity 66 | on behalf of whom a Contribution has been received by Licensor and 67 | subsequently incorporated within the Work. 68 | 69 | 2. Grant of Copyright License. Subject to the terms and conditions of 70 | this License, each Contributor hereby grants to You a perpetual, 71 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 72 | copyright license to reproduce, prepare Derivative Works of, 73 | publicly display, publicly perform, sublicense, and distribute the 74 | Work and such Derivative Works in Source or Object form. 75 | 76 | 3. Grant of Patent License. Subject to the terms and conditions of 77 | this License, each Contributor hereby grants to You a perpetual, 78 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 79 | (except as stated in this section) patent license to make, have made, 80 | use, offer to sell, sell, import, and otherwise transfer the Work, 81 | where such license applies only to those patent claims licensable 82 | by such Contributor that are necessarily infringed by their 83 | Contribution(s) alone or by combination of their Contribution(s) 84 | with the Work to which such Contribution(s) was submitted. If You 85 | institute patent litigation against any entity (including a 86 | cross-claim or counterclaim in a lawsuit) alleging that the Work 87 | or a Contribution incorporated within the Work constitutes direct 88 | or contributory patent infringement, then any patent licenses 89 | granted to You under this License for that Work shall terminate 90 | as of the date such litigation is filed. 91 | 92 | 4. Redistribution. You may reproduce and distribute copies of the 93 | Work or Derivative Works thereof in any medium, with or without 94 | modifications, and in Source or Object form, provided that You 95 | meet the following conditions: 96 | 97 | (a) You must give any other recipients of the Work or 98 | Derivative Works a copy of this License; and 99 | 100 | (b) You must cause any modified files to carry prominent notices 101 | stating that You changed the files; and 102 | 103 | (c) You must retain, in the Source form of any Derivative Works 104 | that You distribute, all copyright, patent, trademark, and 105 | attribution notices from the Source form of the Work, 106 | excluding those notices that do not pertain to any part of 107 | the Derivative Works; and 108 | 109 | (d) If the Work includes a "NOTICE" text file as part of its 110 | distribution, then any Derivative Works that You distribute must 111 | include a readable copy of the attribution notices contained 112 | within such NOTICE file, excluding those notices that do not 113 | pertain to any part of the Derivative Works, in at least one 114 | of the following places: within a NOTICE text file distributed 115 | as part of the Derivative Works; within the Source form or 116 | documentation, if provided along with the Derivative Works; or, 117 | within a display generated by the Derivative Works, if and 118 | wherever such third-party notices normally appear. The contents 119 | of the NOTICE file are for informational purposes only and 120 | do not modify the License. You may add Your own attribution 121 | notices within Derivative Works that You distribute, alongside 122 | or as an addendum to the NOTICE text from the Work, provided 123 | that such additional attribution notices cannot be construed 124 | as modifying the License. 125 | 126 | You may add Your own copyright statement to Your modifications and 127 | may provide additional or different license terms and conditions 128 | for use, reproduction, or distribution of Your modifications, or 129 | for any such Derivative Works as a whole, provided Your use, 130 | reproduction, and distribution of the Work otherwise complies with 131 | the conditions stated in this License. 132 | 133 | 5. Submission of Contributions. Unless You explicitly state otherwise, 134 | any Contribution intentionally submitted for inclusion in the Work 135 | by You to the Licensor shall be under the terms and conditions of 136 | this License, without any additional terms or conditions. 137 | Notwithstanding the above, nothing herein shall supersede or modify 138 | the terms of any separate license agreement you may have executed 139 | with Licensor regarding such Contributions. 140 | 141 | 6. Trademarks. This License does not grant permission to use the trade 142 | names, trademarks, service marks, or product names of the Licensor, 143 | except as required for reasonable and customary use in describing the 144 | origin of the Work and reproducing the content of the NOTICE file. 145 | 146 | 7. Disclaimer of Warranty. Unless required by applicable law or 147 | agreed to in writing, Licensor provides the Work (and each 148 | Contributor provides its Contributions) on an "AS IS" BASIS, 149 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 150 | implied, including, without limitation, any warranties or conditions 151 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 152 | PARTICULAR PURPOSE. You are solely responsible for determining the 153 | appropriateness of using or redistributing the Work and assume any 154 | risks associated with Your exercise of permissions under this License. 155 | 156 | 8. Limitation of Liability. In no event and under no legal theory, 157 | whether in tort (including negligence), contract, or otherwise, 158 | unless required by applicable law (such as deliberate and grossly 159 | negligent acts) or agreed to in writing, shall any Contributor be 160 | liable to You for damages, including any direct, indirect, special, 161 | incidental, or consequential damages of any character arising as a 162 | result of this License or out of the use or inability to use the 163 | Work (including but not limited to damages for loss of goodwill, 164 | work stoppage, computer failure or malfunction, or any and all 165 | other commercial damages or losses), even if such Contributor 166 | has been advised of the possibility of such damages. 167 | 168 | 9. Accepting Warranty or Additional Liability. While redistributing 169 | the Work or Derivative Works thereof, You may choose to offer, 170 | and charge a fee for, acceptance of support, warranty, indemnity, 171 | or other liability obligations and/or rights consistent with this 172 | License. However, in accepting such obligations, You may act only 173 | on Your own behalf and on Your sole responsibility, not on behalf 174 | of any other Contributor, and only if You agree to indemnify, 175 | defend, and hold each Contributor harmless for any liability 176 | incurred by, or claims asserted against, such Contributor by reason 177 | of your accepting any such warranty or additional liability. 178 | 179 | END OF TERMS AND CONDITIONS 180 | 181 | Copyright 2020-2021 Jina AI Limited 182 | 183 | Licensed under the Apache License, Version 2.0 (the "License"); 184 | you may not use this file except in compliance with the License. 185 | You may obtain a copy of the License at 186 | 187 | http://www.apache.org/licenses/LICENSE-2.0 188 | 189 | Unless required by applicable law or agreed to in writing, software 190 | distributed under the License is distributed on an "AS IS" BASIS, 191 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 192 | See the License for the specific language governing permissions and 193 | limitations under the License. 194 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 📚 Reader: Local Deployment Edition 2 | 3 | 🌟 **Live Demo:** 4 | 5 | Example: [https://reader.berlin.cx/https://github.com/intergalacticalvariable/reader/](https://reader.berlin.cx/https://github.com/intergalacticalvariable/reader/) 6 | 7 | This is an adapted version of [Jina AI's Reader](https://github.com/jina-ai/reader) for local deployment using Docker. 8 | 9 | ## 🎯 What it does 10 | It converts any URL to an LLM-friendly input with `http://127.0.0.1:3000/https://google.com`. Get improved output for your agent and RAG systems at no cost. This tool helps you prepare web content for Large Language Models, making it easier to process and analyze online information. 11 | 12 | ## 🚀 Key Features 13 | - 🏠 Runs locally using Docker 14 | - 🔑 No API keys required - works out of the box! 15 | - 🖼️ Saves screenshots locally instead of uploading to Google Cloud Storage 16 | - 📥 Provides download URLs for saved screenshots 17 | - 🌐 Converts web content to LLM-friendly formats 18 | 19 | ## ⚠️ Limitations 20 | - 📄 Currently does not support parsing PDFs 21 | 22 | ## 💻 Demo Environment 23 | The live demo is running on a VPS with the following specifications: 24 | - CPU: 1 vCore 25 | - RAM: 0.5 GB 26 | - Web Server: nginx 27 | 28 | This demonstrates that the Reader can run effectively even on minimal hardware resources. 29 | ## 🐳 Docker Deployment 30 | 31 | ### Option 1: Using the pre-built image 32 | 1. Pull the latest image: 33 | ```bash 34 | docker pull ghcr.io/intergalacticalvariable/reader:latest 35 | ``` 36 | 2. Run the container: 37 | ```bash 38 | docker run -d -p 3000:3000 -v /path/to/local-storage:/app/local-storage --name reader-container ghcr.io/intergalacticalvariable/reader:latest 39 | ``` 40 | Replace `/path/to/local-storage` with the directory where you want to store screenshots. 41 | This command does the following: 42 | - Maps port 3000 of the container to port 3000 on your host 43 | - Mounts a volume for local storage 44 | - Names the container `reader-container` 45 | 3. To stop the container: 46 | ```bash 47 | docker stop reader-container 48 | ``` 49 | 4. To start the container again: 50 | ```bash 51 | docker start reader-container 52 | ``` 53 | 54 | ### Option 2: Building the image locally 55 | 1. Clone the repository: 56 | ```bash 57 | git clone https://github.com/intergalacticalvariable/reader.git 58 | cd reader 59 | ``` 60 | 2. Build the Docker image: 61 | ```bash 62 | docker build -t reader . 63 | ``` 64 | 3. Run the container: 65 | ```bash 66 | docker run -p 3000:3000 -v /path/to/local-storage:/app/local-storage reader 67 | ``` 68 | 69 | ## 🖥️ Usage 70 | Once the Docker container is running, you can use curl to make requests. Here are examples for different response types: 71 | 72 | 1. 📝 Markdown (bypasses readability processing): 73 | ```bash 74 | curl -H "X-Respond-With: markdown" 'http://127.0.0.1:3000/https://google.com' 75 | ``` 76 | 77 | 2. 🌐 HTML (returns documentElement.outerHTML): 78 | ```bash 79 | curl -H "X-Respond-With: html" 'http://127.0.0.1:3000/https://google.com' 80 | ``` 81 | 82 | 3. 📄 Text (returns document.body.innerText): 83 | ```bash 84 | curl -H "X-Respond-With: text" 'http://127.0.0.1:3000/https://google.com' 85 | ``` 86 | 87 | 4. 📸 Screen-Size Screenshot (returns the URL of the webpage's screenshot): 88 | ```bash 89 | curl -H "X-Respond-With: screenshot" 'http://127.0.0.1:3000/https://google.com' 90 | ``` 91 | 92 | 5. 📸 Full-Page Screenshot (returns the URL of the webpage's screenshot): 93 | ```bash 94 | curl -H "X-Respond-With: pageshot" 'http://127.0.0.1:3000/https://google.com' 95 | ``` 96 | 97 | ## 🙏 Acknowledgements 98 | This project is based on the excellent work done by multiple contributors: 99 | 1. The original [Jina AI Reader project](https://github.com/jina-ai/reader), which provided the foundation for this tool. 100 | 2. [Harsh Gupta's adaptation](https://github.com/hargup/reader), which served as the immediate basis for this Docker deployment version. 101 | 102 | ## 📜 License 103 | This project is licensed under Apache-2.0 same as the original Jina AI Reader project. 104 | -------------------------------------------------------------------------------- /backend/.firebaserc: -------------------------------------------------------------------------------- 1 | { 2 | "projects": { 3 | "default": "reader-clone" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /backend/.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | firebase-debug.log* 8 | firebase-debug.*.log* 9 | 10 | # Firebase cache 11 | .firebase/ 12 | 13 | # Firebase config 14 | 15 | # Uncomment this if you'd like others to create their own Firebase project. 16 | # For a team working on the same Firebase project(s), it is recommended to leave 17 | # it commented so all members can deploy to the same project(s) in .firebaserc. 18 | # .firebaserc 19 | 20 | # Runtime data 21 | pids 22 | *.pid 23 | *.seed 24 | *.pid.lock 25 | 26 | # Directory for instrumented libs generated by jscoverage/JSCover 27 | lib-cov 28 | 29 | # Coverage directory used by tools like istanbul 30 | coverage 31 | 32 | # nyc test coverage 33 | .nyc_output 34 | 35 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 36 | .grunt 37 | 38 | # Bower dependency directory (https://bower.io/) 39 | bower_components 40 | 41 | # node-waf configuration 42 | .lock-wscript 43 | 44 | # Compiled binary addons (http://nodejs.org/api/addons.html) 45 | build/Release 46 | 47 | # Dependency directories 48 | node_modules/ 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Optional REPL history 57 | .node_repl_history 58 | 59 | # Output of 'npm pack' 60 | *.tgz 61 | 62 | # Yarn Integrity file 63 | .yarn-integrity 64 | 65 | # dotenv environment variables file 66 | .env 67 | .secret.local 68 | 69 | toy*.ts 70 | 71 | .DS_Store 72 | build/ 73 | .firebase-emu/ 74 | *.log 75 | .DS_Store 76 | 77 | *.local 78 | .secret.* 79 | licensed/ -------------------------------------------------------------------------------- /backend/firebase.json: -------------------------------------------------------------------------------- 1 | { 2 | "firestore": { 3 | "rules": "firestore.rules", 4 | "indexes": "firestore.indexes.json" 5 | }, 6 | "functions": [ 7 | { 8 | "source": "functions", 9 | "codebase": "default", 10 | "ignore": [ 11 | "node_modules", 12 | "src", 13 | ".git", 14 | "*.log", 15 | "*.local", 16 | ".secret.*", 17 | ".firebase-emu" 18 | ], 19 | "predeploy": [ 20 | "npm --prefix \"$RESOURCE_DIR\" run build:clean", 21 | "npm --prefix \"$RESOURCE_DIR\" run build" 22 | ] 23 | } 24 | ], 25 | "storage": { 26 | "rules": "storage.rules" 27 | }, 28 | "emulators": { 29 | "ui": { 30 | "enabled": true 31 | }, 32 | "singleProjectMode": true, 33 | "functions": { 34 | "port": 5001 35 | }, 36 | "firestore": { 37 | "port": 9098 38 | }, 39 | "storage": { 40 | "port": 9097 41 | } 42 | } 43 | } -------------------------------------------------------------------------------- /backend/firestore.indexes.json: -------------------------------------------------------------------------------- 1 | { 2 | "indexes": [ 3 | { 4 | "collectionGroup": "prompts", 5 | "queryScope": "COLLECTION_GROUP", 6 | "fields": [ 7 | { 8 | "fieldPath": "id", 9 | "order": "ASCENDING" 10 | }, 11 | { 12 | "fieldPath": "isPublic", 13 | "order": "ASCENDING" 14 | } 15 | ] 16 | } 17 | ], 18 | "fieldOverrides": [] 19 | } -------------------------------------------------------------------------------- /backend/firestore.rules: -------------------------------------------------------------------------------- 1 | rules_version = '2'; 2 | service cloud.firestore { 3 | match /databases/{database}/documents { 4 | // match /questions/{document=**} { 5 | // allow read: if request.auth != null 6 | // } 7 | 8 | // match /answers/{userId}/profiles/default { 9 | // allow read, write: if request.auth != null && request.auth.uid == userId 10 | // } 11 | 12 | match /credits/{userId}/{document=**} { 13 | allow read: if request.auth != null && request.auth.uid == userId 14 | } 15 | 16 | match /users/{userId}/prompts/{document=**} { 17 | allow read: if request.auth != null && request.auth.uid == userId 18 | } 19 | 20 | // match /users/{userId}/profiles/{document=**} { 21 | // allow read: if request.auth != null && request.auth.uid == userId 22 | // } 23 | 24 | match /users/{userId}/creditHistory/{document=**} { 25 | allow read: if request.auth != null && request.auth.uid == userId 26 | } 27 | 28 | match /{document=**} { 29 | allow read, write: if false; 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /backend/functions/.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | end_of_line = lf 5 | charset = utf-8 6 | indent_style = space 7 | insert_final_newline = true 8 | trim_trailing_whitespace = true 9 | indent_size = 4 10 | quote_type = single 11 | max_line_length = 120 12 | 13 | [*.py] 14 | indent_size = 4 15 | 16 | [*.ts] 17 | indent_size = 4 18 | 19 | [*.js] 20 | indent_size = 2 21 | 22 | [*.vue] 23 | indent_size = 2 24 | 25 | [*.*sx] 26 | indent_size = 2 27 | 28 | [*.*ml] 29 | indent_size = 2 30 | 31 | [*.json] 32 | indent_size = 2 33 | 34 | [*.md] 35 | indent_size = 2 36 | trim_trailing_whitespace = false 37 | -------------------------------------------------------------------------------- /backend/functions/.env.example: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intergalacticalvariable/reader/d5eee9517578c1a31e8beb1fbac1e3a638c940e7/backend/functions/.env.example -------------------------------------------------------------------------------- /backend/functions/.puppeteerrc.cjs: -------------------------------------------------------------------------------- 1 | const { join } = require('path'); 2 | 3 | /** 4 | * @type {import("puppeteer").Configuration} 5 | */ 6 | module.exports = { 7 | // Changes the cache location for Puppeteer. 8 | cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'), 9 | }; 10 | -------------------------------------------------------------------------------- /backend/functions/integrity-check.cjs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const fs = require('fs'); 4 | const path = require('path'); 5 | 6 | const file = path.resolve(__dirname, 'licensed/GeoLite2-City.mmdb'); 7 | 8 | if (!fs.existsSync(file)) { 9 | console.error(`Integrity check failed: ${file} does not exist.`); 10 | process.exit(1); 11 | } 12 | -------------------------------------------------------------------------------- /backend/functions/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "reader", 3 | "scripts": { 4 | "lint": "eslint --ext .js,.ts .", 5 | "build": "tsc -p .", 6 | "build:watch": "tsc --watch", 7 | "build:clean": "rm -rf ./build", 8 | "shell": "npm run build && firebase functions:shell", 9 | "emu:stage": "cd .. && tar -czvf firebase-emu-preset.tgz .firebase-emu", 10 | "emu:reset": "rm -rf ../.firebase-emu && tar -xzf ../firebase-emu-preset.tgz --directory ../", 11 | "emu:start": "firebase emulators:start --import ../.firebase-emu --export-on-exit", 12 | "emu:debug": "firebase emulators:start --import ../.firebase-emu --export-on-exit --inspect-functions", 13 | "emu:debug2": "firebase emulators:start --import ../.firebase-emu --export-on-exit --inspect-functions", 14 | "emu:kill": "killall java", 15 | "serve": "npm run build && npm run emu:start", 16 | "debug": "npm run build && npm run emu:start -- --inspect-functions", 17 | "from-scratch": "npm run build && rm -rf ../.firebase-emu && firebase emulators:start --export-on-exit", 18 | "from-preset": "npm run build && npm run emu:reset && npm run emu:start", 19 | "start": "npm run shell", 20 | "deploy": "firebase deploy --only functions", 21 | "logs": "firebase functions:log", 22 | "gcp-build": "node node_modules/puppeteer/install.mjs" 23 | }, 24 | "engines": { 25 | "node": "20" 26 | }, 27 | "main": "build/index.js", 28 | "dependencies": { 29 | "@esm2cjs/normalize-url": "^8.0.0", 30 | "@google-cloud/translate": "^8.2.0", 31 | "@mozilla/readability": "^0.5.0", 32 | "@napi-rs/canvas": "^0.1.44", 33 | "@types/turndown": "^5.0.4", 34 | "archiver": "^6.0.1", 35 | "axios": "^1.3.3", 36 | "bcrypt": "^5.1.0", 37 | "civkit": "^0.6.5-047c0d8", 38 | "core-js": "^3.37.1", 39 | "cors": "^2.8.5", 40 | "dayjs": "^1.11.9", 41 | "express": "^4.19.2", 42 | "firebase-admin": "^12.1.0", 43 | "firebase-functions": "^5.0.1", 44 | "htmlparser2": "^9.0.0", 45 | "jose": "^5.1.0", 46 | "jsdom": "^24.0.0", 47 | "langdetect": "^0.2.1", 48 | "maxmind": "^4.3.18", 49 | "minio": "^7.1.3", 50 | "openai": "^4.20.0", 51 | "pdfjs-dist": "^4.2.67", 52 | "puppeteer": "^22.7.1", 53 | "puppeteer-extra": "^3.3.6", 54 | "puppeteer-extra-plugin-block-resources": "^2.4.3", 55 | "puppeteer-extra-plugin-page-proxy": "^2.0.0", 56 | "puppeteer-extra-plugin-stealth": "^2.11.2", 57 | "puppeteer-page-proxy": "^1.3.0", 58 | "reflect-metadata": "^0.2.2", 59 | "set-cookie-parser": "^2.6.0", 60 | "stripe": "^11.11.0", 61 | "tiktoken": "^1.0.10", 62 | "tld-extract": "^2.1.0", 63 | "turndown": "^7.1.3", 64 | "turndown-plugin-gfm": "^1.0.2", 65 | "undici": "^5.24.0" 66 | }, 67 | "devDependencies": { 68 | "@types/archiver": "^5.3.4", 69 | "@types/bcrypt": "^5.0.0", 70 | "@types/cors": "^2.8.17", 71 | "@types/generic-pool": "^3.8.1", 72 | "@types/node": "^18", 73 | "@types/set-cookie-parser": "^2.4.7", 74 | "@typescript-eslint/eslint-plugin": "^5.12.0", 75 | "@typescript-eslint/parser": "^5.12.0", 76 | "eslint": "^8.9.0", 77 | "eslint-config-google": "^0.14.0", 78 | "eslint-plugin-import": "^2.25.4", 79 | "firebase-functions-test": "^3.0.0", 80 | "replicate": "^0.16.1", 81 | "typescript": "^5.1.6" 82 | }, 83 | "private": true, 84 | "exports": { 85 | ".": "./build/index.js" 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /backend/functions/src/cloud-functions/crawler.ts: -------------------------------------------------------------------------------- 1 | import { 2 | marshalErrorLike, 3 | RPCHost, RPCReflection, 4 | HashManager, 5 | AssertionFailureError, ParamValidationError, Defer, 6 | } from 'civkit'; 7 | import { singleton } from 'tsyringe'; 8 | import { AsyncContext, CloudHTTPv2, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect } from '../shared/index'; 9 | import _ from 'lodash'; 10 | import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer'; 11 | import { Request, Response } from 'express'; 12 | const pNormalizeUrl = import("@esm2cjs/normalize-url"); 13 | // import { AltTextService } from '../services/alt-text'; 14 | import TurndownService from 'turndown'; 15 | // import { Crawled } from '../db/crawled'; 16 | import { cleanAttribute } from '../utils/misc'; 17 | import { randomUUID } from 'crypto'; 18 | 19 | 20 | import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options'; 21 | // import { PDFExtractor } from '../services/pdf-extract'; 22 | import { DomainBlockade } from '../db/domain-blockade'; 23 | import { JSDomControl } from '../services/jsdom'; 24 | 25 | console.log('Initializing CrawlerHost'); 26 | 27 | const md5Hasher = new HashManager('md5', 'hex'); 28 | 29 | // const logger = new Logger('Crawler'); 30 | 31 | import { TransferProtocolMetadata } from 'civkit'; 32 | import * as fs from 'fs'; 33 | import * as path from 'path'; 34 | import { URL } from 'url'; 35 | 36 | function sendResponse(res: Response, data: T, meta: TransferProtocolMetadata): T { 37 | if (meta.code) { 38 | res.status(meta.code); 39 | } 40 | if (meta.contentType) { 41 | res.type(meta.contentType); 42 | } 43 | if (meta.headers) { 44 | for (const [key, value] of Object.entries(meta.headers)) { 45 | if (value !== undefined) { 46 | res.setHeader(key, value); 47 | } 48 | } 49 | } 50 | res.send(data); 51 | return data; 52 | } 53 | 54 | 55 | export interface ExtraScrappingOptions extends ScrappingOptions { 56 | withIframe?: boolean; 57 | targetSelector?: string | string[]; 58 | removeSelector?: string | string[]; 59 | keepImgDataUrl?: boolean; 60 | } 61 | 62 | export interface FormattedPage { 63 | title?: string; 64 | description?: string; 65 | url?: string; 66 | content?: string; 67 | publishedTime?: string; 68 | html?: string; 69 | text?: string; 70 | screenshotUrl?: string; 71 | screenshot?: Buffer; 72 | pageshotUrl?: string; 73 | pageshot?: Buffer; 74 | links?: { [k: string]: string; }; 75 | images?: { [k: string]: string; }; 76 | 77 | toString: () => string; 78 | } 79 | 80 | const indexProto = { 81 | toString: function (): string { 82 | console.log('Converting index to string'); 83 | return _(this) 84 | .toPairs() 85 | .map(([k, v]) => k ? `[${_.upperFirst(_.lowerCase(k))}] ${v}` : '') 86 | .value() 87 | .join('\n') + '\n'; 88 | } 89 | }; 90 | 91 | @singleton() 92 | export class CrawlerHost extends RPCHost { 93 | logger = new Logger('Crawler'); 94 | 95 | turnDownPlugins = [require('turndown-plugin-gfm').tables]; 96 | 97 | cacheRetentionMs = 1000 * 3600 * 24 * 7; 98 | cacheValidMs = 1000 * 3600; 99 | urlValidMs = 1000 * 3600 * 4; 100 | abuseBlockMs = 1000 * 3600; 101 | 102 | constructor( 103 | protected puppeteerControl: PuppeteerControl, 104 | protected jsdomControl: JSDomControl, 105 | // protected altTextService: AltTextService, 106 | // protected pdfExtractor: PDFExtractor, 107 | protected firebaseObjectStorage: FirebaseStorageBucketControl, 108 | protected threadLocal: AsyncContext, 109 | ) { 110 | super(...arguments); 111 | console.log('CrawlerHost constructor called'); 112 | console.log('Initializing CrawlerHost with dependencies:', { 113 | puppeteerControl: !!puppeteerControl, 114 | jsdomControl: !!jsdomControl, 115 | firebaseObjectStorage: !!firebaseObjectStorage, 116 | threadLocal: !!threadLocal 117 | }); 118 | 119 | puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ScrappingOptions & { url: URL; }) => { 120 | console.log('Crawled event received', { url: options.url.toString() }); 121 | if (!snapshot.title?.trim() && !snapshot.pdfs?.length) { 122 | console.log('Skipping snapshot due to empty title and no PDFs'); 123 | return; 124 | } 125 | if (options.cookies?.length) { 126 | console.log('Skipping caching due to cookies'); 127 | // Potential privacy issue, dont cache if cookies are used 128 | return; 129 | } 130 | }); 131 | 132 | puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => { 133 | console.log('Abuse event received', abuseEvent); 134 | this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn }); 135 | 136 | await DomainBlockade.save(DomainBlockade.from({ 137 | domain: abuseEvent.url.hostname.toLowerCase(), 138 | triggerReason: `${abuseEvent.reason}`, 139 | triggerUrl: abuseEvent.url.toString(), 140 | createdAt: new Date(), 141 | expireAt: new Date(Date.now() + this.abuseBlockMs), 142 | })).catch((err) => { 143 | console.error('Failed to save domain blockade', err); 144 | this.logger.warn(`Failed to save domain blockade for ${abuseEvent.url.hostname}`, { err: marshalErrorLike(err) }); 145 | }); 146 | 147 | }); 148 | } 149 | 150 | override async init() { 151 | console.log('Initializing CrawlerHost'); 152 | await this.dependencyReady(); 153 | 154 | this.emit('ready'); 155 | console.log('CrawlerHost ready'); 156 | console.log('CrawlerHost initialization complete'); 157 | } 158 | 159 | getIndex() { 160 | console.log('Getting index'); 161 | const indexObject: Record = Object.create(indexProto); 162 | 163 | Object.assign(indexObject, { 164 | usage1: 'https://r.jina.ai/YOUR_URL', 165 | usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY', 166 | homepage: 'https://jina.ai/reader', 167 | sourceCode: 'https://github.com/jina-ai/reader', 168 | }); 169 | 170 | console.log('Index object created:', indexObject); 171 | return indexObject; 172 | } 173 | 174 | getTurndown(options?: { 175 | noRules?: boolean | string, 176 | url?: string | URL; 177 | imgDataUrlToObjectUrl?: boolean; 178 | }) { 179 | console.log('Getting Turndown service', options); 180 | const turnDownService = new TurndownService({ 181 | codeBlockStyle: 'fenced', 182 | preformattedCode: true, 183 | } as any); 184 | if (!options?.noRules) { 185 | console.log('Adding Turndown rules'); 186 | turnDownService.addRule('remove-irrelevant', { 187 | filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'], 188 | replacement: () => '' 189 | }); 190 | turnDownService.addRule('truncate-svg', { 191 | filter: 'svg' as any, 192 | replacement: () => '' 193 | }); 194 | turnDownService.addRule('title-as-h1', { 195 | filter: ['title'], 196 | replacement: (innerText) => `${innerText}\n===============\n` 197 | }); 198 | } 199 | 200 | if (options?.imgDataUrlToObjectUrl) { 201 | console.log('Adding data-url-to-pseudo-object-url rule'); 202 | turnDownService.addRule('data-url-to-pseudo-object-url', { 203 | filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')), 204 | replacement: (_content, node: any) => { 205 | const src = (node.getAttribute('src') || '').trim(); 206 | const alt = cleanAttribute(node.getAttribute('alt')) || ''; 207 | 208 | if (options.url) { 209 | const refUrl = new URL(options.url); 210 | const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`); 211 | 212 | return `![${alt}](${mappedUrl})`; 213 | } 214 | 215 | return `![${alt}](blob:${md5Hasher.hash(src)})`; 216 | } 217 | }); 218 | } 219 | 220 | turnDownService.addRule('improved-paragraph', { 221 | filter: 'p', 222 | replacement: (innerText) => { 223 | const trimmed = innerText.trim(); 224 | if (!trimmed) { 225 | return ''; 226 | } 227 | 228 | return `${trimmed.replace(/\n{3,}/g, '\n\n')}\n\n`; 229 | } 230 | }); 231 | turnDownService.addRule('improved-inline-link', { 232 | filter: function (node, options) { 233 | return Boolean( 234 | options.linkStyle === 'inlined' && 235 | node.nodeName === 'A' && 236 | node.getAttribute('href') 237 | ); 238 | }, 239 | 240 | replacement: function (content, node: any) { 241 | let href = node.getAttribute('href'); 242 | if (href) href = href.replace(/([()])/g, '\\$1'); 243 | let title = cleanAttribute(node.getAttribute('title')); 244 | if (title) title = ' "' + title.replace(/"/g, '\\"') + '"'; 245 | 246 | const fixedContent = content.replace(/\s+/g, ' ').trim(); 247 | let fixedHref = href.replace(/\s+/g, '').trim(); 248 | if (options?.url) { 249 | try { 250 | fixedHref = new URL(fixedHref, options.url).toString(); 251 | } catch (_err) { 252 | void 0; 253 | } 254 | } 255 | 256 | return `[${fixedContent}](${fixedHref}${title || ''})`; 257 | } 258 | }); 259 | turnDownService.addRule('improved-code', { 260 | filter: function (node: any) { 261 | let hasSiblings = node.previousSibling || node.nextSibling; 262 | let isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings; 263 | 264 | return node.nodeName === 'CODE' && !isCodeBlock; 265 | }, 266 | 267 | replacement: function (inputContent: any) { 268 | if (!inputContent) return ''; 269 | let content = inputContent; 270 | 271 | let delimiter = '`'; 272 | let matches = content.match(/`+/gm) || []; 273 | while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`'; 274 | if (content.includes('\n')) { 275 | delimiter = '```'; 276 | } 277 | 278 | let extraSpace = delimiter === '```' ? '\n' : /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : ''; 279 | 280 | return delimiter + extraSpace + content + (delimiter === '```' && !content.endsWith(extraSpace) ? extraSpace : '') + delimiter; 281 | } 282 | }); 283 | 284 | console.log('Turndown service configured'); 285 | return turnDownService; 286 | } 287 | 288 | getGeneralSnapshotMixins(snapshot: PageSnapshot) { 289 | console.log('Getting general snapshot mixins'); 290 | let inferred; 291 | const mixin: any = {}; 292 | if (this.threadLocal.get('withImagesSummary')) { 293 | console.log('Generating image summary'); 294 | inferred ??= this.jsdomControl.inferSnapshot(snapshot); 295 | const imageSummary = {} as { [k: string]: string; }; 296 | const imageIdxTrack = new Map(); 297 | 298 | let imgIdx = 0; 299 | 300 | for (const img of inferred.imgs) { 301 | const imgSerial = ++imgIdx; 302 | const idxArr = imageIdxTrack.has(img.src) ? imageIdxTrack.get(img.src)! : []; 303 | idxArr.push(imgSerial); 304 | imageIdxTrack.set(img.src, idxArr); 305 | imageSummary[img.src] = img.alt || ''; 306 | } 307 | 308 | mixin.images = 309 | _(imageSummary) 310 | .toPairs() 311 | .map( 312 | ([url, alt], i) => { 313 | return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url]; 314 | } 315 | ).fromPairs() 316 | .value(); 317 | console.log(`Generated image summary with ${Object.keys(mixin.images).length} images`); 318 | } 319 | if (this.threadLocal.get('withLinksSummary')) { 320 | console.log('Generating link summary'); 321 | inferred ??= this.jsdomControl.inferSnapshot(snapshot); 322 | mixin.links = _.invert(inferred.links || {}); 323 | console.log(`Generated link summary with ${Object.keys(mixin.links).length} links`); 324 | } 325 | 326 | return mixin; 327 | } 328 | 329 | async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & { 330 | screenshotUrl?: string; 331 | pageshotUrl?: string; 332 | }, nominalUrl?: URL) { 333 | console.log('Formatting snapshot', { mode, url: nominalUrl?.toString() }); 334 | const host = this.threadLocal.get('host') || '192.168.178.100:1337'; 335 | 336 | if (mode === 'screenshot') { 337 | if (snapshot.screenshot && !snapshot.screenshotUrl) { 338 | console.log('Saving screenshot'); 339 | const fileName = `screenshot-${randomUUID()}.png`; 340 | const filePath = await this.saveFileLocally(fileName, snapshot.screenshot); 341 | snapshot.screenshotUrl = `http://${host}/instant-screenshots/${fileName}`; 342 | console.log('Screenshot saved and URL generated', { screenshotUrl: snapshot.screenshotUrl }); 343 | } 344 | 345 | return { 346 | ...this.getGeneralSnapshotMixins(snapshot), 347 | screenshotUrl: snapshot.screenshotUrl, 348 | toString() { 349 | return this.screenshotUrl; 350 | } 351 | } as FormattedPage; 352 | } 353 | if (mode === 'pageshot') { 354 | if (snapshot.pageshot && !snapshot.pageshotUrl) { 355 | console.log('Saving pageshot'); 356 | const fileName = `pageshot-${randomUUID()}.png`; 357 | const filePath = await this.saveFileLocally(fileName, snapshot.pageshot); 358 | snapshot.pageshotUrl = `http://${host}/instant-screenshots/${fileName}`; 359 | console.log('Pageshot saved and URL generated', { pageshotUrl: snapshot.pageshotUrl }); 360 | } 361 | 362 | return { 363 | ...this.getGeneralSnapshotMixins(snapshot), 364 | html: snapshot.html, 365 | pageshotUrl: snapshot.pageshotUrl, 366 | toString() { 367 | return this.pageshotUrl; 368 | } 369 | } as FormattedPage; 370 | } 371 | if (mode === 'html') { 372 | console.log('Formatting as HTML'); 373 | return { 374 | ...this.getGeneralSnapshotMixins(snapshot), 375 | html: snapshot.html, 376 | toString() { 377 | return this.html; 378 | } 379 | } as FormattedPage; 380 | } 381 | 382 | let pdfMode = false; 383 | 384 | if (mode === 'text') { 385 | console.log('Formatting as text'); 386 | return { 387 | ...this.getGeneralSnapshotMixins(snapshot), 388 | text: snapshot.text, 389 | toString() { 390 | return this.text; 391 | } 392 | } as FormattedPage; 393 | } 394 | const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl')); 395 | 396 | let contentText = ''; 397 | const imageSummary = {} as { [k: string]: string; }; 398 | const imageIdxTrack = new Map(); 399 | do { 400 | if (pdfMode) { 401 | console.log('PDF mode detected'); 402 | contentText = snapshot.parsed?.content || snapshot.text; 403 | break; 404 | } 405 | 406 | if ( 407 | snapshot.maxElemDepth! > 256 || 408 | snapshot.elemCount! > 70_000 409 | ) { 410 | console.log('Degrading to text to protect the server'); 411 | this.logger.warn('Degrading to text to protect the server', { url: snapshot.href }); 412 | contentText = snapshot.text; 413 | break; 414 | } 415 | 416 | console.log('Processing HTML content'); 417 | const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href); 418 | let toBeTurnedToMd = jsDomElementOfHTML; 419 | let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); 420 | if (mode !== 'markdown' && snapshot.parsed?.content) { 421 | console.log('Processing parsed content for non-markdown mode'); 422 | const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href); 423 | console.log('Created jsDomElementOfParsed'); 424 | const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML); 425 | console.log('Generated par1 from jsDomElementOfHTML'); 426 | const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : ''; 427 | console.log('Generated par2 from jsDomElementOfParsed'); 428 | 429 | // If Readability did its job 430 | if (par2.length >= 0.3 * par1.length) { 431 | console.log('Readability seems to have done its job, adjusting turnDownService'); 432 | turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); 433 | if (snapshot.parsed.content) { 434 | console.log('Using parsed content for toBeTurnedToMd'); 435 | toBeTurnedToMd = jsDomElementOfParsed; 436 | } 437 | } else { 438 | console.log('Readability output not sufficient, using original HTML'); 439 | } 440 | } else { 441 | console.log('Skipping parsed content processing'); 442 | } 443 | 444 | for (const plugin of this.turnDownPlugins) { 445 | turnDownService = turnDownService.use(plugin); 446 | } 447 | const urlToAltMap: { [k: string]: string | undefined; } = {}; 448 | if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) { 449 | const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => { 450 | const r = "ALT TEXT!!!" 451 | if (r && x.src) { 452 | urlToAltMap[x.src.trim()] = r; 453 | } 454 | }); 455 | 456 | await Promise.all(tasks); 457 | } 458 | let imgIdx = 0; 459 | turnDownService.addRule('img-generated-alt', { 460 | filter: 'img', 461 | replacement: (_content, node: any) => { 462 | let linkPreferredSrc = (node.getAttribute('src') || '').trim(); 463 | if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) { 464 | const dataSrc = (node.getAttribute('data-src') || '').trim(); 465 | if (dataSrc && !dataSrc.startsWith('data:')) { 466 | linkPreferredSrc = dataSrc; 467 | } 468 | } 469 | 470 | let src; 471 | try { 472 | src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString(); 473 | } catch (_err) { 474 | void 0; 475 | } 476 | const alt = cleanAttribute(node.getAttribute('alt')); 477 | if (!src) { 478 | return ''; 479 | } 480 | const mapped = urlToAltMap[src]; 481 | const imgSerial = ++imgIdx; 482 | const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : []; 483 | idxArr.push(imgSerial); 484 | imageIdxTrack.set(src, idxArr); 485 | 486 | if (mapped) { 487 | imageSummary[src] = mapped || alt; 488 | 489 | if (src?.startsWith('data:') && imgDataUrlToObjectUrl) { 490 | const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`); 491 | mappedUrl.protocol = 'blob:'; 492 | 493 | return `![Image ${imgIdx}: ${mapped || alt}](${mappedUrl})`; 494 | } 495 | 496 | return `![Image ${imgIdx}: ${mapped || alt}](${src})`; 497 | } 498 | 499 | imageSummary[src] = alt || ''; 500 | 501 | if (src?.startsWith('data:') && imgDataUrlToObjectUrl) { 502 | const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`); 503 | mappedUrl.protocol = 'blob:'; 504 | 505 | return alt ? `![Image ${imgIdx}: ${alt}](${mappedUrl})` : `![Image ${imgIdx}](${mappedUrl})`; 506 | } 507 | 508 | return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`; 509 | } 510 | }); 511 | 512 | if (toBeTurnedToMd) { 513 | try { 514 | contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim(); 515 | } catch (err) { 516 | this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); 517 | const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); 518 | try { 519 | contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim(); 520 | } catch (err2) { 521 | this.logger.warn(`Turndown failed to run, giving up`, { err: err2 }); 522 | } 523 | } 524 | } 525 | 526 | if ( 527 | !contentText || (contentText.startsWith('<') && contentText.endsWith('>')) 528 | && toBeTurnedToMd !== jsDomElementOfHTML 529 | ) { 530 | try { 531 | contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html); 532 | } catch (err) { 533 | this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); 534 | const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); 535 | try { 536 | contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html); 537 | } catch (err2) { 538 | this.logger.warn(`Turndown failed to run, giving up`, { err: err2 }); 539 | } 540 | } 541 | } 542 | if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) { 543 | contentText = snapshot.text; 544 | } 545 | } while (false); 546 | 547 | const cleanText = (contentText || '').trim(); 548 | 549 | const formatted: FormattedPage = { 550 | title: (snapshot.parsed?.title || snapshot.title || '').trim(), 551 | url: nominalUrl?.toString() || snapshot.href?.trim(), 552 | content: cleanText, 553 | publishedTime: snapshot.parsed?.publishedTime || undefined, 554 | 555 | toString() { 556 | if (mode === 'markdown') { 557 | return this.content as string; 558 | } 559 | 560 | const mixins: string[] = []; 561 | if (this.publishedTime) { 562 | mixins.push(`Published Time: ${this.publishedTime}`); 563 | } 564 | const suffixMixins: string[] = []; 565 | if (this.images) { 566 | const imageSummaryChunks: string[] = ['Images:']; 567 | for (const [k, v] of Object.entries(this.images)) { 568 | imageSummaryChunks.push(`- ![${k}](${v})`); 569 | } 570 | if (imageSummaryChunks.length === 1) { 571 | imageSummaryChunks.push('This page does not seem to contain any images.'); 572 | } 573 | suffixMixins.push(imageSummaryChunks.join('\n')); 574 | } 575 | if (this.links) { 576 | const linkSummaryChunks = ['Links/Buttons:']; 577 | for (const [k, v] of Object.entries(this.links)) { 578 | linkSummaryChunks.push(`- [${k}](${v})`); 579 | } 580 | if (linkSummaryChunks.length === 1) { 581 | linkSummaryChunks.push('This page does not seem to contain any buttons/links.'); 582 | } 583 | suffixMixins.push(linkSummaryChunks.join('\n')); 584 | } 585 | 586 | return `Title: ${this.title} 587 | 588 | URL Source: ${this.url} 589 | ${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''} 590 | Markdown Content: 591 | ${this.content} 592 | ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; 593 | } 594 | }; 595 | 596 | if (this.threadLocal.get('withImagesSummary')) { 597 | formatted.images = 598 | _(imageSummary) 599 | .toPairs() 600 | .map( 601 | ([url, alt], i) => { 602 | return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url]; 603 | } 604 | ).fromPairs() 605 | .value(); 606 | } 607 | if (this.threadLocal.get('withLinksSummary')) { 608 | formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {}); 609 | } 610 | 611 | return formatted as FormattedPage; 612 | } 613 | 614 | async crawl(req: Request, res: Response) { 615 | this.logger.info(`Crawl request received for URL: ${req.url}`); 616 | console.log('Crawl method called with request:', req.url); 617 | const ctx = { req, res }; 618 | console.log(`req.headers: ${JSON.stringify(req.headers)}`); 619 | 620 | try { 621 | const crawlerOptionsHeaderOnly = CrawlerOptionsHeaderOnly.from(req); 622 | const crawlerOptionsParamsAllowed = CrawlerOptions.from(req.method === 'POST' ? req.body : req.query, req); 623 | const noSlashURL = ctx.req.url.slice(1); 624 | const crawlerOptions = ctx.req.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed; 625 | console.log('Crawler options:', crawlerOptions); 626 | 627 | // Check if the request is for a screenshot 628 | if (noSlashURL.startsWith('instant-screenshots/')) { 629 | return this.serveScreenshot(noSlashURL, res); 630 | } 631 | 632 | // Handle favicon.ico request 633 | if (noSlashURL === 'favicon.ico') { 634 | console.log('Favicon request detected'); 635 | return sendResponse(res, 'Favicon not available', { contentType: 'text/plain', envelope: null, code: 404 }); 636 | } 637 | 638 | // Extract the actual URL to crawl 639 | const urlToCrawl = noSlashURL.startsWith('http') ? noSlashURL : `http://${noSlashURL}`; 640 | 641 | // Validate URL 642 | let parsedUrl: URL; 643 | try { 644 | parsedUrl = new URL(urlToCrawl); 645 | if (!['http:', 'https:'].includes(parsedUrl.protocol)) { 646 | throw new Error('Invalid protocol'); 647 | } 648 | // Check if the TLD is valid 649 | if (!this.isValidTLD(parsedUrl.hostname)) { 650 | throw new Error('Invalid TLD'); 651 | } 652 | } catch (error) { 653 | console.log('Invalid URL:', urlToCrawl, error); 654 | return sendResponse(res, 'Invalid URL or TLD', { contentType: 'text/plain', envelope: null, code: 400 }); 655 | } 656 | 657 | // Prevent circular crawling 658 | this.puppeteerControl.circuitBreakerHosts.add(ctx.req.hostname.toLowerCase()); 659 | console.log('Added to circuit breaker hosts:', ctx.req.hostname.toLowerCase()); 660 | 661 | const crawlOpts = this.configure(crawlerOptions, req, parsedUrl); 662 | console.log('Configured crawl options:', crawlOpts); 663 | 664 | let lastScrapped: PageSnapshot | undefined; 665 | 666 | try { 667 | for await (const scrapped of this.scrap(parsedUrl, crawlOpts, crawlerOptions)) { 668 | lastScrapped = scrapped; 669 | if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped.title?.trim()) && !scrapped?.pdfs?.length)) { 670 | continue; 671 | } 672 | 673 | const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, parsedUrl); 674 | 675 | if (crawlerOptions.timeout === undefined) { 676 | return this.sendFormattedResponse(res, formatted, crawlerOptions.respondWith); 677 | } 678 | } 679 | } catch (scrapError: any) { 680 | console.error('Error during scraping:', scrapError); 681 | if (scrapError instanceof AssertionFailureError && 682 | (scrapError.message.includes('Invalid TLD') || scrapError.message.includes('ERR_NAME_NOT_RESOLVED'))) { 683 | const errorSnapshot: PageSnapshot = { 684 | title: 'Error: Invalid domain or TLD', 685 | href: parsedUrl.toString(), 686 | html: '', 687 | text: `Failed to access the page due to an invalid domain or TLD: ${parsedUrl.toString()}`, 688 | error: 'Invalid domain or TLD' 689 | }; 690 | const formatted = await this.formatSnapshot(crawlerOptions.respondWith, errorSnapshot, parsedUrl); 691 | return this.sendFormattedResponse(res, formatted, crawlerOptions.respondWith); 692 | } 693 | throw scrapError; // Re-throw if it's not a handled error 694 | } 695 | 696 | if (!lastScrapped) { 697 | return sendResponse(res, 'No content available', { contentType: 'text/plain', envelope: null, code: 404 }); 698 | } 699 | 700 | const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, parsedUrl); 701 | return this.sendFormattedResponse(res, formatted, crawlerOptions.respondWith); 702 | 703 | } catch (error) { 704 | console.error('Error in crawl method:', error); 705 | return sendResponse(res, 'Internal server error', { contentType: 'text/plain', envelope: null, code: 500 }); 706 | } 707 | } 708 | 709 | private isValidTLD(hostname: string): boolean { 710 | const parts = hostname.split('.'); 711 | return parts.length > 1 && parts[parts.length - 1].length >= 2; 712 | } 713 | 714 | private serveScreenshot(screenshotPath: string, res: Response) { 715 | const fullPath = path.join('/app', 'local-storage', screenshotPath); 716 | console.log(`Attempting to serve screenshot from: ${fullPath}`); 717 | if (fs.existsSync(fullPath)) { 718 | return res.sendFile(fullPath); 719 | } else { 720 | console.log(`Screenshot not found: ${fullPath}`); 721 | return sendResponse(res, 'Screenshot not found', { contentType: 'text/plain', envelope: null, code: 404 }); 722 | } 723 | } 724 | 725 | private sendFormattedResponse(res: Response, formatted: any, respondWith: string) { 726 | if (respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { 727 | return sendResponse(res, `${formatted}`, 728 | { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } } 729 | ); 730 | } 731 | if (respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) { 732 | return sendResponse(res, `${formatted}`, 733 | { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } } 734 | ); 735 | } 736 | return sendResponse(res, `${formatted}`, { contentType: 'text/plain', envelope: null }); 737 | } 738 | 739 | getUrlDigest(urlToCrawl: URL) { 740 | const normalizedURL = new URL(urlToCrawl); 741 | if (!normalizedURL.hash.startsWith('#/')) { 742 | normalizedURL.hash = ''; 743 | } 744 | const normalizedUrl = normalizedURL.toString().toLowerCase(); 745 | const digest = md5Hasher.hash(normalizedUrl.toString()); 746 | 747 | return digest; 748 | } 749 | 750 | async *scrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) { 751 | this.logger.info(`Starting scrap for URL: ${urlToCrawl.toString()}`); 752 | console.log('Starting scrap for URL:', urlToCrawl.toString()); 753 | console.log('Crawl options:', crawlOpts); 754 | console.log('Crawler options:', crawlerOpts); 755 | 756 | if (crawlerOpts?.html) { 757 | console.log('Using provided HTML'); 758 | const fakeSnapshot = { 759 | href: urlToCrawl.toString(), 760 | html: crawlerOpts.html, 761 | title: '', 762 | text: '', 763 | } as PageSnapshot; 764 | 765 | yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts); 766 | 767 | return; 768 | } 769 | 770 | if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe) { 771 | console.log('Using custom selectors or iframe'); 772 | for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) { 773 | console.log('Narrowing snapshot'); 774 | yield this.jsdomControl.narrowSnapshot(x, crawlOpts); 775 | } 776 | 777 | return; 778 | } 779 | 780 | console.log('Using default scraping method'); 781 | yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts); 782 | } 783 | 784 | 785 | 786 | async *scrapMany(urls: URL[], options?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) { 787 | const iterators = urls.map((url) => this.scrap(url, options, crawlerOpts)); 788 | 789 | const results: (PageSnapshot | undefined)[] = iterators.map((_x) => undefined); 790 | 791 | let nextDeferred = Defer(); 792 | let concluded = false; 793 | 794 | const handler = async (it: AsyncGenerator, idx: number) => { 795 | try { 796 | for await (const x of it) { 797 | results[idx] = x; 798 | 799 | if (x) { 800 | nextDeferred.resolve(); 801 | nextDeferred = Defer(); 802 | } 803 | 804 | } 805 | } catch (err: any) { 806 | this.logger.warn(`Failed to scrap ${urls[idx]}`, { err: marshalErrorLike(err) }); 807 | } 808 | }; 809 | 810 | Promise.all( 811 | iterators.map((it, idx) => handler(it, idx)) 812 | ).finally(() => { 813 | concluded = true; 814 | nextDeferred.resolve(); 815 | }); 816 | 817 | yield results; 818 | 819 | try { 820 | while (!concluded) { 821 | await nextDeferred.promise; 822 | 823 | yield results; 824 | } 825 | } finally { 826 | for (const x of iterators) { 827 | x.return(); 828 | } 829 | } 830 | } 831 | 832 | configure(opts: CrawlerOptions, req: Request, urlToCrawl: URL) { 833 | 834 | this.threadLocal.set('withGeneratedAlt', opts.withGeneratedAlt); 835 | this.threadLocal.set('withLinksSummary', opts.withLinksSummary); 836 | this.threadLocal.set('withImagesSummary', opts.withImagesSummary); 837 | this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl); 838 | this.threadLocal.set('cacheTolerance', opts.cacheTolerance); 839 | this.threadLocal.set('userAgent', opts.userAgent); 840 | this.threadLocal.set('host', req.headers.host || '192.168.178.100:1337'); 841 | if (opts.timeout) { 842 | this.threadLocal.set('timeout', opts.timeout * 1000); 843 | } 844 | 845 | const cookies = req.headers['x-set-cookie'] ? 846 | (Array.isArray(req.headers['x-set-cookie']) ? req.headers['x-set-cookie'] : [req.headers['x-set-cookie']]) 847 | .map(cookie => { 848 | const [name, value] = cookie.split('='); 849 | return { name, value, url: urlToCrawl.toString() }; 850 | }) 851 | : []; 852 | 853 | console.log('Cookies:', cookies); 854 | const crawlOpts: ExtraScrappingOptions = { 855 | proxyUrl: opts.proxyUrl, 856 | cookies: cookies, 857 | favorScreenshot: ['screenshot', 'pageshot'].includes(opts.respondWith), 858 | removeSelector: opts.removeSelector, 859 | targetSelector: opts.targetSelector, 860 | waitForSelector: opts.waitForSelector, 861 | overrideUserAgent: opts.userAgent, 862 | timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined, 863 | withIframe: opts.withIframe, 864 | }; 865 | 866 | return crawlOpts; 867 | } 868 | 869 | async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) { 870 | const it = this.scrap(url, { ...opts, minIntervalMs: 500 }); 871 | 872 | let lastSnapshot; 873 | let goodEnough = false; 874 | try { 875 | for await (const x of it) { 876 | lastSnapshot = x; 877 | 878 | if (goodEnough) { 879 | break; 880 | } 881 | 882 | if (lastSnapshot?.parsed?.content) { 883 | // After it's good enough, wait for next snapshot; 884 | goodEnough = true; 885 | } 886 | } 887 | 888 | } catch (err) { 889 | if (lastSnapshot) { 890 | return this.formatSnapshot(mode, lastSnapshot, url); 891 | } 892 | 893 | throw err; 894 | } 895 | 896 | if (!lastSnapshot) { 897 | throw new AssertionFailureError(`No content available`); 898 | } 899 | 900 | return this.formatSnapshot(mode, lastSnapshot, url); 901 | } 902 | 903 | async saveFileLocally(fileName: string, content: Buffer): Promise { 904 | const localDir = path.join('/app', 'local-storage', 'instant-screenshots'); 905 | console.log(`Attempting to save file in directory: ${localDir}`); 906 | try { 907 | if (!fs.existsSync(localDir)) { 908 | console.log(`Directory ${localDir} does not exist. Creating it.`); 909 | fs.mkdirSync(localDir, { recursive: true }); 910 | } 911 | const filePath = path.join(localDir, fileName); 912 | console.log(`Writing file to: ${filePath}`); 913 | await fs.promises.writeFile(filePath, content); 914 | console.log(`File successfully written to: ${filePath}`); 915 | return filePath; 916 | } catch (error) { 917 | console.error(`Error saving file locally: ${error}`); 918 | throw error; 919 | } 920 | } 921 | } -------------------------------------------------------------------------------- /backend/functions/src/db/crawled.ts: -------------------------------------------------------------------------------- 1 | import { Also, parseJSONText, Prop } from 'civkit'; 2 | import { FirestoreRecord } from '../shared/lib/firestore'; 3 | import _ from 'lodash'; 4 | import type { PageSnapshot } from '../services/puppeteer'; 5 | 6 | @Also({ 7 | dictOf: Object 8 | }) 9 | export class Crawled extends FirestoreRecord { 10 | static override collectionName = 'crawled'; 11 | 12 | override _id!: string; 13 | 14 | @Prop({ 15 | required: true 16 | }) 17 | url!: string; 18 | 19 | @Prop({ 20 | required: true 21 | }) 22 | urlPathDigest!: string; 23 | 24 | @Prop() 25 | snapshot?: PageSnapshot & { screenshot: never; pageshot: never; }; 26 | 27 | @Prop() 28 | screenshotAvailable?: boolean; 29 | 30 | @Prop() 31 | pageshotAvailable?: boolean; 32 | 33 | @Prop() 34 | snapshotAvailable?: boolean; 35 | 36 | @Prop() 37 | createdAt!: Date; 38 | 39 | @Prop() 40 | expireAt!: Date; 41 | 42 | static patchedFields = [ 43 | 'snapshot' 44 | ]; 45 | 46 | static override from(input: any) { 47 | for (const field of this.patchedFields) { 48 | if (typeof input[field] === 'string') { 49 | input[field] = parseJSONText(input[field]); 50 | } 51 | } 52 | 53 | return super.from(input) as Crawled; 54 | } 55 | 56 | override degradeForFireStore() { 57 | const copy: any = { ...this }; 58 | 59 | for (const field of (this.constructor as typeof Crawled).patchedFields) { 60 | if (typeof copy[field] === 'object') { 61 | copy[field] = JSON.stringify(copy[field]) as any; 62 | } 63 | } 64 | 65 | return copy; 66 | } 67 | 68 | [k: string]: any; 69 | } 70 | -------------------------------------------------------------------------------- /backend/functions/src/db/domain-blockade.ts: -------------------------------------------------------------------------------- 1 | import { Also, Prop } from 'civkit'; 2 | import { FirestoreRecord } from '../shared/lib/firestore'; 3 | 4 | @Also({ 5 | dictOf: Object 6 | }) 7 | export class DomainBlockade extends FirestoreRecord { 8 | static override collectionName = 'domainBlockades'; 9 | 10 | override _id!: string; 11 | 12 | @Prop({ 13 | required: true 14 | }) 15 | domain!: string; 16 | 17 | @Prop({ required: true }) 18 | triggerReason!: string; 19 | 20 | @Prop() 21 | triggerUrl?: string; 22 | 23 | @Prop() 24 | createdAt!: Date; 25 | 26 | @Prop() 27 | expireAt?: Date; 28 | 29 | [k: string]: any; 30 | } 31 | -------------------------------------------------------------------------------- /backend/functions/src/db/img-alt.ts: -------------------------------------------------------------------------------- 1 | import { Also, Prop } from 'civkit'; 2 | import { FirestoreRecord } from '../shared/lib/firestore'; 3 | import _ from 'lodash'; 4 | 5 | @Also({ 6 | dictOf: Object 7 | }) 8 | export class ImgAlt extends FirestoreRecord { 9 | static override collectionName = 'imgAlts'; 10 | 11 | override _id!: string; 12 | 13 | @Prop({ 14 | required: true 15 | }) 16 | src!: string; 17 | 18 | @Prop({ 19 | required: true 20 | }) 21 | urlDigest!: string; 22 | 23 | @Prop() 24 | width?: number; 25 | 26 | @Prop() 27 | height?: number; 28 | 29 | @Prop() 30 | generatedAlt?: string; 31 | 32 | @Prop() 33 | originalAlt?: string; 34 | 35 | @Prop() 36 | createdAt!: Date; 37 | 38 | @Prop() 39 | expireAt?: Date; 40 | 41 | [k: string]: any; 42 | } 43 | -------------------------------------------------------------------------------- /backend/functions/src/db/pdf.ts: -------------------------------------------------------------------------------- 1 | import { Also, Prop, parseJSONText } from 'civkit'; 2 | import { FirestoreRecord } from '../shared/lib/firestore'; 3 | import _ from 'lodash'; 4 | 5 | @Also({ 6 | dictOf: Object 7 | }) 8 | export class PDFContent extends FirestoreRecord { 9 | static override collectionName = 'pdfs'; 10 | 11 | override _id!: string; 12 | 13 | @Prop({ 14 | required: true 15 | }) 16 | src!: string; 17 | 18 | @Prop({ 19 | required: true 20 | }) 21 | urlDigest!: string; 22 | 23 | @Prop() 24 | meta?: { [k: string]: any; }; 25 | 26 | @Prop() 27 | text?: string; 28 | 29 | @Prop() 30 | content?: string; 31 | 32 | @Prop() 33 | createdAt!: Date; 34 | 35 | @Prop() 36 | expireAt?: Date; 37 | 38 | static patchedFields = [ 39 | 'meta' 40 | ]; 41 | 42 | static override from(input: any) { 43 | for (const field of this.patchedFields) { 44 | if (typeof input[field] === 'string') { 45 | input[field] = parseJSONText(input[field]); 46 | } 47 | } 48 | 49 | return super.from(input) as PDFContent; 50 | } 51 | 52 | override degradeForFireStore() { 53 | const copy: any = { ...this }; 54 | 55 | for (const field of (this.constructor as typeof PDFContent).patchedFields) { 56 | if (typeof copy[field] === 'object') { 57 | copy[field] = JSON.stringify(copy[field]) as any; 58 | } 59 | } 60 | 61 | return copy; 62 | } 63 | 64 | [k: string]: any; 65 | } 66 | -------------------------------------------------------------------------------- /backend/functions/src/db/searched.ts: -------------------------------------------------------------------------------- 1 | import { Also, parseJSONText, Prop } from 'civkit'; 2 | import { FirestoreRecord } from '../shared/lib/firestore'; 3 | import _ from 'lodash'; 4 | 5 | @Also({ 6 | dictOf: Object 7 | }) 8 | export class SearchResult extends FirestoreRecord { 9 | static override collectionName = 'searchResults'; 10 | 11 | override _id!: string; 12 | 13 | @Prop({ 14 | required: true 15 | }) 16 | query!: any; 17 | 18 | @Prop({ 19 | required: true 20 | }) 21 | queryDigest!: string; 22 | 23 | @Prop() 24 | response?: any; 25 | 26 | @Prop() 27 | createdAt!: Date; 28 | 29 | @Prop() 30 | expireAt?: Date; 31 | 32 | [k: string]: any; 33 | 34 | static patchedFields = [ 35 | 'query', 36 | 'response', 37 | ]; 38 | 39 | static override from(input: any) { 40 | for (const field of this.patchedFields) { 41 | if (typeof input[field] === 'string') { 42 | input[field] = parseJSONText(input[field]); 43 | } 44 | } 45 | 46 | return super.from(input) as SearchResult; 47 | } 48 | 49 | override degradeForFireStore() { 50 | const copy: any = { ...this }; 51 | 52 | for (const field of (this.constructor as typeof SearchResult).patchedFields) { 53 | if (typeof copy[field] === 'object') { 54 | copy[field] = JSON.stringify(copy[field]) as any; 55 | } 56 | } 57 | 58 | return copy; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /backend/functions/src/dto/scrapping-options.ts: -------------------------------------------------------------------------------- 1 | import { Also, AutoCastable, Prop, AutoCastableMetaClass, Constructor } from 'civkit'; // Adjust the import based on where your decorators are defined 2 | import type { Request, Response } from 'express'; 3 | import type { CookieParam } from 'puppeteer'; 4 | import { parseString as parseSetCookieString } from 'set-cookie-parser'; 5 | 6 | 7 | @Also({ 8 | openapi: { 9 | operation: { 10 | parameters: { 11 | 'Accept': { 12 | description: `Specifies your preference for the response format.\n\n` + 13 | `Supported formats: \n` + 14 | `- text/event-stream\n` + 15 | `- application/json or text/json\n` + 16 | `- text/plain` 17 | , 18 | in: 'header', 19 | schema: { type: 'string' } 20 | }, 21 | 'X-Cache-Tolerance': { 22 | description: `Sets internal cache tolerance in seconds if this header is specified with a integer.`, 23 | in: 'header', 24 | schema: { type: 'string' } 25 | }, 26 | 'X-No-Cache': { 27 | description: `Ignores internal cache if this header is specified with a value.\n\nEquivalent to X-Cache-Tolerance: 0`, 28 | in: 'header', 29 | schema: { type: 'string' } 30 | }, 31 | 'X-Respond-With': { 32 | description: `Specifies the (non-default) form factor of the crawled data you prefer.\n\n` + 33 | `Supported formats: \n` + 34 | `- markdown\n` + 35 | `- html\n` + 36 | `- text\n` + 37 | `- pageshot\n` + 38 | `- screenshot\n` 39 | , 40 | in: 'header', 41 | schema: { type: 'string' } 42 | }, 43 | 'X-Wait-For-Selector': { 44 | description: `Specifies a CSS selector to wait for the appearance of such an element before returning.\n\n` + 45 | 'Example: `X-Wait-For-Selector: .content-block`\n' 46 | , 47 | in: 'header', 48 | schema: { type: 'string' } 49 | }, 50 | 'X-Target-Selector': { 51 | description: `Specifies a CSS selector for return target instead of the full html.\n\n` + 52 | 'Implies `X-Wait-For-Selector: (same selector)`' 53 | , 54 | in: 'header', 55 | schema: { type: 'string' } 56 | }, 57 | 'X-Remove-Selector': { 58 | description: `Specifies a CSS selector to remove elements from the full html.\n\n` + 59 | 'Example `X-Remove-Selector: nav`' 60 | , 61 | in: 'header', 62 | schema: { type: 'string' } 63 | }, 64 | 'X-Keep-Img-Data-Url': { 65 | description: `Keep data-url as it instead of transforming them to object-url. (Only applicable when targeting markdown format)\n\n` + 66 | 'Example `X-Keep-Img-Data-Url: true`' 67 | , 68 | in: 'header', 69 | schema: { type: 'string' } 70 | }, 71 | 'X-Proxy-Url': { 72 | description: `Specifies your custom proxy if you prefer to use one.\n\n` + 73 | `Supported protocols: \n` + 74 | `- http\n` + 75 | `- https\n` + 76 | `- socks4\n` + 77 | `- socks5\n\n` + 78 | `For authentication, https://user:pass@host:port`, 79 | in: 'header', 80 | schema: { type: 'string' } 81 | }, 82 | 'X-Set-Cookie': { 83 | description: `Sets cookie(s) to the headless browser for your request. \n\n` + 84 | `Syntax is the same with standard Set-Cookie`, 85 | in: 'header', 86 | schema: { type: 'string' } 87 | }, 88 | 'X-With-Generated-Alt': { 89 | description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` + 90 | `Note: Does not work when \`X-Respond-With\` is specified`, 91 | in: 'header', 92 | schema: { type: 'string' } 93 | }, 94 | 'X-With-Images-Summary': { 95 | description: `Enable dedicated summary section for images on the page.`, 96 | in: 'header', 97 | schema: { type: 'string' } 98 | }, 99 | 'X-With-links-Summary': { 100 | description: `Enable dedicated summary section for hyper links on the page.`, 101 | in: 'header', 102 | schema: { type: 'string' } 103 | }, 104 | 'X-User-Agent': { 105 | description: `Override User-Agent.`, 106 | in: 'header', 107 | schema: { type: 'string' } 108 | }, 109 | 'X-Timeout': { 110 | description: `Specify timeout in seconds. Max 180.`, 111 | in: 'header', 112 | schema: { type: 'string' } 113 | }, 114 | } 115 | } 116 | } 117 | }) 118 | export class CrawlerOptions extends AutoCastable implements AutoCastableMetaClass { 119 | 120 | @Prop() 121 | url?: string; 122 | 123 | @Prop() 124 | html?: string; 125 | 126 | @Prop({ 127 | default: 'default', 128 | }) 129 | respondWith!: string; 130 | 131 | @Prop({ 132 | default: false, 133 | }) 134 | withGeneratedAlt!: boolean; 135 | 136 | @Prop({ 137 | default: false, 138 | }) 139 | withLinksSummary!: boolean; 140 | 141 | @Prop({ 142 | default: false, 143 | }) 144 | withImagesSummary!: boolean; 145 | 146 | @Prop({ 147 | default: false, 148 | }) 149 | noCache!: boolean; 150 | 151 | @Prop() 152 | cacheTolerance?: number; 153 | 154 | @Prop({ arrayOf: String }) 155 | targetSelector?: string | string[]; 156 | 157 | @Prop({ arrayOf: String }) 158 | waitForSelector?: string | string[]; 159 | 160 | @Prop({ arrayOf: String }) 161 | removeSelector?: string | string[]; 162 | 163 | @Prop({ 164 | default: false, 165 | }) 166 | keepImgDataUrl!: boolean; 167 | 168 | @Prop({ 169 | default: false, 170 | }) 171 | withIframe!: boolean; 172 | 173 | @Prop({ 174 | arrayOf: String, 175 | }) 176 | setCookies?: CookieParam[]; 177 | 178 | @Prop() 179 | proxyUrl?: string; 180 | 181 | @Prop() 182 | userAgent?: string; 183 | 184 | @Prop({ 185 | validate: (v: number) => v > 0 && v <= 180, 186 | type: Number, 187 | nullable: true, 188 | }) 189 | timeout?: number | null; 190 | 191 | static override from(this: Constructor, input: any, ...args: any[]): T { 192 | const instance = super.from(input, ...args) as T; 193 | const req = args[0] as Request | undefined; 194 | 195 | if (req) { 196 | console.log('Request headers:', req.headers); 197 | 198 | const getHeader = (name: string): string | undefined => { 199 | const value = req.headers[name.toLowerCase()]; 200 | return Array.isArray(value) ? value[0] : value; 201 | }; 202 | 203 | const customMode = getHeader('X-Respond-With') || getHeader('X-Return-Format'); 204 | if (customMode) { 205 | instance.respondWith = customMode; 206 | } 207 | 208 | const withGeneratedAlt = getHeader('X-With-Generated-Alt'); 209 | if (withGeneratedAlt !== undefined) { 210 | instance.withGeneratedAlt = withGeneratedAlt.toLowerCase() === 'true'; 211 | } 212 | 213 | const withLinksSummary = getHeader('x-with-links-summary'); 214 | if (withLinksSummary !== undefined) { 215 | instance.withLinksSummary = Boolean(withLinksSummary); 216 | } 217 | 218 | const withImagesSummary = getHeader('x-with-images-summary'); 219 | if (withImagesSummary !== undefined) { 220 | instance.withImagesSummary = Boolean(withImagesSummary); 221 | } 222 | 223 | const noCache = getHeader('x-no-cache'); 224 | if (noCache !== undefined) { 225 | instance.noCache = Boolean(noCache); 226 | } 227 | 228 | if (instance.noCache && instance.cacheTolerance === undefined) { 229 | instance.cacheTolerance = 0; 230 | } 231 | 232 | let cacheTolerance = parseInt(getHeader('x-cache-tolerance') || ''); 233 | if (!isNaN(cacheTolerance)) { 234 | instance.cacheTolerance = cacheTolerance; 235 | } 236 | 237 | let timeoutSeconds = parseInt(getHeader('x-timeout') || ''); 238 | if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) { 239 | instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180; 240 | } else if (getHeader('x-timeout')) { 241 | instance.timeout = null; 242 | } 243 | 244 | const removeSelector = getHeader('x-remove-selector')?.split(', '); 245 | instance.removeSelector ??= removeSelector; 246 | 247 | const targetSelector = getHeader('x-target-selector')?.split(', '); 248 | instance.targetSelector ??= targetSelector; 249 | 250 | const waitForSelector = getHeader('x-wait-for-selector')?.split(', '); 251 | instance.waitForSelector ??= waitForSelector || instance.targetSelector; 252 | 253 | instance.targetSelector = filterSelector(instance.targetSelector); 254 | 255 | const overrideUserAgent = getHeader('x-user-agent'); 256 | instance.userAgent ??= overrideUserAgent; 257 | 258 | const keepImgDataUrl = getHeader('x-keep-img-data-url'); 259 | if (keepImgDataUrl !== undefined) { 260 | instance.keepImgDataUrl = Boolean(keepImgDataUrl); 261 | } 262 | 263 | const withIframe = getHeader('x-with-iframe'); 264 | if (withIframe !== undefined) { 265 | instance.withIframe = Boolean(withIframe); 266 | } 267 | 268 | if (instance.withIframe) { 269 | instance.timeout ??= null; 270 | } 271 | 272 | const cookies: CookieParam[] = []; 273 | const setCookieHeaders = getHeader('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]); 274 | if (Array.isArray(setCookieHeaders)) { 275 | for (const setCookie of setCookieHeaders) { 276 | cookies.push({ 277 | ...parseSetCookieString(setCookie, { decodeValues: false }) as CookieParam, 278 | }); 279 | } 280 | } else if (setCookieHeaders && typeof setCookieHeaders === 'string') { 281 | cookies.push({ 282 | ...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam, 283 | }); 284 | } 285 | 286 | const proxyUrl = getHeader('x-proxy-url'); 287 | instance.proxyUrl ??= proxyUrl; 288 | 289 | if (instance.cacheTolerance) { 290 | instance.cacheTolerance = instance.cacheTolerance * 1000; 291 | } 292 | } 293 | 294 | return instance; 295 | } 296 | } 297 | 298 | export class CrawlerOptionsHeaderOnly extends CrawlerOptions { 299 | static override from(this: Constructor, ...args: any[]): T { 300 | const req = args[0] as Request; 301 | return super.from({}, req) as T; 302 | } 303 | } 304 | 305 | function filterSelector(s?: string | string[]) { 306 | if (!s) { 307 | return s; 308 | } 309 | const sr = Array.isArray(s) ? s : [s]; 310 | const selectors = sr.filter((i)=> { 311 | const innerSelectors = i.split(',').map((s) => s.trim()); 312 | const someViolation = innerSelectors.find((x) => x.startsWith('*') || x.startsWith(':') || x.includes('*:')); 313 | if (someViolation) { 314 | return false; 315 | } 316 | return true; 317 | }) 318 | 319 | return selectors; 320 | }; 321 | -------------------------------------------------------------------------------- /backend/functions/src/fetch.d.ts: -------------------------------------------------------------------------------- 1 | declare global { 2 | export const { 3 | fetch, 4 | FormData, 5 | Headers, 6 | Request, 7 | Response, 8 | File, 9 | }: typeof import('undici'); 10 | export type { FormData, Headers, Request, RequestInit, Response, RequestInit, File } from 'undici'; 11 | } 12 | 13 | export { }; 14 | -------------------------------------------------------------------------------- /backend/functions/src/index.ts: -------------------------------------------------------------------------------- 1 | import 'reflect-metadata'; 2 | import { initializeApp } from 'firebase-admin/app'; 3 | import { CrawlerHost } from './cloud-functions/crawler'; 4 | import { runWith, https, HttpsFunction } from 'firebase-functions'; 5 | import { Logger } from './shared/logger'; 6 | import { container } from 'tsyringe'; 7 | import { PuppeteerControl } from './services/puppeteer'; 8 | import { JSDomControl } from './services/jsdom'; 9 | import { FirebaseStorageBucketControl } from './shared'; 10 | import { AsyncContext } from './shared'; 11 | 12 | initializeApp(); 13 | 14 | container.registerSingleton(Logger); 15 | container.registerSingleton(PuppeteerControl); 16 | container.registerSingleton(JSDomControl); 17 | container.registerSingleton(FirebaseStorageBucketControl); 18 | container.registerSingleton(AsyncContext); 19 | container.registerSingleton(CrawlerHost); 20 | 21 | const crawlerHost = container.resolve(CrawlerHost); 22 | export const crawler = runWith({ 23 | memory: '4GB', 24 | timeoutSeconds: 540, 25 | }).https.onRequest(async (req, res) => { 26 | await crawlerHost.crawl(req, res); 27 | }); 28 | 29 | export const helloWorld: HttpsFunction = https.onRequest((req, res) => { 30 | res.send('Hello World!'); 31 | }); 32 | 33 | 34 | process.on('unhandledRejection', (reason, promise) => { 35 | console.error('Unhandled Rejection at:', promise, 'reason:', reason); 36 | // Application specific logging, throwing an error, or other logic here 37 | }); 38 | 39 | process.on('uncaughtException', (err) => { 40 | console.error('Uncaught Exception:', err); 41 | 42 | // Looks like Firebase runtime does not handle error properly. 43 | // Make sure to quit the process. 44 | process.nextTick(() => process.exit(1)); 45 | console.error('Uncaught exception, process quit.'); 46 | throw err; 47 | }); 48 | -------------------------------------------------------------------------------- /backend/functions/src/server.ts: -------------------------------------------------------------------------------- 1 | import 'reflect-metadata'; 2 | import express from 'express'; 3 | import { container } from 'tsyringe'; 4 | import { CrawlerHost } from './cloud-functions/crawler'; 5 | import path from 'path'; 6 | 7 | const app = express(); 8 | const port = process.env.PORT || 3000; 9 | 10 | const crawlerHost = container.resolve(CrawlerHost); 11 | 12 | app.use(express.json()); 13 | 14 | // Serve static files from the local-storage directory 15 | app.use('/instant-screenshots', express.static(path.join('/app', 'local-storage', 'instant-screenshots'))); 16 | 17 | app.all('*', async (req, res) => { 18 | try { 19 | await crawlerHost.crawl(req, res); 20 | } catch (error: any) { 21 | console.error('Error during crawl:', error); 22 | 23 | // Kontrola typu chyby 24 | if (error.message.includes('Invalid TLD')) { 25 | res.status(400).json({ error: 'Invalid URL or TLD' }); 26 | } else { 27 | // Ošetrenie iných chýb 28 | res.status(500).json({ error: 'An error occurred during the crawl' }); 29 | } 30 | } 31 | }); 32 | 33 | app.listen(port, () => { 34 | console.log(`Server is running on port ${port}`); 35 | }); 36 | 37 | export default app; 38 | -------------------------------------------------------------------------------- /backend/functions/src/services/geoip.ts: -------------------------------------------------------------------------------- 1 | import { container, singleton } from 'tsyringe'; 2 | import fsp from 'fs/promises'; 3 | import { CityResponse, Reader } from 'maxmind'; 4 | import { AsyncService, AutoCastable, Prop, runOnce } from 'civkit'; 5 | import { Logger } from '../shared/index'; 6 | import path from 'path'; 7 | 8 | export enum GEOIP_SUPPORTED_LANGUAGES { 9 | EN = 'en', 10 | ZH_CN = 'zh-CN', 11 | JA = 'ja', 12 | DE = 'de', 13 | FR = 'fr', 14 | ES = 'es', 15 | PT_BR = 'pt-BR', 16 | RU = 'ru', 17 | } 18 | 19 | export class GeoIPInfo extends AutoCastable { 20 | @Prop() 21 | code?: string; 22 | 23 | @Prop() 24 | name?: string; 25 | } 26 | 27 | export class GeoIPCountryInfo extends GeoIPInfo { 28 | @Prop() 29 | eu?: boolean; 30 | } 31 | 32 | export class GeoIPCityResponse extends AutoCastable { 33 | @Prop() 34 | continent?: GeoIPInfo; 35 | 36 | @Prop() 37 | country?: GeoIPCountryInfo; 38 | 39 | @Prop({ 40 | arrayOf: GeoIPInfo 41 | }) 42 | subdivisions?: GeoIPInfo[]; 43 | 44 | @Prop() 45 | city?: string; 46 | 47 | @Prop({ 48 | arrayOf: Number 49 | }) 50 | coordinates?: [number, number, number]; 51 | 52 | @Prop() 53 | timezone?: string; 54 | } 55 | 56 | @singleton() 57 | export class GeoIPService extends AsyncService { 58 | 59 | logger = new Logger('CHANGE_LOGGER_NAME') 60 | 61 | mmdbCity!: Reader; 62 | 63 | constructor( 64 | ) { 65 | super(...arguments); 66 | } 67 | 68 | 69 | override async init() { 70 | await this.dependencyReady(); 71 | 72 | this.emit('ready'); 73 | } 74 | 75 | @runOnce() 76 | async _lazyload() { 77 | const mmdpPath = path.resolve(__dirname, '..', '..', 'licensed', 'GeoLite2-City.mmdb'); 78 | 79 | const dbBuff = await fsp.readFile(mmdpPath, { flag: 'r', encoding: null }); 80 | 81 | this.mmdbCity = new Reader(dbBuff); 82 | 83 | this.logger.info(`Loaded GeoIP database, ${dbBuff.byteLength} bytes`); 84 | } 85 | 86 | 87 | async lookupCity(ip: string, lang: GEOIP_SUPPORTED_LANGUAGES = GEOIP_SUPPORTED_LANGUAGES.EN) { 88 | await this._lazyload(); 89 | 90 | const r = this.mmdbCity.get(ip); 91 | 92 | if (!r) { 93 | return undefined; 94 | } 95 | 96 | return GeoIPCityResponse.from({ 97 | continent: r.continent ? { 98 | code: r.continent?.code, 99 | name: r.continent?.names?.[lang] || r.continent?.names?.en, 100 | } : undefined, 101 | country: r.country ? { 102 | code: r.country?.iso_code, 103 | name: r.country?.names?.[lang] || r.country?.names.en, 104 | eu: r.country?.is_in_european_union, 105 | } : undefined, 106 | city: r.city?.names?.[lang] || r.city?.names?.en, 107 | subdivisions: r.subdivisions?.map((x) => ({ 108 | code: x.iso_code, 109 | name: x.names?.[lang] || x.names?.en, 110 | })), 111 | coordinates: r.location ? [ 112 | r.location.latitude, r.location.longitude, r.location.accuracy_radius 113 | ] : undefined, 114 | timezone: r.location?.time_zone, 115 | }); 116 | } 117 | 118 | } 119 | 120 | const instance = container.resolve(GeoIPService); 121 | 122 | export default instance; 123 | -------------------------------------------------------------------------------- /backend/functions/src/services/jsdom.ts: -------------------------------------------------------------------------------- 1 | import { container, singleton } from 'tsyringe'; 2 | import { AsyncService, marshalErrorLike } from 'civkit'; 3 | import { Logger } from '../shared/index'; 4 | import { ExtendedSnapshot, PageSnapshot } from './puppeteer'; 5 | import { JSDOM, VirtualConsole } from 'jsdom'; 6 | import { Readability } from '@mozilla/readability'; 7 | import TurndownService, { Node } from 'turndown'; 8 | 9 | const virtualConsole = new VirtualConsole(); 10 | virtualConsole.on('error', () => void 0); 11 | 12 | @singleton() 13 | export class JSDomControl extends AsyncService { 14 | 15 | logger = new Logger('CHANGE_LOGGER_NAME') 16 | 17 | constructor( 18 | ) { 19 | super(...arguments); 20 | } 21 | 22 | override async init() { 23 | await this.dependencyReady(); 24 | this.emit('ready'); 25 | } 26 | 27 | narrowSnapshot(snapshot: PageSnapshot | undefined, options?: { 28 | targetSelector?: string | string[]; 29 | removeSelector?: string | string[]; 30 | withIframe?: boolean; 31 | }): PageSnapshot | undefined { 32 | if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe) { 33 | return snapshot; 34 | } 35 | if (!snapshot?.html) { 36 | return snapshot; 37 | } 38 | const t0 = Date.now(); 39 | const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole }); 40 | const allNodes: Node[] = []; 41 | jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = ''); 42 | if (options?.withIframe) { 43 | jsdom.window.document.querySelectorAll('iframe[src],frame[src]').forEach((x) => { 44 | const src = x.getAttribute('src'); 45 | const thisSnapshot = snapshot.childFrames?.find((f) => f.href === src); 46 | if (thisSnapshot?.html) { 47 | x.innerHTML = thisSnapshot.html; 48 | x.querySelectorAll('script, style').forEach((s) => s.remove()); 49 | x.querySelectorAll('[src]').forEach((el) => { 50 | el.setAttribute('src', new URL(el.getAttribute('src')!, src!).toString()); 51 | }); 52 | x.querySelectorAll('[href]').forEach((el) => { 53 | el.setAttribute('href', new URL(el.getAttribute('href')!, src!).toString()); 54 | }); 55 | } 56 | }); 57 | } 58 | 59 | if (Array.isArray(options?.removeSelector)) { 60 | for (const rl of options!.removeSelector) { 61 | jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove()); 62 | } 63 | } else if (options?.removeSelector) { 64 | jsdom.window.document.querySelectorAll(options.removeSelector).forEach((x) => x.remove()); 65 | } 66 | 67 | if (Array.isArray(options?.targetSelector)) { 68 | for (const x of options!.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) { 69 | x.forEach((el) => { 70 | if (!allNodes.includes(el)) { 71 | allNodes.push(el); 72 | } 73 | }); 74 | } 75 | } else if (options?.targetSelector) { 76 | jsdom.window.document.querySelectorAll(options.targetSelector).forEach((el) => { 77 | if (!allNodes.includes(el)) { 78 | allNodes.push(el); 79 | } 80 | }); 81 | } else { 82 | allNodes.push(jsdom.window.document); 83 | } 84 | 85 | if (!allNodes.length) { 86 | return snapshot; 87 | } 88 | const textChunks: string[] = []; 89 | let rootDoc; 90 | if (allNodes.length === 1 && allNodes[0].nodeName === '#document') { 91 | rootDoc = allNodes[0] as any; 92 | if (rootDoc.body.textContent) { 93 | textChunks.push(rootDoc.body.textContent); 94 | } 95 | } else { 96 | rootDoc = new JSDOM('', { url: snapshot.href, virtualConsole }).window.document; 97 | for (const n of allNodes) { 98 | rootDoc.body.appendChild(n); 99 | rootDoc.body.appendChild(rootDoc.createTextNode('\n\n')); 100 | if (n.textContent) { 101 | textChunks.push(n.textContent); 102 | } 103 | } 104 | } 105 | 106 | let parsed; 107 | try { 108 | parsed = new Readability(rootDoc.cloneNode(true) as any).parse(); 109 | } catch (err: any) { 110 | this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) }); 111 | } 112 | 113 | // No innerText in jsdom 114 | // https://github.com/jsdom/jsdom/issues/1245 115 | const textContent = textChunks.join('\n\n'); 116 | const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n'); 117 | 118 | const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]')) 119 | .map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')]) 120 | .flat() 121 | .map((x) => { 122 | try { 123 | return new URL(x, snapshot.rebase || snapshot.href).toString(); 124 | } catch (err) { 125 | return null; 126 | } 127 | }) 128 | .filter(Boolean); 129 | 130 | const imageSet = new Set(imageTags); 131 | 132 | const r = { 133 | ...snapshot, 134 | title: snapshot.title || jsdom.window.document.title, 135 | parsed, 136 | html: rootDoc.documentElement.outerHTML, 137 | text: cleanedText, 138 | imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [], 139 | } as PageSnapshot; 140 | 141 | const dt = Date.now() - t0; 142 | if (dt > 1000) { 143 | this.logger.warn(`Performance issue: Narrowing snapshot took ${dt}ms`, { url: snapshot.href, dt }); 144 | } 145 | 146 | return r; 147 | } 148 | 149 | inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot { 150 | const t0 = Date.now(); 151 | const extendedSnapshot = { ...snapshot } as ExtendedSnapshot; 152 | try { 153 | const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole }); 154 | jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = ''); 155 | const links = Array.from(jsdom.window.document.querySelectorAll('a[href]')) 156 | .map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()]) 157 | .map(([href, text]) => { 158 | if (!text) { 159 | return undefined; 160 | } 161 | try { 162 | const parsed = new URL(href, snapshot.rebase || snapshot.href); 163 | if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') { 164 | return undefined; 165 | } 166 | return [parsed.toString(), text] as const; 167 | } catch (err) { 168 | return undefined; 169 | } 170 | }) 171 | .filter(Boolean) 172 | .reduce((acc, pair) => { 173 | acc[pair![0]] = pair![1]; 174 | return acc; 175 | }, {} as { [k: string]: string; }); 176 | 177 | extendedSnapshot.links = links; 178 | 179 | const imgs = Array.from(jsdom.window.document.querySelectorAll('img[src],img[data-src]')) 180 | .map((x: any) => { 181 | let linkPreferredSrc = x.getAttribute('src') || ''; 182 | if (linkPreferredSrc.startsWith('data:')) { 183 | const dataSrc = x.getAttribute('data-src') || ''; 184 | if (dataSrc && !dataSrc.startsWith('data:')) { 185 | linkPreferredSrc = dataSrc; 186 | } 187 | } 188 | 189 | return { 190 | src: new URL(linkPreferredSrc, snapshot.rebase || snapshot.href).toString(), 191 | width: parseInt(x.getAttribute('width') || '0'), 192 | height: parseInt(x.getAttribute('height') || '0'), 193 | alt: x.getAttribute('alt') || x.getAttribute('title'), 194 | }; 195 | }); 196 | 197 | extendedSnapshot.imgs = imgs as any; 198 | } catch (_err) { 199 | void 0; 200 | } 201 | 202 | const dt = Date.now() - t0; 203 | if (dt > 1000) { 204 | this.logger.warn(`Performance issue: Inferring snapshot took ${dt}ms`, { url: snapshot.href, dt }); 205 | } 206 | 207 | return extendedSnapshot; 208 | } 209 | 210 | snippetToElement(snippet?: string, url?: string) { 211 | const parsed = new JSDOM(snippet || '', { url, virtualConsole }); 212 | 213 | return parsed.window.document.documentElement; 214 | } 215 | 216 | runTurndown(turndownService: TurndownService, html: TurndownService.Node | string) { 217 | const t0 = Date.now(); 218 | 219 | try { 220 | return turndownService.turndown(html); 221 | } finally { 222 | const dt = Date.now() - t0; 223 | if (dt > 1000) { 224 | this.logger.warn(`Performance issue: Turndown took ${dt}ms`, { dt }); 225 | } 226 | } 227 | } 228 | } 229 | 230 | const jsdomControl = container.resolve(JSDomControl); 231 | 232 | export default jsdomControl; 233 | -------------------------------------------------------------------------------- /backend/functions/src/services/puppeteer.ts: -------------------------------------------------------------------------------- 1 | import os from 'os'; 2 | import fs from 'fs'; 3 | import { container, singleton } from 'tsyringe'; 4 | import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit'; 5 | import { Logger } from '../shared/index'; 6 | 7 | import type { Browser, CookieParam, Page } from 'puppeteer'; 8 | import puppeteer from 'puppeteer-extra'; 9 | 10 | import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources'; 11 | import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy'; 12 | import { SecurityCompromiseError, ServiceCrashedError } from '../shared/errors'; 13 | import { TimeoutError } from 'puppeteer'; 14 | import tldExtract from 'tld-extract'; 15 | 16 | // Add this new function for cookie validation 17 | const validateCookie = (cookie: CookieParam) => { 18 | const requiredFields = ['name', 'value']; 19 | for (const field of requiredFields) { 20 | if (!cookie[field]) { 21 | throw new Error(`Cookie is missing required field: ${field}`); 22 | } 23 | } 24 | }; 25 | 26 | const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8'); 27 | 28 | 29 | export interface ImgBrief { 30 | src: string; 31 | loaded?: boolean; 32 | width?: number; 33 | height?: number; 34 | naturalWidth?: number; 35 | naturalHeight?: number; 36 | alt?: string; 37 | } 38 | 39 | export interface ReadabilityParsed { 40 | title: string; 41 | content: string; 42 | textContent: string; 43 | length: number; 44 | excerpt: string; 45 | byline: string; 46 | dir: string; 47 | siteName: string; 48 | lang: string; 49 | publishedTime: string; 50 | } 51 | 52 | export interface PageSnapshot { 53 | title: string; 54 | href: string; 55 | rebase?: string; 56 | html: string; 57 | text: string; 58 | parsed?: Partial | null; 59 | screenshot?: Buffer; 60 | pageshot?: Buffer; 61 | imgs?: ImgBrief[]; 62 | pdfs?: string[]; 63 | maxElemDepth?: number; 64 | elemCount?: number; 65 | childFrames?: PageSnapshot[]; 66 | error?: string; 67 | } 68 | 69 | export interface ExtendedSnapshot extends PageSnapshot { 70 | links: { [url: string]: string; }; 71 | imgs: ImgBrief[]; 72 | } 73 | 74 | export interface ScrappingOptions { 75 | proxyUrl?: string; 76 | cookies?: CookieParam[]; 77 | favorScreenshot?: boolean; 78 | waitForSelector?: string | string[]; 79 | minIntervalMs?: number; 80 | overrideUserAgent?: string; 81 | timeoutMs?: number; 82 | } 83 | 84 | 85 | const puppeteerStealth = require('puppeteer-extra-plugin-stealth'); 86 | puppeteer.use(puppeteerStealth()); 87 | // const puppeteerUAOverride = require('puppeteer-extra-plugin-stealth/evasions/user-agent-override'); 88 | // puppeteer.use(puppeteerUAOverride({ 89 | // userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`, 90 | // platform: `Linux`, 91 | // })) 92 | 93 | puppeteer.use(puppeteerBlockResources({ 94 | blockedTypes: new Set(['media']), 95 | interceptResolutionPriority: 1, 96 | })); 97 | puppeteer.use(puppeteerPageProxy({ 98 | interceptResolutionPriority: 1, 99 | })); 100 | 101 | const SCRIPT_TO_INJECT_INTO_FRAME = ` 102 | ${READABILITY_JS} 103 | 104 | function briefImgs(elem) { 105 | const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]')); 106 | 107 | return imageTags.map((x)=> { 108 | let linkPreferredSrc = x.src; 109 | if (linkPreferredSrc.startsWith('data:')) { 110 | if (typeof x.dataset?.src === 'string' && !x.dataset.src.startsWith('data:')) { 111 | linkPreferredSrc = x.dataset.src; 112 | } 113 | } 114 | 115 | return { 116 | src: new URL(linkPreferredSrc, document.baseURI).toString(), 117 | loaded: x.complete, 118 | width: x.width, 119 | height: x.height, 120 | naturalWidth: x.naturalWidth, 121 | naturalHeight: x.naturalHeight, 122 | alt: x.alt || x.title, 123 | }; 124 | }); 125 | } 126 | function briefPDFs() { 127 | const pdfTags = Array.from(document.querySelectorAll('embed[type="application/pdf"]')); 128 | 129 | return pdfTags.map((x)=> { 130 | return x.src === 'about:blank' ? document.location.href : x.src; 131 | }); 132 | } 133 | function getMaxDepthAndCountUsingTreeWalker(root) { 134 | let maxDepth = 0; 135 | let currentDepth = 0; 136 | let elementCount = 0; 137 | 138 | const treeWalker = document.createTreeWalker( 139 | root, 140 | NodeFilter.SHOW_ELEMENT, 141 | (node) => { 142 | const nodeName = node.nodeName.toLowerCase(); 143 | return (nodeName === 'svg') ? NodeFilter.FILTER_REJECT : NodeFilter.FILTER_ACCEPT; 144 | }, 145 | false 146 | ); 147 | 148 | while (true) { 149 | maxDepth = Math.max(maxDepth, currentDepth); 150 | elementCount++; // Increment the count for the current node 151 | 152 | if (treeWalker.firstChild()) { 153 | currentDepth++; 154 | } else { 155 | while (!treeWalker.nextSibling() && currentDepth > 0) { 156 | treeWalker.parentNode(); 157 | currentDepth--; 158 | } 159 | 160 | if (currentDepth <= 0) { 161 | break; 162 | } 163 | } 164 | } 165 | 166 | return { 167 | maxDepth: maxDepth + 1, 168 | elementCount: elementCount 169 | }; 170 | } 171 | 172 | function giveSnapshot(stopActiveSnapshot) { 173 | if (stopActiveSnapshot) { 174 | window.haltSnapshot = true; 175 | } 176 | let parsed; 177 | try { 178 | parsed = new Readability(document.cloneNode(true)).parse(); 179 | } catch (err) { 180 | void 0; 181 | } 182 | const domAnalysis = getMaxDepthAndCountUsingTreeWalker(document.documentElement); 183 | const r = { 184 | title: document.title, 185 | href: document.location.href, 186 | html: document.documentElement?.outerHTML, 187 | text: document.body?.innerText, 188 | parsed: parsed, 189 | imgs: [], 190 | pdfs: briefPDFs(), 191 | maxElemDepth: domAnalysis.maxDepth, 192 | elemCount: domAnalysis.elementCount, 193 | }; 194 | if (document.baseURI !== r.href) { 195 | r.rebase = document.baseURI; 196 | } 197 | if (parsed && parsed.content) { 198 | const elem = document.createElement('div'); 199 | elem.innerHTML = parsed.content; 200 | r.imgs = briefImgs(elem); 201 | } else { 202 | const allImgs = briefImgs(); 203 | if (allImgs.length === 1) { 204 | r.imgs = allImgs; 205 | } 206 | } 207 | 208 | return r; 209 | } 210 | `; 211 | 212 | @singleton() 213 | export class PuppeteerControl extends AsyncService { 214 | 215 | _sn = 0; 216 | browser!: Browser; 217 | logger = new Logger('CHANGE_LOGGER_NAME') 218 | 219 | private __healthCheckInterval?: NodeJS.Timeout; 220 | 221 | __loadedPage: Page[] = []; 222 | 223 | finalizerMap = new WeakMap>(); 224 | snMap = new WeakMap(); 225 | livePages = new Set(); 226 | lastPageCratedAt: number = 0; 227 | 228 | circuitBreakerHosts: Set = new Set(); 229 | 230 | constructor( 231 | ) { 232 | super(...arguments); 233 | this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95; 234 | 235 | this.on('crippled', () => { 236 | this.__loadedPage.length = 0; 237 | this.livePages.clear(); 238 | }); 239 | } 240 | 241 | briefPages() { 242 | this.logger.info(`Status: ${this.livePages.size} pages alive: ${Array.from(this.livePages).map((x) => this.snMap.get(x)).sort().join(', ')}; ${this.__loadedPage.length} idle pages: ${this.__loadedPage.map((x) => this.snMap.get(x)).sort().join(', ')}`); 243 | } 244 | 245 | override async init() { 246 | if (this.__healthCheckInterval) { 247 | clearInterval(this.__healthCheckInterval); 248 | this.__healthCheckInterval = undefined; 249 | } 250 | await this.dependencyReady(); 251 | 252 | if (this.browser) { 253 | if (this.browser.connected) { 254 | await this.browser.close(); 255 | } else { 256 | this.browser.process()?.kill('SIGKILL'); 257 | } 258 | } 259 | const args = [ 260 | '--no-sandbox', 261 | '--disable-setuid-sandbox', 262 | '--disable-dev-shm-usage', 263 | '--single-process' 264 | ]; 265 | 266 | this.browser = await puppeteer.launch({ 267 | args: args, 268 | timeout: 10_000 269 | }).catch((err: any) => { 270 | this.logger.error(`Unknown firebase issue, just die fast.`, { err }); 271 | process.nextTick(() => { 272 | this.emit('error', err); 273 | // process.exit(1); 274 | }); 275 | return Promise.reject(err); 276 | }); 277 | this.browser.once('disconnected', () => { 278 | this.logger.warn(`Browser disconnected`); 279 | this.emit('crippled'); 280 | process.nextTick(() => this.serviceReady()); 281 | }); 282 | this.logger.info(`Browser launched: ${this.browser.process()?.pid}`); 283 | 284 | this.emit('ready'); 285 | 286 | this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000); 287 | this.newPage().then((r) => this.__loadedPage.push(r)); 288 | } 289 | 290 | @maxConcurrency(1) 291 | async healthCheck() { 292 | if (Date.now() - this.lastPageCratedAt <= 10_000) { 293 | this.briefPages(); 294 | return; 295 | } 296 | const healthyPage = await this.newPage().catch((err) => { 297 | this.logger.warn(`Health check failed`, { err: marshalErrorLike(err) }); 298 | return null; 299 | }); 300 | 301 | if (healthyPage) { 302 | this.__loadedPage.push(healthyPage); 303 | 304 | if (this.__loadedPage.length > 3) { 305 | this.ditchPage(this.__loadedPage.shift()!); 306 | } 307 | 308 | this.briefPages(); 309 | 310 | return; 311 | } 312 | 313 | this.logger.warn(`Trying to clean up...`); 314 | this.browser.process()?.kill('SIGKILL'); 315 | Reflect.deleteProperty(this, 'browser'); 316 | this.emit('crippled'); 317 | this.logger.warn(`Browser killed`); 318 | } 319 | 320 | private extractDomain(url: string): string { 321 | try { 322 | const { hostname } = new URL(url); 323 | const parts = hostname.split('.'); 324 | return parts.length > 1 ? parts.slice(-2).join('.') : hostname; 325 | } catch (error: any) { 326 | this.logger.warn(`Failed to extract domain from URL: ${url}. Error: ${error.message}`); 327 | return url; 328 | } 329 | } 330 | 331 | async newPage() { 332 | await this.serviceReady(); 333 | const dedicatedContext = await this.browser.createBrowserContext(); 334 | const sn = this._sn++; 335 | const page = await dedicatedContext.newPage(); 336 | const preparations: any[] = []; 337 | 338 | // preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`)); 339 | // preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`)); 340 | preparations.push(page.setBypassCSP(true)); 341 | preparations.push(page.setViewport({ width: 1024, height: 1024 })); 342 | preparations.push(page.exposeFunction('reportSnapshot', (snapshot: PageSnapshot) => { 343 | if (snapshot.href === 'about:blank') { 344 | return; 345 | } 346 | page.emit('snapshot', snapshot); 347 | })); 348 | preparations.push(page.evaluateOnNewDocument(SCRIPT_TO_INJECT_INTO_FRAME)); 349 | preparations.push(page.setRequestInterception(true)); 350 | 351 | await Promise.all(preparations); 352 | 353 | await page.goto('about:blank', { waitUntil: 'domcontentloaded' }); 354 | 355 | const domainSet = new Set(); 356 | let reqCounter = 0; 357 | let t0: number | undefined; 358 | let halt = false; 359 | 360 | page.on('request', (req) => { 361 | reqCounter++; 362 | if (halt) { 363 | return req.abort('blockedbyclient', 1000); 364 | } 365 | t0 ??= Date.now(); 366 | const requestUrl = req.url(); 367 | if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') { 368 | return req.abort('blockedbyclient', 1000); 369 | } 370 | 371 | try { 372 | const tldParsed = tldExtract(requestUrl); 373 | domainSet.add(tldParsed.domain); 374 | } catch (error) { 375 | this.logger.warn(`Failed to parse TLD for URL: ${requestUrl}. Using fallback method.`); 376 | const simpleDomain = this.extractDomain(requestUrl); 377 | domainSet.add(simpleDomain); 378 | } 379 | 380 | const parsedUrl = new URL(requestUrl); 381 | 382 | if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) { 383 | page.emit('abuse', { url: requestUrl, page, sn, reason: `Abusive request: ${requestUrl}` }); 384 | return req.abort('blockedbyclient', 1000); 385 | } 386 | 387 | if ( 388 | parsedUrl.hostname === 'localhost' || 389 | parsedUrl.hostname.startsWith('127.') 390 | ) { 391 | page.emit('abuse', { url: requestUrl, page, sn, reason: `Suspicious action: Request to localhost: ${requestUrl}` }); 392 | 393 | return req.abort('blockedbyclient', 1000); 394 | } 395 | 396 | const dt = Math.ceil((Date.now() - t0) / 1000); 397 | const rps = reqCounter / dt; 398 | // console.log(`rps: ${rps}`); 399 | 400 | if (reqCounter > 1000) { 401 | if (rps > 60 || reqCounter > 2000) { 402 | page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests` }); 403 | halt = true; 404 | 405 | return req.abort('blockedbyclient', 1000); 406 | } 407 | } 408 | 409 | if (domainSet.size > 200) { 410 | page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many domains` }); 411 | halt = true; 412 | 413 | return req.abort('blockedbyclient', 1000); 414 | } 415 | 416 | const continueArgs = req.continueRequestOverrides 417 | ? [req.continueRequestOverrides(), 0] as const 418 | : []; 419 | 420 | return req.continue(continueArgs[0], continueArgs[1]); 421 | }); 422 | 423 | await page.evaluateOnNewDocument(` 424 | let lastTextLength = 0; 425 | const handlePageLoad = () => { 426 | if (window.haltSnapshot) { 427 | return; 428 | } 429 | const thisTextLength = (document.body.innerText || '').length; 430 | const deltaLength = Math.abs(thisTextLength - lastTextLength); 431 | if (10 * deltaLength < lastTextLength) { 432 | // Change is not significant 433 | return; 434 | } 435 | const r = giveSnapshot(); 436 | window.reportSnapshot(r); 437 | lastTextLength = thisTextLength; 438 | }; 439 | setInterval(handlePageLoad, 800); 440 | document.addEventListener('readystatechange', handlePageLoad); 441 | document.addEventListener('load', handlePageLoad); 442 | `); 443 | 444 | this.snMap.set(page, sn); 445 | this.logger.info(`Page ${sn} created.`); 446 | this.lastPageCratedAt = Date.now(); 447 | this.livePages.add(page); 448 | 449 | return page; 450 | } 451 | 452 | async getNextPage() { 453 | let thePage: Page | undefined; 454 | if (this.__loadedPage.length) { 455 | thePage = this.__loadedPage.shift(); 456 | if (this.__loadedPage.length <= 1) { 457 | this.newPage() 458 | .then((r) => this.__loadedPage.push(r)) 459 | .catch((err) => { 460 | this.logger.warn(`Failed to load new page ahead of time`, { err: marshalErrorLike(err) }); 461 | }); 462 | } 463 | } 464 | 465 | if (!thePage) { 466 | thePage = await this.newPage(); 467 | } 468 | 469 | const timer = setTimeout(() => { 470 | this.logger.warn(`Page is not allowed to live past 5 minutes, ditching page ${this.snMap.get(thePage!)}...`); 471 | this.ditchPage(thePage!); 472 | }, 300 * 1000); 473 | 474 | this.finalizerMap.set(thePage, timer); 475 | 476 | return thePage; 477 | } 478 | 479 | async ditchPage(page: Page) { 480 | if (this.finalizerMap.has(page)) { 481 | clearTimeout(this.finalizerMap.get(page)!); 482 | this.finalizerMap.delete(page); 483 | } 484 | if (page.isClosed()) { 485 | return; 486 | } 487 | const sn = this.snMap.get(page); 488 | this.logger.info(`Closing page ${sn}`); 489 | this.livePages.delete(page); 490 | await Promise.race([ 491 | (async () => { 492 | const ctx = page.browserContext(); 493 | await page.close(); 494 | await ctx.close(); 495 | })(), delay(5000) 496 | ]).catch((err) => { 497 | this.logger.error(`Failed to destroy page ${sn}`, { err: marshalErrorLike(err) }); 498 | }); 499 | } 500 | 501 | async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator { 502 | // parsedUrl.search = ''; 503 | console.log('Scraping options:', options); 504 | const url = parsedUrl.toString(); 505 | 506 | let snapshot: PageSnapshot | undefined; 507 | let screenshot: Buffer | undefined; 508 | let pageshot: Buffer | undefined; 509 | const page = await this.getNextPage(); 510 | const sn = this.snMap.get(page); 511 | this.logger.info(`Page ${sn}: Scraping ${url}`, { url }); 512 | 513 | if (options?.proxyUrl) { 514 | this.logger.info(`Page ${sn}: Using proxy:`, options.proxyUrl); 515 | await page.useProxy(options.proxyUrl); 516 | } 517 | 518 | if (options?.cookies) { 519 | this.logger.info(`Page ${sn}: Attempting to set cookies:`, JSON.stringify(options.cookies, null, 2)); 520 | try { 521 | options.cookies.forEach(validateCookie); 522 | await page.setCookie(...options.cookies); 523 | } catch (error) { 524 | this.logger.error(`Page ${sn}: Error setting cookies:`, error); 525 | this.logger.info(`Page ${sn}: Problematic cookies:`, JSON.stringify(options.cookies, null, 2)); 526 | throw error; 527 | } 528 | } 529 | 530 | if (options?.overrideUserAgent) { 531 | await page.setUserAgent(options.overrideUserAgent); 532 | } 533 | 534 | let nextSnapshotDeferred = Defer(); 535 | const crippleListener = () => nextSnapshotDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` })); 536 | this.once('crippled', crippleListener); 537 | nextSnapshotDeferred.promise.finally(() => { 538 | this.off('crippled', crippleListener); 539 | }); 540 | let finalized = false; 541 | const hdl = (s: any) => { 542 | if (snapshot === s) { 543 | return; 544 | } 545 | snapshot = s; 546 | if (s?.maxElemDepth && s.maxElemDepth > 256) { 547 | return; 548 | } 549 | if (s?.elemCount && s.elemCount > 10_000) { 550 | return; 551 | } 552 | nextSnapshotDeferred.resolve(s); 553 | nextSnapshotDeferred = Defer(); 554 | this.once('crippled', crippleListener); 555 | nextSnapshotDeferred.promise.finally(() => { 556 | this.off('crippled', crippleListener); 557 | }); 558 | }; 559 | page.on('snapshot', hdl); 560 | page.once('abuse', (event: any) => { 561 | this.emit('abuse', { ...event, url: parsedUrl }); 562 | nextSnapshotDeferred.reject( 563 | new SecurityCompromiseError(`Abuse detected: ${event.reason}`) 564 | ); 565 | }); 566 | 567 | const timeout = options?.timeoutMs || 30_000; 568 | 569 | try { 570 | let waitForPromise: Promise | undefined; 571 | let gotoPromise: Promise; 572 | 573 | gotoPromise = page.goto(url, { 574 | waitUntil: ['load', 'domcontentloaded', 'networkidle0'], 575 | timeout, 576 | }) 577 | .catch((err: any) => { 578 | if (err instanceof TimeoutError || err.message.includes('ERR_NAME_NOT_RESOLVED')) { 579 | this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err: marshalErrorLike(err) }); 580 | return { 581 | title: 'Error: Unable to access page', 582 | href: url, 583 | html: '', 584 | text: `Failed to access the page: ${err.message}`, 585 | error: err.message 586 | } as PageSnapshot; 587 | } 588 | if (err.message && (err.message.includes('Invalid TLD') || err.message.includes('ERR_NAME_NOT_RESOLVED'))) { 589 | this.logger.warn(`Page ${sn}: Invalid domain or TLD for ${url}`, { err: marshalErrorLike(err) }); 590 | return new AssertionFailureError({ 591 | message: `Invalid domain or TLD for ${url}: ${err}`, 592 | cause: err, 593 | }); 594 | } 595 | 596 | this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err: marshalErrorLike(err) }); 597 | return Promise.reject(new AssertionFailureError({ 598 | message: `Failed to goto ${url}: ${err}`, 599 | cause: err, 600 | })); 601 | }).then(async (stuff) => { 602 | // This check is necessary because without snapshot, the condition of the page is unclear 603 | // Calling evaluate directly may stall the process. 604 | if (!snapshot) { 605 | if (stuff instanceof Error) { 606 | finalized = true; 607 | throw stuff; 608 | } 609 | } 610 | try { 611 | const pSubFrameSnapshots = this.snapshotChildFrames(page); 612 | snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; 613 | screenshot = await page.screenshot(); 614 | if (snapshot) { 615 | snapshot.childFrames = await pSubFrameSnapshots; 616 | } 617 | } catch (err: any) { 618 | this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err: marshalErrorLike(err) }); 619 | if (stuff instanceof Error) { 620 | finalized = true; 621 | throw stuff; 622 | } 623 | } 624 | if (!snapshot?.html) { 625 | if (stuff instanceof Error) { 626 | finalized = true; 627 | throw stuff; 628 | } 629 | } 630 | try { 631 | if ((!snapshot?.title || !snapshot?.parsed?.content) && !(snapshot?.pdfs?.length)) { 632 | const salvaged = await this.salvage(url, page); 633 | if (salvaged) { 634 | const pSubFrameSnapshots = this.snapshotChildFrames(page); 635 | snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; 636 | screenshot = await page.screenshot(); 637 | pageshot = await page.screenshot({ fullPage: true }); 638 | if (snapshot) { 639 | snapshot.childFrames = await pSubFrameSnapshots; 640 | } 641 | } 642 | } 643 | } catch (err: any) { 644 | this.logger.warn(`Page ${sn}: Failed to salvage ${url}`, { err: marshalErrorLike(err) }); 645 | } 646 | 647 | finalized = true; 648 | if (snapshot?.html) { 649 | this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href }); 650 | this.emit( 651 | 'crawled', 652 | { ...snapshot, screenshot, pageshot }, 653 | { ...options, url: parsedUrl } 654 | ); 655 | } 656 | }); 657 | 658 | if (options?.waitForSelector) { 659 | console.log('Waiting for selector', options.waitForSelector); 660 | const t0 = Date.now(); 661 | waitForPromise = nextSnapshotDeferred.promise.then(() => { 662 | const t1 = Date.now(); 663 | const elapsed = t1 - t0; 664 | const remaining = timeout - elapsed; 665 | const thisTimeout = remaining > 100 ? remaining : 100; 666 | const p = (Array.isArray(options.waitForSelector) ? 667 | Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) : 668 | page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout })) 669 | .then(async () => { 670 | const pSubFrameSnapshots = this.snapshotChildFrames(page); 671 | snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; 672 | screenshot = await page.screenshot(); 673 | pageshot = await page.screenshot({ fullPage: true }); 674 | if (snapshot) { 675 | snapshot.childFrames = await pSubFrameSnapshots; 676 | } 677 | finalized = true; 678 | }) 679 | .catch((err) => { 680 | this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) }); 681 | waitForPromise = undefined; 682 | }); 683 | return p as any; 684 | }); 685 | } 686 | 687 | try { 688 | let lastHTML = snapshot?.html; 689 | while (true) { 690 | const ckpt = [nextSnapshotDeferred.promise, gotoPromise]; 691 | if (waitForPromise) { 692 | ckpt.push(waitForPromise); 693 | } 694 | if (options?.minIntervalMs) { 695 | ckpt.push(delay(options.minIntervalMs)); 696 | } 697 | let error; 698 | await Promise.race(ckpt).catch((err) => { 699 | if (err.message && (err.message.includes('Invalid TLD') || err.message.includes('ERR_NAME_NOT_RESOLVED'))) { 700 | this.logger.warn(`Invalid domain or TLD encountered: ${err.message}`); 701 | error = new AssertionFailureError({ 702 | message: `Invalid domain or TLD for ${url}: ${err.message}`, 703 | cause: err, 704 | }); 705 | } else { 706 | error = err; 707 | } 708 | }); 709 | if (finalized && !error) { 710 | if (!snapshot && !screenshot) { 711 | if (error) { 712 | throw error; 713 | } 714 | throw new AssertionFailureError(`Could not extract any meaningful content from the page`); 715 | } 716 | yield { ...snapshot, screenshot, pageshot } as PageSnapshot; 717 | break; 718 | } 719 | if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) { 720 | screenshot = await page.screenshot(); 721 | pageshot = await page.screenshot({ fullPage: true }); 722 | lastHTML = snapshot.html; 723 | } 724 | if (snapshot || screenshot) { 725 | yield { ...snapshot, screenshot, pageshot } as PageSnapshot; 726 | } 727 | if (error) { 728 | if (error instanceof AssertionFailureError && 729 | (error.message.includes('Invalid TLD') || error.message.includes('ERR_NAME_NOT_RESOLVED'))) { 730 | this.logger.warn(`Continuing despite Invalid domain or TLD: ${error.message}`); 731 | yield { 732 | title: '', 733 | href: url, 734 | html: '', 735 | text: '', 736 | screenshot, 737 | pageshot, 738 | error: 'Invalid domain or TLD' 739 | } as PageSnapshot; 740 | break; 741 | } else { 742 | throw error; 743 | } 744 | } 745 | } 746 | } catch (error: any) { 747 | if (error.message && (error.message.includes('Invalid TLD') || error.message.includes('ERR_NAME_NOT_RESOLVED'))) { 748 | this.logger.warn(`Invalid domain or TLD encountered: ${error.message}`); 749 | yield { 750 | title: '', 751 | href: url, 752 | html: '', 753 | text: '', 754 | screenshot, 755 | pageshot, 756 | error: 'Invalid domain or TLD' 757 | } as PageSnapshot; 758 | } else { 759 | throw error; 760 | } 761 | } finally { 762 | if (typeof waitForPromise !== 'undefined' && typeof gotoPromise !== 'undefined') { 763 | Promise.allSettled([gotoPromise, waitForPromise]).finally(() => { 764 | page.off('snapshot', hdl); 765 | this.ditchPage(page); 766 | }); 767 | } else if (typeof gotoPromise !== 'undefined') { 768 | gotoPromise.finally(() => { 769 | page.off('snapshot', hdl); 770 | this.ditchPage(page); 771 | }); 772 | } else { 773 | page.off('snapshot', hdl); 774 | this.ditchPage(page); 775 | } 776 | nextSnapshotDeferred.resolve(); 777 | } 778 | } catch (error: any) { 779 | this.logger.error(`Unhandled error in scrap method:`, error); 780 | yield { 781 | title: 'Error: Unhandled exception', 782 | href: url, 783 | html: '', 784 | text: `An unexpected error occurred: ${error.message}`, 785 | error: 'Unhandled exception' 786 | } as PageSnapshot; 787 | } 788 | } 789 | 790 | async salvage(url: string, page: Page) { 791 | this.logger.info(`Salvaging ${url}`); 792 | const googleArchiveUrl = `https://webcache.googleusercontent.com/search?q=cache:${encodeURIComponent(url)}`; 793 | const resp = await fetch(googleArchiveUrl, { 794 | headers: { 795 | 'User-Agent': `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)` 796 | } 797 | }); 798 | resp.body?.cancel().catch(() => void 0); 799 | if (!resp.ok) { 800 | this.logger.warn(`No salvation found for url: ${url}`, { status: resp.status, url }); 801 | return null; 802 | } 803 | 804 | await page.goto(googleArchiveUrl, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 15_000 }).catch((err) => { 805 | this.logger.warn(`Page salvation did not fully succeed.`, { err: marshalErrorLike(err) }); 806 | }); 807 | 808 | this.logger.info(`Salvation completed.`); 809 | 810 | return true; 811 | } 812 | 813 | async snapshotChildFrames(page: Page): Promise { 814 | const childFrames = page.mainFrame().childFrames(); 815 | const r = await Promise.all(childFrames.map(async (x) => { 816 | const thisUrl = x.url(); 817 | if (!thisUrl || thisUrl === 'about:blank') { 818 | return undefined; 819 | } 820 | try { 821 | await x.evaluate(SCRIPT_TO_INJECT_INTO_FRAME); 822 | 823 | return await x.evaluate(`giveSnapshot()`); 824 | } catch (err) { 825 | this.logger.warn(`Failed to snapshot child frame ${thisUrl}`, { err }); 826 | return undefined; 827 | } 828 | })) as PageSnapshot[]; 829 | 830 | return r.filter(Boolean); 831 | } 832 | 833 | } 834 | 835 | const puppeteerControl = container.resolve(PuppeteerControl); 836 | 837 | export default puppeteerControl; 838 | -------------------------------------------------------------------------------- /backend/functions/src/shared/3rd-party/brave-search.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intergalacticalvariable/reader/d5eee9517578c1a31e8beb1fbac1e3a638c940e7/backend/functions/src/shared/3rd-party/brave-search.ts -------------------------------------------------------------------------------- /backend/functions/src/shared/decorators.ts: -------------------------------------------------------------------------------- 1 | export function CloudHTTPv2(config: any): MethodDecorator { 2 | return function (target: any, propertyKey: string | symbol, descriptor: PropertyDescriptor) { 3 | // Simplified implementation 4 | console.log(`CloudHTTPv2 decorator applied to ${String(propertyKey)}`); 5 | return descriptor; 6 | }; 7 | } -------------------------------------------------------------------------------- /backend/functions/src/shared/errors.ts: -------------------------------------------------------------------------------- 1 | export class SecurityCompromiseError extends Error { 2 | constructor(message: string) { 3 | super(message); 4 | this.name = 'SecurityCompromiseError'; 5 | } 6 | } 7 | 8 | export class ServiceCrashedError extends Error { 9 | constructor({ message }: { message: string }) { 10 | super(message); 11 | this.name = 'ServiceCrashedError'; 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /backend/functions/src/shared/index.ts: -------------------------------------------------------------------------------- 1 | import { CloudHTTPv2 } from './decorators'; 2 | import { Ctx } from './types'; 3 | import { Logger } from './logger'; 4 | import { OutputServerEventStream } from './output-stream'; 5 | import { RPCReflect } from './rpc-reflect'; 6 | import { injectable } from 'tsyringe'; 7 | import * as fs from 'fs'; 8 | import * as path from 'path'; 9 | 10 | @injectable() 11 | export class AsyncContext { 12 | private storage: Map = new Map(); 13 | set(key: string, value: any) { 14 | this.storage.set(key, value); 15 | } 16 | 17 | get(key: string): any { 18 | return this.storage.get(key); 19 | } 20 | } 21 | 22 | export class InsufficientBalanceError extends Error { 23 | constructor(message: string) { 24 | super(message); 25 | this.name = 'InsufficientBalanceError'; 26 | } 27 | } 28 | 29 | export function Param(name: string, options?: any): ParameterDecorator { 30 | return (target: Object, propertyKey: string | symbol | undefined, parameterIndex: number) => { 31 | // Implementation details would go here 32 | }; 33 | } 34 | 35 | @injectable() 36 | export class FirebaseStorageBucketControl { 37 | private localStorageDir: string; 38 | 39 | constructor() { 40 | this.localStorageDir = path.join('/app', 'local-storage'); 41 | if (!fs.existsSync(this.localStorageDir)) { 42 | fs.mkdirSync(this.localStorageDir, { recursive: true }); 43 | } 44 | } 45 | 46 | async uploadFile(filePath: string, destination: string): Promise { 47 | const destPath = path.join(this.localStorageDir, destination); 48 | await fs.promises.copyFile(filePath, destPath); 49 | return `file://${destPath}`; 50 | } 51 | 52 | async downloadFile(filePath: string, destination: string): Promise { 53 | const sourcePath = path.join(this.localStorageDir, filePath); 54 | await fs.promises.copyFile(sourcePath, destination); 55 | } 56 | 57 | async deleteFile(filePath: string): Promise { 58 | const fullPath = path.join(this.localStorageDir, filePath); 59 | await fs.promises.unlink(fullPath); 60 | } 61 | 62 | async fileExists(filePath: string): Promise { 63 | const fullPath = path.join(this.localStorageDir, filePath); 64 | return fs.existsSync(fullPath); 65 | } 66 | 67 | async saveFile(filePath: string, content: Buffer, options?: any): Promise { 68 | const fullPath = path.join(this.localStorageDir, filePath); 69 | await fs.promises.writeFile(fullPath, content); 70 | } 71 | 72 | async signDownloadUrl(filePath: string, expirationTime: number): Promise { 73 | const fullPath = path.join(this.localStorageDir, filePath); 74 | return `file://${fullPath}`; 75 | } 76 | } 77 | 78 | export { 79 | CloudHTTPv2, 80 | Ctx, 81 | Logger, 82 | OutputServerEventStream, 83 | RPCReflect, 84 | }; 85 | 86 | export const loadModulesDynamically = (path: string) => { 87 | // Simplified implementation 88 | console.log(`Loading modules from ${path}`); 89 | }; 90 | 91 | export const registry = { 92 | exportAll: () => ({}), 93 | exportGrouped: () => ({}), 94 | allHandsOnDeck: async () => {}, 95 | }; 96 | -------------------------------------------------------------------------------- /backend/functions/src/shared/lib/firestore.ts: -------------------------------------------------------------------------------- 1 | import { Prop } from 'civkit'; 2 | 3 | export class FirestoreRecord { 4 | static collectionName: string; 5 | 6 | @Prop() 7 | _id!: string; 8 | 9 | static from(input: any): FirestoreRecord { 10 | const instance = new this(); 11 | Object.assign(instance, input); 12 | return instance; 13 | } 14 | 15 | static async fromFirestore(id: string): Promise { 16 | // Mock implementation 17 | console.log(`Fetching document with id ${id} from collection ${this.collectionName}`); 18 | return undefined; 19 | } 20 | 21 | static async fromFirestoreQuery(query: any): Promise { 22 | // Mock implementation 23 | console.log(`Executing query on collection ${this.collectionName}`); 24 | return []; 25 | } 26 | 27 | static async save(data: any): Promise { 28 | // Mock implementation 29 | console.log(`Saving data to collection ${this.collectionName}`); 30 | } 31 | 32 | degradeForFireStore(): any { 33 | // Default implementation 34 | return { ...this }; 35 | } 36 | 37 | static COLLECTION = { 38 | doc: (id: string) => ({ 39 | set: (data: any, options?: any) => { 40 | console.log(`Setting document ${id} in collection ${this.collectionName}`); 41 | } 42 | }), 43 | where: () => ({ 44 | orderBy: () => ({ 45 | limit: () => ({}) 46 | }) 47 | }) 48 | }; 49 | } -------------------------------------------------------------------------------- /backend/functions/src/shared/logger.ts: -------------------------------------------------------------------------------- 1 | import { injectable } from 'tsyringe'; 2 | 3 | @injectable() 4 | export class Logger { 5 | constructor(private name: string) {} 6 | 7 | info(message: string, ...args: any[]) { 8 | console.log(`[${this.name}] INFO:`, message, ...args); 9 | } 10 | 11 | warn(message: string, ...args: any[]) { 12 | console.warn(`[${this.name}] WARN:`, message, ...args); 13 | } 14 | 15 | error(message: string, ...args: any[]) { 16 | console.error(`[${this.name}] ERROR:`, message, ...args); 17 | } 18 | 19 | child(options: { service: string }) { 20 | return new Logger(`${this.name}:${options.service}`); 21 | } 22 | } -------------------------------------------------------------------------------- /backend/functions/src/shared/output-stream.ts: -------------------------------------------------------------------------------- 1 | export class OutputServerEventStream { 2 | write(data: any) { 3 | console.log('OutputServerEventStream write:', data); 4 | } 5 | 6 | end() { 7 | console.log('OutputServerEventStream ended'); 8 | } 9 | } -------------------------------------------------------------------------------- /backend/functions/src/shared/rpc-reflect.ts: -------------------------------------------------------------------------------- 1 | export function RPCReflect() { 2 | return function (target: any, propertyKey: string | symbol, parameterIndex: number) { 3 | console.log(`RPCReflect decorator applied to parameter ${parameterIndex} of ${String(propertyKey)}`); 4 | }; 5 | } -------------------------------------------------------------------------------- /backend/functions/src/shared/services/canvas.ts: -------------------------------------------------------------------------------- 1 | import { AsyncService } from 'civkit'; 2 | import { singleton } from 'tsyringe'; 3 | import { Logger } from '../logger'; 4 | 5 | @singleton() 6 | export class CanvasService extends AsyncService { 7 | logger = new Logger('CHANGE_LOGGER_NAME') 8 | 9 | constructor() { 10 | super(); 11 | } 12 | 13 | override async init() { 14 | this.logger.info('CanvasService initialized'); 15 | this.emit('ready'); 16 | } 17 | 18 | async loadImage(url: string): Promise { 19 | console.log(`Mock: Loading image from ${url}`); 20 | return { width: 1000, height: 1000 }; 21 | } 22 | 23 | fitImageToSquareBox(img: any, size: number): any { 24 | console.log(`Mock: Fitting image to square box of size ${size}`); 25 | return { width: size, height: size }; 26 | } 27 | 28 | async canvasToBuffer(canvas: any, format: string): Promise { 29 | console.log(`Mock: Converting canvas to buffer with format ${format}`); 30 | return Buffer.from('mock image data'); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /backend/functions/src/shared/services/rate-limit.ts: -------------------------------------------------------------------------------- 1 | import { AsyncService } from 'civkit'; 2 | 3 | export interface RateLimitDesc { 4 | key: string; 5 | limit: number; 6 | window: number; 7 | } 8 | 9 | export class RateLimitControl extends AsyncService { 10 | constructor() { 11 | super(); 12 | } 13 | 14 | override async init() { 15 | // Mock implementation 16 | this.emit('ready'); 17 | } 18 | 19 | async increment(desc: RateLimitDesc): Promise { 20 | // Mock implementation 21 | console.log(`Incrementing rate limit for key: ${desc.key}`); 22 | return true; 23 | } 24 | 25 | async decrement(desc: RateLimitDesc): Promise { 26 | // Mock implementation 27 | console.log(`Decrementing rate limit for key: ${desc.key}`); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /backend/functions/src/shared/services/secrets.ts: -------------------------------------------------------------------------------- 1 | import { AsyncService } from 'civkit'; 2 | import { singleton } from 'tsyringe'; 3 | import { Logger } from '../logger'; 4 | 5 | @singleton() 6 | export class SecretExposer extends AsyncService { 7 | logger = new Logger('CHANGE_LOGGER_NAME') 8 | 9 | BRAVE_SEARCH_API_KEY: string = 'mock_brave_search_api_key'; 10 | 11 | constructor() { 12 | super(); 13 | } 14 | 15 | override async init() { 16 | // Mock initialization 17 | this.logger.info('SecretExposer initialized'); 18 | this.emit('ready'); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /backend/functions/src/shared/types.ts: -------------------------------------------------------------------------------- 1 | import { Request, Response } from 'express'; 2 | 3 | export interface Ctx { 4 | req: Request; 5 | res: Response; 6 | } -------------------------------------------------------------------------------- /backend/functions/src/types.d.ts: -------------------------------------------------------------------------------- 1 | declare module 'langdetect' { 2 | interface DetectionResult { 3 | lang: string; 4 | prob: number; 5 | } 6 | 7 | export function detect(text: string): DetectionResult[]; 8 | export function detectOne(text: string): string | null; 9 | } 10 | 11 | declare module 'jsdom' { 12 | import EventEmitter from 'events'; 13 | export class JSDOM { 14 | constructor(html: string, options?: any); 15 | window: typeof window; 16 | } 17 | export class VirtualConsole extends EventEmitter{ 18 | constructor(); 19 | sendTo(console: any, options?: any); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /backend/functions/src/utils/markdown.ts: -------------------------------------------------------------------------------- 1 | 2 | export function tidyMarkdown(markdown: string): string { 3 | 4 | // Step 1: Handle complex broken links with text and optional images spread across multiple lines 5 | let normalizedMarkdown = markdown.replace(/\[\s*([^\]\n]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => { 6 | // Remove internal new lines and excessive spaces within the text 7 | text = text.replace(/\s+/g, ' ').trim(); 8 | url = url.replace(/\s+/g, '').trim(); 9 | return `[${text}](${url})`; 10 | }); 11 | 12 | normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]\n!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => { 13 | // Normalize by removing excessive spaces and new lines 14 | text = text.replace(/\s+/g, ' ').trim(); 15 | alt = alt ? alt.replace(/\s+/g, ' ').trim() : ''; 16 | imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : ''; 17 | linkUrl = linkUrl.replace(/\s+/g, '').trim(); 18 | if (imgUrl) { 19 | return `[${text} ![${alt}](${imgUrl})](${linkUrl})`; 20 | } else { 21 | return `[${text}](${linkUrl})`; 22 | } 23 | }); 24 | 25 | // Step 2: Normalize regular links that may be broken across lines 26 | normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => { 27 | text = text.replace(/\s+/g, ' ').trim(); 28 | url = url.replace(/\s+/g, '').trim(); 29 | return `[${text}](${url})`; 30 | }); 31 | 32 | // Step 3: Replace more than two consecutive empty lines with exactly two empty lines 33 | normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n'); 34 | 35 | // Step 4: Remove leading spaces from each line 36 | normalizedMarkdown = normalizedMarkdown.replace(/^[ \t]+/gm, ''); 37 | 38 | return normalizedMarkdown.trim(); 39 | } 40 | -------------------------------------------------------------------------------- /backend/functions/src/utils/misc.ts: -------------------------------------------------------------------------------- 1 | export function cleanAttribute(attribute: string) { 2 | return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : ''; 3 | } 4 | -------------------------------------------------------------------------------- /backend/functions/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "module": "node16", 4 | 5 | "noImplicitReturns": true, 6 | "noUnusedLocals": false, 7 | "outDir": "build", 8 | "sourceMap": true, 9 | "strict": true, 10 | "allowJs": true, 11 | "target": "es2022", 12 | "lib": ["es2022"], 13 | "skipLibCheck": true, 14 | "useDefineForClassFields": false, 15 | "experimentalDecorators": true, 16 | "emitDecoratorMetadata": true, 17 | "esModuleInterop": true, 18 | "noImplicitAny": false, 19 | "noImplicitOverride": true, 20 | }, 21 | "compileOnSave": true, 22 | "include": ["src"] 23 | } 24 | -------------------------------------------------------------------------------- /backend/storage.rules: -------------------------------------------------------------------------------- 1 | rules_version = '2'; 2 | service firebase.storage { 3 | match /b/{bucket}/o { 4 | match /{allPaths=**} { 5 | allow read, write: if false; 6 | } 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | reader: 5 | build: 6 | context: . 7 | dockerfile: Dockerfile 8 | ports: 9 | - "3000:3000" 10 | volumes: 11 | - ./screenshots:/app/local-storage 12 | environment: 13 | PUPPETEER_SKIP_CHROMIUM_DOWNLOAD: "true" 14 | PUPPETEER_EXECUTABLE_PATH: "/usr/bin/google-chrome-stable" 15 | command: node build/server.js 16 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "reader", 3 | "version": "1.0.0", 4 | "description": "### Prerequisite - Node v18 (The build fails for Node version >18) - Yarn - Firebase CLI (`npm install -g firebase-tools`)", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1", 8 | "start": "cd backend/functions && npx nodemon --watch ./src --exec \"npm run build && node build/server.js\"", 9 | "build": "tsc" 10 | }, 11 | "author": "", 12 | "license": "ISC", 13 | "devDependencies": { 14 | "firebase-tools": "^12.4.2", 15 | "typescript": "^5.1.6", 16 | "@types/express": "^4.17.17", 17 | "ts-node": "^10.9.1", 18 | "nodemon": "^2.0.22" 19 | }, 20 | "dependencies": { 21 | "express": "^4.17.1", 22 | "tsyringe": "^4.7.0" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /screenshots/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intergalacticalvariable/reader/d5eee9517578c1a31e8beb1fbac1e3a638c940e7/screenshots/.gitkeep --------------------------------------------------------------------------------