├── .env.example ├── .github └── workflows │ ├── publish.yml │ └── test.yml ├── .gitignore ├── .gitmodules ├── .npmignore ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── jest.config.js ├── package.json ├── src ├── browser.ts ├── converters.ts ├── dev │ ├── regenerateGroundTruth.ts │ ├── runLocalTest.ts │ ├── testBrowserExtraction.ts │ ├── testHtmlToMarkdown.ts │ └── testUsage.ts ├── example.ts ├── extractors.ts ├── index.ts ├── types.ts └── utils │ ├── browserProviders.ts │ └── schemaUtils.ts ├── tests ├── fixtures │ ├── article-with-images.html │ ├── blog-post.html │ └── product-list.html ├── integration │ ├── browser-extraction.test.ts │ ├── extract.test.ts │ ├── html-to-markdown.test.ts │ └── processedContent.test.ts ├── setup.ts └── unit │ ├── browser.test.ts │ ├── browserProviders.test.ts │ ├── converters.test.ts │ ├── extractors.test.ts │ └── schemaUtils.test.ts └── tsconfig.json /.env.example: -------------------------------------------------------------------------------- 1 | # API Keys for testing 2 | GOOGLE_API_KEY=your_google_api_key_here 3 | OPENAI_API_KEY=your_openai_api_key_here 4 | 5 | # Test configuration 6 | TEST_TIMEOUT=30000 7 | LOG_LEVEL=info -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Package to NPM 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' # Run workflow on version tags, e.g. v1.0.0 7 | 8 | permissions: 9 | contents: write 10 | packages: write 11 | 12 | jobs: 13 | build-and-publish: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v3 17 | with: 18 | fetch-depth: 0 # Fetch all history for proper versioning 19 | 20 | - name: Use Node.js 21 | uses: actions/setup-node@v3 22 | with: 23 | node-version: '20.x' 24 | registry-url: 'https://registry.npmjs.org/' 25 | cache: 'npm' 26 | 27 | - name: Install dependencies 28 | run: npm ci 29 | 30 | - name: Build package 31 | run: npm run build 32 | 33 | - name: Run unit tests 34 | run: npm run test:unit 35 | 36 | - name: Update test data submodule 37 | run: npm run test:html2md:update 38 | 39 | - name: Run integration tests 40 | run: npm run test:integration 41 | env: 42 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 43 | GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} 44 | 45 | - name: Generate release notes 46 | id: release 47 | run: | 48 | VERSION=${GITHUB_REF#refs/tags/} 49 | echo "version=$VERSION" >> $GITHUB_OUTPUT 50 | # Extract changes from git log or CHANGELOG if available 51 | CHANGES=$(git log --pretty=format:"* %s (%h)" $(git describe --tags --abbrev=0 HEAD^)..HEAD || echo "Initial release") 52 | echo "CHANGES<> $GITHUB_ENV 53 | echo "$CHANGES" >> $GITHUB_ENV 54 | echo "EOF" >> $GITHUB_ENV 55 | 56 | - name: Create GitHub Release 57 | uses: softprops/action-gh-release@v1 58 | with: 59 | name: Release ${{ steps.release.outputs.version }} 60 | body: | 61 | ## Changes in this release 62 | 63 | ${{ env.CHANGES }} 64 | 65 | For full details, see the [CHANGELOG](https://github.com/lightfeed/extractor/blob/main/CHANGELOG.md). 66 | draft: false 67 | prerelease: false 68 | 69 | - name: Publish to NPM 70 | run: npm publish --access public 71 | env: 72 | NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | schedule: 9 | - cron: '0 0 * * 1' # Run weekly on Monday at midnight UTC 10 | 11 | jobs: 12 | unit-tests: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | node-version: [18.x, 20.x] 17 | 18 | steps: 19 | - uses: actions/checkout@v3 20 | 21 | - name: Use Node.js ${{ matrix.node-version }} 22 | uses: actions/setup-node@v3 23 | with: 24 | node-version: ${{ matrix.node-version }} 25 | cache: 'npm' 26 | 27 | - name: Install dependencies 28 | run: npm ci 29 | 30 | - name: Run unit tests 31 | run: npm run test:unit 32 | 33 | integration-tests: 34 | runs-on: ubuntu-latest 35 | needs: unit-tests 36 | strategy: 37 | matrix: 38 | node-version: [20.x] 39 | 40 | steps: 41 | - uses: actions/checkout@v3 42 | 43 | - name: Use Node.js ${{ matrix.node-version }} 44 | uses: actions/setup-node@v3 45 | with: 46 | node-version: ${{ matrix.node-version }} 47 | cache: 'npm' 48 | 49 | - name: Install dependencies 50 | run: npm ci 51 | 52 | - name: Update test data submodule 53 | run: npm run test:html2md:update 54 | 55 | - name: Run integration tests 56 | run: npm run test:integration 57 | env: 58 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 59 | GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | node_modules/ 3 | yarn.lock 4 | 5 | # Build output 6 | dist/ 7 | build/ 8 | lib/ 9 | 10 | # Environment variables 11 | .env 12 | .env.local 13 | .env.*.local 14 | 15 | # Logs 16 | logs 17 | *.log 18 | npm-debug.log* 19 | yarn-debug.log* 20 | yarn-error.log* 21 | 22 | # IDE and editors 23 | .idea/ 24 | .vscode/ 25 | *.swp 26 | *.swo 27 | .DS_Store 28 | 29 | # Test coverage 30 | coverage/ 31 | 32 | # Temporary files 33 | tmp/ 34 | temp/ 35 | 36 | # Optionally fetched test data submodule 37 | /test-data/ 38 | 39 | # Dev test output 40 | /dev-output/ 41 | 42 | # Keep the .gitmodules file that defines the submodule 43 | !/.gitmodules -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "test-data"] 2 | path = test-data 3 | url = https://github.com/lightfeed/extractor-test-data.git 4 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | # Source 2 | src/ 3 | tests/ 4 | test-data/ 5 | 6 | # Config files 7 | .github/ 8 | .git/ 9 | .gitignore 10 | .gitmodules 11 | .editorconfig 12 | .prettierrc 13 | .eslintrc 14 | .env* 15 | .vscode/ 16 | tsconfig.json 17 | tslint.json 18 | jest.config.js 19 | 20 | # Build artifacts 21 | coverage/ 22 | node_modules/ 23 | 24 | # Development files 25 | *.log 26 | .DS_Store 27 | examples/ 28 | src/dev/ 29 | src/example.ts -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [Unreleased] 9 | 10 | ## [0.2.1] - 2025-09-29 11 | - Included html button into markdown for extraction 12 | 13 | ## [0.2.0] - 2025-08-02 14 | - Added playwright browser 15 | 16 | ## [0.1.9] - 2025-06-28 17 | 18 | ### Added 19 | - Added cleanUrl field in HTMLExtractionOptions - when enabled, it will clean tracking parameters from Amazon product URLs 20 | 21 | ### Changed 22 | - Used Gemini 2.5 flash model instead of the preview version 23 | 24 | ## [0.1.8] - 2025-06-16 25 | 26 | ### Changed 27 | - Use extractionContext to provide additional context (e.g. metadata, not limited to partial data) 28 | 29 | ## [0.1.7] - 2025-06-07 30 | 31 | ### Changed 32 | - Updated README to use @lightfeed/extractor as new npm project 33 | 34 | ## [0.1.6] - 2025-06-07 35 | 36 | ### Changed 37 | - Updated project name to lightfeed/extractor and publish to npm project @lightfeed/extractor 38 | 39 | ## [0.1.5] - 2025-05-14 40 | 41 | ### Fixed 42 | - Improved main html content extraction - preserve option, label and select (can be important for product detail pages) 43 | 44 | ## [0.1.4] - 2025-05-13 45 | 46 | ### Fixed 47 | - Fixed schema conversion bug when input zod schema is from a different zod version 48 | 49 | ## [0.1.3] - 2025-05-13 50 | 51 | ### Added 52 | - Used processedContent instead of markdown in response 53 | - Improved enrich prompt to not remove any fields from the original JSON object 54 | 55 | ## [0.1.2] - 2025-05-12 56 | 57 | ### Added 58 | - Supported enriching data 59 | - Handled nullable instead of optional in schema. This is required for schema in OpenAI models 60 | 61 | ## [0.1.1] - 2025-05-11 62 | 63 | ### Added 64 | - Initial release with core functionality 65 | - HTML to Markdown conversion with main content extraction 66 | - Structured data extraction with LLM support 67 | - Support for OpenAI and Google Gemini API 68 | - URL validation and fixing 69 | - Comprehensive test suite 70 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to lightfeed/extractor 2 | 3 | Thank you for considering contributing to lightfeed/extractor! This document outlines the process for contributing to the project and releasing new versions. 4 | 5 | ## Development Workflow 6 | 7 | 1. Fork the repository 8 | 2. Create a feature branch (`git checkout -b feature/amazing-feature`) 9 | 3. Make your changes 10 | 4. Run tests to ensure everything works: 11 | - `npm run test:unit` - Run unit tests 12 | - `npm run test:integration` - Run integration tests (requires API keys) 13 | - `npm run test:html2md` - Run HTML to Markdown tests 14 | 5. Commit your changes (`git commit -m 'Add some amazing feature'`) 15 | 6. Push to the branch (`git push origin feature/amazing-feature`) 16 | 7. Open a Pull Request 17 | 18 | ## CI/CD Pipeline 19 | 20 | This project uses GitHub Actions for continuous integration and deployment: 21 | 22 | ### Testing Workflow 23 | 24 | The testing workflow runs automatically: 25 | - On each push to the `main` branch 26 | - On each pull request to the `main` branch 27 | - Weekly on Monday at midnight UTC 28 | 29 | The workflow includes: 30 | 1. Unit tests - Run across multiple Node.js versions (18.x, 20.x) 31 | 2. Integration tests - Run on Node.js 20.x using provided API secrets 32 | 33 | ### Setting up API keys for CI 34 | 35 | To enable integration tests in CI, add your API keys as secrets in your GitHub repository: 36 | 37 | 1. Go to your GitHub repository 38 | 2. Click on "Settings" > "Secrets and variables" > "Actions" 39 | 3. Add the following secrets: 40 | - `OPENAI_API_KEY` - Your OpenAI API key 41 | - `GOOGLE_API_KEY` - Your Google API key 42 | 43 | ## Release Process 44 | 45 | This project uses semantic versioning. To create a new release: 46 | 47 | 1. Update the version in `package.json` 48 | 2. Update the `CHANGELOG.md` with details of the changes 49 | 3. Commit these changes with a message like "Bump version to x.y.z" 50 | 4. Create and push a new tag: 51 | ``` 52 | git tag -a vx.y.z -m "Release version x.y.z" 53 | git push origin vx.y.z 54 | ``` 55 | 56 | When you push a new tag prefixed with "v" (e.g., v1.0.0), GitHub Actions will automatically: 57 | 1. Build the package 58 | 2. Run unit tests 59 | 3. Create a GitHub Release with notes from your git history 60 | 4. Publish the package to npm 61 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [2025] [Revar Immersive Technology Inc.] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('ts-jest').JestConfigWithTsJest} */ 2 | module.exports = { 3 | preset: "ts-jest", 4 | testEnvironment: "node", 5 | roots: ["/src", "/tests"], 6 | testMatch: ["**/__tests__/**/*.ts?(x)", "**/?(*.)+(spec|test).ts?(x)"], 7 | collectCoverage: true, 8 | coverageDirectory: "coverage", 9 | collectCoverageFrom: [ 10 | "src/**/*.ts", 11 | "!src/dev/**/*.ts", 12 | "!src/**/*.d.ts", 13 | "!src/types.ts", 14 | "!src/example.ts", 15 | "!**/node_modules/**", 16 | "!**/vendor/**", 17 | ], 18 | transform: { 19 | "^.+\\.tsx?$": "ts-jest", 20 | }, 21 | setupFiles: ["/tests/setup.ts"], 22 | watchman: false, 23 | }; 24 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@lightfeed/extractor", 3 | "version": "0.2.1", 4 | "description": "Use LLMs to robustly extract and enrich structured data from HTML and markdown", 5 | "main": "dist/index.js", 6 | "types": "dist/index.d.ts", 7 | "files": [ 8 | "dist" 9 | ], 10 | "engines": { 11 | "node": ">=18" 12 | }, 13 | "scripts": { 14 | "build": "tsc", 15 | "clean": "rimraf dist", 16 | "prepare": "npm run clean && npm run build", 17 | "prepublishOnly": "npm run test:unit", 18 | "test": "jest", 19 | "test:unit": "jest tests/unit", 20 | "test:integration": "jest tests/integration", 21 | "test:watch": "jest --watch", 22 | "test:cov": "jest --coverage", 23 | "test:local": "ts-node src/dev/runLocalTest.ts", 24 | "test:usage": "ts-node src/dev/testUsage.ts", 25 | "test:browser": "ts-node src/dev/testBrowserExtraction.ts", 26 | "test:html2md": "jest tests/integration/html-to-markdown.test.ts", 27 | "test:html2md:update": "git submodule update --init --recursive test-data", 28 | "test:html2md:sync": "cd test-data && git pull origin main && cd ..", 29 | "test:html2md:regenerate": "ts-node src/dev/regenerateGroundTruth.ts", 30 | "lint": "tslint -p tsconfig.json", 31 | "dev": "ts-node src/example.ts", 32 | "dev:html2md": "ts-node src/dev/testHtmlToMarkdown.ts" 33 | }, 34 | "repository": { 35 | "type": "git", 36 | "url": "git+https://github.com/lightfeed/extractor.git" 37 | }, 38 | "keywords": [ 39 | "llm", 40 | "extraction", 41 | "web-scraping", 42 | "html", 43 | "markdown", 44 | "structured-data", 45 | "openai", 46 | "gemini" 47 | ], 48 | "author": "Lightfeed", 49 | "license": "Apache-2.0", 50 | "bugs": { 51 | "url": "https://github.com/lightfeed/extractor/issues" 52 | }, 53 | "homepage": "https://github.com/lightfeed/extractor#readme", 54 | "dependencies": { 55 | "@langchain/google-genai": "^0.2.5", 56 | "@langchain/openai": "^0.5.10", 57 | "cheerio": "^1.0.0", 58 | "jsonrepair": "^3.12.0", 59 | "langchain": "^0.3.24", 60 | "playwright": "npm:rebrowser-playwright-core@1.49.1", 61 | "turndown": "^7.2.0", 62 | "xmldom": "^0.6.0", 63 | "xpath": "^0.0.34", 64 | "zod": "^3.24.3" 65 | }, 66 | "devDependencies": { 67 | "@types/jest": "^29.5.12", 68 | "@types/node": "^22.15.3", 69 | "@types/turndown": "^5.0.5", 70 | "@types/xmldom": "^0.1.34", 71 | "dotenv": "^16.3.1", 72 | "jest": "^29.7.0", 73 | "rimraf": "^5.0.10", 74 | "ts-jest": "^29.1.2", 75 | "ts-node": "^10.9.2", 76 | "typescript": "^5.8.3" 77 | }, 78 | "publishConfig": { 79 | "access": "public" 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/browser.ts: -------------------------------------------------------------------------------- 1 | import { Browser as PlaywrightBrowser, BrowserContext, Page } from "playwright"; 2 | import { BrowserConfig } from "./types"; 3 | import { createBrowserProvider } from "./utils/browserProviders"; 4 | 5 | /** 6 | * Browser class that provides a clean interface for browser operations 7 | * Use this to load web pages and extract HTML content before passing to the extractor 8 | */ 9 | export class Browser { 10 | private browserProvider: any; 11 | private browser: PlaywrightBrowser | null = null; 12 | private config: BrowserConfig; 13 | 14 | constructor(config: BrowserConfig = { type: "local" }) { 15 | this.config = config; 16 | this.browserProvider = createBrowserProvider(config as any); 17 | } 18 | 19 | /** 20 | * Start the browser instance 21 | */ 22 | async start(): Promise { 23 | if (this.browser) { 24 | throw new Error( 25 | "Browser is already started. Call close() first if you want to restart." 26 | ); 27 | } 28 | this.browser = await this.browserProvider.start(); 29 | } 30 | 31 | /** 32 | * Create a new page in the browser 33 | * Browser must be started first 34 | */ 35 | async newPage(): Promise { 36 | if (!this.browser) { 37 | throw new Error("Browser not started. Call start() first."); 38 | } 39 | return await this.browser.newPage(); 40 | } 41 | 42 | /** 43 | * Create a new browser context 44 | * Browser must be started first 45 | * Use context for advanced operations like setting cookies, headers, etc. 46 | */ 47 | async newContext(): Promise { 48 | if (!this.browser) { 49 | throw new Error("Browser not started. Call start() first."); 50 | } 51 | return await this.browser.newContext(); 52 | } 53 | 54 | /** 55 | * Close the browser and clean up resources 56 | */ 57 | async close(): Promise { 58 | if (this.browser) { 59 | await this.browserProvider.close(); 60 | this.browser = null; 61 | } 62 | } 63 | 64 | /** 65 | * Check if the browser is currently running 66 | */ 67 | isStarted(): boolean { 68 | return this.browser !== null; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/converters.ts: -------------------------------------------------------------------------------- 1 | import TurndownService from "turndown"; 2 | import { HTMLExtractionOptions } from "./types"; 3 | import { DOMParser, XMLSerializer } from "xmldom"; 4 | import { isNodeLike } from "xpath"; 5 | import * as url from "url"; 6 | 7 | var xpath = require("xpath"); 8 | const cheerio = require("cheerio"); 9 | 10 | /** 11 | * Clean URL by removing tracking parameters and unnecessary components 12 | */ 13 | function cleanUrl(url: string): string { 14 | try { 15 | // Check if this is an Amazon URL (amazon.com or amazon.ca) 16 | const urlObj = new URL(url); 17 | const hostname = urlObj.hostname.toLowerCase(); 18 | 19 | if ( 20 | hostname.startsWith("amazon.com") || 21 | hostname.startsWith("www.amazon.com") || 22 | hostname.startsWith("amazon.ca") || 23 | hostname.startsWith("www.amazon.ca") 24 | ) { 25 | // For Amazon URLs, remove /ref= and everything after it 26 | const refIndex = url.indexOf("/ref="); 27 | if (refIndex !== -1) { 28 | return url.substring(0, refIndex); 29 | } 30 | } 31 | 32 | // For other URLs, return as-is (can be extended in the future) 33 | return url; 34 | } catch (error) { 35 | // If URL parsing fails, return original URL 36 | return url; 37 | } 38 | } 39 | 40 | /** 41 | * Extract the main content from an HTML string if requested 42 | */ 43 | function extractMainHtml(html: string): string { 44 | try { 45 | const bodyDoc = new DOMParser().parseFromString(html, "text/html"); 46 | 47 | [...OVERALL_DISCARD_XPATH, ...PRECISION_DISCARD_XPATH].forEach((xPath) => { 48 | const result = xpath.parse(xPath).select({ node: bodyDoc, isHtml: true }); 49 | 50 | // Ensure result is an array before calling forEach 51 | const nodes = Array.isArray(result) ? result : [result]; 52 | 53 | nodes.forEach((node) => { 54 | if (isNodeLike(node) && node.parentNode) { 55 | node.parentNode.removeChild(node); 56 | } 57 | }); 58 | }); 59 | 60 | const refinedHtml = new XMLSerializer().serializeToString(bodyDoc); 61 | return refinedHtml == "" ? html : refinedHtml; 62 | } catch (error) { 63 | console.error("error extracting main html", error); 64 | return ""; 65 | } 66 | } 67 | 68 | /** 69 | * Convert HTML to Markdown 70 | */ 71 | export function htmlToMarkdown( 72 | html: string, 73 | options?: HTMLExtractionOptions, 74 | sourceUrl?: string 75 | ): string { 76 | // First clean up the html 77 | const tidiedHtml = tidyHtml(html, options?.includeImages ?? false); 78 | 79 | // Turndown config 80 | // Reference: https://github.com/jina-ai/reader/blob/1e3bae6aad9cf0005c14f0036b46b49390e63203/backend/functions/src/cloud-functions/crawler.ts#L134 81 | const turnDownService = new TurndownService(); 82 | 83 | // Define elements to remove - conditionally include or exclude images 84 | const elementsToRemove: any[] = [ 85 | "meta", 86 | "style", 87 | "script", 88 | "noscript", 89 | "link", 90 | "textarea", 91 | ]; 92 | 93 | // Only remove image elements if includeImages is not enabled 94 | if (!options?.includeImages) { 95 | elementsToRemove.push("img", "picture", "figure"); 96 | } 97 | 98 | turnDownService.addRule("remove-irrelevant", { 99 | filter: elementsToRemove, 100 | replacement: () => "", 101 | }); 102 | 103 | turnDownService.addRule("truncate-svg", { 104 | filter: "svg" as any, 105 | replacement: () => "", 106 | }); 107 | 108 | turnDownService.addRule("title-as-h1", { 109 | filter: ["title"], 110 | replacement: (innerText: string) => `${innerText}\n===============\n`, 111 | }); 112 | 113 | turnDownService.addRule("improved-paragraph", { 114 | filter: "p", 115 | replacement: (innerText: string) => { 116 | const trimmed = innerText.trim(); 117 | if (!trimmed) { 118 | return ""; 119 | } 120 | 121 | return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`; 122 | }, 123 | }); 124 | 125 | turnDownService.addRule("improved-inline-link", { 126 | filter: function (node: any, options: any) { 127 | return Boolean( 128 | options.linkStyle === "inlined" && 129 | node.nodeName === "A" && 130 | node.getAttribute("href") 131 | ); 132 | }, 133 | 134 | replacement: function (content: string, node: any) { 135 | let href = node.getAttribute("href"); 136 | if (href) { 137 | // Convert relative URLs to absolute if sourceUrl is provided 138 | if ( 139 | sourceUrl && 140 | !href.startsWith("http") && 141 | !href.startsWith("mailto:") 142 | ) { 143 | try { 144 | href = url.resolve(sourceUrl, href); 145 | } catch (error) { 146 | console.warn( 147 | `Failed to resolve URL ${href} against ${sourceUrl}:`, 148 | error 149 | ); 150 | } 151 | } 152 | 153 | // Clean URL if cleanUrls option is enabled (default false) 154 | if (options?.cleanUrls) { 155 | href = cleanUrl(href); 156 | } 157 | 158 | href = href.replace(/([()])/g, "\\$1"); 159 | } 160 | let title = cleanAttribute(node.getAttribute("title")); 161 | if (title) title = ' "' + title.replace(/"/g, '\\"') + '"'; 162 | 163 | const fixedContent = content.replace(/\s+/g, " ").trim(); 164 | const fixedHref = href.replace(/\s+/g, "").trim(); 165 | 166 | return `[${fixedContent}](${fixedHref}${title || ""})`; 167 | }, 168 | }); 169 | 170 | turnDownService.addRule("images", { 171 | filter: "img", 172 | 173 | replacement: function (content: string, node: any) { 174 | let src = node.getAttribute("src"); 175 | if (src) { 176 | // Convert relative URLs to absolute if sourceUrl is provided 177 | if (sourceUrl && !src.startsWith("http") && !src.startsWith("data:")) { 178 | try { 179 | src = url.resolve(sourceUrl, src); 180 | } catch (error) { 181 | console.warn( 182 | `Failed to resolve URL ${src} against ${sourceUrl}:`, 183 | error 184 | ); 185 | } 186 | } 187 | 188 | // Clean URL if cleanUrls option is enabled (default false) 189 | if (options?.cleanUrls) { 190 | src = cleanUrl(src); 191 | } 192 | 193 | src = src.replace(/([()])/g, "\\$1"); 194 | } else { 195 | return ""; // No source, no image 196 | } 197 | 198 | let alt = cleanAttribute(node.getAttribute("alt") || ""); 199 | let title = cleanAttribute(node.getAttribute("title")); 200 | 201 | if (title) title = ' "' + title.replace(/"/g, '\\"') + '"'; 202 | 203 | const fixedSrc = src.replace(/\s+/g, "").trim(); 204 | 205 | return `![${alt}](${fixedSrc}${title || ""})`; 206 | }, 207 | }); 208 | 209 | const fullMarkdown = turnDownService.turndown(tidiedHtml).trim(); 210 | if (options?.extractMainHtml) { 211 | const mainHtml = extractMainHtml(tidiedHtml); 212 | const mainMarkdown = turnDownService.turndown(mainHtml).trim(); 213 | // Heristics: 214 | // If main content is empty or is less than 20% of full content and not too short, use full content 215 | if ( 216 | mainMarkdown.length == 0 || 217 | (mainMarkdown.length < fullMarkdown.length * 0.2 && 218 | mainMarkdown.length < 500) 219 | ) { 220 | return fullMarkdown; 221 | } else { 222 | return mainMarkdown; 223 | } 224 | } else { 225 | return fullMarkdown; 226 | } 227 | } 228 | 229 | // Clean up the html 230 | function tidyHtml(html: string, includeImages: boolean): string { 231 | const $ = cheerio.load(html); 232 | $("*").each(function (this: any) { 233 | const element = $(this); 234 | const attributes = Object.keys(this.attribs); 235 | 236 | for (let i = 0; i < attributes.length; i++) { 237 | let attr = attributes[i]; 238 | // Check if the attribute value has an odd number of quotes 239 | // If the attribute name has a quote, it might be a broken attribute. Remove it completely. 240 | // (this occured at dealnews.com) 241 | if (attr.includes('"')) { 242 | element.remove(); 243 | } 244 | } 245 | }); 246 | 247 | // Adatpted from https://github.com/adbar/trafilatura/blob/c7e00f3a31e436c7b6ce666b44712e16e30908c0/trafilatura/settings.py#L55 248 | // Removed (because user might want to extract them): 249 | // - form 250 | // - fieldset 251 | // - footer (might contain company info) 252 | // - img, picture, figure (if includeImages is false) 253 | // - option, label, select (this can present product options and titles) 254 | // - button (this can present product metadata, e.g. number of reviews) 255 | const manuallyCleanedElements = [ 256 | // important 257 | "aside", 258 | "embed", 259 | // "footer", 260 | // "form", 261 | "head", 262 | "iframe", 263 | "menu", 264 | "object", 265 | "script", 266 | // other content 267 | "applet", 268 | "audio", 269 | "canvas", 270 | "map", 271 | "svg", 272 | "video", 273 | // secondary 274 | "area", 275 | "blink", 276 | // "button", 277 | "datalist", 278 | "dialog", 279 | "frame", 280 | "frameset", 281 | // "fieldset", 282 | "link", 283 | "input", 284 | "ins", 285 | // "label", 286 | "legend", 287 | "marquee", 288 | "math", 289 | "menuitem", 290 | "nav", 291 | "noscript", 292 | "optgroup", 293 | // "option", 294 | "output", 295 | "param", 296 | "progress", 297 | "rp", 298 | "rt", 299 | "rtc", 300 | // "select", 301 | "source", 302 | "style", 303 | "track", 304 | "textarea", 305 | "time", 306 | "use", 307 | ]; 308 | 309 | if (!includeImages) { 310 | manuallyCleanedElements.push("img", "picture", "figure"); 311 | } 312 | 313 | // Further clean html 314 | manuallyCleanedElements.forEach((element) => { 315 | $(element).remove(); 316 | }); 317 | return $("body").html(); 318 | } 319 | 320 | function cleanAttribute(attribute: string) { 321 | return attribute ? attribute.replace(/(\n+\s*)+/g, "\n") : ""; 322 | } 323 | 324 | // Adapted from https://github.com/adbar/trafilatura/blob/c7e00f3a31e436c7b6ce666b44712e16e30908c0/trafilatura/xpaths.py#L100 325 | // Added: 326 | // - Add contains(@id, "filter") to remove filter menus 327 | // - footer 328 | // Removed (because user might want to extract them): 329 | // - Commented out tags 330 | // - Commented out sidebar (sidebar sometimes can be too aggressive and can remove main content) 331 | // - Commented out author 332 | // - Commented out rating 333 | // - Commented out attachment 334 | // - Commented out timestamp 335 | // - Commented out user-info and user-profile 336 | // - Commented out comment or hidden section 337 | // - Not including @data-testid (it can remove dynamic product listings) 338 | // - Commented out options 339 | const OVERALL_DISCARD_XPATH = [ 340 | // navigation + footers, news outlets related posts, sharing, jp-post-flair jp-relatedposts 341 | `.//*[(self::div or self::item or self::list 342 | or self::p or self::section or self::span)][ 343 | contains(translate(@id, "F","f"), "footer") or contains(translate(@class, "F","f"), "footer") 344 | or contains(@id, "related") or contains(translate(@class, "R", "r"), "related") or 345 | contains(@id, "viral") or contains(@class, "viral") or 346 | contains(@id, "filter") or 347 | starts-with(@id, "shar") or starts-with(@class, "shar") or 348 | contains(@class, "share-") or 349 | contains(translate(@id, "S", "s"), "share") or 350 | contains(@id, "social") or contains(@class, "social") or contains(@class, "sociable") or 351 | contains(@id, "syndication") or contains(@class, "syndication") or 352 | starts-with(@id, "jp-") or starts-with(@id, "dpsp-content") or 353 | contains(@class, "embedded") or contains(@class, "embed") 354 | or contains(@id, "newsletter") or contains(@class, "newsletter") 355 | or contains(@class, "subnav") or 356 | contains(@id, "cookie") or contains(@class, "cookie") or ` + 357 | // `contains(@id, "tags") or contains(@class, "tags") or ` + 358 | // `contains(@id, "sidebar") or contains(@class, "sidebar") or ` + 359 | `contains(@id, "banner") or contains(@class, "banner") 360 | or contains(@class, "meta") or 361 | contains(@id, "menu") or contains(@class, "menu") or 362 | contains(translate(@id, "N", "n"), "nav") or contains(translate(@role, "N", "n"), "nav") 363 | or starts-with(@class, "nav") or contains(translate(@class, "N", "n"), "navigation") or 364 | contains(@class, "navbar") or contains(@class, "navbox") or starts-with(@class, "post-nav") 365 | or contains(@id, "breadcrumb") or contains(@class, "breadcrumb") or 366 | contains(@id, "bread-crumb") or contains(@class, "bread-crumb") or ` + 367 | // `contains(@id, "author") or contains(@class, "author") or ` + 368 | `contains(@id, "button") or contains(@class, "button") 369 | or contains(translate(@class, "B", "b"), "byline") or ` + 370 | // contains(@class, "rating") or ` + 371 | `starts-with(@class, "widget") or ` + 372 | // contains(@class, "attachment") or contains(@class, "timestamp") or 373 | // contains(@class, "user-info") or contains(@class, "user-profile") or 374 | `contains(@class, "-ad-") or contains(@class, "-icon") 375 | or contains(@class, "article-infos") or 376 | contains(translate(@class, "I", "i"), "infoline") 377 | or contains(@data-component, "MostPopularStories") 378 | or contains(@class, "outbrain") or contains(@class, "taboola") 379 | or contains(@class, "criteo") ` + 380 | // or contains(@class, "options") 381 | `or contains(@class, "consent") or contains(@class, "modal-content") 382 | or contains(@class, "paid-content") or contains(@class, "paidcontent") 383 | or contains(@id, "premium-") or contains(@id, "paywall") 384 | or contains(@class, "obfuscated") or contains(@class, "blurred") 385 | or contains(@class, " ad ") 386 | or contains(@class, "next-post") or contains(@class, "side-stories") 387 | or contains(@class, "related-stories") or contains(@class, "most-popular") 388 | or contains(@class, "mol-factbox") or starts-with(@class, "ZendeskForm") 389 | or contains(@class, "message-container") or contains(@id, "message_container") 390 | or contains(@class, "yin") or contains(@class, "zlylin") or 391 | contains(@class, "xg1") or contains(@id, "bmdh") 392 | or @data-lp-replacement-content]`, 393 | ".//footer", 394 | 395 | // comment debris + hidden parts 396 | // `.//*[@class="comments-title" or contains(@class, "comments-title") or 397 | // contains(@class, "nocomments") or starts-with(@id, "reply-") or starts-with(@class, "reply-") or 398 | // contains(@class, "-reply-") or contains(@class, "message") 399 | // or contains(@id, "akismet") or contains(@class, "akismet") or 400 | // starts-with(@class, "hide-") or contains(@class, "hide-print") or contains(@id, "hidden") 401 | // or contains(@style, "hidden") or contains(@hidden, "hidden") or contains(@class, "noprint") 402 | // or contains(@style, "display:none") or contains(@class, " hidden") or @aria-hidden="true" 403 | // or contains(@class, "notloaded")]`, 404 | ]; 405 | 406 | // Adapted from https://github.com/adbar/trafilatura/blob/c7e00f3a31e436c7b6ce666b44712e16e30908c0/trafilatura/xpaths.py#L179 407 | // Removed: 408 | // - contains(@style, "border") 409 | const PRECISION_DISCARD_XPATH = [ 410 | ".//header", 411 | `.//*[(self::div or self::item or self::list 412 | or self::p or self::section or self::span)][ 413 | contains(@id, "bottom") or contains(@class, "bottom") or 414 | contains(@id, "link") or contains(@class, "link") 415 | ]`, 416 | ]; 417 | -------------------------------------------------------------------------------- /src/dev/regenerateGroundTruth.ts: -------------------------------------------------------------------------------- 1 | import * as fs from "fs"; 2 | import * as path from "path"; 3 | import { htmlToMarkdown } from "../converters"; 4 | import { HTMLExtractionOptions } from "../types"; 5 | import * as cheerio from "cheerio"; 6 | 7 | // Function to sanitize HTML content 8 | function sanitizeHTML(html: string, originalSource: string): string { 9 | const $ = cheerio.load(html); 10 | 11 | // Remove scripts and event handlers 12 | $("script").remove(); 13 | $("[onclick]").removeAttr("onclick"); 14 | $("[onload]").removeAttr("onload"); 15 | // Find all elements with attributes starting with "on" and remove them 16 | $("*").each(function () { 17 | const el = $(this); 18 | const node = el[0]; 19 | 20 | // Skip if not an element node or has no attributes 21 | if (!node || node.type !== "tag" || !("attribs" in node)) return; 22 | 23 | // Now TypeScript knows node.attribs exists 24 | Object.keys(node.attribs) 25 | .filter((attr) => attr.startsWith("on")) 26 | .forEach((attr) => el.removeAttr(attr)); 27 | }); 28 | // Remove styles 29 | $("style").remove(); 30 | $("[style]").removeAttr("style"); 31 | 32 | // Replace text content with placeholder 33 | $("p, h1, h2, h3, h4, h5, span, div").each(function () { 34 | const el = $(this); 35 | if (el.children().length === 0) { 36 | // Only replace text in leaf nodes 37 | const originalText = el.text(); 38 | const length = originalText.length; 39 | 40 | if (length > 0) { 41 | // Generate placeholder text with exactly the same length 42 | const loremIpsumBase = 43 | "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. "; 44 | 45 | // Create deterministic placeholder based on original length and first character 46 | let placeholder = ""; 47 | // Repeat the base text as many times as needed 48 | while (placeholder.length < length) { 49 | placeholder += loremIpsumBase; 50 | } 51 | 52 | // Trim to exact length of original text 53 | placeholder = placeholder.substring(0, length); 54 | 55 | el.text(placeholder); 56 | } 57 | } 58 | }); 59 | 60 | // Replace links 61 | $("a").each(function () { 62 | const el = $(this); 63 | const isEmail = el.attr("href") && el.attr("href")!.startsWith("mailto:"); 64 | const isExternal = 65 | el.attr("href") && 66 | (el.attr("href")!.startsWith("http") || 67 | el.attr("href")!.startsWith("www")); 68 | 69 | // Replace with appropriate placeholder based on link type 70 | if (isEmail) { 71 | // Replace email links 72 | el.attr("href", "mailto:example@example.com"); 73 | } else if (isExternal) { 74 | // Replace external links 75 | el.attr("href", "https://example.com/external-link"); 76 | } else { 77 | // Replace internal/relative links 78 | el.attr("href", "/placeholder-page"); 79 | } 80 | 81 | const originalLinkText = el.text().trim(); 82 | const textLength = originalLinkText.length; 83 | if (textLength > 0) { 84 | // Base text patterns for different link types 85 | let placeholderBase = "Link Text"; 86 | 87 | if (isEmail) { 88 | placeholderBase = "Email Link"; 89 | } else if (isExternal) { 90 | placeholderBase = "External Link"; 91 | } else { 92 | placeholderBase = "Page Link"; 93 | } 94 | 95 | // Replace the link text 96 | el.text(placeholderBase); 97 | } 98 | }); 99 | 100 | // Replace images with real placeholder services 101 | $("img").each(function () { 102 | const el = $(this); 103 | const width = el.attr("width") || 300; 104 | const height = el.attr("height") || 200; 105 | 106 | // Use a real placeholder image service 107 | el.attr("src", `https://picsum.photos/${width}/${height}`); 108 | 109 | // Add generic alt text if none exists 110 | if (!el.attr("alt")) { 111 | el.attr("alt", "Placeholder image"); 112 | } 113 | }); 114 | 115 | // Add attribution header 116 | const sanitizedHTML = $.html(); 117 | 118 | return sanitizedHTML; 119 | } 120 | 121 | // Function to convert HTML to Markdown and save as ground truth 122 | async function generateGroundTruth( 123 | htmlFilePath: string, 124 | groundtruthDir: string, 125 | options?: HTMLExtractionOptions, 126 | variant: string = "" 127 | ) { 128 | try { 129 | // Read and sanitize the HTML file 130 | const originalHtml = fs.readFileSync(htmlFilePath, "utf8"); 131 | const sanitizedHtml = sanitizeHTML(originalHtml, htmlFilePath); 132 | 133 | // Save sanitized HTML back to the original file 134 | fs.writeFileSync(htmlFilePath, sanitizedHtml); 135 | console.log(`✅ Sanitized HTML: ${htmlFilePath}`); 136 | 137 | // Convert to Markdown 138 | const markdown = htmlToMarkdown(sanitizedHtml, options); 139 | 140 | // Create groundtruth directory if it doesn't exist 141 | if (!fs.existsSync(groundtruthDir)) { 142 | fs.mkdirSync(groundtruthDir, { recursive: true }); 143 | } 144 | 145 | // Generate output filename 146 | const baseName = path.basename(htmlFilePath, ".html"); 147 | const outputFilename = variant 148 | ? `${baseName}.${variant}.md` 149 | : `${baseName}.md`; 150 | const outputPath = path.join(groundtruthDir, outputFilename); 151 | 152 | // Save the markdown 153 | fs.writeFileSync(outputPath, markdown); 154 | console.log(`✅ Generated ground truth: ${outputPath}`); 155 | 156 | return outputPath; 157 | } catch (error) { 158 | console.error("❌ Error generating ground truth:", error); 159 | throw error; 160 | } 161 | } 162 | 163 | // Main function to regenerate all ground truth files 164 | async function main() { 165 | const testDataDir = path.join(process.cwd(), "test-data"); 166 | 167 | // Check if test-data directory exists 168 | if (!fs.existsSync(testDataDir)) { 169 | console.error( 170 | "❌ test-data directory not found. Please run 'npm run test:html2md:update' first." 171 | ); 172 | process.exit(1); 173 | } 174 | 175 | const htmlDir = path.join(testDataDir, "html"); 176 | const groundtruthDir = path.join(testDataDir, "groundtruth"); 177 | 178 | // Get all categories (subdirectories under html/) 179 | const categories = fs 180 | .readdirSync(htmlDir, { withFileTypes: true }) 181 | .filter((dirent) => dirent.isDirectory()) 182 | .map((dirent) => dirent.name); 183 | 184 | console.log("\n🔍 Regenerating ground truth files...\n"); 185 | 186 | // Process each category 187 | for (const category of categories) { 188 | console.log(`\n📁 Processing category: ${category}`); 189 | 190 | const categoryHtmlDir = path.join(htmlDir, category); 191 | const categoryGroundtruthDir = path.join(groundtruthDir, category); 192 | 193 | // Create category directory in groundtruth if it doesn't exist 194 | if (!fs.existsSync(categoryGroundtruthDir)) { 195 | fs.mkdirSync(categoryGroundtruthDir, { recursive: true }); 196 | } 197 | 198 | // Get all HTML files in this category 199 | const htmlFiles = fs 200 | .readdirSync(categoryHtmlDir) 201 | .filter((file) => file.endsWith(".html")) 202 | .map((file) => file.replace(".html", "")); 203 | 204 | // Process each HTML file 205 | for (const filename of htmlFiles) { 206 | const htmlFilePath = path.join(categoryHtmlDir, `${filename}.html`); 207 | 208 | // Generate ground truth files with different options 209 | await generateGroundTruth(htmlFilePath, categoryGroundtruthDir); // Basic conversion 210 | await generateGroundTruth( 211 | htmlFilePath, 212 | categoryGroundtruthDir, 213 | { includeImages: true }, 214 | "images" 215 | ); 216 | await generateGroundTruth( 217 | htmlFilePath, 218 | categoryGroundtruthDir, 219 | { extractMainHtml: true }, 220 | "main" 221 | ); 222 | } 223 | } 224 | 225 | console.log("\n✨ All ground truth files have been regenerated!"); 226 | } 227 | 228 | // Run the main function 229 | main().catch(console.error); 230 | -------------------------------------------------------------------------------- /src/dev/runLocalTest.ts: -------------------------------------------------------------------------------- 1 | import * as fs from "fs"; 2 | import * as path from "path"; 3 | import { config } from "dotenv"; 4 | import { z } from "zod"; 5 | import { extract, ContentFormat, LLMProvider } from "../index"; 6 | 7 | // Load environment variables from .env file 8 | config({ path: path.resolve(process.cwd(), ".env") }); 9 | 10 | // Helper to load HTML test fixtures 11 | function loadFixture(filename: string): string { 12 | return fs.readFileSync( 13 | path.resolve(__dirname, "../../tests/fixtures", filename), 14 | "utf8" 15 | ); 16 | } 17 | 18 | // Example schemas for different content types 19 | const blogSchema = z.object({ 20 | title: z.string(), 21 | author: z.string().optional(), 22 | date: z.string().optional(), 23 | tags: z 24 | .array(z.string()) 25 | .optional() 26 | .describe("Tags appear after the date. Do not include the # symbol."), 27 | summary: z.string(), 28 | content: z.string().optional(), 29 | }); 30 | 31 | // OpenAI version with nullable instead of optional 32 | const blogSchemaOpenAI = z.object({ 33 | title: z.string(), 34 | author: z.string().nullable(), 35 | date: z.string().nullable(), 36 | tags: z 37 | .array(z.string()) 38 | .nullable() 39 | .describe("Tags appear after the date. Do not include the # symbol."), 40 | summary: z.string(), 41 | content: z.string().nullable(), 42 | }); 43 | 44 | const productSchema = z.object({ 45 | products: z.array( 46 | z.object({ 47 | name: z.string(), 48 | price: z.string(), 49 | rating: z.string().optional(), 50 | description: z.string().optional(), 51 | features: z.array(z.string()).optional(), 52 | }) 53 | ), 54 | }); 55 | 56 | // OpenAI version with nullable instead of optional 57 | const productSchemaOpenAI = z.object({ 58 | products: z.array( 59 | z.object({ 60 | name: z.string(), 61 | price: z.string(), 62 | rating: z.string().nullable(), 63 | description: z.string().nullable(), 64 | features: z.array(z.string()).nullable(), 65 | }) 66 | ), 67 | }); 68 | 69 | // Test functions 70 | async function testBlogExtraction(provider = LLMProvider.GOOGLE_GEMINI) { 71 | console.log(`Testing blog post extraction with ${provider}...`); 72 | 73 | try { 74 | const html = loadFixture("blog-post.html"); 75 | 76 | // Check for required API key 77 | if (provider === LLMProvider.GOOGLE_GEMINI && !process.env.GOOGLE_API_KEY) { 78 | console.error("Error: GOOGLE_API_KEY environment variable is required"); 79 | process.exit(1); 80 | } else if (provider === LLMProvider.OPENAI && !process.env.OPENAI_API_KEY) { 81 | console.error("Error: OPENAI_API_KEY environment variable is required"); 82 | process.exit(1); 83 | } 84 | 85 | const apiKey = 86 | provider === LLMProvider.GOOGLE_GEMINI 87 | ? process.env.GOOGLE_API_KEY 88 | : process.env.OPENAI_API_KEY; 89 | 90 | const result = await extract({ 91 | content: html, 92 | format: ContentFormat.HTML, 93 | schema: 94 | provider === LLMProvider.GOOGLE_GEMINI ? blogSchema : blogSchemaOpenAI, 95 | provider, 96 | googleApiKey: provider === LLMProvider.GOOGLE_GEMINI ? apiKey : undefined, 97 | openaiApiKey: provider === LLMProvider.OPENAI ? apiKey : undefined, 98 | htmlExtractionOptions: { 99 | extractMainHtml: false, 100 | }, 101 | sourceUrl: "https://www.example.com/blog/blog-post", 102 | }); 103 | 104 | console.log("Extracted data:"); 105 | console.log(JSON.stringify(result.data, null, 2)); 106 | console.log("\nToken usage:"); 107 | console.log(result.usage); 108 | 109 | return result; 110 | } catch (error) { 111 | console.error(`Blog extraction error with ${provider}:`, error); 112 | throw error; 113 | } 114 | } 115 | 116 | async function testProductExtraction(provider = LLMProvider.GOOGLE_GEMINI) { 117 | console.log(`Testing product listing extraction with ${provider}...`); 118 | 119 | try { 120 | const html = loadFixture("product-list.html"); 121 | 122 | // Check for required API key 123 | if (provider === LLMProvider.GOOGLE_GEMINI && !process.env.GOOGLE_API_KEY) { 124 | console.error("Error: GOOGLE_API_KEY environment variable is required"); 125 | process.exit(1); 126 | } else if (provider === LLMProvider.OPENAI && !process.env.OPENAI_API_KEY) { 127 | console.error("Error: OPENAI_API_KEY environment variable is required"); 128 | process.exit(1); 129 | } 130 | 131 | const apiKey = 132 | provider === LLMProvider.GOOGLE_GEMINI 133 | ? process.env.GOOGLE_API_KEY 134 | : process.env.OPENAI_API_KEY; 135 | 136 | const result = await extract({ 137 | content: html, 138 | format: ContentFormat.HTML, 139 | schema: 140 | provider === LLMProvider.GOOGLE_GEMINI 141 | ? productSchema 142 | : productSchemaOpenAI, 143 | provider, 144 | googleApiKey: provider === LLMProvider.GOOGLE_GEMINI ? apiKey : undefined, 145 | openaiApiKey: provider === LLMProvider.OPENAI ? apiKey : undefined, 146 | htmlExtractionOptions: { 147 | extractMainHtml: true, 148 | }, 149 | sourceUrl: "https://www.example.com/product/product-list", 150 | }); 151 | 152 | console.log("Extracted data:"); 153 | console.log(JSON.stringify(result.data, null, 2)); 154 | console.log("\nToken usage:"); 155 | console.log(result.usage); 156 | 157 | return result; 158 | } catch (error) { 159 | console.error(`Product extraction error with ${provider}:`, error); 160 | throw error; 161 | } 162 | } 163 | 164 | // Run tests based on command line arguments 165 | async function main() { 166 | // Parse arguments: content type and provider 167 | const args = process.argv.slice(2); 168 | const contentType = args[0] || "all"; // 'blog', 'product', or 'all' 169 | const provider = 170 | args[1]?.toUpperCase() === "OPENAI" 171 | ? LLMProvider.OPENAI 172 | : args[1]?.toUpperCase() === "GEMINI" 173 | ? LLMProvider.GOOGLE_GEMINI 174 | : "all"; // 'OPENAI', 'GEMINI', or 'all' 175 | 176 | console.log("API Keys available:"); 177 | console.log(`- GOOGLE_API_KEY: ${process.env.GOOGLE_API_KEY ? "Yes" : "No"}`); 178 | console.log(`- OPENAI_API_KEY: ${process.env.OPENAI_API_KEY ? "Yes" : "No"}`); 179 | console.log(""); 180 | 181 | // Run blog tests 182 | if (contentType === "blog" || contentType === "all") { 183 | if (provider === LLMProvider.GOOGLE_GEMINI || provider === "all") { 184 | await testBlogExtraction(LLMProvider.GOOGLE_GEMINI); 185 | } 186 | if (provider === LLMProvider.OPENAI || provider === "all") { 187 | await testBlogExtraction(LLMProvider.OPENAI); 188 | } 189 | } 190 | 191 | // Run product tests 192 | if (contentType === "product" || contentType === "all") { 193 | if (provider === LLMProvider.GOOGLE_GEMINI || provider === "all") { 194 | await testProductExtraction(LLMProvider.GOOGLE_GEMINI); 195 | } 196 | if (provider === LLMProvider.OPENAI || provider === "all") { 197 | await testProductExtraction(LLMProvider.OPENAI); 198 | } 199 | } 200 | } 201 | 202 | // Only run if directly executed 203 | if (require.main === module) { 204 | console.log("Starting local extraction test..."); 205 | console.log("Make sure you have set up your .env file with API keys."); 206 | console.log("Usage: npm run test:local -- [contentType] [provider]"); 207 | console.log(" contentType: 'blog', 'product', or 'all' (default)"); 208 | console.log(" provider: 'openai', 'gemini', or 'all' (default)"); 209 | 210 | main() 211 | .then(() => { 212 | console.log("All tests completed successfully."); 213 | }) 214 | .catch((error) => { 215 | console.error("Test failed:", error); 216 | process.exit(1); 217 | }); 218 | } 219 | -------------------------------------------------------------------------------- /src/dev/testBrowserExtraction.ts: -------------------------------------------------------------------------------- 1 | import { extract, ContentFormat, LLMProvider, Browser } from "../index"; 2 | import { z } from "zod"; 3 | import * as path from "path"; 4 | import { config } from "dotenv"; 5 | 6 | // Load environment variables from .env file 7 | config({ path: path.resolve(process.cwd(), ".env") }); 8 | 9 | const productCatalogSchema = z.object({ 10 | products: z 11 | .array( 12 | z.object({ 13 | name: z.string().describe("Product name or title"), 14 | brand: z.string().optional().describe("Brand name"), 15 | price: z.number().describe("Current price"), 16 | originalPrice: z 17 | .number() 18 | .optional() 19 | .describe("Original price if on sale"), 20 | rating: z.number().optional().describe("Product rating out of 5"), 21 | reviewCount: z.number().optional().describe("Number of reviews"), 22 | productUrl: z.string().url().describe("Link to product detail page"), 23 | imageUrl: z.string().url().optional().describe("Product image URL"), 24 | }) 25 | ) 26 | .describe("List of bread and bakery products"), 27 | }); 28 | 29 | async function testProductCatalogExtraction() { 30 | console.log("🍞 Testing Product Catalog Extraction...\n"); 31 | 32 | const testUrl = 33 | "https://www.walmart.ca/en/browse/grocery/bread-bakery/10019_6000194327359"; 34 | 35 | try { 36 | console.log(`📡 Loading product catalog page: ${testUrl}`); 37 | console.log("🤖 Using Browser class to load the page...\n"); 38 | 39 | // Create browser instance 40 | const browser = new Browser({ 41 | type: "local", 42 | headless: false, 43 | }); 44 | 45 | await browser.start(); 46 | console.log("✅ Browser started successfully"); 47 | 48 | // Create page and load content using direct Playwright API 49 | const page = await browser.newPage(); 50 | await page.goto(testUrl); 51 | 52 | try { 53 | await page.waitForLoadState("networkidle", { timeout: 10000 }); 54 | } catch { 55 | console.log("Network idle timeout, continuing..."); 56 | } 57 | 58 | const html = await page.content(); 59 | console.log(`📄 Loaded ${html.length} characters of HTML`); 60 | 61 | await browser.close(); 62 | console.log("✅ Browser closed"); 63 | 64 | // Now extract product data from the loaded HTML 65 | console.log("\n🧠 Extracting product data using LLM..."); 66 | 67 | const result = await extract({ 68 | content: html, 69 | format: ContentFormat.HTML, 70 | sourceUrl: testUrl, 71 | schema: productCatalogSchema, 72 | provider: LLMProvider.GOOGLE_GEMINI, 73 | googleApiKey: process.env.GOOGLE_API_KEY, 74 | htmlExtractionOptions: { 75 | extractMainHtml: true, 76 | includeImages: true, 77 | cleanUrls: true, 78 | }, 79 | }); 80 | 81 | console.log("✅ Extraction successful!"); 82 | 83 | console.log("🍞 EXTRACTED PRODUCT CATALOG DATA:"); 84 | console.log("=".repeat(80)); 85 | console.log(JSON.stringify(result.data, null, 2)); 86 | console.log("=".repeat(80)); 87 | 88 | console.log("\n💰 Token Usage:"); 89 | console.log(`Input tokens: ${result.usage.inputTokens}`); 90 | console.log(`Output tokens: ${result.usage.outputTokens}`); 91 | } catch (error) { 92 | console.error("❌ Error during product catalog extraction:", error); 93 | } 94 | } 95 | 96 | async function main() { 97 | if (!process.env.GOOGLE_API_KEY) { 98 | console.error("❌ Please set GOOGLE_API_KEY environment variable"); 99 | process.exit(1); 100 | } 101 | 102 | console.log("🚀 Starting product catalog extraction\n"); 103 | 104 | await testProductCatalogExtraction(); 105 | 106 | console.log("\n🎉 Extraction completed!"); 107 | } 108 | 109 | if (require.main === module) { 110 | main().catch(console.error); 111 | } 112 | 113 | export { testProductCatalogExtraction }; 114 | -------------------------------------------------------------------------------- /src/dev/testHtmlToMarkdown.ts: -------------------------------------------------------------------------------- 1 | import * as fs from "fs"; 2 | import * as path from "path"; 3 | import { htmlToMarkdown } from "../converters"; 4 | import { HTMLExtractionOptions } from "../types"; 5 | 6 | // Function to convert HTML to Markdown and save the result 7 | async function testConvertHtmlToMarkdown( 8 | htmlFilePath: string, 9 | outputDir: string, 10 | options?: HTMLExtractionOptions 11 | ) { 12 | try { 13 | // Read the HTML file 14 | const html = fs.readFileSync(htmlFilePath, "utf8"); 15 | 16 | // Convert to Markdown 17 | const markdown = htmlToMarkdown(html, options); 18 | 19 | // Create output directory if it doesn't exist 20 | if (!fs.existsSync(outputDir)) { 21 | fs.mkdirSync(outputDir, { recursive: true }); 22 | } 23 | 24 | // Generate output filename 25 | const baseName = path.basename(htmlFilePath, ".html"); 26 | const optionsSuffix = options?.includeImages 27 | ? ".with-images" 28 | : options?.extractMainHtml 29 | ? ".main-content" 30 | : ""; 31 | const outputPath = path.join(outputDir, `${baseName}${optionsSuffix}.md`); 32 | 33 | // Save the markdown 34 | fs.writeFileSync(outputPath, markdown); 35 | console.log(`✅ Converted ${htmlFilePath} to ${outputPath}`); 36 | 37 | return outputPath; 38 | } catch (error) { 39 | console.error("❌ Error converting HTML to Markdown:", error); 40 | throw error; 41 | } 42 | } 43 | 44 | // Main function to run the test 45 | async function main() { 46 | // Get the HTML file path from command line arguments 47 | const htmlFilePath = process.argv[2]; 48 | if (!htmlFilePath) { 49 | console.error("❌ Please provide an HTML file path as an argument"); 50 | console.log("Usage: npm run dev:html2md "); 51 | process.exit(1); 52 | } 53 | 54 | // Create output directory 55 | const outputDir = path.join(process.cwd(), "dev-output", "markdown"); 56 | 57 | // Test different conversion options 58 | console.log( 59 | "\n🔍 Testing HTML to Markdown conversion with different options...\n" 60 | ); 61 | 62 | // 1. Basic conversion 63 | await testConvertHtmlToMarkdown(htmlFilePath, outputDir); 64 | 65 | // 2. Conversion with images 66 | await testConvertHtmlToMarkdown(htmlFilePath, outputDir, { 67 | includeImages: true, 68 | }); 69 | 70 | // 3. Main content extraction 71 | await testConvertHtmlToMarkdown(htmlFilePath, outputDir, { 72 | extractMainHtml: true, 73 | }); 74 | 75 | // 4. Both images and main content 76 | await testConvertHtmlToMarkdown(htmlFilePath, outputDir, { 77 | includeImages: true, 78 | extractMainHtml: true, 79 | }); 80 | 81 | console.log( 82 | "\n✨ All conversions completed! Check the output in:", 83 | outputDir 84 | ); 85 | } 86 | 87 | // Run the main function 88 | main().catch(console.error); 89 | -------------------------------------------------------------------------------- /src/dev/testUsage.ts: -------------------------------------------------------------------------------- 1 | import { config } from "dotenv"; 2 | import * as path from "path"; 3 | import { z } from "zod"; 4 | import { extract, ContentFormat, LLMProvider } from "../index"; 5 | 6 | // Load environment variables from .env file 7 | config({ path: path.resolve(process.cwd(), ".env") }); 8 | 9 | // A simple test script to verify usage tracking works 10 | async function testUsageTracking() { 11 | console.log("Testing usage tracking with OpenAI..."); 12 | 13 | // Check if API keys are available 14 | if (!process.env.OPENAI_API_KEY) { 15 | console.error("Error: OPENAI_API_KEY environment variable is required"); 16 | process.exit(1); 17 | } 18 | 19 | // Simple schema to test extraction 20 | const schema = z.object({ 21 | title: z.string(), 22 | description: z.string(), 23 | }); 24 | 25 | // Simple markdown content 26 | const markdown = ` 27 | # Hello World 28 | 29 | This is a test of the usage tracking system. 30 | `; 31 | 32 | try { 33 | // Run extraction 34 | const result = await extract({ 35 | content: markdown, 36 | format: ContentFormat.MARKDOWN, 37 | schema, 38 | provider: LLMProvider.OPENAI, 39 | openaiApiKey: process.env.OPENAI_API_KEY, 40 | }); 41 | 42 | // Log the results 43 | console.log("\nExtracted data:"); 44 | console.log(JSON.stringify(result.data, null, 2)); 45 | 46 | console.log("\nToken usage:"); 47 | console.log(result.usage); 48 | 49 | // Check if usage was captured 50 | if (result.usage.inputTokens && result.usage.outputTokens) { 51 | console.log("\n✅ Usage tracking is working correctly!"); 52 | } else { 53 | console.log("\n❌ Usage tracking failed!"); 54 | } 55 | } catch (error) { 56 | console.error("Error testing usage tracking:", error); 57 | } 58 | } 59 | 60 | // Run the test if executed directly 61 | if (require.main === module) { 62 | testUsageTracking() 63 | .then(() => console.log("Test completed")) 64 | .catch(console.error); 65 | } 66 | -------------------------------------------------------------------------------- /src/example.ts: -------------------------------------------------------------------------------- 1 | import { extract, ContentFormat, LLMProvider } from "./index"; 2 | import { z } from "zod"; 3 | import { config } from "dotenv"; 4 | import * as path from "path"; 5 | import * as fs from "fs"; 6 | import { htmlToMarkdown } from "./converters"; 7 | 8 | // Load environment variables from .env file 9 | config({ path: path.resolve(process.cwd(), ".env") }); 10 | 11 | async function example() { 12 | try { 13 | // Check if API key is available 14 | if (!process.env.GOOGLE_API_KEY) { 15 | console.error("Error: GOOGLE_API_KEY environment variable is required"); 16 | return; 17 | } 18 | 19 | // Define a schema for blog post extraction 20 | const schema = z.object({ 21 | title: z.string(), 22 | author: z.string().optional(), 23 | date: z.string().optional(), 24 | summary: z.string(), 25 | categories: z.array(z.string()).optional(), 26 | }); 27 | 28 | const htmlContent = fs.readFileSync( 29 | path.resolve(__dirname, "../tests/fixtures", "blog-post.html"), 30 | "utf8" 31 | ); 32 | const sourceUrl = "https://www.example.com/blog/async-await"; 33 | 34 | const markdown = htmlToMarkdown( 35 | htmlContent, 36 | { 37 | extractMainHtml: true, 38 | includeImages: true, 39 | }, 40 | sourceUrl 41 | ); 42 | 43 | // fs.writeFileSync("test.md", markdown); 44 | 45 | console.log("Running extraction example..."); 46 | 47 | // Extract data from HTML 48 | const result = await extract({ 49 | content: htmlContent, 50 | format: ContentFormat.HTML, 51 | schema, 52 | // Using Google Gemini by default 53 | openaiApiKey: process.env.OPENAI_API_KEY, 54 | provider: LLMProvider.OPENAI, 55 | sourceUrl, 56 | }); 57 | 58 | console.log("Extracted Data:"); 59 | console.log(JSON.stringify(result.data, null, 2)); 60 | 61 | console.log("\nMarkdown Content:"); 62 | console.log(result.processedContent.slice(0, 1000) + "\n..."); 63 | 64 | console.log("\nToken Usage:"); 65 | console.log(result.usage); 66 | } catch (error) { 67 | console.error("Error in example:", error); 68 | } 69 | } 70 | 71 | // Only run if directly executed 72 | if (require.main === module) { 73 | example(); 74 | } 75 | -------------------------------------------------------------------------------- /src/extractors.ts: -------------------------------------------------------------------------------- 1 | import { ChatOpenAI } from "@langchain/openai"; 2 | import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; 3 | import { z } from "zod"; 4 | import { LLMProvider, Usage, ContentFormat } from "./types"; 5 | import { AIMessage } from "@langchain/core/messages"; 6 | import { 7 | safeSanitizedParser, 8 | transformSchemaForLLM, 9 | fixUrlEscapeSequences, 10 | } from "./utils/schemaUtils"; 11 | import { jsonrepair } from "jsonrepair"; 12 | 13 | // Define LLMResult type here since direct import is problematic 14 | interface TokenUsage { 15 | promptTokens?: number; 16 | completionTokens?: number; 17 | totalTokens?: number; 18 | } 19 | 20 | interface LLMOutput { 21 | tokenUsage?: TokenUsage; 22 | } 23 | 24 | interface LLMResult { 25 | llmOutput?: LLMOutput; 26 | } 27 | 28 | /** 29 | * Get usage statistics from LLM output 30 | */ 31 | export function getUsage(output: LLMResult): Usage { 32 | const usage: Usage = {}; 33 | 34 | if (output.llmOutput && output.llmOutput.tokenUsage) { 35 | usage.inputTokens = output.llmOutput.tokenUsage.promptTokens; 36 | usage.outputTokens = output.llmOutput.tokenUsage.completionTokens; 37 | } 38 | 39 | return usage; 40 | } 41 | 42 | /** 43 | * Create LLM instance based on provider and configuration 44 | */ 45 | export function createLLM( 46 | provider: LLMProvider, 47 | modelName: string, 48 | apiKey: string, 49 | temperature: number = 0 50 | ) { 51 | switch (provider) { 52 | case LLMProvider.OPENAI: 53 | return new ChatOpenAI({ 54 | apiKey, 55 | modelName, 56 | temperature, 57 | }); 58 | 59 | case LLMProvider.GOOGLE_GEMINI: 60 | return new ChatGoogleGenerativeAI({ 61 | apiKey, 62 | model: modelName, 63 | temperature, 64 | }); 65 | 66 | default: 67 | throw new Error(`Unsupported LLM provider: ${provider}`); 68 | } 69 | } 70 | 71 | interface ExtractionPromptOptions { 72 | format: string; 73 | content: string; 74 | customPrompt?: string; 75 | extractionContext?: Record; 76 | } 77 | 78 | interface TruncateContentOptions extends ExtractionPromptOptions { 79 | maxTokens: number; 80 | } 81 | 82 | /** 83 | * Generate the extraction prompt with or without a custom query 84 | */ 85 | export function generateExtractionPrompt({ 86 | format, 87 | content, 88 | customPrompt, 89 | extractionContext, 90 | }: ExtractionPromptOptions): string { 91 | // Base prompt structure that's shared between default and custom prompts 92 | const extractionTask = customPrompt 93 | ? `${customPrompt}` 94 | : "Please extract structured information from the provided content."; 95 | 96 | // If extractionContext is provided, include it in the prompt for additional context 97 | let promptTemplate = `Content information is below: 98 | ------ 99 | Format: ${format} 100 | --- 101 | ${content} 102 | ------ 103 | 104 | `; 105 | 106 | if (extractionContext) { 107 | promptTemplate += `Extraction context: 108 | --- 109 | ${JSON.stringify(extractionContext, null, 2)} 110 | ------ 111 | 112 | You are a data extraction assistant that extracts structured information from the above content and context. 113 | 114 | Your task is: ${extractionTask} 115 | 116 | ## Guidelines: 117 | 1. Extract ONLY information explicitly stated in the content or provided in the extraction context 118 | 2. If the extraction context contains partial data objects, enrich and update them with information from the content, overriding existing values when better information is available 119 | 3. If the extraction context contains metadata (URLs, locations, etc.), use it to enhance your understanding and extraction 120 | 4. Do not make assumptions or infer missing data beyond what's provided 121 | 5. Leave fields empty when information is not present or you are uncertain 122 | 6. Follow the required schema exactly 123 | 124 | `; 125 | } else { 126 | promptTemplate += `You are a data extraction assistant that extracts structured information from the above content. 127 | 128 | Your task is: ${extractionTask} 129 | 130 | ## Guidelines: 131 | 1. Extract ONLY information explicitly stated in the content 132 | 2. Do not make assumptions or infer missing data 133 | 3. Leave fields empty when information is not present or you are uncertain 134 | 4. Do not include information that appears incomplete or truncated 135 | 5. Follow the required schema exactly 136 | 137 | `; 138 | } 139 | 140 | promptTemplate += `Return only the structured data in valid JSON format and nothing else.`; 141 | 142 | return promptTemplate; 143 | } 144 | 145 | /** 146 | * Truncate content to fit within token limit 147 | * Uses a rough conversion of 4 characters per token 148 | */ 149 | export function truncateContent({ 150 | format, 151 | content, 152 | customPrompt, 153 | extractionContext, 154 | maxTokens, 155 | }: TruncateContentOptions): string { 156 | const maxChars = maxTokens * 4; 157 | 158 | // First generate the full prompt 159 | const fullPrompt = generateExtractionPrompt({ 160 | format, 161 | content, 162 | customPrompt, 163 | extractionContext, 164 | }); 165 | 166 | // If the full prompt is within limits, return original content 167 | if (fullPrompt.length <= maxChars) { 168 | return content; 169 | } 170 | 171 | // Calculate how much we need to reduce the content 172 | const excessChars = fullPrompt.length - maxChars; 173 | 174 | // Truncate content by the excess amount 175 | return content.slice(0, content.length - excessChars); 176 | } 177 | 178 | /** 179 | * Extract structured data from markdown using an LLM 180 | */ 181 | export async function extractWithLLM( 182 | content: string, 183 | schema: T, 184 | provider: LLMProvider, 185 | modelName: string, 186 | apiKey: string, 187 | temperature: number = 0, 188 | customPrompt?: string, 189 | format: string = ContentFormat.MARKDOWN, 190 | maxInputTokens?: number, 191 | extractionContext?: Record 192 | ): Promise<{ data: z.infer; usage: Usage }> { 193 | const llm = createLLM(provider, modelName, apiKey, temperature); 194 | let usage: Usage = {}; 195 | 196 | // Truncate content if maxInputTokens is specified 197 | const truncatedContent = maxInputTokens 198 | ? truncateContent({ 199 | format, 200 | content, 201 | customPrompt, 202 | extractionContext, 203 | maxTokens: maxInputTokens, 204 | }) 205 | : content; 206 | 207 | // Generate the prompt using the unified template function 208 | const prompt = generateExtractionPrompt({ 209 | format, 210 | content: truncatedContent, 211 | customPrompt, 212 | extractionContext, 213 | }); 214 | 215 | try { 216 | // Transform schema to be compatible with LLM output (converting url() to string()) 217 | const llmSchema = transformSchemaForLLM(schema); 218 | 219 | // Extract structured data with a withStructuredOutput chain 220 | const structuredOutputLLM = llm.withStructuredOutput(llmSchema, { 221 | includeRaw: true, 222 | }); 223 | 224 | // Create a callback handler for usage tracking 225 | const callbacks = [ 226 | { 227 | handleLLMEnd: (output: any) => { 228 | usage = getUsage(output); 229 | }, 230 | }, 231 | ]; 232 | 233 | // Invoke the LLM with callbacks to track usage 234 | const response = await structuredOutputLLM.invoke(prompt, { callbacks }); 235 | const raw = response.raw as AIMessage; 236 | 237 | let data = response.parsed; 238 | 239 | // If structured output is not successful, try to parse the raw object. 240 | if (data == null) { 241 | // Note: this only works for OpenAI models. 242 | if (raw.tool_calls && raw.tool_calls.length > 0) { 243 | // This is the raw object in JSON mode before structured output tool call. 244 | const rawObject = raw.tool_calls[0].args; 245 | // Manually sanitize the object and remove any unsafe but optional fields or unsafe items in arrays. 246 | data = safeSanitizedParser(llmSchema, rawObject); 247 | } 248 | 249 | // Note: this only works for Google Gemini models. 250 | if (raw.lc_kwargs && raw.lc_kwargs.content) { 251 | // Gemini does not return a JSON object, it returns a string that is a JSON object. 252 | // We use jsonrepair to fix the JSON string and then parse it. 253 | const rawJson = raw.lc_kwargs.content; 254 | const rawObject = JSON.parse(jsonrepair(rawJson)); 255 | data = safeSanitizedParser(llmSchema, rawObject); 256 | } 257 | if (data == null) { 258 | throw new Error("No valid data was extracted"); 259 | } 260 | } 261 | 262 | // If structured output worked, we still need to fix URL escape sequences 263 | // and validate against the original schema 264 | const fixedData = fixUrlEscapeSequences(data, schema); 265 | const validatedData = safeSanitizedParser(schema, fixedData); 266 | // If validation fails, something went wrong with the URL validation 267 | if (validatedData === null) { 268 | throw new Error( 269 | "Extracted data failed validation against original schema" 270 | ); 271 | } 272 | 273 | data = validatedData; 274 | 275 | // Return the parsed data and usage statistics 276 | return { 277 | data, 278 | usage, 279 | }; 280 | } catch (error) { 281 | console.error("Error during LLM extraction:", error); 282 | throw error; 283 | } 284 | } 285 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { htmlToMarkdown } from "./converters"; 3 | import { extractWithLLM } from "./extractors"; 4 | import { 5 | ContentFormat, 6 | LLMProvider, 7 | ExtractorOptions, 8 | ExtractorResult, 9 | HTMLExtractionOptions, 10 | } from "./types"; 11 | 12 | // Default model names 13 | const DEFAULT_MODELS = { 14 | [LLMProvider.GOOGLE_GEMINI]: "gemini-2.5-flash", 15 | [LLMProvider.OPENAI]: "gpt-4o-mini", 16 | }; 17 | 18 | /** 19 | * Extract structured data from HTML, markdown, or plain text content using an LLM 20 | * 21 | * @param options Configuration options for extraction 22 | * @param options.content HTML, markdown, or plain text content to extract from 23 | * @param options.format Content format (HTML, MARKDOWN, or TXT) 24 | * @param options.schema Zod schema defining the structure to extract 25 | * @param options.provider LLM provider (GOOGLE_GEMINI or OPENAI) 26 | * @param options.modelName Model name to use (provider-specific) 27 | * @param options.googleApiKey Google API key (if using Google Gemini provider) 28 | * @param options.openaiApiKey OpenAI API key (if using OpenAI provider) 29 | * @param options.temperature Temperature for the LLM (0-1) 30 | * @param options.prompt Custom prompt to guide the extraction process 31 | * @param options.sourceUrl URL of the HTML content (required for HTML format) 32 | * @param options.htmlExtractionOptions HTML-specific options for content extraction 33 | * @param options.maxInputTokens Maximum number of input tokens to send to the LLM 34 | * @param options.extractionContext Extraction context that provides additional information for the extraction process (partial data, metadata, etc.) 35 | * @returns The extracted data, original content, and usage statistics 36 | */ 37 | export async function extract( 38 | options: ExtractorOptions 39 | ): Promise>> { 40 | // Validate required parameters 41 | const provider = options.provider ?? LLMProvider.GOOGLE_GEMINI; 42 | let apiKey: string; 43 | 44 | if (provider === LLMProvider.GOOGLE_GEMINI) { 45 | apiKey = options.googleApiKey ?? process.env.GOOGLE_API_KEY ?? ""; 46 | if (!apiKey) { 47 | throw new Error( 48 | "Google API key is required. Provide googleApiKey option or set GOOGLE_API_KEY environment variable." 49 | ); 50 | } 51 | } else if (provider === LLMProvider.OPENAI) { 52 | apiKey = options.openaiApiKey ?? process.env.OPENAI_API_KEY ?? ""; 53 | if (!apiKey) { 54 | throw new Error( 55 | "OpenAI API key is required. Provide openaiApiKey option or set OPENAI_API_KEY environment variable." 56 | ); 57 | } 58 | } else { 59 | throw new Error(`Unsupported LLM provider: ${provider}`); 60 | } 61 | 62 | // Validate sourceUrl for HTML format 63 | if (options.format === ContentFormat.HTML && !options.sourceUrl) { 64 | throw new Error( 65 | "sourceUrl is required when format is HTML to properly handle relative URLs" 66 | ); 67 | } 68 | 69 | // Get model name (use defaults if not provided) 70 | const modelName = options.modelName ?? DEFAULT_MODELS[provider]; 71 | 72 | // Convert HTML to markdown if needed 73 | let content = options.content; 74 | let formatToUse = options.format; 75 | 76 | if (options.format === ContentFormat.HTML) { 77 | content = htmlToMarkdown( 78 | options.content, 79 | options.htmlExtractionOptions, 80 | options.sourceUrl 81 | ); 82 | // For the LLM, the content is now markdown 83 | formatToUse = ContentFormat.MARKDOWN; 84 | } 85 | 86 | // Extract structured data using LLM 87 | const { data, usage } = await extractWithLLM( 88 | content, 89 | options.schema, 90 | provider, 91 | modelName, 92 | apiKey, 93 | options.temperature ?? 0, 94 | options.prompt, 95 | formatToUse.toString(), // Pass the correct format based on actual content 96 | options.maxInputTokens, 97 | options.extractionContext 98 | ); 99 | 100 | // Return the full result 101 | return { 102 | data, 103 | processedContent: content, 104 | usage, 105 | }; 106 | } 107 | 108 | /** 109 | * Convert HTML to markdown 110 | * 111 | * @param html HTML content to convert 112 | * @param options HTML extraction options 113 | * @param sourceUrl Source URL for resolving relative links 114 | * @returns Markdown content 115 | */ 116 | export function convertHtmlToMarkdown( 117 | html: string, 118 | options?: HTMLExtractionOptions, 119 | sourceUrl?: string 120 | ): string { 121 | return htmlToMarkdown(html, options, sourceUrl); 122 | } 123 | 124 | // Re-export types and enums 125 | export * from "./types"; 126 | 127 | // Utils 128 | export { safeSanitizedParser } from "./utils/schemaUtils"; 129 | 130 | // Browser utilities 131 | export { 132 | LocalBrowserProvider, 133 | ServerlessBrowserProvider, 134 | RemoteBrowserProvider, 135 | createBrowserProvider, 136 | } from "./utils/browserProviders"; 137 | 138 | // Browser class for web page loading 139 | export { Browser } from "./browser"; 140 | -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import type { Browser, LaunchOptions, ConnectOverCDPOptions } from "playwright"; 3 | 4 | /** 5 | * Represents the format of the input content 6 | */ 7 | export enum ContentFormat { 8 | HTML = "html", 9 | MARKDOWN = "markdown", 10 | TXT = "txt", 11 | } 12 | 13 | /** 14 | * Supported LLM providers 15 | */ 16 | export enum LLMProvider { 17 | OPENAI = "openai", 18 | GOOGLE_GEMINI = "google_gemini", 19 | } 20 | 21 | /** 22 | * Proxy configuration for network requests 23 | */ 24 | export interface ProxyConfig { 25 | host: string; 26 | port: number; 27 | auth?: { 28 | username: string; 29 | password: string; 30 | }; 31 | } 32 | 33 | /** 34 | * Abstract base class for browser providers 35 | */ 36 | export abstract class BrowserProvider { 37 | abstract start(): Promise; 38 | abstract close(): Promise; 39 | abstract getSession(): T | null; 40 | } 41 | 42 | /** 43 | * Configuration for local browser provider 44 | */ 45 | export interface LocalBrowserConfig { 46 | type: "local"; 47 | options?: Omit, "channel">; 48 | headless?: boolean; 49 | proxy?: ProxyConfig; 50 | } 51 | 52 | /** 53 | * Configuration for serverless browser provider 54 | */ 55 | export interface ServerlessBrowserConfig { 56 | type: "serverless"; 57 | executablePath: string; 58 | options?: Omit< 59 | Omit, "channel">, 60 | "executablePath" 61 | >; 62 | headless?: boolean; 63 | proxy?: ProxyConfig; 64 | } 65 | 66 | /** 67 | * Configuration for remote browser provider 68 | */ 69 | export interface RemoteBrowserConfig { 70 | type: "remote"; 71 | wsEndpoint: string; 72 | options?: Omit; 73 | } 74 | 75 | /** 76 | * Union type for all browser configurations 77 | */ 78 | export type BrowserConfig = 79 | | LocalBrowserConfig 80 | | ServerlessBrowserConfig 81 | | RemoteBrowserConfig; 82 | 83 | /** 84 | * Options for HTML content processing 85 | */ 86 | export interface HTMLExtractionOptions { 87 | /** 88 | * When enabled, attempts to extract the main content from HTML, removing navigation bars, headers, footers, etc. 89 | * This uses heuristics to identify the main content area. 90 | * 91 | * Should be kept off (false) when extracting specific details about a single item, 92 | * as it might remove important contextual elements. 93 | * 94 | * Only applies to HTML format, not markdown. 95 | */ 96 | extractMainHtml?: boolean; 97 | 98 | /** 99 | * When enabled, images in the HTML will be included in the markdown output. 100 | * By default, images are excluded to simplify the extraction process. 101 | * 102 | * Enable this option when you need to extract image information or URLs. 103 | */ 104 | includeImages?: boolean; 105 | 106 | /** 107 | * When enabled, removes tracking parameters and unnecessary URL components to clean up links. 108 | * Currently supports cleaning Amazon product URLs by removing /ref= parameters and everything after. 109 | * This helps produce cleaner, more readable URLs in the markdown output. 110 | * 111 | * Disabled by default to preserve original URLs. 112 | */ 113 | cleanUrls?: boolean; 114 | } 115 | 116 | /** 117 | * Options for the extractor 118 | */ 119 | export interface ExtractorOptions { 120 | /** Content to extract from (HTML, Markdown, or plain text) */ 121 | content: string; 122 | 123 | /** Format of the content */ 124 | format: ContentFormat; 125 | 126 | /** Schema for structured extraction */ 127 | schema: T; 128 | 129 | /** LLM Provider (OpenAI or Google Gemini) */ 130 | provider?: LLMProvider; 131 | 132 | /** Model name to use */ 133 | modelName?: string; 134 | 135 | /** OpenAI API key */ 136 | openaiApiKey?: string; 137 | 138 | /** Google API key */ 139 | googleApiKey?: string; 140 | 141 | /** Temperature for the LLM (0-1), defaults to 0 */ 142 | temperature?: number; 143 | 144 | /** HTML-specific extraction options (only applies when format is HTML) */ 145 | htmlExtractionOptions?: HTMLExtractionOptions; 146 | 147 | /** Custom prompt for extraction (if not provided, a default prompt will be used) */ 148 | prompt?: string; 149 | 150 | /** URL of the HTML content, required when format is HTML to properly handle relative URLs */ 151 | sourceUrl?: string; 152 | 153 | /** Maximum number of input tokens to send to the LLM. Uses a rough conversion of 4 characters per token. */ 154 | maxInputTokens?: number; 155 | 156 | /** 157 | * Extraction context that provides additional information for the extraction process. This can include: 158 | * - Partial data objects to be enriched with information from the content 159 | * - Metadata like website URL, user location, access timestamp 160 | * - Domain-specific knowledge or constraints 161 | * - Any other contextual information relevant to the extraction task 162 | * When provided, the LLM will consider this context alongside the content for more accurate extraction. 163 | */ 164 | extractionContext?: Record; 165 | } 166 | 167 | /** 168 | * Usage statistics for LLM calls 169 | */ 170 | export interface Usage { 171 | inputTokens?: number; 172 | outputTokens?: number; 173 | } 174 | 175 | /** 176 | * Result of the extraction process 177 | */ 178 | export interface ExtractorResult { 179 | /** Extracted data according to the schema */ 180 | data: T; 181 | 182 | /** 183 | * Processed content that was sent to the LLM. 184 | * This will be markdown if the input was HTML (after conversion), 185 | * or the original content if the input was already markdown or plain text. 186 | */ 187 | processedContent: string; 188 | 189 | /** Usage statistics */ 190 | usage: Usage; 191 | } 192 | -------------------------------------------------------------------------------- /src/utils/browserProviders.ts: -------------------------------------------------------------------------------- 1 | import { 2 | chromium, 3 | Browser, 4 | LaunchOptions, 5 | ConnectOverCDPOptions, 6 | } from "playwright"; 7 | import { BrowserProvider, ProxyConfig } from "../types"; 8 | 9 | /** 10 | * Local browser provider that launches a Chrome instance locally 11 | */ 12 | export class LocalBrowserProvider extends BrowserProvider { 13 | options: Omit, "channel"> | undefined; 14 | session: Browser | undefined; 15 | proxy: ProxyConfig | null; 16 | headless: boolean; 17 | 18 | constructor(params: { 19 | options?: Omit, "channel">; 20 | headless?: boolean; 21 | proxy?: ProxyConfig; 22 | }) { 23 | super(); 24 | this.options = params.options; 25 | this.proxy = params.proxy ?? null; 26 | this.headless = params.headless ?? true; 27 | } 28 | 29 | async start(): Promise { 30 | const launchArgs = this.options?.args ?? []; 31 | const browser = await chromium.launch({ 32 | ...(this.options ?? {}), 33 | channel: "chrome", 34 | headless: this.headless, 35 | args: ["--disable-blink-features=AutomationControlled", ...launchArgs], 36 | ...(this.proxy == null 37 | ? {} 38 | : { 39 | proxy: { 40 | server: `http://${this.proxy.host}:${this.proxy.port}`, 41 | username: this.proxy.auth?.username, 42 | password: this.proxy.auth?.password, 43 | }, 44 | }), 45 | }); 46 | this.session = browser; 47 | return this.session; 48 | } 49 | 50 | async close(): Promise { 51 | return await this.session?.close(); 52 | } 53 | 54 | public getSession() { 55 | if (!this.session) { 56 | return null; 57 | } 58 | return this.session; 59 | } 60 | } 61 | 62 | /** 63 | * Serverless browser provider for environments like AWS Lambda 64 | */ 65 | export class ServerlessBrowserProvider extends BrowserProvider { 66 | options: 67 | | Omit, "channel">, "executablePath"> 68 | | undefined; 69 | session: Browser | undefined; 70 | executablePath: string; 71 | proxy: ProxyConfig | null; 72 | 73 | constructor(params: { 74 | options?: Omit, "channel">; 75 | executablePath: string; 76 | proxy?: ProxyConfig; 77 | }) { 78 | super(); 79 | this.options = params.options; 80 | this.executablePath = params.executablePath; 81 | this.proxy = params.proxy ?? null; 82 | } 83 | 84 | async start(): Promise { 85 | const launchArgs = this.options?.args ?? []; 86 | const browser = await chromium.launch({ 87 | ...(this.options ?? {}), 88 | headless: true, 89 | executablePath: this.executablePath, 90 | args: ["--disable-blink-features=AutomationControlled", ...launchArgs], 91 | ...(this.proxy == null 92 | ? {} 93 | : { 94 | proxy: { 95 | server: `http://${this.proxy.host}:${this.proxy.port}`, 96 | username: this.proxy.auth?.username, 97 | password: this.proxy.auth?.password, 98 | }, 99 | }), 100 | }); 101 | this.session = browser; 102 | return this.session; 103 | } 104 | 105 | async close(): Promise { 106 | return await this.session?.close(); 107 | } 108 | 109 | public getSession() { 110 | if (!this.session) { 111 | return null; 112 | } 113 | return this.session; 114 | } 115 | } 116 | 117 | /** 118 | * Remote browser provider that connects to an existing browser instance 119 | */ 120 | export class RemoteBrowserProvider extends BrowserProvider { 121 | options: Omit | undefined; 122 | session: Browser | undefined; 123 | wsEndpoint: string; 124 | 125 | constructor(params: { 126 | wsEndpoint: string; 127 | options?: Omit; 128 | }) { 129 | super(); 130 | this.wsEndpoint = params.wsEndpoint; 131 | this.options = params.options; 132 | } 133 | 134 | async start(): Promise { 135 | const browser = await chromium.connectOverCDP( 136 | this.wsEndpoint, 137 | this.options 138 | ); 139 | this.session = browser; 140 | return this.session; 141 | } 142 | 143 | async close(): Promise { 144 | return await this.session?.close(); 145 | } 146 | 147 | public getSession() { 148 | if (!this.session) { 149 | return null; 150 | } 151 | return this.session; 152 | } 153 | } 154 | 155 | /** 156 | * Factory function to create a browser provider based on configuration 157 | */ 158 | export function createBrowserProvider(config: { 159 | type: "local"; 160 | options?: Omit, "channel">; 161 | headless?: boolean; 162 | proxy?: ProxyConfig; 163 | }): LocalBrowserProvider; 164 | export function createBrowserProvider(config: { 165 | type: "serverless"; 166 | executablePath: string; 167 | options?: Omit< 168 | Omit, "channel">, 169 | "executablePath" 170 | >; 171 | proxy?: ProxyConfig; 172 | }): ServerlessBrowserProvider; 173 | export function createBrowserProvider(config: { 174 | type: "remote"; 175 | wsEndpoint: string; 176 | options?: Omit; 177 | }): RemoteBrowserProvider; 178 | export function createBrowserProvider( 179 | config: any 180 | ): LocalBrowserProvider | ServerlessBrowserProvider | RemoteBrowserProvider { 181 | switch (config.type) { 182 | case "local": 183 | return new LocalBrowserProvider({ 184 | options: config.options, 185 | headless: config.headless, 186 | proxy: config.proxy, 187 | }); 188 | case "serverless": 189 | return new ServerlessBrowserProvider({ 190 | options: config.options, 191 | executablePath: config.executablePath, 192 | proxy: config.proxy, 193 | }); 194 | case "remote": 195 | return new RemoteBrowserProvider({ 196 | wsEndpoint: config.wsEndpoint, 197 | options: config.options, 198 | }); 199 | default: 200 | throw new Error(`Unsupported browser provider type: ${config.type}`); 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /src/utils/schemaUtils.ts: -------------------------------------------------------------------------------- 1 | import { 2 | z, 3 | ZodArray, 4 | ZodObject, 5 | ZodOptional, 6 | ZodTypeAny, 7 | ZodNullable, 8 | ZodFirstPartyTypeKind, 9 | } from "zod"; 10 | 11 | /** 12 | * Checks if a schema is a ZodString with URL validation 13 | */ 14 | export function isUrlSchema(schema: ZodTypeAny): boolean { 15 | if (!isZodType(schema, ZodFirstPartyTypeKind.ZodString)) return false; 16 | 17 | // Check if schema has URL validation by checking for internal checks property 18 | // This is a bit of a hack but necessary since Zod doesn't expose validation info 19 | const checks = (schema as any)._def.checks; 20 | if (!checks || !Array.isArray(checks)) return false; 21 | 22 | return checks.some((check) => check.kind === "url"); 23 | } 24 | 25 | /** 26 | * Helper function to check schema type without using instanceof (can fail due to zod version differences) 27 | */ 28 | function isZodType(schema: ZodTypeAny, type: ZodFirstPartyTypeKind): boolean { 29 | return (schema as any)._def.typeName === type; 30 | } 31 | 32 | /** 33 | * Transforms a schema, replacing any URL validations with string validations 34 | * for compatibility with LLM output 35 | */ 36 | export function transformSchemaForLLM( 37 | schema: T 38 | ): ZodTypeAny { 39 | // For URL string schemas, remove the URL check but preserve everything else 40 | if (isUrlSchema(schema)) { 41 | const originalDef = { ...(schema as any)._def }; 42 | 43 | // Filter out only URL checks, keep all other checks 44 | if (originalDef.checks && Array.isArray(originalDef.checks)) { 45 | originalDef.checks = originalDef.checks.filter( 46 | (check: any) => check.kind !== "url" 47 | ); 48 | } 49 | 50 | // Create a new string schema with the modified definition 51 | return new z.ZodString({ 52 | ...originalDef, 53 | typeName: z.ZodFirstPartyTypeKind.ZodString, 54 | }); 55 | } 56 | 57 | // For object schemas, transform each property 58 | if (isZodType(schema, ZodFirstPartyTypeKind.ZodObject)) { 59 | const originalDef = { ...(schema as any)._def }; 60 | const newShape: Record = {}; 61 | 62 | // Transform each property in the shape 63 | for (const [key, propertySchema] of Object.entries((schema as any).shape)) { 64 | newShape[key] = transformSchemaForLLM(propertySchema as ZodTypeAny); 65 | } 66 | 67 | // Create a new object with the same definition but transformed shape 68 | return new z.ZodObject({ 69 | ...originalDef, 70 | shape: () => newShape, 71 | typeName: z.ZodFirstPartyTypeKind.ZodObject, 72 | }); 73 | } 74 | 75 | // For array schemas, transform the element schema 76 | if (isZodType(schema, ZodFirstPartyTypeKind.ZodArray)) { 77 | const originalDef = { ...(schema as any)._def }; 78 | const transformedElement = transformSchemaForLLM( 79 | (schema as any).element as ZodTypeAny 80 | ); 81 | 82 | // Create a new array with the same definition but transformed element 83 | return new z.ZodArray({ 84 | ...originalDef, 85 | type: transformedElement, 86 | typeName: z.ZodFirstPartyTypeKind.ZodArray, 87 | }); 88 | } 89 | 90 | // For optional schemas, transform the inner schema 91 | if (isZodType(schema, ZodFirstPartyTypeKind.ZodOptional)) { 92 | const originalDef = { ...(schema as any)._def }; 93 | const transformedInner = transformSchemaForLLM( 94 | (schema as any).unwrap() as ZodTypeAny 95 | ); 96 | 97 | // Create a new optional with the same definition but transformed inner type 98 | return new z.ZodOptional({ 99 | ...originalDef, 100 | innerType: transformedInner, 101 | typeName: z.ZodFirstPartyTypeKind.ZodOptional, 102 | }); 103 | } 104 | 105 | // For nullable schemas, transform the inner schema 106 | if (isZodType(schema, ZodFirstPartyTypeKind.ZodNullable)) { 107 | const originalDef = { ...(schema as any)._def }; 108 | const transformedInner = transformSchemaForLLM( 109 | (schema as any).unwrap() as ZodTypeAny 110 | ); 111 | 112 | // Create a new nullable with the same definition but transformed inner type 113 | return new z.ZodNullable({ 114 | ...originalDef, 115 | innerType: transformedInner, 116 | typeName: z.ZodFirstPartyTypeKind.ZodNullable, 117 | }); 118 | } 119 | 120 | // Return the original schema for all other types 121 | return schema; 122 | } 123 | 124 | /** 125 | * Fix URL escape sequences in the object based on the original schema 126 | */ 127 | export function fixUrlEscapeSequences(data: any, schema: ZodTypeAny): any { 128 | if (data === null || data === undefined) return data; 129 | 130 | if (isUrlSchema(schema)) { 131 | if (typeof data === "string") { 132 | // Replace escaped parentheses with unescaped versions 133 | return data.replace(/\\\(/g, "(").replace(/\\\)/g, ")"); 134 | } 135 | return data; 136 | } 137 | 138 | if ( 139 | isZodType(schema, ZodFirstPartyTypeKind.ZodObject) && 140 | typeof data === "object" && 141 | !Array.isArray(data) 142 | ) { 143 | const shape = (schema as any).shape; 144 | const result: Record = {}; 145 | 146 | for (const [key, propertySchema] of Object.entries(shape)) { 147 | if (key in data) { 148 | result[key] = fixUrlEscapeSequences( 149 | data[key], 150 | propertySchema as ZodTypeAny 151 | ); 152 | } else { 153 | result[key] = data[key]; 154 | } 155 | } 156 | 157 | return result; 158 | } 159 | 160 | if ( 161 | isZodType(schema, ZodFirstPartyTypeKind.ZodArray) && 162 | Array.isArray(data) 163 | ) { 164 | const elementSchema = (schema as any).element as ZodTypeAny; 165 | return data.map((item) => fixUrlEscapeSequences(item, elementSchema)); 166 | } 167 | 168 | if (isZodType(schema, ZodFirstPartyTypeKind.ZodOptional)) { 169 | const innerSchema = (schema as any).unwrap() as ZodTypeAny; 170 | return fixUrlEscapeSequences(data, innerSchema); 171 | } 172 | 173 | if (isZodType(schema, ZodFirstPartyTypeKind.ZodNullable)) { 174 | const innerSchema = (schema as any).unwrap() as ZodTypeAny; 175 | return fixUrlEscapeSequences(data, innerSchema); 176 | } 177 | 178 | return data; 179 | } 180 | 181 | /** 182 | * Sanitizes an object to conform to a Zod schema by removing invalid optional fields or array items. 183 | * If the object can't be sanitized to match the schema, returns null. 184 | * 185 | * @param schema The Zod schema to validate against 186 | * @param rawObject The raw object to sanitize 187 | * @returns The sanitized object or null if it can't be sanitized 188 | */ 189 | export function safeSanitizedParser( 190 | schema: T, 191 | rawObject: unknown 192 | ): z.infer | null { 193 | try { 194 | // If the raw object is null or undefined, just validate it directly 195 | if (rawObject === null || rawObject === undefined) { 196 | return schema.parse(rawObject); 197 | } 198 | 199 | // Handle different schema types 200 | if (isZodType(schema, ZodFirstPartyTypeKind.ZodObject)) { 201 | return sanitizeObject(schema as any, rawObject); 202 | } else if (isZodType(schema, ZodFirstPartyTypeKind.ZodArray)) { 203 | return sanitizeArray(schema as any, rawObject); 204 | } else if (isZodType(schema, ZodFirstPartyTypeKind.ZodOptional)) { 205 | return sanitizeOptional(schema as any, rawObject); 206 | } else if (isZodType(schema, ZodFirstPartyTypeKind.ZodNullable)) { 207 | return sanitizeNullable(schema as any, rawObject); 208 | } else { 209 | // For primitive values, try to parse directly 210 | return schema.parse(rawObject); 211 | } 212 | } catch (error) { 213 | // If any error occurs during sanitization, return null 214 | return null; 215 | } 216 | } 217 | 218 | /** 219 | * Sanitizes an object against a Zod object schema 220 | */ 221 | function sanitizeObject(schema: ZodObject, rawObject: unknown): any { 222 | if ( 223 | typeof rawObject !== "object" || 224 | rawObject === null || 225 | Array.isArray(rawObject) 226 | ) { 227 | throw new Error("Expected an object"); 228 | } 229 | 230 | const shape = schema.shape; 231 | const result: Record = {}; 232 | const rawObjectRecord = rawObject as Record; 233 | 234 | // Process each property in the schema 235 | for (const [key, propertySchema] of Object.entries(shape)) { 236 | // Check if the property doesn't exist in the raw object 237 | if (!(key in rawObjectRecord)) { 238 | // For nullable properties, add as null if missing 239 | if ( 240 | isZodType( 241 | propertySchema as ZodTypeAny, 242 | ZodFirstPartyTypeKind.ZodNullable 243 | ) 244 | ) { 245 | result[key] = null; 246 | } 247 | // For other types (required or optional), skip missing properties 248 | continue; 249 | } 250 | 251 | // If property is optional, try to sanitize it 252 | if ( 253 | isZodType(propertySchema as ZodTypeAny, ZodFirstPartyTypeKind.ZodOptional) 254 | ) { 255 | const sanitized = safeSanitizedParser( 256 | propertySchema as ZodTypeAny, 257 | rawObjectRecord[key] 258 | ); 259 | if (sanitized !== null) { 260 | result[key] = sanitized; 261 | } 262 | // If sanitization fails, just skip the optional property 263 | } else if ( 264 | isZodType(propertySchema as ZodTypeAny, ZodFirstPartyTypeKind.ZodNullable) 265 | ) { 266 | // For nullable properties, try to sanitize or set to null 267 | try { 268 | const sanitized = safeSanitizedParser( 269 | propertySchema as ZodTypeAny, 270 | rawObjectRecord[key] 271 | ); 272 | result[key] = sanitized; 273 | } catch { 274 | // If sanitization fails, set to null for nullable properties 275 | result[key] = null; 276 | } 277 | } else { 278 | // For required properties, try to sanitize and throw if it fails 279 | const sanitized = safeSanitizedParser( 280 | propertySchema as ZodTypeAny, 281 | rawObjectRecord[key] 282 | ); 283 | if (sanitized === null) { 284 | throw new Error(`Required property ${key} could not be sanitized`); 285 | } 286 | result[key] = sanitized; 287 | } 288 | } 289 | 290 | // Validate the final object to ensure it matches the schema 291 | return schema.parse(result); 292 | } 293 | 294 | /** 295 | * Sanitizes an array against a Zod array schema 296 | */ 297 | function sanitizeArray(schema: ZodArray, rawValue: unknown): any { 298 | if (!Array.isArray(rawValue)) { 299 | throw new Error("Expected an array"); 300 | } 301 | 302 | const elementSchema = schema.element as ZodTypeAny; 303 | const sanitizedArray = []; 304 | 305 | // Process each item in the array 306 | for (const item of rawValue) { 307 | try { 308 | const sanitizedItem = safeSanitizedParser(elementSchema, item); 309 | if (sanitizedItem !== null) { 310 | sanitizedArray.push(sanitizedItem); 311 | } 312 | // If an item can't be sanitized, just skip it 313 | } catch { 314 | // Skip invalid array items 315 | } 316 | } 317 | 318 | // Validate the final array to ensure it matches the schema 319 | return schema.parse(sanitizedArray); 320 | } 321 | 322 | /** 323 | * Sanitizes a value against an optional Zod schema 324 | */ 325 | function sanitizeOptional(schema: ZodOptional, rawValue: unknown): any { 326 | try { 327 | // Try to sanitize using the inner schema 328 | const innerSchema = schema.unwrap(); 329 | const parsed = safeSanitizedParser(innerSchema, rawValue); 330 | // If the parsed value is not valid, return undefined for optional values 331 | if (parsed === null) { 332 | return undefined; 333 | } 334 | return parsed; 335 | } catch { 336 | // If sanitization fails, return undefined for optional values 337 | return undefined; 338 | } 339 | } 340 | 341 | /** 342 | * Sanitizes a value against a nullable Zod schema 343 | */ 344 | function sanitizeNullable(schema: ZodNullable, rawValue: unknown): any { 345 | // If the value is null, return null directly 346 | if (rawValue === null) { 347 | return null; 348 | } 349 | 350 | try { 351 | // Try to sanitize using the inner schema 352 | const innerSchema = schema.unwrap(); 353 | const sanitized = safeSanitizedParser(innerSchema, rawValue); 354 | 355 | // If sanitization of inner schema fails, return null 356 | if (sanitized === null) { 357 | return null; 358 | } 359 | 360 | return sanitized; 361 | } catch { 362 | // If sanitization fails, return null for nullable values 363 | return null; 364 | } 365 | } 366 | -------------------------------------------------------------------------------- /tests/fixtures/article-with-images.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Modern Web Development with React and Node.js 7 | 8 | 9 |
10 | 17 |

Modern Web Development with React and Node.js

18 |
19 | Jane Smith 20 | March 20, 2023 21 |
22 | #React 23 | #Node.js 24 | #JavaScript 25 |
26 |
27 |
28 | 29 |
30 | 34 | 35 |

Building modern web applications requires a solid understanding of both front-end and back-end technologies. React has become the industry standard for building interactive user interfaces, while Node.js powers the server-side of many applications.

36 | 37 |

React: Building User Interfaces

38 |

React is a JavaScript library for building user interfaces, particularly single-page applications. It allows developers to create reusable UI components and manage application state efficiently.

39 | 40 |
41 | React Component Example 42 |

43 | function Welcome(props) {
44 |   return <h1>Hello, {props.name}</h1>;
45 | }
46 |       
47 |
48 | 49 |

Node.js: Server-Side JavaScript

50 |

Node.js allows JavaScript to be used for server-side programming. It uses an event-driven, non-blocking I/O model that makes it lightweight and efficient for data-intensive real-time applications.

51 | 52 |
53 | Node.js Event Loop 54 |
The Node.js event loop enables non-blocking I/O operations
55 |
56 | 57 |

Combining React and Node.js

58 |

When combined, React and Node.js create a powerful full-stack JavaScript environment. The front-end is handled by React components, while the back-end API is managed by Node.js.

59 | 60 |

A typical architecture might look like this:

61 |
    62 |
  1. React components for the user interface
  2. 63 |
  3. Redux or Context API for state management
  4. 64 |
  5. Express.js (Node.js framework) for the API layer
  6. 65 |
  7. MongoDB or another database for data persistence
  8. 66 |
67 | 68 | 85 | 86 |

Conclusion

87 |

The combination of React and Node.js provides a consistent development experience across the stack, as both use JavaScript. This allows for better code reuse and a more streamlined development process.

88 | 89 |

Whether you're building a simple web application or a complex enterprise system, the React and Node.js stack offers flexibility, performance, and scalability.

90 |
91 | 92 |
93 |

© 2023 Web Development Blog

94 | Blog Logo 95 |
96 | 97 | -------------------------------------------------------------------------------- /tests/fixtures/blog-post.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Understanding Async/Await in JavaScript 7 | 8 | 9 |
10 |

Understanding Async/Await in JavaScript

11 |
12 | John Doe 13 | January 15, 2023 14 |
15 | #JavaScript 16 | #Programming 17 |
18 |
19 |
20 | 21 |
22 |

Async/await is a modern way to handle asynchronous operations in JavaScript. It was introduced in ES2017 and has since become the preferred method for handling promises.

23 | 24 |

What is Async/Await?

25 |

The async keyword is used to declare an asynchronous function. An async function automatically returns a promise, and the value returned by the function will be resolved with the returned promise.

26 | 27 |

The await keyword can only be used inside an async function. It pauses the execution of the function until the promise is resolved or rejected.

28 | 29 |

Basic Example

30 |

31 | async function fetchData() {
32 |   const response = await fetch('https://api.example.com/data');
33 |   const data = await response.json();
34 |   return data;
35 | }
36 |     
37 | 38 |

In this example, the function will wait for the fetch operation to complete before moving to the next line. This makes asynchronous code look and behave more like synchronous code, making it easier to understand and maintain.

39 | 40 |

Error Handling

41 |

With async/await, you can use try/catch blocks for error handling, which is more intuitive than promise chaining with .catch().

42 | 43 |

44 | async function fetchData() {
45 |   try {
46 |     const response = await fetch('https://api.example.com/data');
47 |     const data = await response.json();
48 |     return data;
49 |   } catch (error) {
50 |     console.error('Error fetching data:', error);
51 |     throw error;
52 |   }
53 | }
54 |     
55 | 56 |

Conclusion

57 |

Async/await makes asynchronous code more readable and maintainable. It's built on promises, so you can still use all the promise methods when needed, but the syntax is cleaner and more intuitive.

58 | 59 |

For more information, visit our JavaScript Tutorials or check out the MDN documentation.

60 |
61 | 62 |
63 |

© 2023 JavaScript Blog

64 |
65 | 66 | -------------------------------------------------------------------------------- /tests/fixtures/product-list.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Smart Home Products 7 | 8 | 9 |
10 |

Smart Home Products

11 |

Top-rated devices to make your home smarter

12 |
13 | 14 |
15 |
16 |

Smart Speaker Pro

17 |
18 | Smart Speaker Pro 19 |
20 |
21 |
$129.99
22 |
★★★★☆ (4.2/5)
23 |
24 |

Premium smart speaker with built-in voice assistant. Control your smart home, play music, or get answers to your questions.

25 |
    26 |
  • 360° sound with deep bass
  • 27 |
  • Multi-room audio support
  • 28 |
  • Compatible with most smart home devices
  • 29 |
  • Available in black, white, and gray
  • 30 |
31 |
32 | View Details 33 |
34 |
35 | 36 |
37 |

Smart Thermostat

38 |
39 | Smart Thermostat 40 |
41 |
42 |
$89.95
43 |
★★★★★ (4.8/5)
44 |
45 |

Energy-efficient smart thermostat that learns your preferences and helps save on utility bills.

46 |
    47 |
  • Easy installation
  • 48 |
  • Compatible with most HVAC systems
  • 49 |
  • Mobile app control
  • 50 |
  • Energy usage reports
  • 51 |
52 |
53 | View Details 54 |
55 |
56 | 57 |
58 |

Smart Security Camera

59 |
60 | Smart Security Camera 61 |
62 |
63 |
$74.50
64 |
★★★★☆ (4.0/5)
65 |
66 |

HD security camera with motion detection, night vision, and two-way audio.

67 |
    68 |
  • 1080p HD video
  • 69 |
  • Cloud storage available
  • 70 |
  • Weather-resistant
  • 71 |
  • Real-time alerts
  • 72 |
73 |
74 | View Details 75 |
76 |
77 |
78 | 79 |
80 |

Prices and availability may vary. Last updated: June 2023

81 |
82 | 83 | -------------------------------------------------------------------------------- /tests/integration/browser-extraction.test.ts: -------------------------------------------------------------------------------- 1 | import { extract, ContentFormat, LLMProvider, Browser } from "../../src/index"; 2 | import { z } from "zod"; 3 | 4 | const testSchema = z.object({ 5 | title: z.string(), 6 | description: z.string().optional(), 7 | mainContent: z.string().optional(), 8 | }); 9 | 10 | describe("Browser + Extraction Integration Tests", () => { 11 | // Test with a simple, stable website 12 | const testUrl = "https://example.com"; 13 | 14 | describe("Browser Class with Google Gemini", () => { 15 | it("should load page and extract data using Browser class", async () => { 16 | // Load HTML using Browser class with direct Playwright API 17 | const browser = new Browser(); 18 | await browser.start(); 19 | 20 | const page = await browser.newPage(); 21 | 22 | try { 23 | await page.goto(testUrl); 24 | 25 | try { 26 | await page.waitForLoadState("networkidle", { timeout: 10000 }); 27 | } catch { 28 | console.log("Network idle timeout, continuing..."); 29 | } 30 | 31 | const html = await page.content(); 32 | await browser.close(); 33 | 34 | // Extract data from the loaded HTML 35 | const result = await extract({ 36 | content: html, 37 | format: ContentFormat.HTML, 38 | sourceUrl: testUrl, 39 | schema: testSchema, 40 | provider: LLMProvider.GOOGLE_GEMINI, 41 | googleApiKey: process.env.GOOGLE_API_KEY, 42 | }); 43 | 44 | expect(result.data).toBeDefined(); 45 | expect(result.data.title).toBeDefined(); 46 | expect(typeof result.data.title).toBe("string"); 47 | expect(result.processedContent).toBeDefined(); 48 | expect(result.usage).toBeDefined(); 49 | 50 | // The processed content should be markdown (converted from HTML) 51 | expect(result.processedContent).toContain("Example Domain"); 52 | } catch (error) { 53 | throw error; // Re-throw non-network errors 54 | } finally { 55 | await browser.close(); 56 | } 57 | }); 58 | }); 59 | 60 | describe("Error Handling", () => { 61 | it("should handle navigation errors", async () => { 62 | const browser = new Browser(); 63 | await browser.start(); 64 | 65 | const page = await browser.newPage(); 66 | 67 | // Use a non-existent domain 68 | const unreachableUrl = "https://this-domain-does-not-exist-12345.com"; 69 | 70 | await expect( 71 | page.goto(unreachableUrl, { timeout: 5000 }) 72 | ).rejects.toThrow(); 73 | 74 | await browser.close(); 75 | }); 76 | 77 | it("should handle browser startup errors gracefully", async () => { 78 | // Test with invalid serverless config 79 | const invalidConfig = { 80 | type: "serverless" as const, 81 | executablePath: "/non/existent/path", 82 | }; 83 | 84 | const browser = new Browser(invalidConfig); 85 | 86 | await expect(browser.start()).rejects.toThrow(); 87 | }); 88 | }); 89 | }); 90 | -------------------------------------------------------------------------------- /tests/integration/extract.test.ts: -------------------------------------------------------------------------------- 1 | import * as fs from "fs"; 2 | import * as path from "path"; 3 | import { z } from "zod"; 4 | import { 5 | extract, 6 | ContentFormat, 7 | LLMProvider, 8 | ExtractorResult, 9 | } from "../../src"; 10 | import { htmlToMarkdown } from "../../src/converters"; 11 | 12 | // Read the sample HTML files 13 | const blogPostHtml = fs.readFileSync( 14 | path.resolve(__dirname, "../fixtures/blog-post.html"), 15 | "utf8" 16 | ); 17 | // Define schemas that will be reused 18 | const blogSchema = z.object({ 19 | title: z.string(), 20 | author: z.string(), 21 | date: z.string(), 22 | tags: z 23 | .array(z.string()) 24 | .optional() 25 | .describe("Tags appear after the date. Do not include the # symbol."), 26 | summary: z.string(), 27 | links: z 28 | .array(z.string().url()) 29 | .optional() 30 | .describe("Extract all URLs from the content"), 31 | }); 32 | 33 | // Define a separate schema for OpenAI tests using nullable instead of optional 34 | const blogSchemaOpenAI = z.object({ 35 | title: z.string(), 36 | author: z.string(), 37 | date: z.string(), 38 | tags: z 39 | .array(z.string()) 40 | .nullable() 41 | .describe("Tags appear after the date. Do not include the # symbol."), 42 | summary: z.string(), 43 | links: z 44 | .array(z.string().url()) 45 | .nullable() 46 | .describe("Extract all URLs from the content"), 47 | }); 48 | 49 | // Helper function to verify blog post extraction results 50 | function verifyBlogPostExtraction(result: ExtractorResult): void { 51 | // Check the data is extracted correctly 52 | expect(result.data).toBeDefined(); 53 | expect(result.data.title).toBe("Understanding Async/Await in JavaScript"); 54 | expect(result.data.author).toBe("John Doe"); 55 | expect(result.data.date).toBe("January 15, 2023"); 56 | expect(typeof result.data.summary).toBe("string"); 57 | expect(result.data.summary.length).toBeGreaterThan(0); 58 | expect(result.data.tags).toEqual(["JavaScript", "Programming"]); 59 | 60 | // Verify URLs are extracted and are absolute 61 | expect(result.data.links).toBeDefined(); 62 | expect(Array.isArray(result.data.links)).toBe(true); 63 | expect(result.data.links).toContain( 64 | "https://example.com/blog/javascript-tutorials" 65 | ); 66 | expect(result.data.links).toContain( 67 | "https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/async_function" 68 | ); 69 | 70 | // Verify that usage statistics are returned 71 | expect(result.usage).toBeDefined(); 72 | expect(result.usage.inputTokens).toBeGreaterThan(0); 73 | expect(result.usage.outputTokens).toBeGreaterThan(0); 74 | } 75 | 76 | describe("Extract Integration Tests", () => { 77 | describe("Blog Post Extraction", () => { 78 | test("should extract blog post data using Google Gemini default model", async () => { 79 | const result = await extract({ 80 | content: blogPostHtml, 81 | format: ContentFormat.HTML, 82 | schema: blogSchema, 83 | provider: LLMProvider.GOOGLE_GEMINI, 84 | googleApiKey: process.env.GOOGLE_API_KEY, 85 | sourceUrl: "https://example.com/blog/async-await", 86 | }); 87 | 88 | verifyBlogPostExtraction(result); 89 | }); 90 | 91 | test("should extract blog post data using OpenAI default model", async () => { 92 | const result = await extract({ 93 | content: blogPostHtml, 94 | format: ContentFormat.HTML, 95 | schema: blogSchemaOpenAI, 96 | provider: LLMProvider.OPENAI, 97 | openaiApiKey: process.env.OPENAI_API_KEY, 98 | sourceUrl: "https://example.com/blog/async-await", 99 | }); 100 | 101 | verifyBlogPostExtraction(result); 102 | }); 103 | }); 104 | 105 | const productListHtml = fs.readFileSync( 106 | path.resolve(__dirname, "../fixtures/product-list.html"), 107 | "utf8" 108 | ); 109 | 110 | const productSchema = z.object({ 111 | products: z.array( 112 | z.object({ 113 | name: z.string(), 114 | price: z.number(), 115 | rating: z.number().optional(), 116 | description: z.string().optional(), 117 | features: z.array(z.string()).optional(), 118 | imageUrl: z.string().url().optional(), 119 | productUrl: z.string().url().optional(), 120 | }) 121 | ), 122 | }); 123 | 124 | // Define a separate schema for OpenAI tests using nullable instead of optional 125 | const productSchemaOpenAI = z.object({ 126 | products: z.array( 127 | z.object({ 128 | name: z.string(), 129 | price: z.number(), 130 | rating: z.number().nullable(), 131 | description: z.string().nullable(), 132 | features: z.array(z.string()).nullable(), 133 | imageUrl: z.string().url().nullable(), 134 | productUrl: z.string().url().nullable(), 135 | }) 136 | ), 137 | }); 138 | 139 | const groundTruthProductList = [ 140 | { 141 | name: "Smart Speaker Pro", 142 | price: 129.99, 143 | rating: 4.2, 144 | description: 145 | "Premium smart speaker with built-in voice assistant. Control your smart home, play music, or get answers to your questions.", 146 | features: [ 147 | "360° sound with deep bass", 148 | "Multi-room audio support", 149 | "Compatible with most smart home devices", 150 | "Available in black, white, and gray", 151 | ], 152 | imageUrl: "https://example.com/images/products/speaker.jpg", 153 | productUrl: "https://example.com/products/smart-speaker-pro", 154 | }, 155 | { 156 | name: "Smart Thermostat", 157 | price: 89.95, 158 | rating: 4.8, 159 | description: 160 | "Energy-efficient smart thermostat that learns your preferences and helps save on utility bills.", 161 | features: [ 162 | "Easy installation", 163 | "Compatible with most HVAC systems", 164 | "Mobile app control", 165 | "Energy usage reports", 166 | ], 167 | imageUrl: "https://example.com/images/products/thermostat.jpg", 168 | productUrl: "https://example.com/products/smart-thermostat", 169 | }, 170 | { 171 | name: "Smart Security Camera", 172 | price: 74.5, 173 | rating: 4, 174 | description: 175 | "HD security camera with motion detection, night vision, and two-way audio.", 176 | features: [ 177 | "1080p HD video", 178 | "Cloud storage available", 179 | "Weather-resistant", 180 | "Real-time alerts", 181 | ], 182 | imageUrl: "https://example.com/images/products/camera.jpg", 183 | productUrl: "https://example.com/products/smart-security-camera", 184 | }, 185 | ]; 186 | 187 | // Helper function to verify product list extraction results 188 | function verifyProductListExtraction(result: ExtractorResult): void { 189 | // Check structure, not exact values 190 | expect(result.data).toBeDefined(); 191 | expect(Array.isArray(result.data.products)).toBe(true); 192 | 193 | // Check parity with ground truth data 194 | expect(result.data.products.length).toBe(groundTruthProductList.length); 195 | 196 | // Verify each extracted product matches the ground truth 197 | for (const product of result.data.products) { 198 | // Find matching product in ground truth by name 199 | const groundTruthProduct = groundTruthProductList.find( 200 | (p) => p.name === product.name 201 | ); 202 | 203 | // Ensure the product exists in ground truth 204 | expect(groundTruthProduct).toBeDefined(); 205 | 206 | // Compare all product properties 207 | expect(product.price).toBe(groundTruthProduct!.price); 208 | expect(product.rating).toBe(groundTruthProduct!.rating); 209 | expect(product.description).toBe(groundTruthProduct!.description); 210 | expect(product.features).toEqual(groundTruthProduct!.features); 211 | 212 | // Verify URLs are absolute 213 | expect(product.imageUrl).toBe(groundTruthProduct!.imageUrl); 214 | expect(product.productUrl).toBe(groundTruthProduct!.productUrl); 215 | } 216 | 217 | // Verify that usage statistics are returned 218 | expect(result.usage).toBeDefined(); 219 | expect(result.usage.inputTokens).toBeGreaterThan(0); 220 | expect(result.usage.outputTokens).toBeGreaterThan(0); 221 | } 222 | 223 | describe("Product List Extraction", () => { 224 | test("should extract product list data using Google Gemini", async () => { 225 | const result = await extract({ 226 | content: productListHtml, 227 | format: ContentFormat.HTML, 228 | schema: productSchema, 229 | provider: LLMProvider.GOOGLE_GEMINI, 230 | googleApiKey: process.env.GOOGLE_API_KEY, 231 | sourceUrl: "https://example.com/products", 232 | htmlExtractionOptions: { 233 | extractMainHtml: true, 234 | includeImages: true, 235 | }, 236 | }); 237 | verifyProductListExtraction(result); 238 | }); 239 | 240 | test("should extract product list data using OpenAI", async () => { 241 | const result = await extract({ 242 | content: productListHtml, 243 | format: ContentFormat.HTML, 244 | schema: productSchemaOpenAI, 245 | provider: LLMProvider.OPENAI, 246 | openaiApiKey: process.env.OPENAI_API_KEY, 247 | sourceUrl: "https://example.com/products", 248 | htmlExtractionOptions: { 249 | extractMainHtml: true, 250 | includeImages: true, 251 | }, 252 | }); 253 | verifyProductListExtraction(result); 254 | }); 255 | }); 256 | 257 | const markdownContent = "Product: Apple, Price: N/A"; 258 | 259 | describe("Handle Structured Output Errors", () => { 260 | test("should handle structured output errors using OpenAI", async () => { 261 | const result = await extract({ 262 | content: markdownContent, 263 | format: ContentFormat.MARKDOWN, 264 | schema: z.object({ 265 | product: z.string(), 266 | // For this test, force the price to be N/A and break the schema so we can test the 267 | // structured output error handling. In real life, this could happen if the LLM returns 268 | // a value that is not expected by the schema. 269 | price: z.number().describe("Use 'N/A' if not available").nullable(), 270 | }), 271 | provider: LLMProvider.OPENAI, 272 | openaiApiKey: process.env.OPENAI_API_KEY, 273 | modelName: "gpt-3.5-turbo", 274 | }); 275 | expect(result.data).toEqual( 276 | expect.objectContaining({ 277 | product: expect.stringMatching(/^Apple(?:, Price: N\/A)?$/), 278 | price: null, 279 | }) 280 | ); 281 | }); 282 | 283 | test("should handle structured output errors using Google Gemini", async () => { 284 | const result = await extract({ 285 | content: blogPostHtml, 286 | format: ContentFormat.HTML, 287 | schema: z.object({ 288 | title: z.string(), 289 | author: z.string().optional(), 290 | date: z.string().optional(), 291 | tags: z 292 | .array(z.string()) 293 | .optional() 294 | .describe( 295 | "Tags appear after the date. Do not include the # symbol." 296 | ), 297 | summary: z.string(), 298 | // For this test, adding an additional content field seems to cause the Google Gemini model 299 | // to fail in some cases to return the structured output. 300 | content: z.string().optional(), 301 | }), 302 | provider: LLMProvider.GOOGLE_GEMINI, 303 | googleApiKey: process.env.GOOGLE_API_KEY, 304 | sourceUrl: "https://example.com/blog/async-await", 305 | }); 306 | expect(result.data).toBeDefined(); 307 | }); 308 | }); 309 | 310 | describe("Special Character Handling", () => { 311 | test("should extract link with special characters from markdown and validate as URL", async () => { 312 | const markdownContent = 313 | "[Meeting \\[11-12-24\\]](https://example.com/meeting-\\(11-12-24\\))"; 314 | 315 | // Use string().url() validation 316 | const schema = z.object({ 317 | title: z.string(), 318 | link: z.string().url(), // Added URL validation 319 | }); 320 | 321 | const result = await extract({ 322 | content: markdownContent, 323 | format: ContentFormat.MARKDOWN, 324 | schema, 325 | provider: LLMProvider.OPENAI, 326 | openaiApiKey: process.env.OPENAI_API_KEY, 327 | }); 328 | 329 | // Verify the extracted data 330 | expect(result.data.title).toBe("Meeting [11-12-24]"); 331 | expect(result.data.link).toBe("https://example.com/meeting-(11-12-24)"); 332 | }); 333 | 334 | test("should extract an array of URLs with special characters", async () => { 335 | const markdownContent = ` 336 | # Meeting Links 337 | 338 | - [Q4 Planning \\(2023\\)](https://example.com/meetings/q4-planning-\\(2023\\)) 339 | - [Budget Review \\[2024\\]](https://example.com/budget/review-\\[2024\\]) 340 | - [Product Launch (May 2024)](https://example.com/products/launch-(may-2024)) 341 | `; 342 | 343 | // Use array of string().url() validation 344 | const schema = z.object({ 345 | title: z.string(), 346 | links: z.array(z.string().url()), 347 | }); 348 | 349 | const result = await extract({ 350 | content: markdownContent, 351 | format: ContentFormat.MARKDOWN, 352 | schema, 353 | provider: LLMProvider.OPENAI, 354 | openaiApiKey: process.env.OPENAI_API_KEY, 355 | }); 356 | 357 | // Verify the extracted data 358 | expect(result.data.title).toBe("Meeting Links"); 359 | expect(result.data.links).toContain( 360 | "https://example.com/meetings/q4-planning-(2023)" 361 | ); 362 | expect(result.data.links).toContain( 363 | "https://example.com/budget/review-[2024]" 364 | ); 365 | expect(result.data.links).toContain( 366 | "https://example.com/products/launch-(may-2024)" 367 | ); 368 | }); 369 | }); 370 | 371 | describe("Data Enrichment", () => { 372 | test("should enrich existing data with blog post content using Google Gemini", async () => { 373 | // Create partial data to be enriched 374 | const partialData = { 375 | title: "A Different Title", 376 | date: "February 1, 2022", // This might be updated based on content 377 | summary: "", 378 | }; 379 | 380 | const result = await extract({ 381 | content: blogPostHtml, 382 | format: ContentFormat.HTML, 383 | schema: blogSchema, 384 | provider: LLMProvider.GOOGLE_GEMINI, 385 | googleApiKey: process.env.GOOGLE_API_KEY, 386 | sourceUrl: "https://example.com/blog/async-await", 387 | extractionContext: partialData, 388 | }); 389 | 390 | // Verify the enriched data has the correct values 391 | verifyBlogPostExtraction(result); 392 | }); 393 | 394 | test("should enrich existing data with blog post content using OpenAI", async () => { 395 | // Create partial data with some existing values 396 | const partialData = { 397 | title: "A Different Title", // This should be updated 398 | date: "February 1, 2022", // This might be updated based on content 399 | summary: "", 400 | }; 401 | 402 | const result = await extract({ 403 | content: blogPostHtml, 404 | format: ContentFormat.HTML, 405 | schema: blogSchemaOpenAI, 406 | provider: LLMProvider.OPENAI, 407 | openaiApiKey: process.env.OPENAI_API_KEY, 408 | sourceUrl: "https://example.com/blog/async-await", 409 | extractionContext: partialData, 410 | }); 411 | 412 | // Verify the enriched data has the correct values 413 | verifyBlogPostExtraction(result); 414 | }); 415 | 416 | test("should enrich product list data with custom prompt using Google Gemini", async () => { 417 | // Create partial product data with missing information 418 | const partialData = { 419 | products: [ 420 | { 421 | name: "Smart Speaker Pro", 422 | price: 0, // Missing price 423 | features: [], // Missing features 424 | }, 425 | { 426 | name: "Smart Thermostat", 427 | price: 0, // Missing price 428 | features: [], // Missing features 429 | }, 430 | { 431 | name: "Smart Security Camera", 432 | price: 0, // Missing price 433 | features: [], // Missing features 434 | }, 435 | ], 436 | }; 437 | 438 | const result = await extract({ 439 | content: productListHtml, 440 | format: ContentFormat.HTML, 441 | schema: productSchema, 442 | provider: LLMProvider.GOOGLE_GEMINI, 443 | googleApiKey: process.env.GOOGLE_API_KEY, 444 | sourceUrl: "https://example.com/products", 445 | extractionContext: partialData, 446 | prompt: 447 | "Focus on enriching the product data with accurate prices and feature lists from the context.", 448 | }); 449 | 450 | // Verify that prices and features were enriched correctly 451 | expect(result.data).toBeDefined(); 452 | expect(Array.isArray(result.data.products)).toBe(true); 453 | expect(result.data.products.length).toBe(3); 454 | 455 | // Check prices were updated 456 | expect(result.data.products[0].price).toBe(129.99); 457 | expect(result.data.products[1].price).toBe(89.95); 458 | expect(result.data.products[2].price).toBe(74.5); 459 | 460 | // Check features were populated 461 | expect(result.data.products[0].features?.length).toBeGreaterThan(0); 462 | expect(result.data.products[1].features?.length).toBeGreaterThan(0); 463 | expect(result.data.products[2].features?.length).toBeGreaterThan(0); 464 | 465 | // Verify usage stats 466 | expect(result.usage).toBeDefined(); 467 | expect(result.usage.inputTokens).toBeGreaterThan(0); 468 | expect(result.usage.outputTokens).toBeGreaterThan(0); 469 | }); 470 | }); 471 | }); 472 | 473 | // Read the sample HTML file with images 474 | const articleWithImages = fs.readFileSync( 475 | path.resolve(__dirname, "../fixtures/article-with-images.html"), 476 | "utf8" 477 | ); 478 | 479 | // Define a schema that includes image extraction 480 | const articleSchema = z.object({ 481 | title: z.string(), 482 | author: z.string(), 483 | date: z.string(), 484 | tags: z 485 | .array(z.string()) 486 | .optional() 487 | .describe("Tags appear after the date. Do not include the # symbol."), 488 | summary: z.string(), 489 | images: z 490 | .array( 491 | z.object({ 492 | url: z.string().url(), 493 | alt: z.string().optional(), 494 | caption: z.string().optional(), 495 | }) 496 | ) 497 | .optional() 498 | .describe( 499 | "Extract all images from the article with their URLs and alt text" 500 | ), 501 | }); 502 | 503 | // Define a separate schema for OpenAI tests using nullable instead of optional 504 | const articleSchemaOpenAI = z.object({ 505 | title: z.string(), 506 | author: z.string(), 507 | date: z.string(), 508 | tags: z 509 | .array(z.string()) 510 | .nullable() 511 | .describe("Tags appear after the date. Do not include the # symbol."), 512 | summary: z.string(), 513 | images: z 514 | .array( 515 | z.object({ 516 | url: z.string().url(), 517 | alt: z.string().nullable(), 518 | caption: z.string().nullable(), 519 | }) 520 | ) 521 | .nullable() 522 | .describe( 523 | "Extract all images from the article with their URLs and alt text" 524 | ), 525 | }); 526 | 527 | // Function to verify that images are correctly extracted 528 | function verifyImageExtraction(result: ExtractorResult): void { 529 | // Check the data is extracted correctly 530 | expect(result.data).toBeDefined(); 531 | expect(result.data.title).toBe( 532 | "Modern Web Development with React and Node.js" 533 | ); 534 | expect(result.data.author).toBe("Jane Smith"); 535 | expect(result.data.date).toBe("March 20, 2023"); 536 | expect(result.data.tags).toContain("React"); 537 | expect(result.data.tags).toContain("Node.js"); 538 | expect(result.data.tags).toContain("JavaScript"); 539 | 540 | // Verify that images are extracted 541 | expect(result.data.images).toBeDefined(); 542 | expect(Array.isArray(result.data.images)).toBe(true); 543 | expect(result.data.images.length).toBeGreaterThan(0); 544 | 545 | // Check for the main architecture image 546 | const architectureImage = result.data.images.find((img: any) => 547 | img.url.includes("react-node-architecture.png") 548 | ); 549 | expect(architectureImage).toBeDefined(); 550 | expect(architectureImage.alt).toBe("React and Node.js Architecture"); 551 | 552 | // Check for the event loop image 553 | const eventLoopImage = result.data.images.find((img: any) => 554 | img.url.includes("nodejs-event-loop.jpg") 555 | ); 556 | expect(eventLoopImage).toBeDefined(); 557 | expect(eventLoopImage.alt).toBe("Node.js Event Loop"); 558 | 559 | // Check for the webpack image 560 | const webpackImage = result.data.images.find((img: any) => 561 | img.url.includes("webpack-logo.png") 562 | ); 563 | expect(webpackImage).toBeDefined(); 564 | expect(webpackImage.alt).toBe("Webpack Logo"); 565 | expect(webpackImage.caption).toBe("Webpack for module bundling"); 566 | 567 | // Verify that usage statistics are returned 568 | expect(result.usage).toBeDefined(); 569 | expect(result.usage.inputTokens).toBeGreaterThan(0); 570 | expect(result.usage.outputTokens).toBeGreaterThan(0); 571 | } 572 | 573 | describe("Image Extraction Integration Tests", () => { 574 | // Test that the low level htmlToMarkdown function correctly handles images 575 | test("should include images in markdown when includeImages is true", () => { 576 | const markdownWithImages = htmlToMarkdown(articleWithImages, { 577 | includeImages: true, 578 | }); 579 | const markdownWithoutImages = htmlToMarkdown(articleWithImages); 580 | 581 | // With includeImages: true, markdown should contain image references 582 | expect(markdownWithImages).toContain( 583 | "![React and Node.js Architecture](https://example.com/images/react-node-architecture.png)" 584 | ); 585 | expect(markdownWithImages).toContain( 586 | "![Node.js Event Loop](https://example.com/images/nodejs-event-loop.jpg)" 587 | ); 588 | 589 | // Without includeImages, markdown should not contain image references 590 | expect(markdownWithoutImages).not.toContain( 591 | "![React and Node.js Architecture]" 592 | ); 593 | expect(markdownWithoutImages).not.toContain("![Node.js Event Loop]"); 594 | }); 595 | 596 | // Test with OpenAI 597 | test("should extract images using OpenAI when includeImages is true", async () => { 598 | const result = await extract({ 599 | content: articleWithImages, 600 | format: ContentFormat.HTML, 601 | schema: articleSchemaOpenAI, 602 | provider: LLMProvider.OPENAI, 603 | openaiApiKey: process.env.OPENAI_API_KEY, 604 | htmlExtractionOptions: { 605 | includeImages: true, 606 | }, 607 | sourceUrl: "https://example.com/blog/async-await", 608 | }); 609 | 610 | verifyImageExtraction(result); 611 | }); 612 | 613 | // Test with Google Gemini 614 | test("should extract images using Google Gemini when includeImages is true", async () => { 615 | const result = await extract({ 616 | content: articleWithImages, 617 | format: ContentFormat.HTML, 618 | schema: articleSchema, 619 | provider: LLMProvider.GOOGLE_GEMINI, 620 | googleApiKey: process.env.GOOGLE_API_KEY, 621 | htmlExtractionOptions: { 622 | includeImages: true, 623 | }, 624 | sourceUrl: "https://example.com/blog/async-await", 625 | }); 626 | 627 | verifyImageExtraction(result); 628 | }); 629 | }); 630 | -------------------------------------------------------------------------------- /tests/integration/html-to-markdown.test.ts: -------------------------------------------------------------------------------- 1 | import * as fs from "fs"; 2 | import * as path from "path"; 3 | import { htmlToMarkdown } from "../../src/converters"; 4 | import { HTMLExtractionOptions } from "../../src/types"; 5 | 6 | // Flag to check if the test-data submodule exists 7 | const testDataExists = fs.existsSync(path.join(__dirname, "../../test-data")); 8 | 9 | // Skip all tests if the test-data submodule is not available 10 | const testOrSkip = testDataExists ? test : test.skip; 11 | 12 | describe("HTML to Markdown Integration Tests", () => { 13 | // Function to test a specific HTML file against its groundtruth markdown 14 | function testConversion( 15 | category: string, 16 | filename: string, 17 | options?: HTMLExtractionOptions, 18 | variant: string = "" 19 | ) { 20 | // Construct file paths 21 | const htmlFilePath = path.join( 22 | __dirname, 23 | "../../test-data/html", 24 | category, 25 | `${filename}.html` 26 | ); 27 | 28 | // Determine the groundtruth file path based on variant 29 | let groundtruthFilename = `${filename}`; 30 | if (variant === "main") { 31 | groundtruthFilename += ".main"; 32 | } else if (variant === "images") { 33 | groundtruthFilename += ".images"; 34 | } 35 | 36 | const markdownFilePath = path.join( 37 | __dirname, 38 | "../../test-data/groundtruth", 39 | category, 40 | `${groundtruthFilename}.md` 41 | ); 42 | 43 | // Skip if files don't exist 44 | if (!fs.existsSync(htmlFilePath) || !fs.existsSync(markdownFilePath)) { 45 | console.warn( 46 | `Skipping test: Missing files for ${category}/${filename}: ${htmlFilePath} or ${markdownFilePath} not found` 47 | ); 48 | return; 49 | } 50 | 51 | // Read files 52 | const html = fs.readFileSync(htmlFilePath, "utf8"); 53 | const expectedMarkdown = fs.readFileSync(markdownFilePath, "utf8"); 54 | 55 | // Convert HTML to Markdown 56 | const actualMarkdown = htmlToMarkdown(html, options); 57 | 58 | // Compare 59 | expect(actualMarkdown).toBe(expectedMarkdown); 60 | } 61 | 62 | // Dynamic test generation - automatically test all files in the test-data directory 63 | if (testDataExists) { 64 | describe("Auto-discovered Tests", () => { 65 | // Get all categories (subdirectories under html/) 66 | const testDataDir = path.join(__dirname, "../../test-data"); 67 | const htmlDir = path.join(testDataDir, "html"); 68 | const categories = fs 69 | .readdirSync(htmlDir, { withFileTypes: true }) 70 | .filter((dirent) => dirent.isDirectory()) 71 | .map((dirent) => dirent.name); 72 | 73 | // For each category, get all HTML files and create tests 74 | categories.forEach((category) => { 75 | const categoryDir = path.join(htmlDir, category); 76 | const htmlFiles = fs 77 | .readdirSync(categoryDir) 78 | .filter((file) => file.endsWith(".html")) 79 | .map((file) => file.replace(".html", "")); 80 | 81 | htmlFiles.forEach((filename) => { 82 | // Check which groundtruth files exist for this file 83 | const groundtruthDir = path.join( 84 | testDataDir, 85 | "groundtruth", 86 | category 87 | ); 88 | 89 | // Basic conversion 90 | if (fs.existsSync(path.join(groundtruthDir, `${filename}.md`))) { 91 | testOrSkip( 92 | `should convert ${category}/${filename} to markdown`, 93 | () => { 94 | testConversion(category, filename); 95 | } 96 | ); 97 | } 98 | 99 | // Main content extraction 100 | if (fs.existsSync(path.join(groundtruthDir, `${filename}.main.md`))) { 101 | testOrSkip( 102 | `should extract main content from ${category}/${filename}`, 103 | () => { 104 | testConversion( 105 | category, 106 | filename, 107 | { extractMainHtml: true }, 108 | "main" 109 | ); 110 | } 111 | ); 112 | } 113 | 114 | // Conversion with images 115 | if ( 116 | fs.existsSync(path.join(groundtruthDir, `${filename}.images.md`)) 117 | ) { 118 | testOrSkip( 119 | `should convert ${category}/${filename} with images`, 120 | () => { 121 | testConversion( 122 | category, 123 | filename, 124 | { includeImages: true }, 125 | "images" 126 | ); 127 | } 128 | ); 129 | } 130 | }); 131 | }); 132 | }); 133 | } 134 | }); 135 | -------------------------------------------------------------------------------- /tests/integration/processedContent.test.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { extract, ContentFormat, LLMProvider } from "../../src"; 3 | 4 | describe("ProcessedContent Integration Tests", () => { 5 | const simpleSchema = z.object({ 6 | title: z.string(), 7 | content: z.string().nullable(), 8 | }); 9 | 10 | // Skip tests if API keys are not available 11 | const skipIfNoKeys = () => { 12 | if (!process.env.OPENAI_API_KEY) { 13 | return true; 14 | } 15 | return false; 16 | }; 17 | 18 | it("should return original content as processedContent for TXT format", async () => { 19 | if (skipIfNoKeys()) { 20 | console.log("Skipping test: No API keys available"); 21 | return; 22 | } 23 | 24 | const plainTextContent = 25 | "Title: Simple Test\n\nThis is a test of plain text extraction."; 26 | 27 | const result = await extract({ 28 | content: plainTextContent, 29 | format: ContentFormat.TXT, 30 | schema: simpleSchema, 31 | provider: LLMProvider.OPENAI, 32 | openaiApiKey: process.env.OPENAI_API_KEY, 33 | }); 34 | 35 | // Verify the processedContent is the same as the original content 36 | expect(result.processedContent).toBe(plainTextContent); 37 | }, 60000); 38 | 39 | it("should return original content as processedContent for MARKDOWN format", async () => { 40 | if (skipIfNoKeys()) { 41 | console.log("Skipping test: No API keys available"); 42 | return; 43 | } 44 | 45 | const markdownContent = 46 | "# Simple Test\n\nThis is a test of markdown extraction."; 47 | 48 | const result = await extract({ 49 | content: markdownContent, 50 | format: ContentFormat.MARKDOWN, 51 | schema: simpleSchema, 52 | provider: LLMProvider.OPENAI, 53 | openaiApiKey: process.env.OPENAI_API_KEY, 54 | }); 55 | 56 | // Verify the processedContent is the same as the original content 57 | expect(result.processedContent).toBe(markdownContent); 58 | }, 60000); 59 | 60 | it("should return converted markdown as processedContent for HTML format", async () => { 61 | if (skipIfNoKeys()) { 62 | console.log("Skipping test: No API keys available"); 63 | return; 64 | } 65 | 66 | const htmlContent = 67 | "

Simple Test

This is a test of HTML extraction.

"; 68 | 69 | const result = await extract({ 70 | content: htmlContent, 71 | format: ContentFormat.HTML, 72 | schema: simpleSchema, 73 | provider: LLMProvider.OPENAI, 74 | openaiApiKey: process.env.OPENAI_API_KEY, 75 | sourceUrl: "https://example.com", 76 | }); 77 | 78 | // For HTML, processedContent should be the converted markdown 79 | expect(result.processedContent).toContain("Simple Test"); 80 | expect(result.processedContent).toContain( 81 | "This is a test of HTML extraction." 82 | ); 83 | expect(result.processedContent).not.toContain("

"); 84 | expect(result.processedContent).not.toContain("

"); 85 | }, 60000); 86 | }); 87 | -------------------------------------------------------------------------------- /tests/setup.ts: -------------------------------------------------------------------------------- 1 | import { config } from "dotenv"; 2 | import * as path from "path"; 3 | 4 | // Load environment variables from .env file 5 | config({ path: path.resolve(process.cwd(), ".env") }); 6 | 7 | // Set default timeout for tests (useful for tests involving LLM API calls) 8 | jest.setTimeout(60000); 9 | -------------------------------------------------------------------------------- /tests/unit/browser.test.ts: -------------------------------------------------------------------------------- 1 | import { Browser } from "../../src/browser"; 2 | 3 | // Mock browser providers 4 | jest.mock("../../src/utils/browserProviders"); 5 | 6 | describe("Browser Class", () => { 7 | let mockBrowser: any; 8 | let mockPage: any; 9 | let mockContext: any; 10 | let mockProvider: any; 11 | 12 | beforeEach(() => { 13 | jest.clearAllMocks(); 14 | 15 | // Mock page 16 | mockPage = { 17 | goto: jest.fn(), 18 | waitForTimeout: jest.fn(), 19 | waitForLoadState: jest.fn(), 20 | content: jest.fn(), 21 | close: jest.fn(), 22 | title: jest.fn(), 23 | }; 24 | 25 | // Mock context 26 | mockContext = { 27 | newPage: jest.fn().mockResolvedValue(mockPage), 28 | close: jest.fn(), 29 | }; 30 | 31 | // Mock browser 32 | mockBrowser = { 33 | newPage: jest.fn().mockResolvedValue(mockPage), 34 | newContext: jest.fn().mockResolvedValue(mockContext), 35 | close: jest.fn(), 36 | }; 37 | 38 | // Mock provider 39 | mockProvider = { 40 | start: jest.fn().mockResolvedValue(mockBrowser), 41 | close: jest.fn(), 42 | }; 43 | 44 | // Mock createBrowserProvider 45 | const { 46 | createBrowserProvider, 47 | } = require("../../src/utils/browserProviders"); 48 | createBrowserProvider.mockReturnValue(mockProvider); 49 | }); 50 | 51 | describe("Browser instantiation and lifecycle", () => { 52 | it("should create browser with default config", () => { 53 | const browser = new Browser(); 54 | expect(browser.isStarted()).toBe(false); 55 | }); 56 | 57 | it("should create browser with custom config", () => { 58 | const config = { 59 | type: "local" as const, 60 | options: { args: ["--disable-dev-shm-usage"] }, 61 | }; 62 | const browser = new Browser(config); 63 | expect(browser.isStarted()).toBe(false); 64 | }); 65 | 66 | it("should start browser successfully", async () => { 67 | const browser = new Browser(); 68 | await browser.start(); 69 | 70 | expect(browser.isStarted()).toBe(true); 71 | expect(mockProvider.start).toHaveBeenCalled(); 72 | 73 | await browser.close(); 74 | }); 75 | 76 | it("should throw error when starting already started browser", async () => { 77 | const browser = new Browser(); 78 | await browser.start(); 79 | 80 | await expect(browser.start()).rejects.toThrow( 81 | "Browser is already started. Call close() first if you want to restart." 82 | ); 83 | 84 | await browser.close(); 85 | }); 86 | 87 | it("should close browser successfully", async () => { 88 | const browser = new Browser(); 89 | await browser.start(); 90 | await browser.close(); 91 | 92 | expect(browser.isStarted()).toBe(false); 93 | expect(mockProvider.close).toHaveBeenCalled(); 94 | }); 95 | 96 | it("should handle closing non-started browser gracefully", async () => { 97 | const browser = new Browser(); 98 | await browser.close(); 99 | 100 | expect(browser.isStarted()).toBe(false); 101 | }); 102 | }); 103 | 104 | describe("Page operations", () => { 105 | it("should create new page when browser is started", async () => { 106 | const browser = new Browser(); 107 | await browser.start(); 108 | 109 | const page = await browser.newPage(); 110 | 111 | expect(page).toBe(mockPage); 112 | expect(mockBrowser.newPage).toHaveBeenCalled(); 113 | 114 | await browser.close(); 115 | }); 116 | 117 | it("should throw error when creating page with non-started browser", async () => { 118 | const browser = new Browser(); 119 | 120 | await expect(browser.newPage()).rejects.toThrow( 121 | "Browser not started. Call start() first." 122 | ); 123 | }); 124 | 125 | it("should create new browser context", async () => { 126 | const browser = new Browser(); 127 | 128 | // Should fail if browser not started 129 | await expect(browser.newContext()).rejects.toThrow( 130 | "Browser not started. Call start() first." 131 | ); 132 | 133 | await browser.start(); 134 | const context = await browser.newContext(); 135 | expect(context).toBe(mockContext); 136 | expect(mockBrowser.newContext).toHaveBeenCalled(); 137 | 138 | await browser.close(); 139 | }); 140 | 141 | it("should allow direct page operations with Playwright API", async () => { 142 | const browser = new Browser(); 143 | await browser.start(); 144 | 145 | const page = await browser.newPage(); 146 | const url = "https://example.com"; 147 | const htmlContent = "

Test

"; 148 | 149 | mockPage.content.mockResolvedValue(htmlContent); 150 | mockPage.title.mockResolvedValue("Test Title"); 151 | 152 | // Use direct Playwright API 153 | await page.goto(url); 154 | await page.waitForLoadState("networkidle", { timeout: 10000 }); 155 | const html = await page.content(); 156 | const title = await page.title(); 157 | 158 | expect(mockPage.goto).toHaveBeenCalledWith(url); 159 | expect(mockPage.waitForLoadState).toHaveBeenCalledWith("networkidle", { 160 | timeout: 10000, 161 | }); 162 | expect(mockPage.content).toHaveBeenCalled(); 163 | expect(mockPage.title).toHaveBeenCalled(); 164 | expect(html).toBe(htmlContent); 165 | expect(title).toBe("Test Title"); 166 | 167 | await browser.close(); 168 | }); 169 | 170 | it("should handle page errors gracefully", async () => { 171 | const browser = new Browser(); 172 | await browser.start(); 173 | 174 | const page = await browser.newPage(); 175 | const error = new Error("Navigation failed"); 176 | 177 | mockPage.goto.mockRejectedValue(error); 178 | 179 | await expect(page.goto("https://example.com")).rejects.toThrow( 180 | "Navigation failed" 181 | ); 182 | 183 | await browser.close(); 184 | }); 185 | 186 | it("should support multiple pages", async () => { 187 | const browser = new Browser(); 188 | await browser.start(); 189 | 190 | const page1 = await browser.newPage(); 191 | const page2 = await browser.newPage(); 192 | 193 | expect(page1).toBe(mockPage); 194 | expect(page2).toBe(mockPage); 195 | expect(mockBrowser.newPage).toHaveBeenCalledTimes(2); 196 | 197 | await browser.close(); 198 | }); 199 | }); 200 | }); 201 | -------------------------------------------------------------------------------- /tests/unit/browserProviders.test.ts: -------------------------------------------------------------------------------- 1 | import { 2 | LocalBrowserProvider, 3 | ServerlessBrowserProvider, 4 | RemoteBrowserProvider, 5 | createBrowserProvider, 6 | } from "../../src/utils/browserProviders"; 7 | import { Browser } from "playwright"; 8 | 9 | // Mock playwright 10 | jest.mock("playwright", () => ({ 11 | chromium: { 12 | launch: jest.fn(), 13 | connectOverCDP: jest.fn(), 14 | }, 15 | })); 16 | 17 | const { chromium } = require("playwright"); 18 | 19 | describe("Browser Providers", () => { 20 | beforeEach(() => { 21 | jest.clearAllMocks(); 22 | }); 23 | 24 | describe("LocalBrowserProvider", () => { 25 | it("should create instance with default options", () => { 26 | const provider = new LocalBrowserProvider({}); 27 | expect(provider).toBeInstanceOf(LocalBrowserProvider); 28 | expect(provider.options).toBeUndefined(); 29 | expect(provider.proxy).toBeNull(); 30 | }); 31 | 32 | it("should create instance with custom options and proxy", () => { 33 | const options = { args: ["--no-sandbox"] }; 34 | const proxy = { host: "proxy.example.com", port: 8080 }; 35 | 36 | const provider = new LocalBrowserProvider({ options, proxy }); 37 | expect(provider.options).toEqual(options); 38 | expect(provider.proxy).toEqual(proxy); 39 | }); 40 | 41 | it("should start browser with correct configuration", async () => { 42 | const mockBrowser = { close: jest.fn() } as unknown as Browser; 43 | chromium.launch.mockResolvedValue(mockBrowser); 44 | 45 | const provider = new LocalBrowserProvider({ 46 | options: { args: ["--test-arg"] }, 47 | proxy: { host: "proxy.test", port: 3128 }, 48 | }); 49 | 50 | const browser = await provider.start(); 51 | 52 | expect(chromium.launch).toHaveBeenCalledWith({ 53 | channel: "chrome", 54 | headless: true, 55 | args: ["--disable-blink-features=AutomationControlled", "--test-arg"], 56 | proxy: { 57 | server: "http://proxy.test:3128", 58 | username: undefined, 59 | password: undefined, 60 | }, 61 | }); 62 | expect(browser).toBe(mockBrowser); 63 | expect(provider.getSession()).toBe(mockBrowser); 64 | }); 65 | 66 | it("should start browser without proxy when not provided", async () => { 67 | const mockBrowser = { close: jest.fn() } as unknown as Browser; 68 | chromium.launch.mockResolvedValue(mockBrowser); 69 | 70 | const provider = new LocalBrowserProvider({}); 71 | await provider.start(); 72 | 73 | expect(chromium.launch).toHaveBeenCalledWith({ 74 | channel: "chrome", 75 | headless: true, 76 | args: ["--disable-blink-features=AutomationControlled"], 77 | }); 78 | }); 79 | 80 | it("should close browser", async () => { 81 | const mockBrowser = { close: jest.fn() } as unknown as Browser; 82 | chromium.launch.mockResolvedValue(mockBrowser); 83 | 84 | const provider = new LocalBrowserProvider({}); 85 | await provider.start(); 86 | await provider.close(); 87 | 88 | expect(mockBrowser.close).toHaveBeenCalled(); 89 | }); 90 | 91 | it("should return null when no session", () => { 92 | const provider = new LocalBrowserProvider({}); 93 | expect(provider.getSession()).toBeNull(); 94 | }); 95 | }); 96 | 97 | describe("ServerlessBrowserProvider", () => { 98 | it("should create instance with required parameters", () => { 99 | const executablePath = "/usr/bin/chromium"; 100 | const provider = new ServerlessBrowserProvider({ executablePath }); 101 | 102 | expect(provider).toBeInstanceOf(ServerlessBrowserProvider); 103 | expect(provider.executablePath).toBe(executablePath); 104 | expect(provider.proxy).toBeNull(); 105 | }); 106 | 107 | it("should start browser with executable path", async () => { 108 | const mockBrowser = { close: jest.fn() } as unknown as Browser; 109 | chromium.launch.mockResolvedValue(mockBrowser); 110 | 111 | const executablePath = "/usr/bin/chromium"; 112 | const provider = new ServerlessBrowserProvider({ executablePath }); 113 | 114 | await provider.start(); 115 | 116 | expect(chromium.launch).toHaveBeenCalledWith({ 117 | headless: true, 118 | executablePath, 119 | args: ["--disable-blink-features=AutomationControlled"], 120 | }); 121 | }); 122 | 123 | it("should start browser with proxy configuration", async () => { 124 | const mockBrowser = { close: jest.fn() } as unknown as Browser; 125 | chromium.launch.mockResolvedValue(mockBrowser); 126 | 127 | const executablePath = "/usr/bin/chromium"; 128 | const proxy = { 129 | host: "proxy.test", 130 | port: 8080, 131 | auth: { username: "user", password: "pass" }, 132 | }; 133 | 134 | const provider = new ServerlessBrowserProvider({ executablePath, proxy }); 135 | await provider.start(); 136 | 137 | expect(chromium.launch).toHaveBeenCalledWith({ 138 | headless: true, 139 | executablePath, 140 | args: ["--disable-blink-features=AutomationControlled"], 141 | proxy: { 142 | server: "http://proxy.test:8080", 143 | username: "user", 144 | password: "pass", 145 | }, 146 | }); 147 | }); 148 | }); 149 | 150 | describe("RemoteBrowserProvider", () => { 151 | it("should create instance with WebSocket endpoint", () => { 152 | const wsEndpoint = "ws://localhost:9222"; 153 | const provider = new RemoteBrowserProvider({ wsEndpoint }); 154 | 155 | expect(provider).toBeInstanceOf(RemoteBrowserProvider); 156 | expect(provider.wsEndpoint).toBe(wsEndpoint); 157 | }); 158 | 159 | it("should connect to remote browser", async () => { 160 | const mockBrowser = { close: jest.fn() } as unknown as Browser; 161 | chromium.connectOverCDP.mockResolvedValue(mockBrowser); 162 | 163 | const wsEndpoint = "ws://localhost:9222"; 164 | const options = { timeout: 30000 }; 165 | const provider = new RemoteBrowserProvider({ wsEndpoint, options }); 166 | 167 | const browser = await provider.start(); 168 | 169 | expect(chromium.connectOverCDP).toHaveBeenCalledWith(wsEndpoint, options); 170 | expect(browser).toBe(mockBrowser); 171 | }); 172 | }); 173 | 174 | describe("createBrowserProvider", () => { 175 | it("should create LocalBrowserProvider", () => { 176 | const config = { type: "local" as const }; 177 | const provider = createBrowserProvider(config); 178 | expect(provider).toBeInstanceOf(LocalBrowserProvider); 179 | }); 180 | 181 | it("should create ServerlessBrowserProvider", () => { 182 | const config = { 183 | type: "serverless" as const, 184 | executablePath: "/usr/bin/chromium", 185 | }; 186 | const provider = createBrowserProvider(config); 187 | expect(provider).toBeInstanceOf(ServerlessBrowserProvider); 188 | }); 189 | 190 | it("should create RemoteBrowserProvider", () => { 191 | const config = { 192 | type: "remote" as const, 193 | wsEndpoint: "ws://localhost:9222", 194 | }; 195 | const provider = createBrowserProvider(config); 196 | expect(provider).toBeInstanceOf(RemoteBrowserProvider); 197 | }); 198 | 199 | it("should throw error for unsupported type", () => { 200 | const config = { type: "unsupported" as any }; 201 | expect(() => createBrowserProvider(config)).toThrow( 202 | "Unsupported browser provider type: unsupported" 203 | ); 204 | }); 205 | }); 206 | }); 207 | -------------------------------------------------------------------------------- /tests/unit/converters.test.ts: -------------------------------------------------------------------------------- 1 | import { htmlToMarkdown } from "../../src/converters"; 2 | import { convertHtmlToMarkdown } from "../../src/index"; 3 | 4 | describe("HTML to Markdown converter", () => { 5 | test("should convert simple HTML to markdown", () => { 6 | const html = "

Hello World

This is a test

"; 7 | const markdown = htmlToMarkdown(html); 8 | 9 | expect(markdown).toEqual("Hello World\n===========\n\nThis is a test"); 10 | expect(markdown).toContain("Hello World"); 11 | expect(markdown).toContain("This is a test"); 12 | }); 13 | 14 | test("should handle HTML with attributes", () => { 15 | const html = 16 | '

Title

Paragraph

'; 17 | const markdown = htmlToMarkdown(html); 18 | 19 | expect(markdown).toContain("Title"); 20 | expect(markdown).toContain("Paragraph"); 21 | }); 22 | 23 | // TODO: Add test for end-to-end extraction 24 | test("should escape markdown characters", () => { 25 | const html = 26 | 'Meeting [11-12-24]'; 27 | const markdown = htmlToMarkdown(html); 28 | 29 | expect(markdown).toBe( 30 | "[Meeting \\[11-12-24\\]](https://example.com/meeting-\\(11-12-24\\))" 31 | ); 32 | }); 33 | 34 | test("should convert links correctly", () => { 35 | const html = 'Example'; 36 | const markdown = htmlToMarkdown(html); 37 | 38 | expect(markdown).toBe("[Example](https://example.com)"); 39 | }); 40 | 41 | test("should discard images by default", () => { 42 | const html = 'An image'; 43 | const markdown = htmlToMarkdown(html); 44 | expect(markdown).toBe(""); 45 | }); 46 | 47 | test("should discard images when includeImages is false", () => { 48 | const html = 'An image'; 49 | const markdown = htmlToMarkdown(html, { includeImages: false }); 50 | expect(markdown).toBe(""); 51 | }); 52 | 53 | test("should include images when includeImages is true", () => { 54 | const html = 55 | '

Text with an image: Example image

'; 56 | const markdownWithImages = htmlToMarkdown(html, { includeImages: true }); 57 | const markdownWithoutImages = htmlToMarkdown(html); 58 | 59 | // With includeImages, the image should be converted to markdown format 60 | expect(markdownWithImages).toContain("Text with an image:"); 61 | expect(markdownWithImages).toContain( 62 | "![Example image](https://example.com/image.jpg)" 63 | ); 64 | 65 | // Without includeImages, the image should be removed 66 | expect(markdownWithoutImages).toContain("Text with an image:"); 67 | expect(markdownWithoutImages).not.toContain("![Example image]"); 68 | expect(markdownWithoutImages).not.toContain( 69 | "https://example.com/image.jpg" 70 | ); 71 | }); 72 | 73 | test("should handle complex HTML with multiple images", () => { 74 | const html = ` 75 |
76 |

Test Article

77 |

First paragraph with First image embedded.

78 |
79 | Second image 80 |
Figure caption
81 |
82 | 83 | 84 | 85 | Third image 86 | 87 |

Final paragraph.

88 |
89 | `; 90 | 91 | const markdownWithImages = htmlToMarkdown(html, { includeImages: true }); 92 | 93 | // Check that both images are included 94 | expect(markdownWithImages).toContain("![First image](image1.jpg)"); 95 | expect(markdownWithImages).toContain("![Second image](image2.jpg)"); 96 | expect(markdownWithImages).toContain("![Third image](image3.jpg)"); 97 | expect(markdownWithImages).toContain("Figure caption"); 98 | 99 | // Verify the basic structure is preserved 100 | expect(markdownWithImages).toContain("Test Article"); 101 | expect(markdownWithImages).toContain("First paragraph"); 102 | expect(markdownWithImages).toContain("Final paragraph"); 103 | 104 | // Check without images 105 | const markdownWithoutImages = htmlToMarkdown(html); 106 | expect(markdownWithoutImages).not.toContain("![First image]"); 107 | expect(markdownWithoutImages).not.toContain("![Second image]"); 108 | expect(markdownWithoutImages).not.toContain("![Third image]"); 109 | }); 110 | 111 | test("should extract main content when extractMainHtml is true", () => { 112 | const html = ` 113 | 114 | 115 |
Header content
116 |
117 |

Main Content

118 |

This is the main content

119 |
120 |
Footer content
121 | 122 | 123 | `; 124 | 125 | const markdownWithExtraction = htmlToMarkdown(html, { 126 | extractMainHtml: true, 127 | }); 128 | const markdownWithoutExtraction = htmlToMarkdown(html); 129 | 130 | // With extraction, only the article content should be included 131 | expect(markdownWithExtraction).toContain("Main Content"); 132 | expect(markdownWithExtraction).toContain("This is the main content"); 133 | expect(markdownWithExtraction).not.toContain("Header content"); 134 | expect(markdownWithExtraction).not.toContain("Footer content"); 135 | 136 | // Without extraction, the entire HTML should be converted 137 | expect(markdownWithoutExtraction).toContain("Header content"); 138 | expect(markdownWithoutExtraction).toContain("Main Content"); 139 | expect(markdownWithoutExtraction).toContain("Footer content"); 140 | }); 141 | 142 | describe("URL handling", () => { 143 | test("should convert relative URLs to absolute URLs when sourceUrl is provided", () => { 144 | const html = ` 145 | About Us 146 | Product 147 | Blog Post 148 | Logo 149 | Photo 150 | `; 151 | const sourceUrl = "https://example.com/company/"; 152 | const markdown = htmlToMarkdown(html, { includeImages: true }, sourceUrl); 153 | 154 | // Check that relative URLs are converted to absolute 155 | expect(markdown).toContain("[About Us](https://example.com/about)"); 156 | expect(markdown).toContain( 157 | "[Product](https://example.com/company/products/item.html)" 158 | ); 159 | expect(markdown).toContain( 160 | "[Blog Post](https://example.com/blog/post.html)" 161 | ); 162 | expect(markdown).toContain( 163 | "![Logo](https://example.com/images/logo.png)" 164 | ); 165 | expect(markdown).toContain( 166 | "![Photo](https://example.com/company/assets/photo.jpg)" 167 | ); 168 | }); 169 | 170 | test("should not modify absolute URLs when sourceUrl is provided", () => { 171 | const html = ` 172 | External Link 173 | Email 174 | CDN Image 175 | `; 176 | const sourceUrl = "https://example.com/"; 177 | const markdown = htmlToMarkdown(html, { includeImages: true }, sourceUrl); 178 | 179 | // Check that absolute URLs remain unchanged 180 | expect(markdown).toContain( 181 | "[External Link](https://other-site.com/page)" 182 | ); 183 | expect(markdown).toContain("[Email](mailto:user@example.com)"); 184 | expect(markdown).toContain( 185 | "![CDN Image](https://cdn.example.com/image.jpg)" 186 | ); 187 | }); 188 | 189 | test("should handle relative URLs without sourceUrl", () => { 190 | const html = ` 191 | About Us 192 | Logo 193 | `; 194 | const markdown = htmlToMarkdown(html, { includeImages: true }); 195 | 196 | // Check that relative URLs remain unchanged when no sourceUrl is provided 197 | expect(markdown).toContain("[About Us](/about)"); 198 | expect(markdown).toContain("![Logo](/images/logo.png)"); 199 | }); 200 | 201 | test("should handle invalid URLs gracefully", () => { 202 | const html = ` 203 | Invalid Link 204 | Invalid Image 205 | `; 206 | const sourceUrl = "https://example.com/"; 207 | const markdown = htmlToMarkdown(html, { includeImages: true }, sourceUrl); 208 | 209 | // Check that invalid URLs are preserved as-is 210 | expect(markdown).toContain("[Invalid Link](invalid:url)"); 211 | expect(markdown).toContain("![Invalid Image](invalid:url)"); 212 | }); 213 | 214 | describe("URL cleaning", () => { 215 | test("should clean Amazon URLs by removing tracking parameters when cleanUrls is enabled", () => { 216 | const html = ` 217 | Amazon Product 218 | Amazon CA Product 219 | `; 220 | const markdown = htmlToMarkdown(html, { cleanUrls: true }); 221 | 222 | // Check that Amazon URLs are cleaned 223 | expect(markdown).toContain( 224 | "[Amazon Product](https://www.amazon.com/Product-Name-Here/dp/ABCDE01234)" 225 | ); 226 | expect(markdown).toContain( 227 | "[Amazon CA Product](https://amazon.ca/Item-Name/dp/B12345)" 228 | ); 229 | 230 | // Ensure tracking parameters are removed 231 | expect(markdown).not.toContain("/ref="); 232 | expect(markdown).not.toContain("dib="); 233 | expect(markdown).not.toContain("qid="); 234 | }); 235 | 236 | test("should not clean Amazon URLs by default", () => { 237 | const html = ` 238 | Amazon Product 239 | `; 240 | const markdown = htmlToMarkdown(html); 241 | 242 | // Check that Amazon URLs are NOT cleaned by default 243 | expect(markdown).toContain( 244 | "[Amazon Product](https://www.amazon.com/Product-Name-Here/dp/ABCDE01234/ref=sr_1_47?dib=abc123&qid=1640995200)" 245 | ); 246 | }); 247 | 248 | test("should not clean Amazon URLs when cleanUrls is false", () => { 249 | const html = ` 250 | Amazon Product 251 | `; 252 | const markdown = htmlToMarkdown(html, { cleanUrls: false }); 253 | 254 | // Check that Amazon URLs are NOT cleaned when option is disabled 255 | expect(markdown).toContain( 256 | "[Amazon Product](https://www.amazon.com/Product-Name-Here/dp/ABCDE01234/ref=sr_1_47?dib=abc123&qid=1640995200)" 257 | ); 258 | }); 259 | 260 | test("should not modify non-Amazon URLs", () => { 261 | const html = ` 262 | Regular Link 263 | Shop Link 264 | Image 265 | `; 266 | const markdown = htmlToMarkdown(html, { 267 | includeImages: true, 268 | cleanUrls: true, 269 | }); 270 | 271 | // Check that non-Amazon URLs remain unchanged even with cleanUrls enabled 272 | expect(markdown).toContain( 273 | "[Regular Link](https://example.com/product?utm_source=test&ref=something)" 274 | ); 275 | expect(markdown).toContain( 276 | "[Shop Link](https://shop.example.com/item/ref=special)" 277 | ); 278 | expect(markdown).toContain( 279 | "![Image](https://cdn.example.com/image.jpg?v=123&ref=cache)" 280 | ); 281 | }); 282 | }); 283 | }); 284 | }); 285 | 286 | describe("convertHtmlToMarkdown", () => { 287 | it("should convert HTML to markdown", () => { 288 | const html = "

Hello World

This is a test

"; 289 | const markdown = convertHtmlToMarkdown(html); 290 | expect(markdown).toContain("Hello World"); 291 | expect(markdown).toContain("This is a test"); 292 | }); 293 | 294 | it("should handle HTML extraction options", () => { 295 | const html = ` 296 | 297 |

Main Content

Important text

298 |
Footer
299 | `; 300 | const markdown = convertHtmlToMarkdown(html, { extractMainHtml: true }); 301 | expect(markdown).toContain("Main Content"); 302 | expect(markdown).toContain("Important text"); 303 | // Navigation and footer might be removed by extractMainHtml 304 | }); 305 | 306 | it("should process images when includeImages is true", () => { 307 | const html = '
Test Image
'; 308 | const markdown = convertHtmlToMarkdown(html, { includeImages: true }); 309 | expect(markdown).toContain("![Test Image]"); 310 | }); 311 | 312 | it("should handle source URL for relative links", () => { 313 | const html = 'About'; 314 | const markdown = convertHtmlToMarkdown( 315 | html, 316 | undefined, 317 | "https://example.com" 318 | ); 319 | expect(markdown).toContain("https://example.com/about"); 320 | }); 321 | }); 322 | -------------------------------------------------------------------------------- /tests/unit/extractors.test.ts: -------------------------------------------------------------------------------- 1 | import { 2 | getUsage, 3 | createLLM, 4 | extractWithLLM, 5 | truncateContent, 6 | generateExtractionPrompt, 7 | } from "../../src/extractors"; 8 | import { LLMProvider, ContentFormat } from "../../src/types"; 9 | import { z } from "zod"; 10 | 11 | // Mock the LLM providers 12 | jest.mock("@langchain/openai", () => ({ 13 | ChatOpenAI: jest.fn().mockImplementation(() => ({ 14 | constructor: { name: "ChatOpenAI" }, 15 | withStructuredOutput: jest.fn().mockImplementation(() => ({ 16 | invoke: jest.fn().mockResolvedValue({ 17 | parsed: { title: "Test Title", content: "Test Content" }, 18 | raw: { 19 | tool_calls: [ 20 | { 21 | args: { title: "Test Title", content: "Test Content" }, 22 | }, 23 | ], 24 | }, 25 | }), 26 | })), 27 | })), 28 | })); 29 | 30 | jest.mock("@langchain/google-genai", () => ({ 31 | ChatGoogleGenerativeAI: jest.fn().mockImplementation(() => ({ 32 | constructor: { name: "ChatGoogleGenerativeAI" }, 33 | withStructuredOutput: jest.fn().mockImplementation(() => ({ 34 | invoke: jest.fn().mockResolvedValue({ 35 | parsed: { title: "Test Title", content: "Test Content" }, 36 | raw: { 37 | lc_kwargs: { 38 | content: '{"title":"Test Title","content":"Test Content"}', 39 | }, 40 | }, 41 | }), 42 | })), 43 | })), 44 | })); 45 | 46 | describe("extractors", () => { 47 | const mockSchema = z.object({ 48 | title: z.string(), 49 | content: z.string(), 50 | }); 51 | 52 | const mockContent = "Test content"; 53 | const mockApiKey = "test-api-key"; 54 | 55 | beforeEach(() => { 56 | jest.clearAllMocks(); 57 | }); 58 | 59 | describe("getUsage", () => { 60 | it("should extract usage statistics from LLM output", () => { 61 | const mockOutput = { 62 | llmOutput: { 63 | tokenUsage: { 64 | promptTokens: 100, 65 | completionTokens: 50, 66 | totalTokens: 150, 67 | }, 68 | }, 69 | }; 70 | 71 | const usage = getUsage(mockOutput); 72 | 73 | expect(usage.inputTokens).toBe(100); 74 | expect(usage.outputTokens).toBe(50); 75 | }); 76 | 77 | it("should handle missing token usage", () => { 78 | const mockOutput = { 79 | llmOutput: {}, 80 | }; 81 | 82 | const usage = getUsage(mockOutput); 83 | 84 | expect(usage.inputTokens).toBeUndefined(); 85 | expect(usage.outputTokens).toBeUndefined(); 86 | }); 87 | 88 | it("should handle missing llmOutput", () => { 89 | const mockOutput = {}; 90 | 91 | const usage = getUsage(mockOutput); 92 | 93 | expect(usage.inputTokens).toBeUndefined(); 94 | expect(usage.outputTokens).toBeUndefined(); 95 | }); 96 | }); 97 | 98 | describe("createLLM", () => { 99 | it("should create ChatOpenAI instance for OPENAI provider", () => { 100 | const llm = createLLM( 101 | LLMProvider.OPENAI, 102 | "gpt-4o-mini", 103 | "fake-api-key", 104 | 0 105 | ); 106 | 107 | expect(llm).toBeDefined(); 108 | expect(llm.constructor.name).toBe("ChatOpenAI"); 109 | }); 110 | 111 | it("should create ChatGoogleGenerativeAI instance for GOOGLE_GEMINI provider", () => { 112 | const llm = createLLM( 113 | LLMProvider.GOOGLE_GEMINI, 114 | "gemini-2.5-flash", 115 | "fake-api-key", 116 | 0 117 | ); 118 | 119 | expect(llm).toBeDefined(); 120 | expect(llm.constructor.name).toBe("ChatGoogleGenerativeAI"); 121 | }); 122 | 123 | it("should throw error for unsupported provider", () => { 124 | expect(() => { 125 | // @ts-ignore - Testing invalid provider 126 | createLLM("unsupported-provider", "model", "api-key", 0); 127 | }).toThrow("Unsupported LLM provider"); 128 | }); 129 | }); 130 | 131 | describe("extractWithLLM", () => { 132 | it("should extract data using OpenAI", async () => { 133 | const result = await extractWithLLM( 134 | mockContent, 135 | mockSchema, 136 | LLMProvider.OPENAI, 137 | "gpt-4o-mini", 138 | mockApiKey 139 | ); 140 | 141 | expect(result.data).toEqual({ 142 | title: "Test Title", 143 | content: "Test Content", 144 | }); 145 | }); 146 | 147 | it("should extract data using Google Gemini", async () => { 148 | const result = await extractWithLLM( 149 | mockContent, 150 | mockSchema, 151 | LLMProvider.GOOGLE_GEMINI, 152 | "gemini-2.5-flash", 153 | mockApiKey 154 | ); 155 | 156 | expect(result.data).toEqual({ 157 | title: "Test Title", 158 | content: "Test Content", 159 | }); 160 | }); 161 | 162 | it("should handle custom prompts", async () => { 163 | const customPrompt = "Extract the main topic and summary"; 164 | const result = await extractWithLLM( 165 | mockContent, 166 | mockSchema, 167 | LLMProvider.OPENAI, 168 | "gpt-4o-mini", 169 | mockApiKey, 170 | 0, 171 | customPrompt 172 | ); 173 | 174 | expect(result.data).toEqual({ 175 | title: "Test Title", 176 | content: "Test Content", 177 | }); 178 | }); 179 | 180 | it("should handle different content formats", async () => { 181 | const result = await extractWithLLM( 182 | mockContent, 183 | mockSchema, 184 | LLMProvider.OPENAI, 185 | "gpt-4o-mini", 186 | mockApiKey, 187 | 0, 188 | undefined, 189 | ContentFormat.TXT 190 | ); 191 | 192 | expect(result.data).toEqual({ 193 | title: "Test Title", 194 | content: "Test Content", 195 | }); 196 | }); 197 | 198 | it("should handle extraction context", async () => { 199 | const extractionContext = { 200 | title: "Existing Title", 201 | content: "", // Empty field that should be filled 202 | }; 203 | 204 | const result = await extractWithLLM( 205 | mockContent, 206 | mockSchema, 207 | LLMProvider.OPENAI, 208 | "gpt-4o-mini", 209 | mockApiKey, 210 | 0, 211 | undefined, 212 | ContentFormat.TXT, 213 | undefined, 214 | extractionContext 215 | ); 216 | 217 | expect(result.data).toEqual({ 218 | title: "Test Title", 219 | content: "Test Content", 220 | }); 221 | }); 222 | }); 223 | 224 | describe("truncateContent", () => { 225 | it("should not truncate content when full prompt is within limit", () => { 226 | const prompt = generateExtractionPrompt({ 227 | format: ContentFormat.TXT, 228 | content: "", 229 | }); 230 | const content = "This is a short test content."; 231 | const result = truncateContent({ 232 | content, 233 | maxTokens: (prompt.length + content.length) / 4, 234 | format: ContentFormat.TXT, 235 | }); 236 | expect(result).toBe(content); 237 | }); 238 | 239 | it("should truncate content by excess amount", () => { 240 | const prompt = generateExtractionPrompt({ 241 | format: ContentFormat.TXT, 242 | content: "", 243 | }); 244 | // Create a content that will make the full prompt exceed the limit 245 | const content = "This is a longer test content that should be truncated."; 246 | const result = truncateContent({ 247 | content, 248 | maxTokens: (prompt.length + content.length) / 4 - 1, 249 | format: ContentFormat.TXT, 250 | }); 251 | expect(result.length).toBe(content.length - 4); 252 | }); 253 | 254 | it("should account for extractionContext in prompt size calculation", () => { 255 | const prompt = generateExtractionPrompt({ 256 | format: ContentFormat.TXT, 257 | content: "", 258 | extractionContext: { a: 1, b: 2 }, 259 | }); 260 | 261 | const content = "This is a test content for enrichment."; 262 | const result = truncateContent({ 263 | content, 264 | maxTokens: (prompt.length + content.length) / 4 - 1, 265 | format: ContentFormat.TXT, 266 | extractionContext: { a: 1, b: 2 }, 267 | }); 268 | 269 | expect(result.length).toBe(content.length - 4); 270 | }); 271 | }); 272 | 273 | describe("generateExtractionPrompt", () => { 274 | it("should generate a basic extraction prompt without extractionContext", () => { 275 | const prompt = generateExtractionPrompt({ 276 | format: ContentFormat.TXT, 277 | content: "Some test content", 278 | }); 279 | 280 | expect(prompt).toContain("Content information is below:"); 281 | expect(prompt).toContain("Format: txt"); 282 | expect(prompt).toContain("Some test content"); 283 | expect(prompt).toContain("You are a data extraction assistant"); 284 | expect(prompt).toContain( 285 | "Extract ONLY information explicitly stated in the content" 286 | ); 287 | expect(prompt).not.toContain("Extraction context"); 288 | expect(prompt).toContain( 289 | "Return only the structured data in valid JSON format" 290 | ); 291 | }); 292 | 293 | it("should generate a context-aware prompt with extractionContext", () => { 294 | const extractionContext = { 295 | title: "Existing Title", 296 | author: "", 297 | tags: ["existing"], 298 | }; 299 | 300 | const prompt = generateExtractionPrompt({ 301 | format: ContentFormat.MARKDOWN, 302 | content: "Some markdown content", 303 | extractionContext, 304 | }); 305 | 306 | expect(prompt).toContain("Content information is below:"); 307 | expect(prompt).toContain("Format: markdown"); 308 | expect(prompt).toContain("Some markdown content"); 309 | expect(prompt).toContain("Extraction context"); 310 | expect(prompt).toContain(JSON.stringify(extractionContext, null, 2)); 311 | expect(prompt).toContain( 312 | "You are a data extraction assistant that extracts structured information from the above content and context" 313 | ); 314 | expect(prompt).toContain( 315 | "If the extraction context contains partial data objects, enrich and update them with information from the content" 316 | ); 317 | expect(prompt).toContain( 318 | "Return only the structured data in valid JSON format" 319 | ); 320 | }); 321 | 322 | it("should include custom prompt in the instructions", () => { 323 | const customPrompt = "Extract only product information and prices"; 324 | const extractionContext = { products: [] }; 325 | 326 | const prompt = generateExtractionPrompt({ 327 | format: ContentFormat.HTML, 328 | content: "
Product content
", 329 | customPrompt, 330 | extractionContext, 331 | }); 332 | 333 | expect(prompt).toContain(customPrompt); 334 | expect(prompt).toContain("Extraction context"); 335 | expect(prompt).toContain(JSON.stringify(extractionContext, null, 2)); 336 | }); 337 | }); 338 | }); 339 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | /* Visit https://aka.ms/tsconfig to read more about this file */ 4 | 5 | /* Projects */ 6 | // "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */ 7 | // "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */ 8 | // "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */ 9 | // "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */ 10 | // "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */ 11 | // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ 12 | 13 | /* Language and Environment */ 14 | "target": "es2018", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ 15 | // "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ 16 | // "jsx": "preserve", /* Specify what JSX code is generated. */ 17 | // "libReplacement": true, /* Enable lib replacement. */ 18 | // "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */ 19 | // "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */ 20 | // "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */ 21 | // "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */ 22 | // "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */ 23 | // "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */ 24 | // "noLib": true, /* Disable including any library files, including the default lib.d.ts. */ 25 | // "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */ 26 | // "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */ 27 | 28 | /* Modules */ 29 | "module": "commonjs", /* Specify what module code is generated. */ 30 | "rootDir": "./src", 31 | "moduleResolution": "node", 32 | // "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */ 33 | // "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */ 34 | // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ 35 | // "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */ 36 | // "types": [], /* Specify type package names to be included without being referenced in a source file. */ 37 | // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ 38 | // "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */ 39 | // "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */ 40 | // "rewriteRelativeImportExtensions": true, /* Rewrite '.ts', '.tsx', '.mts', and '.cts' file extensions in relative import paths to their JavaScript equivalent in output files. */ 41 | // "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */ 42 | // "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */ 43 | // "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */ 44 | // "noUncheckedSideEffectImports": true, /* Check side effect imports. */ 45 | // "resolveJsonModule": true, /* Enable importing .json files. */ 46 | // "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */ 47 | // "noResolve": true, /* Disallow 'import's, 'require's or ''s from expanding the number of files TypeScript should add to a project. */ 48 | 49 | /* JavaScript Support */ 50 | // "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */ 51 | // "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */ 52 | // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */ 53 | 54 | /* Emit */ 55 | "declaration": true, 56 | // "declarationMap": true, /* Create sourcemaps for d.ts files. */ 57 | // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */ 58 | "sourceMap": true, 59 | // "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */ 60 | // "noEmit": true, /* Disable emitting files from a compilation. */ 61 | // "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */ 62 | "outDir": "./dist", 63 | // "removeComments": true, /* Disable emitting comments. */ 64 | // "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */ 65 | // "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */ 66 | // "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */ 67 | // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ 68 | // "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */ 69 | // "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */ 70 | // "newLine": "crlf", /* Set the newline character for emitting files. */ 71 | // "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */ 72 | // "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */ 73 | // "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */ 74 | // "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */ 75 | // "declarationDir": "./", /* Specify the output directory for generated declaration files. */ 76 | 77 | /* Interop Constraints */ 78 | // "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */ 79 | // "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */ 80 | // "isolatedDeclarations": true, /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */ 81 | // "erasableSyntaxOnly": true, /* Do not allow runtime constructs that are not part of ECMAScript. */ 82 | // "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */ 83 | "esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */ 84 | // "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */ 85 | "forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */ 86 | 87 | /* Type Checking */ 88 | "strict": true, /* Enable all strict type-checking options. */ 89 | // "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */ 90 | // "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */ 91 | // "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */ 92 | // "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */ 93 | // "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */ 94 | // "strictBuiltinIteratorReturn": true, /* Built-in iterators are instantiated with a 'TReturn' type of 'undefined' instead of 'any'. */ 95 | // "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */ 96 | // "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */ 97 | // "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */ 98 | // "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */ 99 | // "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */ 100 | // "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */ 101 | // "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */ 102 | // "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */ 103 | // "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */ 104 | // "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */ 105 | // "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */ 106 | // "allowUnusedLabels": true, /* Disable error reporting for unused labels. */ 107 | // "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */ 108 | 109 | /* Completeness */ 110 | // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ 111 | "skipLibCheck": true /* Skip type checking all .d.ts files. */ 112 | }, 113 | "include": ["src/**/*"], 114 | "exclude": ["node_modules", "dist", "**/*.test.ts"] 115 | } 116 | --------------------------------------------------------------------------------