├── .env.example
├── .github
    └── workflows
    │   ├── publish.yml
    │   └── test.yml
├── .gitignore
├── .gitmodules
├── .npmignore
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── jest.config.js
├── package-lock.json
├── package.json
├── src
    ├── converters.ts
    ├── dev
    │   ├── regenerateGroundTruth.ts
    │   ├── runLocalTest.ts
    │   ├── testHtmlToMarkdown.ts
    │   └── testUsage.ts
    ├── example.ts
    ├── extractors.ts
    ├── index.ts
    ├── types.ts
    └── utils
    │   └── schemaUtils.ts
├── tests
    ├── fixtures
    │   ├── article-with-images.html
    │   ├── blog-post.html
    │   └── product-list.html
    ├── integration
    │   ├── extract.test.ts
    │   ├── html-to-markdown.test.ts
    │   └── processedContent.test.ts
    ├── setup.ts
    └── unit
    │   ├── converters.test.ts
    │   ├── extractors.test.ts
    │   └── schemaUtils.test.ts
└── tsconfig.json


/.env.example:
--------------------------------------------------------------------------------
1 | # API Keys for testing
2 | GOOGLE_API_KEY=your_google_api_key_here
3 | OPENAI_API_KEY=your_openai_api_key_here
4 | 
5 | # Test configuration
6 | TEST_TIMEOUT=30000
7 | LOG_LEVEL=info 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Package to NPM
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*' # Run workflow on version tags, e.g. v1.0.0
 7 | 
 8 | permissions:
 9 |   contents: write
10 |   packages: write
11 | 
12 | jobs:
13 |   build-and-publish:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - uses: actions/checkout@v3
17 |         with:
18 |           fetch-depth: 0 # Fetch all history for proper versioning
19 |           
20 |       - name: Use Node.js
21 |         uses: actions/setup-node@v3
22 |         with:
23 |           node-version: '20.x'
24 |           registry-url: 'https://registry.npmjs.org/'
25 |           cache: 'npm'
26 |           
27 |       - name: Install dependencies
28 |         run: npm ci
29 |         
30 |       - name: Build package
31 |         run: npm run build
32 |         
33 |       - name: Run unit tests
34 |         run: npm run test:unit
35 |         
36 |       - name: Update test data submodule
37 |         run: npm run test:html2md:update
38 |         
39 |       - name: Run integration tests
40 |         run: npm run test:integration
41 |         env:
42 |           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
43 |           GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
44 |         
45 |       - name: Generate release notes
46 |         id: release
47 |         run: |
48 |           VERSION=${GITHUB_REF#refs/tags/}
49 |           echo "version=$VERSION" >> $GITHUB_OUTPUT
50 |           # Extract changes from git log or CHANGELOG if available
51 |           CHANGES=$(git log --pretty=format:"* %s (%h)" $(git describe --tags --abbrev=0 HEAD^)..HEAD || echo "Initial release")
52 |           echo "CHANGES<<EOF" >> $GITHUB_ENV
53 |           echo "$CHANGES" >> $GITHUB_ENV
54 |           echo "EOF" >> $GITHUB_ENV
55 |           
56 |       - name: Create GitHub Release
57 |         uses: softprops/action-gh-release@v1
58 |         with:
59 |           name: Release ${{ steps.release.outputs.version }}
60 |           body: |
61 |             ## Changes in this release
62 |             
63 |             ${{ env.CHANGES }}
64 |             
65 |             For full details, see the [CHANGELOG](https://github.com/lightfeed/extractor/blob/main/CHANGELOG.md).
66 |           draft: false
67 |           prerelease: false
68 |         
69 |       - name: Publish to NPM
70 |         run: npm publish --access public
71 |         env:
72 |           NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 |   pull_request:
 7 |     branches: [main]
 8 |   schedule:
 9 |     - cron: '0 0 * * 1' # Run weekly on Monday at midnight UTC
10 | 
11 | jobs:
12 |   unit-tests:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         node-version: [18.x, 20.x]
17 |     
18 |     steps:
19 |       - uses: actions/checkout@v3
20 |       
21 |       - name: Use Node.js ${{ matrix.node-version }}
22 |         uses: actions/setup-node@v3
23 |         with:
24 |           node-version: ${{ matrix.node-version }}
25 |           cache: 'npm'
26 |           
27 |       - name: Install dependencies
28 |         run: npm ci
29 |         
30 |       - name: Run unit tests
31 |         run: npm run test:unit
32 |   
33 |   integration-tests:
34 |     runs-on: ubuntu-latest
35 |     needs: unit-tests
36 |     strategy:
37 |       matrix:
38 |         node-version: [20.x]
39 |     
40 |     steps:
41 |       - uses: actions/checkout@v3
42 |       
43 |       - name: Use Node.js ${{ matrix.node-version }}
44 |         uses: actions/setup-node@v3
45 |         with:
46 |           node-version: ${{ matrix.node-version }}
47 |           cache: 'npm'
48 |           
49 |       - name: Install dependencies
50 |         run: npm ci
51 |         
52 |       - name: Update test data submodule
53 |         run: npm run test:html2md:update
54 |         
55 |       - name: Run integration tests
56 |         run: npm run test:integration
57 |         env:
58 |           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
59 |           GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Dependencies
 2 | node_modules/
 3 | yarn.lock
 4 | 
 5 | # Build output
 6 | dist/
 7 | build/
 8 | lib/
 9 | 
10 | # Environment variables
11 | .env
12 | .env.local
13 | .env.*.local
14 | 
15 | # Logs
16 | logs
17 | *.log
18 | npm-debug.log*
19 | yarn-debug.log*
20 | yarn-error.log*
21 | 
22 | # IDE and editors
23 | .idea/
24 | .vscode/
25 | *.swp
26 | *.swo
27 | .DS_Store
28 | 
29 | # Test coverage
30 | coverage/
31 | 
32 | # Temporary files
33 | tmp/
34 | temp/
35 | 
36 | # Optionally fetched test data submodule
37 | /test-data/
38 | 
39 | # Dev test output
40 | /dev-output/
41 | 
42 | # Keep the .gitmodules file that defines the submodule
43 | !/.gitmodules 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "test-data"]
2 | 	path = test-data
3 | 	url = https://github.com/lightfeed/extractor-test-data.git
4 | 


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
 1 | # Source
 2 | src/
 3 | tests/
 4 | test-data/
 5 | 
 6 | # Config files
 7 | .github/
 8 | .git/
 9 | .gitignore
10 | .gitmodules
11 | .editorconfig
12 | .prettierrc
13 | .eslintrc
14 | .env*
15 | .vscode/
16 | tsconfig.json
17 | tslint.json
18 | jest.config.js
19 | 
20 | # Build artifacts
21 | coverage/
22 | node_modules/
23 | 
24 | # Development files
25 | *.log
26 | .DS_Store
27 | examples/
28 | src/dev/
29 | src/example.ts 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 7 | 
 8 | ## [Unreleased]
 9 | 
10 | ## [0.1.7] - 2025-06-07
11 | 
12 | ## Changed
13 | - Updte README to use @lightfeed/extractor as new npm project
14 | 
15 | ## [0.1.6] - 2025-06-07
16 | 
17 | ## Changed
18 | - Update project name to lightfeed/extractor and publish to npm project @lightfeed/extractor
19 | 
20 | ## [0.1.5] - 2025-05-14
21 | 
22 | ## Fixed
23 | - Improve main html content extraction - preserve option, label and select (can be important for product detail pages)
24 | 
25 | ## [0.1.4] - 2025-05-13
26 | 
27 | ## Fixed
28 | - Fixed schema conversion bug when input zod schema is from a different zod version
29 | 
30 | ## [0.1.3] - 2025-05-13
31 | 
32 | ### Added
33 | - Use processedContent instead of markdown in response
34 | - Improve enrich prompt to not remove any fields from the original JSON object
35 | 
36 | ## [0.1.2] - 2025-05-12
37 | 
38 | ### Added
39 | - Support enriching data
40 | - Handle nullable instead of optional in schema. This is required for schema in OpenAI models
41 | 
42 | ## [0.1.1] - 2025-05-11
43 | 
44 | ### Added
45 | - Initial release with core functionality
46 | - HTML to Markdown conversion with main content extraction
47 | - Structured data extraction with LLM support
48 | - Support for OpenAI and Google Gemini API
49 | - URL validation and fixing
50 | - Comprehensive test suite
51 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to lightfeed/extractor
 2 | 
 3 | Thank you for considering contributing to lightfeed/extractor! This document outlines the process for contributing to the project and releasing new versions.
 4 | 
 5 | ## Development Workflow
 6 | 
 7 | 1. Fork the repository
 8 | 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
 9 | 3. Make your changes
10 | 4. Run tests to ensure everything works:
11 |    - `npm run test:unit` - Run unit tests
12 |    - `npm run test:integration` - Run integration tests (requires API keys)
13 |    - `npm run test:html2md` - Run HTML to Markdown tests
14 | 5. Commit your changes (`git commit -m 'Add some amazing feature'`)
15 | 6. Push to the branch (`git push origin feature/amazing-feature`)
16 | 7. Open a Pull Request
17 | 
18 | ## CI/CD Pipeline
19 | 
20 | This project uses GitHub Actions for continuous integration and deployment:
21 | 
22 | ### Testing Workflow
23 | 
24 | The testing workflow runs automatically:
25 | - On each push to the `main` branch
26 | - On each pull request to the `main` branch
27 | - Weekly on Monday at midnight UTC
28 | 
29 | The workflow includes:
30 | 1. Unit tests - Run across multiple Node.js versions (18.x, 20.x)
31 | 2. Integration tests - Run on Node.js 20.x using provided API secrets
32 | 
33 | ### Setting up API keys for CI
34 | 
35 | To enable integration tests in CI, add your API keys as secrets in your GitHub repository:
36 | 
37 | 1. Go to your GitHub repository
38 | 2. Click on "Settings" > "Secrets and variables" > "Actions"
39 | 3. Add the following secrets:
40 |    - `OPENAI_API_KEY` - Your OpenAI API key
41 |    - `GOOGLE_API_KEY` - Your Google API key
42 | 
43 | ## Release Process
44 | 
45 | This project uses semantic versioning. To create a new release:
46 | 
47 | 1. Update the version in `package.json`
48 | 2. Update the `CHANGELOG.md` with details of the changes
49 | 3. Commit these changes with a message like "Bump version to x.y.z"
50 | 4. Create and push a new tag:
51 |    ```
52 |    git tag -a vx.y.z -m "Release version x.y.z"
53 |    git push origin vx.y.z
54 |    ```
55 | 
56 | When you push a new tag prefixed with "v" (e.g., v1.0.0), GitHub Actions will automatically:
57 | 1. Build the package
58 | 2. Run unit tests
59 | 3. Create a GitHub Release with notes from your git history
60 | 4. Publish the package to npm
61 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [2025] [Revar Immersive Technology Inc.]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">
  2 |   <img src="https://www.lightfeed.ai/docs/img/logo.svg" width="32" height="32" alt="Lightfeed Logo"/>
  3 |   Lightfeed Extractor
  4 | </h1>
  5 | 
  6 | <p align="center">
  7 |   <strong>Use LLMs to robustly extract structured data from HTML and markdown</strong>
  8 | </p>
  9 | 
 10 | <div align="center">
 11 |   <a href="https://www.npmjs.com/package/@lightfeed/extractor">
 12 |     <img src="https://img.shields.io/npm/v/@lightfeed/extractor?logo=npm" alt="npm" /></a>
 13 |   <a href="https://github.com/lightfeed/extractor/actions/workflows/test.yml">
 14 |       <img src="https://img.shields.io/github/actions/workflow/status/lightfeed/extractor/test.yml?branch=main"
 15 |           alt="Test status (main branch)"></a>
 16 |   <a href="https://github.com/lightfeed/extractor/blob/main/LICENSE">
 17 |     <img src="https://img.shields.io/github/license/lightfeed/extractor" alt="License" /></a>
 18 | </div>
 19 | <div>
 20 |   <p align="center">
 21 |     <a href="https://lightfeed.ai/docs">
 22 |       <img src="https://img.shields.io/badge/docs-lightfeed.ai-3E63DD" alt="Lightfeed Documentation" /></a>
 23 |     <a href="https://discord.gg/txZ2s4pgQJ" alt="Discord">
 24 |       <img src="https://img.shields.io/discord/1209342987008614501?label=chat&logo=discord&logoColor=white&color=5865F2" alt="Discord" /></a>
 25 |     <a href="https://www.linkedin.com/company/lightfeed-ai">
 26 |       <img src="https://img.shields.io/badge/Follow%20on%20LinkedIn-0A66C2?logo=linkedin&logoColor=white" alt="Follow on LinkedIn" /></a>
 27 |     <a href="https://twitter.com/lightfeed_ai">
 28 |       <img src="https://img.shields.io/badge/Follow%20on%20X-202020?logo=x&logoColor=white" alt="Follow on X" /></a>
 29 |   </p>
 30 | </div>
 31 | 
 32 | ## How It Works
 33 | 
 34 | 1. **HTML to Markdown Conversion**: If the input is HTML, it's first converted to clean, LLM-friendly markdown. This step can optionally extract only the main content and include images. See [HTML to Markdown Conversion](#html-to-markdown-conversion) section for details. The `convertHtmlToMarkdown` function can also be used standalone.
 35 | 
 36 | 2. **LLM Processing**: The markdown is sent to an LLM in JSON mode (Google Gemini 2.5 flash or OpenAI GPT-4o mini by default) with a prompt to extract structured data according to your Zod schema or enrich existing data objects. You can set a maximum input token limit to control costs or avoid exceeding the model's context window, and the function will return token usage metrics for each LLM call.
 37 | 
 38 | 3. **JSON Sanitization**: If the LLM structured output fails or doesn't fully match your schema, a sanitization process attempts to recover and fix the data. This makes complex schema extraction much more robust, especially with deeply nested objects and arrays. See [JSON Sanitization](#json-sanitization) for details.
 39 | 
 40 | 4. **URL Validation**: All extracted URLs are validated - handling relative URLs, removing invalid ones, and repairing markdown-escaped links. See [URL Validation](#url-validation) section for details.
 41 | 
 42 | ## Why use an LLM extractor?
 43 | 💡 Understands natural language criteria and context to extract the data you need, not just raw content as displayed
 44 | 
 45 | ⚡️ No need to manually create custom scraper code for each site
 46 | 
 47 | 🔁 Resilient to website changes, e.g., HTML structure, CSS selectors, or page layout
 48 | 
 49 | ✅ LLMs are becoming more accurate and cost-effective
 50 | 
 51 | ## Installation
 52 | 
 53 | ```bash
 54 | npm install @lightfeed/extractor
 55 | ```
 56 | 
 57 | ## Hosted Version
 58 | 
 59 | While this library provides a robust foundation for data extraction, you might want to consider [lightfeed.ai](https://lightfeed.ai) if you need:
 60 | 
 61 | - **Persistent Searchable Databases**: Automatically store and manage extracted data in a production-ready vector database
 62 | - **Scheduled Runs, Deduplication and Tracking**: Smart detection and handling of duplicate content across your sources, with automated change tracking
 63 | - **Deep Link Extraction**: Follow links to collect complete data from connected pages
 64 | - **Real-time API and Integration**: Query your extracted data through robust API endpoints and integrations
 65 | - **Research Portal**: Explore and analyze your data through an intuitive interface
 66 | 
 67 | ## Usage
 68 | 
 69 | ### Basic Example
 70 | 
 71 | ```typescript
 72 | import { extract, ContentFormat, LLMProvider } from "@lightfeed/extractor";
 73 | import { z } from "zod";
 74 | 
 75 | async function main() {
 76 |   // Define your schema. We will run one more sanitization process to recover imperfect, failed, or partial LLM outputs into this schema
 77 |   const schema = z.object({
 78 |     title: z.string(),
 79 |     author: z.string().optional(),
 80 |     date: z.string().optional(),
 81 |     tags: z.array(z.string()),
 82 |     summary: z.string().describe("A brief summary of the article content within 500 characters"),
 83 |     // Use .url() to fix and validate URL field
 84 |     links: z.array(z.string().url()).describe("All URLs mentioned in the article")
 85 |   });
 86 | 
 87 |   // Extract from HTML
 88 |   const result = await extract({
 89 |     content: `
 90 |       <article>
 91 |         <h1>Understanding Async/Await in JavaScript</h1>
 92 |         <div class="meta">
 93 |           <span class="author">John Doe</span> |
 94 |           <span class="date">January 15, 2023</span> |
 95 |           <span class="tags">#JavaScript #Programming</span>
 96 |         </div>
 97 |         <p>This article explains how async/await works in modern JavaScript.</p>
 98 |         <p>Learn more at <a href="https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/async_function">MDN</a>
 99 |         or check our <a href="/blog/javascript-tutorials">tutorials</a>.</p>
100 |       </article>
101 |     `,
102 |     format: ContentFormat.HTML,
103 |     schema,
104 |     sourceUrl: "https://example.com/blog/async-await", // Required for HTML format to handle relative URLs
105 |     googleApiKey: "your-google-gemini-api-key",
106 |   });
107 | 
108 |   console.log("Extracted Data:", result.data);
109 |   console.log("Token Usage:", result.usage);
110 | }
111 | 
112 | main().catch(console.error);
113 | ```
114 | 
115 | ### Extracting from Markdown or Plain Text
116 | 
117 | You can also extract structured data directly from Markdown string:
118 | 
119 | ```typescript
120 | const result = await extract({
121 |   content: markdownContent,
122 |   // Specify that content is Markdown. In addition to HTML and Markdown, you can also extract plain text by ContentFormat.TXT
123 |   format: ContentFormat.MARKDOWN,
124 |   schema: mySchema,
125 |   googleApiKey: "your-google-gemini-api-key",
126 | });
127 | ```
128 | 
129 | ### Custom Extraction Prompts
130 | 
131 | You can provide a custom prompt to guide the extraction process:
132 | 
133 | ```typescript
134 | const result = await extract({
135 |   content: htmlContent,
136 |   format: ContentFormat.HTML,
137 |   schema: mySchema,
138 |   sourceUrl: "https://example.com/products",
139 |   // In custom prompt, defined what data should be retrieved
140 |   prompt: "Extract ONLY products that are on sale or have special discounts. Include their original prices, discounted prices, and product URL.",
141 |   googleApiKey: "your-google-gemini-api-key",
142 | });
143 | ```
144 | 
145 | If no prompt is provided, a default extraction prompt will be used.
146 | 
147 | ### Data Enrichment
148 | 
149 | You can use the `dataToEnrich` option to provide an existing data object that will be enriched with additional information from the content. This is particularly useful for:
150 | 
151 | - Updating incomplete records with missing information
152 | - Enhancing existing data with new details from content
153 | - Merging data from multiple sources
154 | 
155 | The LLM will be instructed to enrich the provided object rather than creating a completely new one:
156 | 
157 | ```typescript
158 | // Example of enriching a product record with missing information
159 | const productToEnrich = {
160 |   productUrl: "https://example.com/products/smart-security-camera",
161 |   name: "",
162 |   price: 0,
163 |   reviews: [],
164 | };
165 | 
166 | const result = await extract({
167 |   content: htmlContent,
168 |   format: ContentFormat.HTML,
169 |   schema: productSchema,
170 |   sourceUrl: "https://example.com/products/smart-security-camera",
171 |   prompt: "Enrich the product data with complete details from the product page.",
172 |   dataToEnrich: productToEnrich,
173 |   googleApiKey: "your-google-gemini-api-key",
174 | });
175 | 
176 | // Result will contain the original data enriched with information from the content
177 | console.log(result.data);
178 | // {
179 | //   productUrl: "https://example.com/products/smart-security-camera" // Preserved from original object
180 | //   name: "Smart Security Camera", // Enriched from the product page
181 | //   price: 74.50, // Enriched from the product page
182 | //   reviews: ["I really like this camera", ...] // Reviews enriched from the product page
183 | // }
184 | ```
185 | 
186 | ### Customizing LLM Provider and Managing Token Limits
187 | 
188 | You can customize LLM and manage token limits to control costs and ensure your content fits within the model's maximum context window:
189 | 
190 | ```typescript
191 | // Extract from Markdown with token limit
192 | const result = await extract({
193 |   content: markdownContent,
194 |   format: ContentFormat.MARKDOWN,
195 |   schema,
196 |   // Provide model provider and model name
197 |   provider: LLMProvider.OPENAI,
198 |   modelName: "gpt-4o-mini",
199 |   openaiApiKey: "your-openai-api-key",
200 |   // Limit to roughly 128K tokens (max input for gpt-4o-mini)
201 |   maxInputTokens: 128000,
202 | });
203 | ```
204 | 
205 | > [!WARNING]
206 | > For OpenAI models, optional schema is not supported. You need to change `.optional()` to `.nullable()`.
207 | 
208 | ### Extracting from Main HTML
209 | 
210 | For blog posts or articles with lots of navigation elements, headers, and footers, you can use the `extractMainHtml` option to focus on just the main content:
211 | 
212 | ```typescript
213 | const result = await extract({
214 |   content: htmlContent,
215 |   format: ContentFormat.HTML,
216 |   schema: mySchema,
217 |   htmlExtractionOptions: {
218 |     extractMainHtml: true // Uses heuristics to remove navigation, headers, footers, etc.
219 |   },
220 |   sourceUrl,
221 | });
222 | ```
223 | 
224 | > [!NOTE]
225 | > The `extractMainHtml` option only applies to HTML format. It uses heuristics to identify and extract what appears to be the main content area (like article or main tags). It's recommended to keep this option off (false) when extracting details about a single item (like detail page for a product) as it might remove important contextual elements.
226 | 
227 | ### Extracting Images from HTML
228 | 
229 | By default, images are excluded from the HTML extraction process to simplify the output. If you need to extract image URLs or references, you can enable the `includeImages` option:
230 | 
231 | ```typescript
232 | // Define a schema that includes product images
233 | const productListSchema = z.object({
234 |   products: z.array(
235 |     z.object({
236 |       name: z.string(),
237 |       price: z.number(),
238 |       description: z.string().optional(),
239 |       // Include an array of images for each product
240 |       image: z.object({
241 |         url: z.string().url(),
242 |         alt: z.string().optional(),
243 |       }).optional(),
244 |     })
245 |   ),
246 | });
247 | 
248 | const result = await extract({
249 |   content: htmlContent,
250 |   format: ContentFormat.HTML,
251 |   schema: mySchema,
252 |   htmlExtractionOptions: {
253 |     includeImages: true // Includes images in the generated markdown
254 |   },
255 |   sourceUrl: sourceUrl,
256 | });
257 | ```
258 | 
259 | ## API Keys
260 | 
261 | The library will check for API keys in the following order:
262 | 
263 | 1. Directly provided API key parameter (`googleApiKey` or `openaiApiKey`)
264 | 2. Environment variables (`GOOGLE_API_KEY` or `OPENAI_API_KEY`)
265 | 
266 | While the library can use environment variables, it's recommended to explicitly provide API keys in production code for better control and transparency.
267 | 
268 | ## API Reference
269 | 
270 | ### `extract<T>(options: ExtractorOptions<T>): Promise<ExtractorResult<T>>`
271 | 
272 | Main function to extract structured data from content.
273 | 
274 | #### Options
275 | 
276 | | Option | Type | Description | Default |
277 | |--------|------|-------------|---------|
278 | | `content` | `string` | HTML, markdown, or plain text content to extract from | Required |
279 | | `format` | `ContentFormat` | Content format (HTML, MARKDOWN, or TXT) | Required |
280 | | `schema` | `z.ZodTypeAny` | Zod schema defining the structure to extract | Required |
281 | | `prompt` | `string` | Custom prompt to guide the extraction process | Internal default prompt |
282 | | `provider` | `LLMProvider` | LLM provider (GOOGLE_GEMINI or OPENAI) | `LLMProvider.GOOGLE_GEMINI` |
283 | | `modelName` | `string` | Model name to use | Provider-specific default, Google Gemini 2.5 flash or OpenAI GPT-4o mini  |
284 | | `googleApiKey` | `string` | Google Gemini API key (if using Google Gemini provider) | From env `GOOGLE_API_KEY` |
285 | | `openaiApiKey` | `string` | OpenAI API key (if using OpenAI provider) | From env `OPENAI_API_KEY` |
286 | | `temperature` | `number` | Temperature for the LLM (0-1) | `0` |
287 | | `htmlExtractionOptions` | `HTMLExtractionOptions` | HTML-specific options for content extraction (see below) | `{}` |
288 | | `sourceUrl` | `string` | URL of the HTML content, required when format is HTML to properly handle relative URLs | Required for HTML format |
289 | | `maxInputTokens` | `number` | Maximum number of input tokens to send to the LLM. Uses a rough conversion of 4 characters per token. When specified, content will be truncated if the total prompt size exceeds this limit. | `undefined` |
290 | | `dataToEnrich` | `Record<string, any>` | Original data object to enrich with information from the content. When provided, the LLM will be instructed to update this object rather than creating a new one from scratch. | `undefined` |
291 | 
292 | #### HTML Extraction Options
293 | 
294 | | Option | Type | Description | Default |
295 | |--------|------|-------------|---------|
296 | | `extractMainHtml` | `boolean` | When enabled for HTML content, attempts to extract the main content area, removing navigation bars, headers, footers, sidebars etc. using heuristics. Should be kept off when extracting details about a single item. | `false` |
297 | | `includeImages` | `boolean` | When enabled, images in the HTML will be included in the markdown output. Enable this when you need to extract image URLs or related content. | `false` |
298 | 
299 | #### Return Value
300 | 
301 | The function returns a Promise that resolves to an `ExtractorResult<T>` object:
302 | 
303 | ```typescript
304 | interface ExtractorResult<T> {
305 |   data: T;             // Extracted structured data
306 |   processedContent: string;    // Processed content that was sent to the LLM. Markdown if the input was HTM (after conversion)
307 |   usage: {             // Token usage statistics
308 |     inputTokens?: number;
309 |     outputTokens?: number;
310 |   };
311 | }
312 | ```
313 | 
314 | ### HTML to Markdown Conversion
315 | 
316 | The `convertHtmlToMarkdown` utility function allows you to convert HTML content to markdown without performing extraction.
317 | 
318 | **Function signature:**
319 | ```typescript
320 | convertHtmlToMarkdown(html: string, options?: HTMLExtractionOptions, sourceUrl?: string): string
321 | ```
322 | 
323 | #### Parameters
324 | 
325 | | Parameter | Type | Description | Default |
326 | |-----------|------|-------------|---------|
327 | | `html` | `string` | HTML content to convert to markdown | Required |
328 | | `options` | `HTMLExtractionOptions` | See [HTML Extraction Options](#html-extraction-options) | `undefined` |
329 | | `sourceUrl` | `string` | URL of the HTML content, used to properly convert relative URLs to absolute URLs | `undefined` |
330 | 
331 | #### Return Value
332 | 
333 | The function returns a string containing the markdown conversion of the HTML content.
334 | 
335 | #### Example
336 | 
337 | ```typescript
338 | import { convertHtmlToMarkdown, HTMLExtractionOptions } from "@lightfeed/extractor";
339 | 
340 | // Basic conversion
341 | const markdown = convertHtmlToMarkdown("<h1>Hello World</h1><p>This is a test</p>");
342 | console.log(markdown);
343 | // Output: "Hello World\n===========\n\nThis is a test"
344 | 
345 | // With options to extract main content and include images
346 | const options: HTMLExtractionOptions = {
347 |   extractMainHtml: true,
348 |   includeImages: true
349 | };
350 | 
351 | // With source URL to handle relative links
352 | const markdownWithOptions = convertHtmlToMarkdown(
353 |   `<html>
354 |     <body>
355 |       <header>Header</header>
356 |       <div>
357 |         <img src="/images/logo.png" alt="Logo">
358 |         <a href="/about">About</a>
359 |       </div>
360 |     </body>
361 |     <footer>Footer content</footer>
362 |   </html>`,
363 |   options,
364 |   "https://example.com"
365 | );
366 | console.log(markdownWithOptions);
367 | // Output: "![Logo](https://example.com/images/logo.png)[About](https://example.com/about)"
368 | ```
369 | 
370 | ### JSON Sanitization
371 | 
372 | The `safeSanitizedParser` utility function helps sanitize and recover partial data from LLM outputs that may not perfectly conform to your schema.
373 | 
374 | **Function signature:**
375 | ```typescript
376 | safeSanitizedParser<T>(schema: ZodTypeAny, rawObject: unknown): z.infer<T> | null
377 | ```
378 | 
379 | ```typescript
380 | import { safeSanitizedParser } from "@lightfeed/extractor";
381 | import { z } from "zod";
382 | 
383 | // Define a product catalog schema
384 | const productSchema = z.object({
385 |   products: z.array(
386 |     z.object({
387 |       id: z.number(),
388 |       name: z.string(), // Required field
389 |       price: z.number().optional(), // Optional number
390 |       inStock: z.boolean().optional(),
391 |       category: z.string().optional(),
392 |     })
393 |   ),
394 |   storeInfo: z.object({
395 |     name: z.string(),
396 |     location: z.string().optional(),
397 |     rating: z.number().optional(),
398 |   })
399 | });
400 | 
401 | // Example LLM output with realistic validation issues
402 | const rawLLMOutput = {
403 |   products: [
404 |     {
405 |       id: 1,
406 |       name: "Laptop",
407 |       price: 999,
408 |       inStock: true,
409 |     }, // Valid product
410 |     {
411 |       id: 2,
412 |       name: "Headphones",
413 |       price: "N/A", // Non-convertible string for optional number
414 |       inStock: true,
415 |       category: "Audio",
416 |     },
417 |     {
418 |       id: 3,
419 |       // Missing required "name" field
420 |       price: 45.99,
421 |       inStock: false
422 |     },
423 |     {
424 |       id: 4,
425 |       name: "Keyboard",
426 |       price: 59.99,
427 |       inStock: true
428 |     } // Valid product
429 |   ],
430 |   storeInfo: {
431 |     name: "TechStore",
432 |     location: "123 Main St",
433 |     rating: "N/A" // Invalid: rating is not a number
434 |   }
435 | };
436 | 
437 | // Sanitize the data to recover what's valid
438 | const sanitizedData = safeSanitizedParser(productSchema, rawLLMOutput);
439 | 
440 | // Result:
441 | // {
442 | //   products: [
443 | //     {
444 | //       id: 1,
445 | //       name: "Laptop",
446 | //       price: 999,
447 | //       inStock: true,
448 | //     },
449 | //     {
450 | //       id: 2,
451 | //       name: "Headphones",
452 | //       inStock: true,
453 | //       category: "Audio",
454 | //     },
455 | //     {
456 | //       id: 4,
457 | //       name: "Keyboard",
458 | //       price: 59.99,
459 | //       inStock: true,
460 | //     }
461 | //   ],
462 | //   storeInfo: {
463 | //     name: "TechStore",
464 | //     location: "123 Main St",
465 | //   }
466 | // }
467 | ```
468 | 
469 | This utility is especially useful when:
470 | - LLMs return non-convertible data for optional fields (like "N/A" for numbers)
471 | - Some objects in arrays are missing required fields
472 | - Objects contain invalid values that don't match constraints
473 | - You want to recover as much valid data as possible while safely removing problematic parts
474 | 
475 | ### URL Validation
476 | 
477 | The library provides robust URL validation and handling through Zod's `z.string().url()` validator:
478 | 
479 | ```typescript
480 | const schema = z.object({
481 |   title: z.string(),
482 |   link: z.string().url(),      // Full URL validation works!
483 |   sources: z.array(z.string().url())  // Also works with arrays of URLs
484 | });
485 | 
486 | const result = await extract({
487 |   content: markdownContent,
488 |   format: ContentFormat.MARKDOWN,
489 |   schema,
490 |   // ... other options
491 | });
492 | ```
493 | 
494 | #### How URL Validation Works
495 | 
496 | Our URL validation system provides several key benefits:
497 | 
498 | 1. **Validation**: Uses Zod's built-in `url()` validator to ensure URLs are properly formatted
499 | 2. **Special Character Handling**: Automatically fixes URLs with escaped special characters in markdown (e.g., `https://example.com/meeting-\(2023\)` becomes `https://example.com/meeting-(2023)`)
500 | 3. **Relative URL Resolution**: Converts relative URLs to absolute URLs when `sourceUrl` is provided
501 | 4. **Invalid URL Handling**: Skips invalid URLs rather than failing the entire extraction using our `safeSanitizedParser`
502 | 
503 | This approach ensures reliable URL extraction while maintaining the full power of Zod's schema validation.
504 | 
505 | ## Development
506 | 
507 | ### Setup
508 | 
509 | 1. Clone the repository
510 | 2. Install dependencies with `npm install`
511 | 3. Create a `.env` file in the root directory with your API keys (see `.env.example`)
512 | 
513 | ### Scripts
514 | 
515 | - `npm run build` - Build the library
516 | - `npm run clean` - Remove build artifacts
517 | - `npm run test` - Run all tests (requires API keys for integration tests)
518 | - `npm run dev` - Run the example file
519 | 
520 | ### Running Local Tests
521 | 
522 | You can test the library with real API calls and sample HTML files:
523 | 
524 | ```bash
525 | # Run all local tests with both providers
526 | npm run test:local
527 | 
528 | # Run specific test type with both providers
529 | npm run test:local -- blog
530 | npm run test:local -- product
531 | 
532 | # Run tests with a specific provider
533 | npm run test:local -- blog openai   # Test blog extraction with OpenAI
534 | npm run test:local -- product gemini  # Test product extraction with Google Gemini
535 | ```
536 | 
537 | ### Testing
538 | 
539 | The library includes both unit tests and integration tests:
540 | 
541 | - **Unit tests**: Test individual components without making API calls
542 | - **Integration tests**: Test full extraction pipeline with real API calls
543 | 
544 | Integration tests require valid API keys to be provided in your `.env` file or environment variables. Tests will fail if required API keys are not available.
545 | 
546 | Each integration test runs with both Google Gemini and OpenAI to ensure compatibility across providers.
547 | 
548 | #### HTML to Markdown Integration Tests
549 | 
550 | This project includes comprehensive integration tests for the HTML to Markdown converter using real-world HTML samples. The tests validate three conversion types:
551 | 
552 | 1. Basic conversion (no images)
553 | 2. Main content extraction (no images)
554 | 3. Conversion with images included
555 | 
556 | These tests use a Git submodule with HTML files and groundtruth markdown files. The submodule is not downloaded by default to keep the repository lightweight. To run these tests:
557 | 
558 | ```bash
559 | # First time: Initialize and download the test data submodule
560 | npm run test:html2md:update
561 | 
562 | # Run the HTML to Markdown integration tests
563 | npm run test:html2md
564 | 
565 | # Update test data if new test files are available
566 | npm run test:html2md:sync
567 | ```
568 | 
569 | The test suite automatically discovers all available test files and creates test cases for each conversion type that has a corresponding groundtruth file.
570 | 
571 | #### Running Specific Tests
572 | 
573 | You can run individual tests by using the `-t` flag with a pattern that matches the test description:
574 | 
575 | ```bash
576 | # Run a specific test by exact description
577 | npm run test -- -t "should extract blog post data using Google Gemini default model"
578 | 
579 | # Run all tests that include a specific keyword
580 | npm run test -- -t "blog post"
581 | 
582 | # Run all tests for a specific provider
583 | npm run test -- -t "OpenAI"
584 | 
585 | # Run all unit tests for a specific utility
586 | npm run test -- -t "safeSanitizedParser"
587 | 
588 | # Run specific HTML to Markdown tests
589 | npm run test -- -t "should convert forum/tech-0 to markdown"
590 | ```
591 | 
592 | The `-t` flag uses pattern matching, so you can be as specific or general as needed to select the tests you want to run.
593 | 
594 | ## Support
595 | 
596 | If you need direct assistance with your implementation:
597 | - Email us at support@lightfeed.ai
598 | - Open an issue in this repository
599 | - Post your question in our [Discord community](https://discord.gg/txZ2s4pgQJ)
600 | 
601 | ## License
602 | 
603 | Apache 2.0
604 | 


--------------------------------------------------------------------------------
/jest.config.js:
--------------------------------------------------------------------------------
 1 | /** @type {import('ts-jest').JestConfigWithTsJest} */
 2 | module.exports = {
 3 |   preset: "ts-jest",
 4 |   testEnvironment: "node",
 5 |   roots: ["<rootDir>/src", "<rootDir>/tests"],
 6 |   testMatch: ["**/__tests__/**/*.ts?(x)", "**/?(*.)+(spec|test).ts?(x)"],
 7 |   collectCoverage: true,
 8 |   coverageDirectory: "coverage",
 9 |   collectCoverageFrom: [
10 |     "src/**/*.ts",
11 |     "!src/dev/**/*.ts",
12 |     "!src/**/*.d.ts",
13 |     "!src/types.ts",
14 |     "!src/example.ts",
15 |     "!**/node_modules/**",
16 |     "!**/vendor/**",
17 |   ],
18 |   transform: {
19 |     "^.+\\.tsx?$": "ts-jest",
20 |   },
21 |   setupFiles: ["<rootDir>/tests/setup.ts"],
22 |   watchman: false,
23 | };
24 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@lightfeed/extractor",
 3 |   "version": "0.1.7",
 4 |   "description": "Use LLMs to robustly extract and enrich structured data from HTML and markdown",
 5 |   "main": "dist/index.js",
 6 |   "types": "dist/index.d.ts",
 7 |   "files": [
 8 |     "dist"
 9 |   ],
10 |   "engines": {
11 |     "node": ">=18"
12 |   },
13 |   "scripts": {
14 |     "build": "tsc",
15 |     "clean": "rimraf dist",
16 |     "prepare": "npm run clean && npm run build",
17 |     "prepublishOnly": "npm run test:unit",
18 |     "test": "jest",
19 |     "test:unit": "jest tests/unit",
20 |     "test:integration": "jest tests/integration",
21 |     "test:watch": "jest --watch",
22 |     "test:cov": "jest --coverage",
23 |     "test:local": "ts-node src/dev/runLocalTest.ts",
24 |     "test:usage": "ts-node src/dev/testUsage.ts",
25 |     "test:html2md": "jest tests/integration/html-to-markdown.test.ts",
26 |     "test:html2md:update": "git submodule update --init --recursive test-data",
27 |     "test:html2md:sync": "cd test-data && git pull origin main && cd ..",
28 |     "test:html2md:regenerate": "ts-node src/dev/regenerateGroundTruth.ts",
29 |     "lint": "tslint -p tsconfig.json",
30 |     "dev": "ts-node src/example.ts",
31 |     "dev:html2md": "ts-node src/dev/testHtmlToMarkdown.ts"
32 |   },
33 |   "repository": {
34 |     "type": "git",
35 |     "url": "git+https://github.com/lightfeed/extractor.git"
36 |   },
37 |   "keywords": [
38 |     "llm",
39 |     "extraction",
40 |     "web-scraping",
41 |     "html",
42 |     "markdown",
43 |     "structured-data",
44 |     "openai",
45 |     "gemini"
46 |   ],
47 |   "author": "Lightfeed",
48 |   "license": "Apache-2.0",
49 |   "bugs": {
50 |     "url": "https://github.com/lightfeed/extractor/issues"
51 |   },
52 |   "homepage": "https://github.com/lightfeed/extractor#readme",
53 |   "dependencies": {
54 |     "@langchain/google-genai": "^0.2.5",
55 |     "@langchain/openai": "^0.5.10",
56 |     "cheerio": "^1.0.0",
57 |     "jsonrepair": "^3.12.0",
58 |     "langchain": "^0.3.24",
59 |     "turndown": "^7.2.0",
60 |     "xmldom": "^0.6.0",
61 |     "xpath": "^0.0.34",
62 |     "zod": "^3.24.3"
63 |   },
64 |   "devDependencies": {
65 |     "@types/jest": "^29.5.12",
66 |     "@types/node": "^22.15.3",
67 |     "@types/turndown": "^5.0.5",
68 |     "@types/xmldom": "^0.1.34",
69 |     "dotenv": "^16.3.1",
70 |     "jest": "^29.7.0",
71 |     "rimraf": "^5.0.10",
72 |     "ts-jest": "^29.1.2",
73 |     "ts-node": "^10.9.2",
74 |     "typescript": "^5.8.3"
75 |   },
76 |   "publishConfig": {
77 |     "access": "public"
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/converters.ts:
--------------------------------------------------------------------------------
  1 | import TurndownService from "turndown";
  2 | import { HTMLExtractionOptions } from "./types";
  3 | import { DOMParser, XMLSerializer } from "xmldom";
  4 | import { isNodeLike } from "xpath";
  5 | import * as url from "url";
  6 | 
  7 | var xpath = require("xpath");
  8 | const cheerio = require("cheerio");
  9 | 
 10 | /**
 11 |  * Extract the main content from an HTML string if requested
 12 |  */
 13 | function extractMainHtml(html: string): string {
 14 |   try {
 15 |     const bodyDoc = new DOMParser().parseFromString(html, "text/html");
 16 | 
 17 |     [...OVERALL_DISCARD_XPATH, ...PRECISION_DISCARD_XPATH].forEach((xPath) => {
 18 |       const result = xpath.parse(xPath).select({ node: bodyDoc, isHtml: true });
 19 | 
 20 |       // Ensure result is an array before calling forEach
 21 |       const nodes = Array.isArray(result) ? result : [result];
 22 | 
 23 |       nodes.forEach((node) => {
 24 |         if (isNodeLike(node) && node.parentNode) {
 25 |           node.parentNode.removeChild(node);
 26 |         }
 27 |       });
 28 |     });
 29 | 
 30 |     const refinedHtml = new XMLSerializer().serializeToString(bodyDoc);
 31 |     return refinedHtml == "" ? html : refinedHtml;
 32 |   } catch (error) {
 33 |     console.error("error extracting main html", error);
 34 |     return "";
 35 |   }
 36 | }
 37 | 
 38 | /**
 39 |  * Convert HTML to Markdown
 40 |  */
 41 | export function htmlToMarkdown(
 42 |   html: string,
 43 |   options?: HTMLExtractionOptions,
 44 |   sourceUrl?: string
 45 | ): string {
 46 |   // First clean up the html
 47 |   const tidiedHtml = tidyHtml(html, options?.includeImages ?? false);
 48 | 
 49 |   // Turndown config
 50 |   // Reference: https://github.com/jina-ai/reader/blob/1e3bae6aad9cf0005c14f0036b46b49390e63203/backend/functions/src/cloud-functions/crawler.ts#L134
 51 |   const turnDownService = new TurndownService();
 52 | 
 53 |   // Define elements to remove - conditionally include or exclude images
 54 |   const elementsToRemove: any[] = [
 55 |     "meta",
 56 |     "style",
 57 |     "script",
 58 |     "noscript",
 59 |     "link",
 60 |     "textarea",
 61 |   ];
 62 | 
 63 |   // Only remove image elements if includeImages is not enabled
 64 |   if (!options?.includeImages) {
 65 |     elementsToRemove.push("img", "picture", "figure");
 66 |   }
 67 | 
 68 |   turnDownService.addRule("remove-irrelevant", {
 69 |     filter: elementsToRemove,
 70 |     replacement: () => "",
 71 |   });
 72 | 
 73 |   turnDownService.addRule("truncate-svg", {
 74 |     filter: "svg" as any,
 75 |     replacement: () => "",
 76 |   });
 77 | 
 78 |   turnDownService.addRule("title-as-h1", {
 79 |     filter: ["title"],
 80 |     replacement: (innerText: string) => `${innerText}\n===============\n`,
 81 |   });
 82 | 
 83 |   turnDownService.addRule("improved-paragraph", {
 84 |     filter: "p",
 85 |     replacement: (innerText: string) => {
 86 |       const trimmed = innerText.trim();
 87 |       if (!trimmed) {
 88 |         return "";
 89 |       }
 90 | 
 91 |       return `${trimmed.replace(/\n{3,}/g, "\n\n")}\n\n`;
 92 |     },
 93 |   });
 94 | 
 95 |   turnDownService.addRule("improved-inline-link", {
 96 |     filter: function (node: any, options: any) {
 97 |       return Boolean(
 98 |         options.linkStyle === "inlined" &&
 99 |           node.nodeName === "A" &&
100 |           node.getAttribute("href")
101 |       );
102 |     },
103 | 
104 |     replacement: function (content: string, node: any) {
105 |       let href = node.getAttribute("href");
106 |       if (href) {
107 |         // Convert relative URLs to absolute if sourceUrl is provided
108 |         if (
109 |           sourceUrl &&
110 |           !href.startsWith("http") &&
111 |           !href.startsWith("mailto:")
112 |         ) {
113 |           try {
114 |             href = url.resolve(sourceUrl, href);
115 |           } catch (error) {
116 |             console.warn(
117 |               `Failed to resolve URL ${href} against ${sourceUrl}:`,
118 |               error
119 |             );
120 |           }
121 |         }
122 |         href = href.replace(/([()])/g, "\\$1");
123 |       }
124 |       let title = cleanAttribute(node.getAttribute("title"));
125 |       if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
126 | 
127 |       const fixedContent = content.replace(/\s+/g, " ").trim();
128 |       const fixedHref = href.replace(/\s+/g, "").trim();
129 | 
130 |       return `[${fixedContent}](${fixedHref}${title || ""})`;
131 |     },
132 |   });
133 | 
134 |   turnDownService.addRule("images", {
135 |     filter: "img",
136 | 
137 |     replacement: function (content: string, node: any) {
138 |       let src = node.getAttribute("src");
139 |       if (src) {
140 |         // Convert relative URLs to absolute if sourceUrl is provided
141 |         if (sourceUrl && !src.startsWith("http") && !src.startsWith("data:")) {
142 |           try {
143 |             src = url.resolve(sourceUrl, src);
144 |           } catch (error) {
145 |             console.warn(
146 |               `Failed to resolve URL ${src} against ${sourceUrl}:`,
147 |               error
148 |             );
149 |           }
150 |         }
151 |         src = src.replace(/([()])/g, "\\$1");
152 |       } else {
153 |         return ""; // No source, no image
154 |       }
155 | 
156 |       let alt = cleanAttribute(node.getAttribute("alt") || "");
157 |       let title = cleanAttribute(node.getAttribute("title"));
158 | 
159 |       if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
160 | 
161 |       const fixedSrc = src.replace(/\s+/g, "").trim();
162 | 
163 |       return `![${alt}](${fixedSrc}${title || ""})`;
164 |     },
165 |   });
166 | 
167 |   const fullMarkdown = turnDownService.turndown(tidiedHtml).trim();
168 |   if (options?.extractMainHtml) {
169 |     const mainHtml = extractMainHtml(tidiedHtml);
170 |     const mainMarkdown = turnDownService.turndown(mainHtml).trim();
171 |     // Heristics:
172 |     // If main content is empty or is less than 20% of full content and not too short, use full content
173 |     if (
174 |       mainMarkdown.length == 0 ||
175 |       (mainMarkdown.length < fullMarkdown.length * 0.2 &&
176 |         mainMarkdown.length < 500)
177 |     ) {
178 |       return fullMarkdown;
179 |     } else {
180 |       return mainMarkdown;
181 |     }
182 |   } else {
183 |     return fullMarkdown;
184 |   }
185 | }
186 | 
187 | // Clean up the html
188 | function tidyHtml(html: string, includeImages: boolean): string {
189 |   const $ = cheerio.load(html);
190 |   $("*").each(function (this: any) {
191 |     const element = $(this);
192 |     const attributes = Object.keys(this.attribs);
193 | 
194 |     for (let i = 0; i < attributes.length; i++) {
195 |       let attr = attributes[i];
196 |       // Check if the attribute value has an odd number of quotes
197 |       // If the attribute name has a quote, it might be a broken attribute. Remove it completely.
198 |       // (this occured at dealnews.com)
199 |       if (attr.includes('"')) {
200 |         element.remove();
201 |       }
202 |     }
203 |   });
204 | 
205 |   // Adatpted from https://github.com/adbar/trafilatura/blob/c7e00f3a31e436c7b6ce666b44712e16e30908c0/trafilatura/settings.py#L55
206 |   // Removed (because user might want to extract them):
207 |   // - form
208 |   // - fieldset
209 |   // - footer (might contain company info)
210 |   // - img, picture, figure (if includeImages is false)
211 |   // - option, label, select (this can present product options and titles)
212 |   const manuallyCleanedElements = [
213 |     // important
214 |     "aside",
215 |     "embed",
216 |     // "footer",
217 |     // "form",
218 |     "head",
219 |     "iframe",
220 |     "menu",
221 |     "object",
222 |     "script",
223 |     // other content
224 |     "applet",
225 |     "audio",
226 |     "canvas",
227 |     "map",
228 |     "svg",
229 |     "video",
230 |     // secondary
231 |     "area",
232 |     "blink",
233 |     "button",
234 |     "datalist",
235 |     "dialog",
236 |     "frame",
237 |     "frameset",
238 |     // "fieldset",
239 |     "link",
240 |     "input",
241 |     "ins",
242 |     // "label",
243 |     "legend",
244 |     "marquee",
245 |     "math",
246 |     "menuitem",
247 |     "nav",
248 |     "noscript",
249 |     "optgroup",
250 |     // "option",
251 |     "output",
252 |     "param",
253 |     "progress",
254 |     "rp",
255 |     "rt",
256 |     "rtc",
257 |     // "select",
258 |     "source",
259 |     "style",
260 |     "track",
261 |     "textarea",
262 |     "time",
263 |     "use",
264 |   ];
265 | 
266 |   if (!includeImages) {
267 |     manuallyCleanedElements.push("img", "picture", "figure");
268 |   }
269 | 
270 |   // Further clean html
271 |   manuallyCleanedElements.forEach((element) => {
272 |     $(element).remove();
273 |   });
274 |   return $("body").html();
275 | }
276 | 
277 | function cleanAttribute(attribute: string) {
278 |   return attribute ? attribute.replace(/(\n+\s*)+/g, "\n") : "";
279 | }
280 | 
281 | // Adapted from https://github.com/adbar/trafilatura/blob/c7e00f3a31e436c7b6ce666b44712e16e30908c0/trafilatura/xpaths.py#L100
282 | // Added:
283 | // - Add contains(@id, "filter") to remove filter menus
284 | // - footer
285 | // Removed (because user might want to extract them):
286 | // - Commented out tags
287 | // - Commented out sidebar (sidebar sometimes can be too aggressive and can remove main content)
288 | // - Commented out author
289 | // - Commented out rating
290 | // - Commented out attachment
291 | // - Commented out timestamp
292 | // - Commented out user-info and user-profile
293 | // - Commented out comment or hidden section
294 | // - Not including @data-testid (it can remove dynamic product listings)
295 | // - Commented out options
296 | const OVERALL_DISCARD_XPATH = [
297 |   // navigation + footers, news outlets related posts, sharing, jp-post-flair jp-relatedposts
298 |   `.//*[(self::div or self::item or self::list
299 |            or self::p or self::section or self::span)][
300 |   contains(translate(@id, "F","f"), "footer") or contains(translate(@class, "F","f"), "footer")
301 |   or contains(@id, "related") or contains(translate(@class, "R", "r"), "related") or
302 |   contains(@id, "viral") or contains(@class, "viral") or
303 |   contains(@id, "filter") or
304 |   starts-with(@id, "shar") or starts-with(@class, "shar") or
305 |   contains(@class, "share-") or
306 |   contains(translate(@id, "S", "s"), "share") or
307 |   contains(@id, "social") or contains(@class, "social") or contains(@class, "sociable") or
308 |   contains(@id, "syndication") or contains(@class, "syndication") or
309 |   starts-with(@id, "jp-") or starts-with(@id, "dpsp-content") or
310 |   contains(@class, "embedded") or contains(@class, "embed")
311 |   or contains(@id, "newsletter") or contains(@class, "newsletter")
312 |   or contains(@class, "subnav") or
313 |   contains(@id, "cookie") or contains(@class, "cookie") or ` +
314 |     // `contains(@id, "tags") or contains(@class, "tags") or ` +
315 |     // `contains(@id, "sidebar") or contains(@class, "sidebar") or ` +
316 |     `contains(@id, "banner") or contains(@class, "banner")
317 |   or contains(@class, "meta") or
318 |   contains(@id, "menu") or contains(@class, "menu") or
319 |   contains(translate(@id, "N", "n"), "nav") or contains(translate(@role, "N", "n"), "nav")
320 |   or starts-with(@class, "nav") or contains(translate(@class, "N", "n"), "navigation") or
321 |   contains(@class, "navbar") or contains(@class, "navbox") or starts-with(@class, "post-nav")
322 |   or contains(@id, "breadcrumb") or contains(@class, "breadcrumb") or
323 |   contains(@id, "bread-crumb") or contains(@class, "bread-crumb") or ` +
324 |     // `contains(@id, "author") or contains(@class, "author") or ` +
325 |     `contains(@id, "button") or contains(@class, "button")
326 |   or contains(translate(@class, "B", "b"), "byline") or ` +
327 |     // contains(@class, "rating") or ` +
328 |     `starts-with(@class, "widget") or ` +
329 |     // contains(@class, "attachment") or contains(@class, "timestamp") or
330 |     // contains(@class, "user-info") or contains(@class, "user-profile") or
331 |     `contains(@class, "-ad-") or contains(@class, "-icon")
332 |   or contains(@class, "article-infos") or
333 |   contains(translate(@class, "I", "i"), "infoline")
334 |   or contains(@data-component, "MostPopularStories")
335 |   or contains(@class, "outbrain") or contains(@class, "taboola")
336 |   or contains(@class, "criteo") ` +
337 |     // or contains(@class, "options")
338 |     `or contains(@class, "consent") or contains(@class, "modal-content")
339 |   or contains(@class, "paid-content") or contains(@class, "paidcontent")
340 |   or contains(@id, "premium-") or contains(@id, "paywall")
341 |   or contains(@class, "obfuscated") or contains(@class, "blurred")
342 |   or contains(@class, " ad ")
343 |   or contains(@class, "next-post") or contains(@class, "side-stories")
344 |   or contains(@class, "related-stories") or contains(@class, "most-popular")
345 |   or contains(@class, "mol-factbox") or starts-with(@class, "ZendeskForm")
346 |   or contains(@class, "message-container") or contains(@id, "message_container")
347 |   or contains(@class, "yin") or contains(@class, "zlylin") or
348 |   contains(@class, "xg1") or contains(@id, "bmdh")
349 |   or @data-lp-replacement-content]`,
350 |   ".//footer",
351 | 
352 |   // comment debris + hidden parts
353 |   // `.//*[@class="comments-title" or contains(@class, "comments-title") or
354 |   // contains(@class, "nocomments") or starts-with(@id, "reply-") or starts-with(@class, "reply-") or
355 |   // contains(@class, "-reply-") or contains(@class, "message")
356 |   // or contains(@id, "akismet") or contains(@class, "akismet") or
357 |   // starts-with(@class, "hide-") or contains(@class, "hide-print") or contains(@id, "hidden")
358 |   // or contains(@style, "hidden") or contains(@hidden, "hidden") or contains(@class, "noprint")
359 |   // or contains(@style, "display:none") or contains(@class, " hidden") or @aria-hidden="true"
360 |   // or contains(@class, "notloaded")]`,
361 | ];
362 | 
363 | // Adapted from https://github.com/adbar/trafilatura/blob/c7e00f3a31e436c7b6ce666b44712e16e30908c0/trafilatura/xpaths.py#L179
364 | // Removed:
365 | // - contains(@style, "border")
366 | const PRECISION_DISCARD_XPATH = [
367 |   ".//header",
368 |   `.//*[(self::div or self::item or self::list
369 |            or self::p or self::section or self::span)][
370 |       contains(@id, "bottom") or contains(@class, "bottom") or
371 |       contains(@id, "link") or contains(@class, "link")
372 |   ]`,
373 | ];
374 | 


--------------------------------------------------------------------------------
/src/dev/regenerateGroundTruth.ts:
--------------------------------------------------------------------------------
  1 | import * as fs from "fs";
  2 | import * as path from "path";
  3 | import { htmlToMarkdown } from "../converters";
  4 | import { HTMLExtractionOptions } from "../types";
  5 | import * as cheerio from "cheerio";
  6 | 
  7 | // Function to sanitize HTML content
  8 | function sanitizeHTML(html: string, originalSource: string): string {
  9 |   const $ = cheerio.load(html);
 10 | 
 11 |   // Remove scripts and event handlers
 12 |   $("script").remove();
 13 |   $("[onclick]").removeAttr("onclick");
 14 |   $("[onload]").removeAttr("onload");
 15 |   // Find all elements with attributes starting with "on" and remove them
 16 |   $("*").each(function () {
 17 |     const el = $(this);
 18 |     const node = el[0];
 19 | 
 20 |     // Skip if not an element node or has no attributes
 21 |     if (!node || node.type !== "tag" || !("attribs" in node)) return;
 22 | 
 23 |     // Now TypeScript knows node.attribs exists
 24 |     Object.keys(node.attribs)
 25 |       .filter((attr) => attr.startsWith("on"))
 26 |       .forEach((attr) => el.removeAttr(attr));
 27 |   });
 28 |   // Remove styles
 29 |   $("style").remove();
 30 |   $("[style]").removeAttr("style");
 31 | 
 32 |   // Replace text content with placeholder
 33 |   $("p, h1, h2, h3, h4, h5, span, div").each(function () {
 34 |     const el = $(this);
 35 |     if (el.children().length === 0) {
 36 |       // Only replace text in leaf nodes
 37 |       const originalText = el.text();
 38 |       const length = originalText.length;
 39 | 
 40 |       if (length > 0) {
 41 |         // Generate placeholder text with exactly the same length
 42 |         const loremIpsumBase =
 43 |           "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. ";
 44 | 
 45 |         // Create deterministic placeholder based on original length and first character
 46 |         let placeholder = "";
 47 |         // Repeat the base text as many times as needed
 48 |         while (placeholder.length < length) {
 49 |           placeholder += loremIpsumBase;
 50 |         }
 51 | 
 52 |         // Trim to exact length of original text
 53 |         placeholder = placeholder.substring(0, length);
 54 | 
 55 |         el.text(placeholder);
 56 |       }
 57 |     }
 58 |   });
 59 | 
 60 |   // Replace links
 61 |   $("a").each(function () {
 62 |     const el = $(this);
 63 |     const isEmail = el.attr("href") && el.attr("href")!.startsWith("mailto:");
 64 |     const isExternal =
 65 |       el.attr("href") &&
 66 |       (el.attr("href")!.startsWith("http") ||
 67 |         el.attr("href")!.startsWith("www"));
 68 | 
 69 |     // Replace with appropriate placeholder based on link type
 70 |     if (isEmail) {
 71 |       // Replace email links
 72 |       el.attr("href", "mailto:example@example.com");
 73 |     } else if (isExternal) {
 74 |       // Replace external links
 75 |       el.attr("href", "https://example.com/external-link");
 76 |     } else {
 77 |       // Replace internal/relative links
 78 |       el.attr("href", "/placeholder-page");
 79 |     }
 80 | 
 81 |     const originalLinkText = el.text().trim();
 82 |     const textLength = originalLinkText.length;
 83 |     if (textLength > 0) {
 84 |       // Base text patterns for different link types
 85 |       let placeholderBase = "Link Text";
 86 | 
 87 |       if (isEmail) {
 88 |         placeholderBase = "Email Link";
 89 |       } else if (isExternal) {
 90 |         placeholderBase = "External Link";
 91 |       } else {
 92 |         placeholderBase = "Page Link";
 93 |       }
 94 | 
 95 |       // Replace the link text
 96 |       el.text(placeholderBase);
 97 |     }
 98 |   });
 99 | 
100 |   // Replace images with real placeholder services
101 |   $("img").each(function () {
102 |     const el = $(this);
103 |     const width = el.attr("width") || 300;
104 |     const height = el.attr("height") || 200;
105 | 
106 |     // Use a real placeholder image service
107 |     el.attr("src", `https://picsum.photos/${width}/${height}`);
108 | 
109 |     // Add generic alt text if none exists
110 |     if (!el.attr("alt")) {
111 |       el.attr("alt", "Placeholder image");
112 |     }
113 |   });
114 | 
115 |   // Add attribution header
116 |   const sanitizedHTML = $.html();
117 | 
118 |   return sanitizedHTML;
119 | }
120 | 
121 | // Function to convert HTML to Markdown and save as ground truth
122 | async function generateGroundTruth(
123 |   htmlFilePath: string,
124 |   groundtruthDir: string,
125 |   options?: HTMLExtractionOptions,
126 |   variant: string = ""
127 | ) {
128 |   try {
129 |     // Read and sanitize the HTML file
130 |     const originalHtml = fs.readFileSync(htmlFilePath, "utf8");
131 |     const sanitizedHtml = sanitizeHTML(originalHtml, htmlFilePath);
132 | 
133 |     // Save sanitized HTML back to the original file
134 |     fs.writeFileSync(htmlFilePath, sanitizedHtml);
135 |     console.log(`✅ Sanitized HTML: ${htmlFilePath}`);
136 | 
137 |     // Convert to Markdown
138 |     const markdown = htmlToMarkdown(sanitizedHtml, options);
139 | 
140 |     // Create groundtruth directory if it doesn't exist
141 |     if (!fs.existsSync(groundtruthDir)) {
142 |       fs.mkdirSync(groundtruthDir, { recursive: true });
143 |     }
144 | 
145 |     // Generate output filename
146 |     const baseName = path.basename(htmlFilePath, ".html");
147 |     const outputFilename = variant
148 |       ? `${baseName}.${variant}.md`
149 |       : `${baseName}.md`;
150 |     const outputPath = path.join(groundtruthDir, outputFilename);
151 | 
152 |     // Save the markdown
153 |     fs.writeFileSync(outputPath, markdown);
154 |     console.log(`✅ Generated ground truth: ${outputPath}`);
155 | 
156 |     return outputPath;
157 |   } catch (error) {
158 |     console.error("❌ Error generating ground truth:", error);
159 |     throw error;
160 |   }
161 | }
162 | 
163 | // Main function to regenerate all ground truth files
164 | async function main() {
165 |   const testDataDir = path.join(process.cwd(), "test-data");
166 | 
167 |   // Check if test-data directory exists
168 |   if (!fs.existsSync(testDataDir)) {
169 |     console.error(
170 |       "❌ test-data directory not found. Please run 'npm run test:html2md:update' first."
171 |     );
172 |     process.exit(1);
173 |   }
174 | 
175 |   const htmlDir = path.join(testDataDir, "html");
176 |   const groundtruthDir = path.join(testDataDir, "groundtruth");
177 | 
178 |   // Get all categories (subdirectories under html/)
179 |   const categories = fs
180 |     .readdirSync(htmlDir, { withFileTypes: true })
181 |     .filter((dirent) => dirent.isDirectory())
182 |     .map((dirent) => dirent.name);
183 | 
184 |   console.log("\n🔍 Regenerating ground truth files...\n");
185 | 
186 |   // Process each category
187 |   for (const category of categories) {
188 |     console.log(`\n📁 Processing category: ${category}`);
189 | 
190 |     const categoryHtmlDir = path.join(htmlDir, category);
191 |     const categoryGroundtruthDir = path.join(groundtruthDir, category);
192 | 
193 |     // Create category directory in groundtruth if it doesn't exist
194 |     if (!fs.existsSync(categoryGroundtruthDir)) {
195 |       fs.mkdirSync(categoryGroundtruthDir, { recursive: true });
196 |     }
197 | 
198 |     // Get all HTML files in this category
199 |     const htmlFiles = fs
200 |       .readdirSync(categoryHtmlDir)
201 |       .filter((file) => file.endsWith(".html"))
202 |       .map((file) => file.replace(".html", ""));
203 | 
204 |     // Process each HTML file
205 |     for (const filename of htmlFiles) {
206 |       const htmlFilePath = path.join(categoryHtmlDir, `${filename}.html`);
207 | 
208 |       // Generate ground truth files with different options
209 |       await generateGroundTruth(htmlFilePath, categoryGroundtruthDir); // Basic conversion
210 |       await generateGroundTruth(
211 |         htmlFilePath,
212 |         categoryGroundtruthDir,
213 |         { includeImages: true },
214 |         "images"
215 |       );
216 |       await generateGroundTruth(
217 |         htmlFilePath,
218 |         categoryGroundtruthDir,
219 |         { extractMainHtml: true },
220 |         "main"
221 |       );
222 |     }
223 |   }
224 | 
225 |   console.log("\n✨ All ground truth files have been regenerated!");
226 | }
227 | 
228 | // Run the main function
229 | main().catch(console.error);
230 | 


--------------------------------------------------------------------------------
/src/dev/runLocalTest.ts:
--------------------------------------------------------------------------------
  1 | import * as fs from "fs";
  2 | import * as path from "path";
  3 | import { config } from "dotenv";
  4 | import { z } from "zod";
  5 | import { extract, ContentFormat, LLMProvider } from "../index";
  6 | 
  7 | // Load environment variables from .env file
  8 | config({ path: path.resolve(process.cwd(), ".env") });
  9 | 
 10 | // Helper to load HTML test fixtures
 11 | function loadFixture(filename: string): string {
 12 |   return fs.readFileSync(
 13 |     path.resolve(__dirname, "../../tests/fixtures", filename),
 14 |     "utf8"
 15 |   );
 16 | }
 17 | 
 18 | // Example schemas for different content types
 19 | const blogSchema = z.object({
 20 |   title: z.string(),
 21 |   author: z.string().optional(),
 22 |   date: z.string().optional(),
 23 |   tags: z
 24 |     .array(z.string())
 25 |     .optional()
 26 |     .describe("Tags appear after the date. Do not include the # symbol."),
 27 |   summary: z.string(),
 28 |   content: z.string().optional(),
 29 | });
 30 | 
 31 | // OpenAI version with nullable instead of optional
 32 | const blogSchemaOpenAI = z.object({
 33 |   title: z.string(),
 34 |   author: z.string().nullable(),
 35 |   date: z.string().nullable(),
 36 |   tags: z
 37 |     .array(z.string())
 38 |     .nullable()
 39 |     .describe("Tags appear after the date. Do not include the # symbol."),
 40 |   summary: z.string(),
 41 |   content: z.string().nullable(),
 42 | });
 43 | 
 44 | const productSchema = z.object({
 45 |   products: z.array(
 46 |     z.object({
 47 |       name: z.string(),
 48 |       price: z.string(),
 49 |       rating: z.string().optional(),
 50 |       description: z.string().optional(),
 51 |       features: z.array(z.string()).optional(),
 52 |     })
 53 |   ),
 54 | });
 55 | 
 56 | // OpenAI version with nullable instead of optional
 57 | const productSchemaOpenAI = z.object({
 58 |   products: z.array(
 59 |     z.object({
 60 |       name: z.string(),
 61 |       price: z.string(),
 62 |       rating: z.string().nullable(),
 63 |       description: z.string().nullable(),
 64 |       features: z.array(z.string()).nullable(),
 65 |     })
 66 |   ),
 67 | });
 68 | 
 69 | // Test functions
 70 | async function testBlogExtraction(provider = LLMProvider.GOOGLE_GEMINI) {
 71 |   console.log(`Testing blog post extraction with ${provider}...`);
 72 | 
 73 |   try {
 74 |     const html = loadFixture("blog-post.html");
 75 | 
 76 |     // Check for required API key
 77 |     if (provider === LLMProvider.GOOGLE_GEMINI && !process.env.GOOGLE_API_KEY) {
 78 |       console.error("Error: GOOGLE_API_KEY environment variable is required");
 79 |       process.exit(1);
 80 |     } else if (provider === LLMProvider.OPENAI && !process.env.OPENAI_API_KEY) {
 81 |       console.error("Error: OPENAI_API_KEY environment variable is required");
 82 |       process.exit(1);
 83 |     }
 84 | 
 85 |     const apiKey =
 86 |       provider === LLMProvider.GOOGLE_GEMINI
 87 |         ? process.env.GOOGLE_API_KEY
 88 |         : process.env.OPENAI_API_KEY;
 89 | 
 90 |     const result = await extract({
 91 |       content: html,
 92 |       format: ContentFormat.HTML,
 93 |       schema:
 94 |         provider === LLMProvider.GOOGLE_GEMINI ? blogSchema : blogSchemaOpenAI,
 95 |       provider,
 96 |       googleApiKey: provider === LLMProvider.GOOGLE_GEMINI ? apiKey : undefined,
 97 |       openaiApiKey: provider === LLMProvider.OPENAI ? apiKey : undefined,
 98 |       htmlExtractionOptions: {
 99 |         extractMainHtml: false,
100 |       },
101 |       sourceUrl: "https://www.example.com/blog/blog-post",
102 |     });
103 | 
104 |     console.log("Extracted data:");
105 |     console.log(JSON.stringify(result.data, null, 2));
106 |     console.log("\nToken usage:");
107 |     console.log(result.usage);
108 | 
109 |     return result;
110 |   } catch (error) {
111 |     console.error(`Blog extraction error with ${provider}:`, error);
112 |     throw error;
113 |   }
114 | }
115 | 
116 | async function testProductExtraction(provider = LLMProvider.GOOGLE_GEMINI) {
117 |   console.log(`Testing product listing extraction with ${provider}...`);
118 | 
119 |   try {
120 |     const html = loadFixture("product-list.html");
121 | 
122 |     // Check for required API key
123 |     if (provider === LLMProvider.GOOGLE_GEMINI && !process.env.GOOGLE_API_KEY) {
124 |       console.error("Error: GOOGLE_API_KEY environment variable is required");
125 |       process.exit(1);
126 |     } else if (provider === LLMProvider.OPENAI && !process.env.OPENAI_API_KEY) {
127 |       console.error("Error: OPENAI_API_KEY environment variable is required");
128 |       process.exit(1);
129 |     }
130 | 
131 |     const apiKey =
132 |       provider === LLMProvider.GOOGLE_GEMINI
133 |         ? process.env.GOOGLE_API_KEY
134 |         : process.env.OPENAI_API_KEY;
135 | 
136 |     const result = await extract({
137 |       content: html,
138 |       format: ContentFormat.HTML,
139 |       schema:
140 |         provider === LLMProvider.GOOGLE_GEMINI
141 |           ? productSchema
142 |           : productSchemaOpenAI,
143 |       provider,
144 |       googleApiKey: provider === LLMProvider.GOOGLE_GEMINI ? apiKey : undefined,
145 |       openaiApiKey: provider === LLMProvider.OPENAI ? apiKey : undefined,
146 |       htmlExtractionOptions: {
147 |         extractMainHtml: true,
148 |       },
149 |       sourceUrl: "https://www.example.com/product/product-list",
150 |     });
151 | 
152 |     console.log("Extracted data:");
153 |     console.log(JSON.stringify(result.data, null, 2));
154 |     console.log("\nToken usage:");
155 |     console.log(result.usage);
156 | 
157 |     return result;
158 |   } catch (error) {
159 |     console.error(`Product extraction error with ${provider}:`, error);
160 |     throw error;
161 |   }
162 | }
163 | 
164 | // Run tests based on command line arguments
165 | async function main() {
166 |   // Parse arguments: content type and provider
167 |   const args = process.argv.slice(2);
168 |   const contentType = args[0] || "all"; // 'blog', 'product', or 'all'
169 |   const provider =
170 |     args[1]?.toUpperCase() === "OPENAI"
171 |       ? LLMProvider.OPENAI
172 |       : args[1]?.toUpperCase() === "GEMINI"
173 |       ? LLMProvider.GOOGLE_GEMINI
174 |       : "all"; // 'OPENAI', 'GEMINI', or 'all'
175 | 
176 |   console.log("API Keys available:");
177 |   console.log(`- GOOGLE_API_KEY: ${process.env.GOOGLE_API_KEY ? "Yes" : "No"}`);
178 |   console.log(`- OPENAI_API_KEY: ${process.env.OPENAI_API_KEY ? "Yes" : "No"}`);
179 |   console.log("");
180 | 
181 |   // Run blog tests
182 |   if (contentType === "blog" || contentType === "all") {
183 |     if (provider === LLMProvider.GOOGLE_GEMINI || provider === "all") {
184 |       await testBlogExtraction(LLMProvider.GOOGLE_GEMINI);
185 |     }
186 |     if (provider === LLMProvider.OPENAI || provider === "all") {
187 |       await testBlogExtraction(LLMProvider.OPENAI);
188 |     }
189 |   }
190 | 
191 |   // Run product tests
192 |   if (contentType === "product" || contentType === "all") {
193 |     if (provider === LLMProvider.GOOGLE_GEMINI || provider === "all") {
194 |       await testProductExtraction(LLMProvider.GOOGLE_GEMINI);
195 |     }
196 |     if (provider === LLMProvider.OPENAI || provider === "all") {
197 |       await testProductExtraction(LLMProvider.OPENAI);
198 |     }
199 |   }
200 | }
201 | 
202 | // Only run if directly executed
203 | if (require.main === module) {
204 |   console.log("Starting local extraction test...");
205 |   console.log("Make sure you have set up your .env file with API keys.");
206 |   console.log("Usage: npm run test:local -- [contentType] [provider]");
207 |   console.log("  contentType: 'blog', 'product', or 'all' (default)");
208 |   console.log("  provider: 'openai', 'gemini', or 'all' (default)");
209 | 
210 |   main()
211 |     .then(() => {
212 |       console.log("All tests completed successfully.");
213 |     })
214 |     .catch((error) => {
215 |       console.error("Test failed:", error);
216 |       process.exit(1);
217 |     });
218 | }
219 | 


--------------------------------------------------------------------------------
/src/dev/testHtmlToMarkdown.ts:
--------------------------------------------------------------------------------
 1 | import * as fs from "fs";
 2 | import * as path from "path";
 3 | import { htmlToMarkdown } from "../converters";
 4 | import { HTMLExtractionOptions } from "../types";
 5 | 
 6 | // Function to convert HTML to Markdown and save the result
 7 | async function testConvertHtmlToMarkdown(
 8 |   htmlFilePath: string,
 9 |   outputDir: string,
10 |   options?: HTMLExtractionOptions
11 | ) {
12 |   try {
13 |     // Read the HTML file
14 |     const html = fs.readFileSync(htmlFilePath, "utf8");
15 | 
16 |     // Convert to Markdown
17 |     const markdown = htmlToMarkdown(html, options);
18 | 
19 |     // Create output directory if it doesn't exist
20 |     if (!fs.existsSync(outputDir)) {
21 |       fs.mkdirSync(outputDir, { recursive: true });
22 |     }
23 | 
24 |     // Generate output filename
25 |     const baseName = path.basename(htmlFilePath, ".html");
26 |     const optionsSuffix = options?.includeImages
27 |       ? ".with-images"
28 |       : options?.extractMainHtml
29 |       ? ".main-content"
30 |       : "";
31 |     const outputPath = path.join(outputDir, `${baseName}${optionsSuffix}.md`);
32 | 
33 |     // Save the markdown
34 |     fs.writeFileSync(outputPath, markdown);
35 |     console.log(`✅ Converted ${htmlFilePath} to ${outputPath}`);
36 | 
37 |     return outputPath;
38 |   } catch (error) {
39 |     console.error("❌ Error converting HTML to Markdown:", error);
40 |     throw error;
41 |   }
42 | }
43 | 
44 | // Main function to run the test
45 | async function main() {
46 |   // Get the HTML file path from command line arguments
47 |   const htmlFilePath = process.argv[2];
48 |   if (!htmlFilePath) {
49 |     console.error("❌ Please provide an HTML file path as an argument");
50 |     console.log("Usage: npm run dev:html2md <path-to-html-file>");
51 |     process.exit(1);
52 |   }
53 | 
54 |   // Create output directory
55 |   const outputDir = path.join(process.cwd(), "dev-output", "markdown");
56 | 
57 |   // Test different conversion options
58 |   console.log(
59 |     "\n🔍 Testing HTML to Markdown conversion with different options...\n"
60 |   );
61 | 
62 |   // 1. Basic conversion
63 |   await testConvertHtmlToMarkdown(htmlFilePath, outputDir);
64 | 
65 |   // 2. Conversion with images
66 |   await testConvertHtmlToMarkdown(htmlFilePath, outputDir, {
67 |     includeImages: true,
68 |   });
69 | 
70 |   // 3. Main content extraction
71 |   await testConvertHtmlToMarkdown(htmlFilePath, outputDir, {
72 |     extractMainHtml: true,
73 |   });
74 | 
75 |   // 4. Both images and main content
76 |   await testConvertHtmlToMarkdown(htmlFilePath, outputDir, {
77 |     includeImages: true,
78 |     extractMainHtml: true,
79 |   });
80 | 
81 |   console.log(
82 |     "\n✨ All conversions completed! Check the output in:",
83 |     outputDir
84 |   );
85 | }
86 | 
87 | // Run the main function
88 | main().catch(console.error);
89 | 


--------------------------------------------------------------------------------
/src/dev/testUsage.ts:
--------------------------------------------------------------------------------
 1 | import { config } from "dotenv";
 2 | import * as path from "path";
 3 | import { z } from "zod";
 4 | import { extract, ContentFormat, LLMProvider } from "../index";
 5 | 
 6 | // Load environment variables from .env file
 7 | config({ path: path.resolve(process.cwd(), ".env") });
 8 | 
 9 | // A simple test script to verify usage tracking works
10 | async function testUsageTracking() {
11 |   console.log("Testing usage tracking with OpenAI...");
12 | 
13 |   // Check if API keys are available
14 |   if (!process.env.OPENAI_API_KEY) {
15 |     console.error("Error: OPENAI_API_KEY environment variable is required");
16 |     process.exit(1);
17 |   }
18 | 
19 |   // Simple schema to test extraction
20 |   const schema = z.object({
21 |     title: z.string(),
22 |     description: z.string(),
23 |   });
24 | 
25 |   // Simple markdown content
26 |   const markdown = `
27 | # Hello World
28 | 
29 | This is a test of the usage tracking system.
30 |   `;
31 | 
32 |   try {
33 |     // Run extraction
34 |     const result = await extract({
35 |       content: markdown,
36 |       format: ContentFormat.MARKDOWN,
37 |       schema,
38 |       provider: LLMProvider.OPENAI,
39 |       openaiApiKey: process.env.OPENAI_API_KEY,
40 |     });
41 | 
42 |     // Log the results
43 |     console.log("\nExtracted data:");
44 |     console.log(JSON.stringify(result.data, null, 2));
45 | 
46 |     console.log("\nToken usage:");
47 |     console.log(result.usage);
48 | 
49 |     // Check if usage was captured
50 |     if (result.usage.inputTokens && result.usage.outputTokens) {
51 |       console.log("\n✅ Usage tracking is working correctly!");
52 |     } else {
53 |       console.log("\n❌ Usage tracking failed!");
54 |     }
55 |   } catch (error) {
56 |     console.error("Error testing usage tracking:", error);
57 |   }
58 | }
59 | 
60 | // Run the test if executed directly
61 | if (require.main === module) {
62 |   testUsageTracking()
63 |     .then(() => console.log("Test completed"))
64 |     .catch(console.error);
65 | }
66 | 


--------------------------------------------------------------------------------
/src/example.ts:
--------------------------------------------------------------------------------
 1 | import { extract, ContentFormat, LLMProvider } from "./index";
 2 | import { z } from "zod";
 3 | import { config } from "dotenv";
 4 | import * as path from "path";
 5 | import * as fs from "fs";
 6 | import { htmlToMarkdown } from "./converters";
 7 | 
 8 | // Load environment variables from .env file
 9 | config({ path: path.resolve(process.cwd(), ".env") });
10 | 
11 | async function example() {
12 |   try {
13 |     // Check if API key is available
14 |     if (!process.env.GOOGLE_API_KEY) {
15 |       console.error("Error: GOOGLE_API_KEY environment variable is required");
16 |       return;
17 |     }
18 | 
19 |     // Define a schema for blog post extraction
20 |     const schema = z.object({
21 |       title: z.string(),
22 |       author: z.string().optional(),
23 |       date: z.string().optional(),
24 |       summary: z.string(),
25 |       categories: z.array(z.string()).optional(),
26 |     });
27 | 
28 |     const htmlContent = fs.readFileSync(
29 |       path.resolve(__dirname, "../tests/fixtures", "blog-post.html"),
30 |       "utf8"
31 |     );
32 |     const sourceUrl = "https://www.example.com/blog/async-await";
33 | 
34 |     const markdown = htmlToMarkdown(
35 |       htmlContent,
36 |       {
37 |         extractMainHtml: true,
38 |         includeImages: true,
39 |       },
40 |       sourceUrl
41 |     );
42 | 
43 |     // fs.writeFileSync("test.md", markdown);
44 | 
45 |     console.log("Running extraction example...");
46 | 
47 |     // Extract data from HTML
48 |     const result = await extract({
49 |       content: htmlContent,
50 |       format: ContentFormat.HTML,
51 |       schema,
52 |       // Using Google Gemini by default
53 |       openaiApiKey: process.env.OPENAI_API_KEY,
54 |       provider: LLMProvider.OPENAI,
55 |       sourceUrl,
56 |     });
57 | 
58 |     console.log("Extracted Data:");
59 |     console.log(JSON.stringify(result.data, null, 2));
60 | 
61 |     console.log("\nMarkdown Content:");
62 |     console.log(result.processedContent.slice(0, 1000) + "\n...");
63 | 
64 |     console.log("\nToken Usage:");
65 |     console.log(result.usage);
66 |   } catch (error) {
67 |     console.error("Error in example:", error);
68 |   }
69 | }
70 | 
71 | // Only run if directly executed
72 | if (require.main === module) {
73 |   example();
74 | }
75 | 


--------------------------------------------------------------------------------
/src/extractors.ts:
--------------------------------------------------------------------------------
  1 | import { ChatOpenAI } from "@langchain/openai";
  2 | import { ChatGoogleGenerativeAI } from "@langchain/google-genai";
  3 | import { z } from "zod";
  4 | import { LLMProvider, Usage, ContentFormat } from "./types";
  5 | import { AIMessage } from "@langchain/core/messages";
  6 | import {
  7 |   safeSanitizedParser,
  8 |   transformSchemaForLLM,
  9 |   fixUrlEscapeSequences,
 10 | } from "./utils/schemaUtils";
 11 | import { jsonrepair } from "jsonrepair";
 12 | 
 13 | // Define LLMResult type here since direct import is problematic
 14 | interface TokenUsage {
 15 |   promptTokens?: number;
 16 |   completionTokens?: number;
 17 |   totalTokens?: number;
 18 | }
 19 | 
 20 | interface LLMOutput {
 21 |   tokenUsage?: TokenUsage;
 22 | }
 23 | 
 24 | interface LLMResult {
 25 |   llmOutput?: LLMOutput;
 26 | }
 27 | 
 28 | /**
 29 |  * Get usage statistics from LLM output
 30 |  */
 31 | export function getUsage(output: LLMResult): Usage {
 32 |   const usage: Usage = {};
 33 | 
 34 |   if (output.llmOutput && output.llmOutput.tokenUsage) {
 35 |     usage.inputTokens = output.llmOutput.tokenUsage.promptTokens;
 36 |     usage.outputTokens = output.llmOutput.tokenUsage.completionTokens;
 37 |   }
 38 | 
 39 |   return usage;
 40 | }
 41 | 
 42 | /**
 43 |  * Create LLM instance based on provider and configuration
 44 |  */
 45 | export function createLLM(
 46 |   provider: LLMProvider,
 47 |   modelName: string,
 48 |   apiKey: string,
 49 |   temperature: number = 0
 50 | ) {
 51 |   switch (provider) {
 52 |     case LLMProvider.OPENAI:
 53 |       return new ChatOpenAI({
 54 |         apiKey,
 55 |         modelName,
 56 |         temperature,
 57 |       });
 58 | 
 59 |     case LLMProvider.GOOGLE_GEMINI:
 60 |       return new ChatGoogleGenerativeAI({
 61 |         apiKey,
 62 |         model: modelName,
 63 |         temperature,
 64 |       });
 65 | 
 66 |     default:
 67 |       throw new Error(`Unsupported LLM provider: ${provider}`);
 68 |   }
 69 | }
 70 | 
 71 | interface ExtractionPromptOptions {
 72 |   format: string;
 73 |   content: string;
 74 |   customPrompt?: string;
 75 |   dataToEnrich?: Record<string, any>;
 76 | }
 77 | 
 78 | interface TruncateContentOptions extends ExtractionPromptOptions {
 79 |   maxTokens: number;
 80 | }
 81 | 
 82 | /**
 83 |  * Generate the extraction prompt with or without a custom query
 84 |  */
 85 | export function generateExtractionPrompt({
 86 |   format,
 87 |   content,
 88 |   customPrompt,
 89 |   dataToEnrich,
 90 | }: ExtractionPromptOptions): string {
 91 |   // Base prompt structure that's shared between default and custom prompts
 92 |   const extractionTask = customPrompt
 93 |     ? `${customPrompt}`
 94 |     : "Please extract structured information from the provided context.";
 95 | 
 96 |   // If dataToEnrich is provided, include it in the prompt for enrichment
 97 |   let promptTemplate = `Context information is below:
 98 | ------
 99 | Format: ${format}
100 | ---
101 | ${content}
102 | ------
103 | 
104 | `;
105 | 
106 |   if (dataToEnrich) {
107 |     promptTemplate += `Original JSON object:
108 | ---
109 | ${JSON.stringify(dataToEnrich, null, 2)}
110 | ------
111 | 
112 | You are a data extraction assistant that extracts structured information from the above context in ${format} and JSON.
113 | 
114 | Your task is: ${extractionTask}
115 | 
116 | ## Guidelines:
117 | 1. Extract ONLY information explicitly stated in the context
118 | 2. Enrich the original JSON object with information from the context
119 | 3. Do not remove any fields from the original JSON object
120 | 4. Only update existing fields and fill in additional fields if new and relevant information is available in the context
121 | 5. Do not make assumptions or infer missing data
122 | 6. Do not include information that appears incomplete or truncated
123 | 
124 | `;
125 |   } else {
126 |     promptTemplate += `You are a data extraction assistant that extracts structured information from the above context.
127 | 
128 | Your task is: ${extractionTask}
129 | 
130 | ## Guidelines:
131 | 1. Extract ONLY information explicitly stated in the context
132 | 2. Do not make assumptions or infer missing data
133 | 3. Leave fields empty when information is not present or you are uncertain
134 | 4. Do not include information that appears incomplete or truncated
135 | 5. Follow the required schema exactly
136 | 
137 | `;
138 |   }
139 | 
140 |   promptTemplate += `Return only the structured data in valid JSON format and nothing else.`;
141 | 
142 |   return promptTemplate;
143 | }
144 | 
145 | /**
146 |  * Truncate content to fit within token limit
147 |  * Uses a rough conversion of 4 characters per token
148 |  */
149 | export function truncateContent({
150 |   format,
151 |   content,
152 |   customPrompt,
153 |   dataToEnrich,
154 |   maxTokens,
155 | }: TruncateContentOptions): string {
156 |   const maxChars = maxTokens * 4;
157 | 
158 |   // First generate the full prompt
159 |   const fullPrompt = generateExtractionPrompt({
160 |     format,
161 |     content,
162 |     customPrompt,
163 |     dataToEnrich,
164 |   });
165 | 
166 |   // If the full prompt is within limits, return original content
167 |   if (fullPrompt.length <= maxChars) {
168 |     return content;
169 |   }
170 | 
171 |   // Calculate how much we need to reduce the content
172 |   const excessChars = fullPrompt.length - maxChars;
173 | 
174 |   // Truncate content by the excess amount
175 |   return content.slice(0, content.length - excessChars);
176 | }
177 | 
178 | /**
179 |  * Extract structured data from markdown using an LLM
180 |  */
181 | export async function extractWithLLM<T extends z.ZodTypeAny>(
182 |   content: string,
183 |   schema: T,
184 |   provider: LLMProvider,
185 |   modelName: string,
186 |   apiKey: string,
187 |   temperature: number = 0,
188 |   customPrompt?: string,
189 |   format: string = ContentFormat.MARKDOWN,
190 |   maxInputTokens?: number,
191 |   dataToEnrich?: Record<string, any>
192 | ): Promise<{ data: z.infer<T>; usage: Usage }> {
193 |   const llm = createLLM(provider, modelName, apiKey, temperature);
194 |   let usage: Usage = {};
195 | 
196 |   // Truncate content if maxInputTokens is specified
197 |   const truncatedContent = maxInputTokens
198 |     ? truncateContent({
199 |         format,
200 |         content,
201 |         customPrompt,
202 |         dataToEnrich,
203 |         maxTokens: maxInputTokens,
204 |       })
205 |     : content;
206 | 
207 |   // Generate the prompt using the unified template function
208 |   const prompt = generateExtractionPrompt({
209 |     format,
210 |     content: truncatedContent,
211 |     customPrompt,
212 |     dataToEnrich,
213 |   });
214 | 
215 |   try {
216 |     // Transform schema to be compatible with LLM output (converting url() to string())
217 |     const llmSchema = transformSchemaForLLM(schema);
218 | 
219 |     // Extract structured data with a withStructuredOutput chain
220 |     const structuredOutputLLM = llm.withStructuredOutput(llmSchema, {
221 |       includeRaw: true,
222 |     });
223 | 
224 |     // Create a callback handler for usage tracking
225 |     const callbacks = [
226 |       {
227 |         handleLLMEnd: (output: any) => {
228 |           usage = getUsage(output);
229 |         },
230 |       },
231 |     ];
232 | 
233 |     // Invoke the LLM with callbacks to track usage
234 |     const response = await structuredOutputLLM.invoke(prompt, { callbacks });
235 |     const raw = response.raw as AIMessage;
236 | 
237 |     let data = response.parsed;
238 | 
239 |     // If structured output is not successful, try to parse the raw object.
240 |     if (data == null) {
241 |       // Note: this only works for OpenAI models.
242 |       if (raw.tool_calls && raw.tool_calls.length > 0) {
243 |         // This is the raw object in JSON mode before structured output tool call.
244 |         const rawObject = raw.tool_calls[0].args;
245 |         // Manually sanitize the object and remove any unsafe but optional fields or unsafe items in arrays.
246 |         data = safeSanitizedParser(llmSchema, rawObject);
247 |       }
248 | 
249 |       // Note: this only works for Google Gemini models.
250 |       if (raw.lc_kwargs && raw.lc_kwargs.content) {
251 |         // Gemini does not return a JSON object, it returns a string that is a JSON object.
252 |         // We use jsonrepair to fix the JSON string and then parse it.
253 |         const rawJson = raw.lc_kwargs.content;
254 |         const rawObject = JSON.parse(jsonrepair(rawJson));
255 |         data = safeSanitizedParser(llmSchema, rawObject);
256 |       }
257 |       if (data == null) {
258 |         throw new Error("No valid data was extracted");
259 |       }
260 |     }
261 | 
262 |     // If structured output worked, we still need to fix URL escape sequences
263 |     // and validate against the original schema
264 |     const fixedData = fixUrlEscapeSequences(data, schema);
265 |     const validatedData = safeSanitizedParser(schema, fixedData);
266 |     // If validation fails, something went wrong with the URL validation
267 |     if (validatedData === null) {
268 |       throw new Error(
269 |         "Extracted data failed validation against original schema"
270 |       );
271 |     }
272 | 
273 |     data = validatedData;
274 | 
275 |     // Return the parsed data and usage statistics
276 |     return {
277 |       data,
278 |       usage,
279 |     };
280 |   } catch (error) {
281 |     console.error("Error during LLM extraction:", error);
282 |     throw error;
283 |   }
284 | }
285 | 


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
  1 | import { z } from "zod";
  2 | import { htmlToMarkdown } from "./converters";
  3 | import { extractWithLLM } from "./extractors";
  4 | import {
  5 |   ContentFormat,
  6 |   LLMProvider,
  7 |   ExtractorOptions,
  8 |   ExtractorResult,
  9 |   HTMLExtractionOptions,
 10 | } from "./types";
 11 | 
 12 | // Default model names
 13 | const DEFAULT_MODELS = {
 14 |   [LLMProvider.GOOGLE_GEMINI]: "gemini-2.5-flash-preview-04-17",
 15 |   [LLMProvider.OPENAI]: "gpt-4o-mini",
 16 | };
 17 | 
 18 | /**
 19 |  * Extract structured data from HTML, markdown, or plain text content using an LLM
 20 |  *
 21 |  * @param options Configuration options for extraction
 22 |  * @param options.content HTML, markdown, or plain text content to extract from
 23 |  * @param options.format Content format (HTML, MARKDOWN, or TXT)
 24 |  * @param options.schema Zod schema defining the structure to extract
 25 |  * @param options.provider LLM provider (GOOGLE_GEMINI or OPENAI)
 26 |  * @param options.modelName Model name to use (provider-specific)
 27 |  * @param options.googleApiKey Google API key (if using Google Gemini provider)
 28 |  * @param options.openaiApiKey OpenAI API key (if using OpenAI provider)
 29 |  * @param options.temperature Temperature for the LLM (0-1)
 30 |  * @param options.prompt Custom prompt to guide the extraction process
 31 |  * @param options.sourceUrl URL of the HTML content (required for HTML format)
 32 |  * @param options.htmlExtractionOptions HTML-specific options for content extraction
 33 |  * @param options.maxInputTokens Maximum number of input tokens to send to the LLM
 34 |  * @param options.dataToEnrich Original data object to enrich with information from the content
 35 |  * @returns The extracted data, original content, and usage statistics
 36 |  */
 37 | export async function extract<T extends z.ZodTypeAny>(
 38 |   options: ExtractorOptions<T>
 39 | ): Promise<ExtractorResult<z.infer<T>>> {
 40 |   // Validate required parameters
 41 |   const provider = options.provider ?? LLMProvider.GOOGLE_GEMINI;
 42 |   let apiKey: string;
 43 | 
 44 |   if (provider === LLMProvider.GOOGLE_GEMINI) {
 45 |     apiKey = options.googleApiKey ?? process.env.GOOGLE_API_KEY ?? "";
 46 |     if (!apiKey) {
 47 |       throw new Error(
 48 |         "Google API key is required. Provide googleApiKey option or set GOOGLE_API_KEY environment variable."
 49 |       );
 50 |     }
 51 |   } else if (provider === LLMProvider.OPENAI) {
 52 |     apiKey = options.openaiApiKey ?? process.env.OPENAI_API_KEY ?? "";
 53 |     if (!apiKey) {
 54 |       throw new Error(
 55 |         "OpenAI API key is required. Provide openaiApiKey option or set OPENAI_API_KEY environment variable."
 56 |       );
 57 |     }
 58 |   } else {
 59 |     throw new Error(`Unsupported LLM provider: ${provider}`);
 60 |   }
 61 | 
 62 |   // Validate sourceUrl for HTML format
 63 |   if (options.format === ContentFormat.HTML && !options.sourceUrl) {
 64 |     throw new Error(
 65 |       "sourceUrl is required when format is HTML to properly handle relative URLs"
 66 |     );
 67 |   }
 68 | 
 69 |   // Get model name (use defaults if not provided)
 70 |   const modelName = options.modelName ?? DEFAULT_MODELS[provider];
 71 | 
 72 |   // Convert HTML to markdown if needed
 73 |   let content = options.content;
 74 |   let formatToUse = options.format;
 75 | 
 76 |   if (options.format === ContentFormat.HTML) {
 77 |     content = htmlToMarkdown(
 78 |       options.content,
 79 |       options.htmlExtractionOptions,
 80 |       options.sourceUrl
 81 |     );
 82 |     // For the LLM, the content is now markdown
 83 |     formatToUse = ContentFormat.MARKDOWN;
 84 |   }
 85 | 
 86 |   // Extract structured data using LLM
 87 |   const { data, usage } = await extractWithLLM(
 88 |     content,
 89 |     options.schema,
 90 |     provider,
 91 |     modelName,
 92 |     apiKey,
 93 |     options.temperature ?? 0,
 94 |     options.prompt,
 95 |     formatToUse.toString(), // Pass the correct format based on actual content
 96 |     options.maxInputTokens,
 97 |     options.dataToEnrich
 98 |   );
 99 | 
100 |   // Return the full result
101 |   return {
102 |     data,
103 |     processedContent: content,
104 |     usage,
105 |   };
106 | }
107 | 
108 | /**
109 |  * Convert HTML to markdown
110 |  *
111 |  * @param html HTML content to convert
112 |  * @param options HTML extraction options
113 |  * @param sourceUrl Source URL for resolving relative links
114 |  * @returns Markdown content
115 |  */
116 | export function convertHtmlToMarkdown(
117 |   html: string,
118 |   options?: HTMLExtractionOptions,
119 |   sourceUrl?: string
120 | ): string {
121 |   return htmlToMarkdown(html, options, sourceUrl);
122 | }
123 | 
124 | // Re-export types and enums
125 | export * from "./types";
126 | 
127 | // Utils
128 | export { safeSanitizedParser } from "./utils/schemaUtils";
129 | 


--------------------------------------------------------------------------------
/src/types.ts:
--------------------------------------------------------------------------------
  1 | import { z } from "zod";
  2 | 
  3 | /**
  4 |  * Represents the format of the input content
  5 |  */
  6 | export enum ContentFormat {
  7 |   HTML = "html",
  8 |   MARKDOWN = "markdown",
  9 |   TXT = "txt",
 10 | }
 11 | 
 12 | /**
 13 |  * Supported LLM providers
 14 |  */
 15 | export enum LLMProvider {
 16 |   OPENAI = "openai",
 17 |   GOOGLE_GEMINI = "google_gemini",
 18 | }
 19 | 
 20 | /**
 21 |  * Options for HTML content processing
 22 |  */
 23 | export interface HTMLExtractionOptions {
 24 |   /**
 25 |    * When enabled, attempts to extract the main content from HTML, removing navigation bars, headers, footers, etc.
 26 |    * This uses heuristics to identify the main content area.
 27 |    *
 28 |    * Should be kept off (false) when extracting specific details about a single item,
 29 |    * as it might remove important contextual elements.
 30 |    *
 31 |    * Only applies to HTML format, not markdown.
 32 |    */
 33 |   extractMainHtml?: boolean;
 34 | 
 35 |   /**
 36 |    * When enabled, images in the HTML will be included in the markdown output.
 37 |    * By default, images are excluded to simplify the extraction process.
 38 |    *
 39 |    * Enable this option when you need to extract image information or URLs.
 40 |    */
 41 |   includeImages?: boolean;
 42 | }
 43 | 
 44 | /**
 45 |  * Options for the extractor
 46 |  */
 47 | export interface ExtractorOptions<T extends z.ZodTypeAny> {
 48 |   /** Content to extract from (HTML, Markdown, or plain text) */
 49 |   content: string;
 50 | 
 51 |   /** Format of the content */
 52 |   format: ContentFormat;
 53 | 
 54 |   /** Schema for structured extraction */
 55 |   schema: T;
 56 | 
 57 |   /** LLM Provider (OpenAI or Google Gemini) */
 58 |   provider?: LLMProvider;
 59 | 
 60 |   /** Model name to use */
 61 |   modelName?: string;
 62 | 
 63 |   /** OpenAI API key */
 64 |   openaiApiKey?: string;
 65 | 
 66 |   /** Google API key */
 67 |   googleApiKey?: string;
 68 | 
 69 |   /** Temperature for the LLM (0-1), defaults to 0 */
 70 |   temperature?: number;
 71 | 
 72 |   /** HTML-specific extraction options (only applies when format is HTML) */
 73 |   htmlExtractionOptions?: HTMLExtractionOptions;
 74 | 
 75 |   /** Custom prompt for extraction (if not provided, a default prompt will be used) */
 76 |   prompt?: string;
 77 | 
 78 |   /** URL of the HTML content, required only for HTML format */
 79 |   sourceUrl?: string;
 80 | 
 81 |   /** Maximum number of input tokens to send to the LLM. Uses a rough conversion of 4 characters per token. */
 82 |   maxInputTokens?: number;
 83 | 
 84 |   /** Original data object to enrich with extracted information. When provided, the LLM will be instructed to enrich this object with additional information from the content. */
 85 |   dataToEnrich?: Record<string, any>;
 86 | }
 87 | 
 88 | /**
 89 |  * Usage statistics for LLM calls
 90 |  */
 91 | export interface Usage {
 92 |   inputTokens?: number;
 93 |   outputTokens?: number;
 94 | }
 95 | 
 96 | /**
 97 |  * Result of the extraction process
 98 |  */
 99 | export interface ExtractorResult<T> {
100 |   /** Extracted data according to the schema */
101 |   data: T;
102 | 
103 |   /**
104 |    * Processed content that was sent to the LLM.
105 |    * This will be markdown if the input was HTML (after conversion),
106 |    * or the original content if the input was already markdown or plain text.
107 |    */
108 |   processedContent: string;
109 | 
110 |   /** Usage statistics */
111 |   usage: Usage;
112 | }
113 | 


--------------------------------------------------------------------------------
/src/utils/schemaUtils.ts:
--------------------------------------------------------------------------------
  1 | import {
  2 |   z,
  3 |   ZodArray,
  4 |   ZodObject,
  5 |   ZodOptional,
  6 |   ZodTypeAny,
  7 |   ZodNullable,
  8 |   ZodFirstPartyTypeKind,
  9 | } from "zod";
 10 | 
 11 | /**
 12 |  * Checks if a schema is a ZodString with URL validation
 13 |  */
 14 | export function isUrlSchema(schema: ZodTypeAny): boolean {
 15 |   if (!isZodType(schema, ZodFirstPartyTypeKind.ZodString)) return false;
 16 | 
 17 |   // Check if schema has URL validation by checking for internal checks property
 18 |   // This is a bit of a hack but necessary since Zod doesn't expose validation info
 19 |   const checks = (schema as any)._def.checks;
 20 |   if (!checks || !Array.isArray(checks)) return false;
 21 | 
 22 |   return checks.some((check) => check.kind === "url");
 23 | }
 24 | 
 25 | /**
 26 |  * Helper function to check schema type without using instanceof (can fail due to zod version differences)
 27 |  */
 28 | function isZodType(schema: ZodTypeAny, type: ZodFirstPartyTypeKind): boolean {
 29 |   return (schema as any)._def.typeName === type;
 30 | }
 31 | 
 32 | /**
 33 |  * Transforms a schema, replacing any URL validations with string validations
 34 |  * for compatibility with LLM output
 35 |  */
 36 | export function transformSchemaForLLM<T extends ZodTypeAny>(
 37 |   schema: T
 38 | ): ZodTypeAny {
 39 |   // For URL string schemas, remove the URL check but preserve everything else
 40 |   if (isUrlSchema(schema)) {
 41 |     const originalDef = { ...(schema as any)._def };
 42 | 
 43 |     // Filter out only URL checks, keep all other checks
 44 |     if (originalDef.checks && Array.isArray(originalDef.checks)) {
 45 |       originalDef.checks = originalDef.checks.filter(
 46 |         (check: any) => check.kind !== "url"
 47 |       );
 48 |     }
 49 | 
 50 |     // Create a new string schema with the modified definition
 51 |     return new z.ZodString({
 52 |       ...originalDef,
 53 |       typeName: z.ZodFirstPartyTypeKind.ZodString,
 54 |     });
 55 |   }
 56 | 
 57 |   // For object schemas, transform each property
 58 |   if (isZodType(schema, ZodFirstPartyTypeKind.ZodObject)) {
 59 |     const originalDef = { ...(schema as any)._def };
 60 |     const newShape: Record<string, ZodTypeAny> = {};
 61 | 
 62 |     // Transform each property in the shape
 63 |     for (const [key, propertySchema] of Object.entries((schema as any).shape)) {
 64 |       newShape[key] = transformSchemaForLLM(propertySchema as ZodTypeAny);
 65 |     }
 66 | 
 67 |     // Create a new object with the same definition but transformed shape
 68 |     return new z.ZodObject({
 69 |       ...originalDef,
 70 |       shape: () => newShape,
 71 |       typeName: z.ZodFirstPartyTypeKind.ZodObject,
 72 |     });
 73 |   }
 74 | 
 75 |   // For array schemas, transform the element schema
 76 |   if (isZodType(schema, ZodFirstPartyTypeKind.ZodArray)) {
 77 |     const originalDef = { ...(schema as any)._def };
 78 |     const transformedElement = transformSchemaForLLM(
 79 |       (schema as any).element as ZodTypeAny
 80 |     );
 81 | 
 82 |     // Create a new array with the same definition but transformed element
 83 |     return new z.ZodArray({
 84 |       ...originalDef,
 85 |       type: transformedElement,
 86 |       typeName: z.ZodFirstPartyTypeKind.ZodArray,
 87 |     });
 88 |   }
 89 | 
 90 |   // For optional schemas, transform the inner schema
 91 |   if (isZodType(schema, ZodFirstPartyTypeKind.ZodOptional)) {
 92 |     const originalDef = { ...(schema as any)._def };
 93 |     const transformedInner = transformSchemaForLLM(
 94 |       (schema as any).unwrap() as ZodTypeAny
 95 |     );
 96 | 
 97 |     // Create a new optional with the same definition but transformed inner type
 98 |     return new z.ZodOptional({
 99 |       ...originalDef,
100 |       innerType: transformedInner,
101 |       typeName: z.ZodFirstPartyTypeKind.ZodOptional,
102 |     });
103 |   }
104 | 
105 |   // For nullable schemas, transform the inner schema
106 |   if (isZodType(schema, ZodFirstPartyTypeKind.ZodNullable)) {
107 |     const originalDef = { ...(schema as any)._def };
108 |     const transformedInner = transformSchemaForLLM(
109 |       (schema as any).unwrap() as ZodTypeAny
110 |     );
111 | 
112 |     // Create a new nullable with the same definition but transformed inner type
113 |     return new z.ZodNullable({
114 |       ...originalDef,
115 |       innerType: transformedInner,
116 |       typeName: z.ZodFirstPartyTypeKind.ZodNullable,
117 |     });
118 |   }
119 | 
120 |   // Return the original schema for all other types
121 |   return schema;
122 | }
123 | 
124 | /**
125 |  * Fix URL escape sequences in the object based on the original schema
126 |  */
127 | export function fixUrlEscapeSequences(data: any, schema: ZodTypeAny): any {
128 |   if (data === null || data === undefined) return data;
129 | 
130 |   if (isUrlSchema(schema)) {
131 |     if (typeof data === "string") {
132 |       // Replace escaped parentheses with unescaped versions
133 |       return data.replace(/\\\(/g, "(").replace(/\\\)/g, ")");
134 |     }
135 |     return data;
136 |   }
137 | 
138 |   if (
139 |     isZodType(schema, ZodFirstPartyTypeKind.ZodObject) &&
140 |     typeof data === "object" &&
141 |     !Array.isArray(data)
142 |   ) {
143 |     const shape = (schema as any).shape;
144 |     const result: Record<string, any> = {};
145 | 
146 |     for (const [key, propertySchema] of Object.entries(shape)) {
147 |       if (key in data) {
148 |         result[key] = fixUrlEscapeSequences(
149 |           data[key],
150 |           propertySchema as ZodTypeAny
151 |         );
152 |       } else {
153 |         result[key] = data[key];
154 |       }
155 |     }
156 | 
157 |     return result;
158 |   }
159 | 
160 |   if (
161 |     isZodType(schema, ZodFirstPartyTypeKind.ZodArray) &&
162 |     Array.isArray(data)
163 |   ) {
164 |     const elementSchema = (schema as any).element as ZodTypeAny;
165 |     return data.map((item) => fixUrlEscapeSequences(item, elementSchema));
166 |   }
167 | 
168 |   if (isZodType(schema, ZodFirstPartyTypeKind.ZodOptional)) {
169 |     const innerSchema = (schema as any).unwrap() as ZodTypeAny;
170 |     return fixUrlEscapeSequences(data, innerSchema);
171 |   }
172 | 
173 |   if (isZodType(schema, ZodFirstPartyTypeKind.ZodNullable)) {
174 |     const innerSchema = (schema as any).unwrap() as ZodTypeAny;
175 |     return fixUrlEscapeSequences(data, innerSchema);
176 |   }
177 | 
178 |   return data;
179 | }
180 | 
181 | /**
182 |  * Sanitizes an object to conform to a Zod schema by removing invalid optional fields or array items.
183 |  * If the object can't be sanitized to match the schema, returns null.
184 |  *
185 |  * @param schema The Zod schema to validate against
186 |  * @param rawObject The raw object to sanitize
187 |  * @returns The sanitized object or null if it can't be sanitized
188 |  */
189 | export function safeSanitizedParser<T extends ZodTypeAny>(
190 |   schema: T,
191 |   rawObject: unknown
192 | ): z.infer<T> | null {
193 |   try {
194 |     // If the raw object is null or undefined, just validate it directly
195 |     if (rawObject === null || rawObject === undefined) {
196 |       return schema.parse(rawObject);
197 |     }
198 | 
199 |     // Handle different schema types
200 |     if (isZodType(schema, ZodFirstPartyTypeKind.ZodObject)) {
201 |       return sanitizeObject(schema as any, rawObject);
202 |     } else if (isZodType(schema, ZodFirstPartyTypeKind.ZodArray)) {
203 |       return sanitizeArray(schema as any, rawObject);
204 |     } else if (isZodType(schema, ZodFirstPartyTypeKind.ZodOptional)) {
205 |       return sanitizeOptional(schema as any, rawObject);
206 |     } else if (isZodType(schema, ZodFirstPartyTypeKind.ZodNullable)) {
207 |       return sanitizeNullable(schema as any, rawObject);
208 |     } else {
209 |       // For primitive values, try to parse directly
210 |       return schema.parse(rawObject);
211 |     }
212 |   } catch (error) {
213 |     // If any error occurs during sanitization, return null
214 |     return null;
215 |   }
216 | }
217 | 
218 | /**
219 |  * Sanitizes an object against a Zod object schema
220 |  */
221 | function sanitizeObject(schema: ZodObject<any>, rawObject: unknown): any {
222 |   if (
223 |     typeof rawObject !== "object" ||
224 |     rawObject === null ||
225 |     Array.isArray(rawObject)
226 |   ) {
227 |     throw new Error("Expected an object");
228 |   }
229 | 
230 |   const shape = schema.shape;
231 |   const result: Record<string, any> = {};
232 |   const rawObjectRecord = rawObject as Record<string, unknown>;
233 | 
234 |   // Process each property in the schema
235 |   for (const [key, propertySchema] of Object.entries(shape)) {
236 |     // Skip if the property doesn't exist in the raw object
237 |     if (!(key in rawObjectRecord)) {
238 |       continue;
239 |     }
240 | 
241 |     // If property is optional, try to sanitize it
242 |     if (
243 |       isZodType(propertySchema as ZodTypeAny, ZodFirstPartyTypeKind.ZodOptional)
244 |     ) {
245 |       const sanitized = safeSanitizedParser(
246 |         propertySchema as ZodTypeAny,
247 |         rawObjectRecord[key]
248 |       );
249 |       if (sanitized !== null) {
250 |         result[key] = sanitized;
251 |       }
252 |       // If sanitization fails, just skip the optional property
253 |     } else if (
254 |       isZodType(propertySchema as ZodTypeAny, ZodFirstPartyTypeKind.ZodNullable)
255 |     ) {
256 |       // For nullable properties, try to sanitize or set to null
257 |       try {
258 |         const sanitized = safeSanitizedParser(
259 |           propertySchema as ZodTypeAny,
260 |           rawObjectRecord[key]
261 |         );
262 |         result[key] = sanitized;
263 |       } catch {
264 |         // If sanitization fails, set to null for nullable properties
265 |         result[key] = null;
266 |       }
267 |     } else {
268 |       // For required properties, try to sanitize and throw if it fails
269 |       const sanitized = safeSanitizedParser(
270 |         propertySchema as ZodTypeAny,
271 |         rawObjectRecord[key]
272 |       );
273 |       if (sanitized === null) {
274 |         throw new Error(`Required property ${key} could not be sanitized`);
275 |       }
276 |       result[key] = sanitized;
277 |     }
278 |   }
279 | 
280 |   // Validate the final object to ensure it matches the schema
281 |   return schema.parse(result);
282 | }
283 | 
284 | /**
285 |  * Sanitizes an array against a Zod array schema
286 |  */
287 | function sanitizeArray(schema: ZodArray<any>, rawValue: unknown): any {
288 |   if (!Array.isArray(rawValue)) {
289 |     throw new Error("Expected an array");
290 |   }
291 | 
292 |   const elementSchema = schema.element as ZodTypeAny;
293 |   const sanitizedArray = [];
294 | 
295 |   // Process each item in the array
296 |   for (const item of rawValue) {
297 |     try {
298 |       const sanitizedItem = safeSanitizedParser(elementSchema, item);
299 |       if (sanitizedItem !== null) {
300 |         sanitizedArray.push(sanitizedItem);
301 |       }
302 |       // If an item can't be sanitized, just skip it
303 |     } catch {
304 |       // Skip invalid array items
305 |     }
306 |   }
307 | 
308 |   // Validate the final array to ensure it matches the schema
309 |   return schema.parse(sanitizedArray);
310 | }
311 | 
312 | /**
313 |  * Sanitizes a value against an optional Zod schema
314 |  */
315 | function sanitizeOptional(schema: ZodOptional<any>, rawValue: unknown): any {
316 |   try {
317 |     // Try to sanitize using the inner schema
318 |     const innerSchema = schema.unwrap();
319 |     const parsed = safeSanitizedParser(innerSchema, rawValue);
320 |     // If the parsed value is not valid, return undefined for optional values
321 |     if (parsed === null) {
322 |       return undefined;
323 |     }
324 |     return parsed;
325 |   } catch {
326 |     // If sanitization fails, return undefined for optional values
327 |     return undefined;
328 |   }
329 | }
330 | 
331 | /**
332 |  * Sanitizes a value against a nullable Zod schema
333 |  */
334 | function sanitizeNullable(schema: ZodNullable<any>, rawValue: unknown): any {
335 |   // If the value is null, return null directly
336 |   if (rawValue === null) {
337 |     return null;
338 |   }
339 | 
340 |   try {
341 |     // Try to sanitize using the inner schema
342 |     const innerSchema = schema.unwrap();
343 |     const sanitized = safeSanitizedParser(innerSchema, rawValue);
344 | 
345 |     // If sanitization of inner schema fails, return null
346 |     if (sanitized === null) {
347 |       return null;
348 |     }
349 | 
350 |     return sanitized;
351 |   } catch {
352 |     // If sanitization fails, return null for nullable values
353 |     return null;
354 |   }
355 | }
356 | 


--------------------------------------------------------------------------------
/tests/fixtures/article-with-images.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |   <meta charset="UTF-8">
 5 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |   <title>Modern Web Development with React and Node.js</title>
 7 | </head>
 8 | <body>
 9 |   <header>
10 |     <nav>
11 |       <ul>
12 |         <li><a href="/">Home</a></li>
13 |         <li><a href="/blog">Blog</a></li>
14 |         <li><a href="/about">About</a></li>
15 |       </ul>
16 |     </nav>
17 |     <h1>Modern Web Development with React and Node.js</h1>
18 |     <div class="meta">
19 |       <span class="author">Jane Smith</span>
20 |       <span class="date">March 20, 2023</span>
21 |       <div class="tags">
22 |         <span>#React</span>
23 |         <span>#Node.js</span>
24 |         <span>#JavaScript</span>
25 |       </div>
26 |     </div>
27 |   </header>
28 |   
29 |   <article>
30 |     <figure class="featured-image">
31 |       <img src="https://example.com/images/react-node-architecture.png" alt="React and Node.js Architecture" width="800" height="450">
32 |       <figcaption>The Modern Web Stack: React for front-end and Node.js for back-end</figcaption>
33 |     </figure>
34 | 
35 |     <p>Building modern web applications requires a solid understanding of both front-end and back-end technologies. React has become the industry standard for building interactive user interfaces, while Node.js powers the server-side of many applications.</p>
36 |     
37 |     <h2>React: Building User Interfaces</h2>
38 |     <p>React is a JavaScript library for building user interfaces, particularly single-page applications. It allows developers to create reusable UI components and manage application state efficiently.</p>
39 |     
40 |     <div class="code-example">
41 |       <img src="https://example.com/images/react-component-example.png" alt="React Component Example" width="600" height="300">
42 |       <pre><code>
43 | function Welcome(props) {
44 |   return &lt;h1&gt;Hello, {props.name}&lt;/h1&gt;;
45 | }
46 |       </code></pre>
47 |     </div>
48 |     
49 |     <h2>Node.js: Server-Side JavaScript</h2>
50 |     <p>Node.js allows JavaScript to be used for server-side programming. It uses an event-driven, non-blocking I/O model that makes it lightweight and efficient for data-intensive real-time applications.</p>
51 |     
52 |     <figure>
53 |       <img src="https://example.com/images/nodejs-event-loop.jpg" alt="Node.js Event Loop" width="700" height="400">
54 |       <figcaption>The Node.js event loop enables non-blocking I/O operations</figcaption>
55 |     </figure>
56 |     
57 |     <h2>Combining React and Node.js</h2>
58 |     <p>When combined, React and Node.js create a powerful full-stack JavaScript environment. The front-end is handled by React components, while the back-end API is managed by Node.js.</p>
59 |     
60 |     <p>A typical architecture might look like this:</p>
61 |     <ol>
62 |       <li>React components for the user interface</li>
63 |       <li>Redux or Context API for state management</li>
64 |       <li>Express.js (Node.js framework) for the API layer</li>
65 |       <li>MongoDB or another database for data persistence</li>
66 |     </ol>
67 |     
68 |     <div class="image-gallery">
69 |       <h3>Popular Tools in the React/Node.js Ecosystem</h3>
70 |       <div class="gallery-items">
71 |         <div class="gallery-item">
72 |           <img src="https://example.com/images/webpack-logo.png" alt="Webpack Logo" width="200" height="200">
73 |           <p>Webpack for module bundling</p>
74 |         </div>
75 |         <div class="gallery-item">
76 |           <img src="https://example.com/images/express-logo.png" alt="Express.js Logo" width="200" height="200">
77 |           <p>Express.js for API routing</p>
78 |         </div>
79 |         <div class="gallery-item">
80 |           <img src="https://example.com/images/mongodb-logo.png" alt="MongoDB Logo" width="200" height="200">
81 |           <p>MongoDB for database</p>
82 |         </div>
83 |       </div>
84 |     </div>
85 |     
86 |     <h2>Conclusion</h2>
87 |     <p>The combination of React and Node.js provides a consistent development experience across the stack, as both use JavaScript. This allows for better code reuse and a more streamlined development process.</p>
88 |     
89 |     <p>Whether you're building a simple web application or a complex enterprise system, the React and Node.js stack offers flexibility, performance, and scalability.</p>
90 |   </article>
91 |   
92 |   <footer>
93 |     <p>© 2023 Web Development Blog</p>
94 |     <img src="https://example.com/images/blog-logo-small.png" alt="Blog Logo" width="100" height="30">
95 |   </footer>
96 | </body>
97 | </html> 


--------------------------------------------------------------------------------
/tests/fixtures/blog-post.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |   <meta charset="UTF-8">
 5 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |   <title>Understanding Async/Await in JavaScript</title>
 7 | </head>
 8 | <body>
 9 |   <header>
10 |     <h1>Understanding Async/Await in JavaScript</h1>
11 |     <div class="meta">
12 |       <span class="author">John Doe</span>
13 |       <span class="date">January 15, 2023</span>
14 |       <div class="tags">
15 |         <span>#JavaScript</span>
16 |         <span>#Programming</span>
17 |       </div>
18 |     </div>
19 |   </header>
20 |   
21 |   <article>
22 |     <p>Async/await is a modern way to handle asynchronous operations in JavaScript. It was introduced in ES2017 and has since become the preferred method for handling promises.</p>
23 |     
24 |     <h2>What is Async/Await?</h2>
25 |     <p>The <code>async</code> keyword is used to declare an asynchronous function. An async function automatically returns a promise, and the value returned by the function will be resolved with the returned promise.</p>
26 |     
27 |     <p>The <code>await</code> keyword can only be used inside an async function. It pauses the execution of the function until the promise is resolved or rejected.</p>
28 |     
29 |     <h2>Basic Example</h2>
30 |     <pre><code>
31 | async function fetchData() {
32 |   const response = await fetch('https://api.example.com/data');
33 |   const data = await response.json();
34 |   return data;
35 | }
36 |     </code></pre>
37 |     
38 |     <p>In this example, the function will wait for the fetch operation to complete before moving to the next line. This makes asynchronous code look and behave more like synchronous code, making it easier to understand and maintain.</p>
39 |     
40 |     <h2>Error Handling</h2>
41 |     <p>With async/await, you can use try/catch blocks for error handling, which is more intuitive than promise chaining with .catch().</p>
42 |     
43 |     <pre><code>
44 | async function fetchData() {
45 |   try {
46 |     const response = await fetch('https://api.example.com/data');
47 |     const data = await response.json();
48 |     return data;
49 |   } catch (error) {
50 |     console.error('Error fetching data:', error);
51 |     throw error;
52 |   }
53 | }
54 |     </code></pre>
55 |     
56 |     <h2>Conclusion</h2>
57 |     <p>Async/await makes asynchronous code more readable and maintainable. It's built on promises, so you can still use all the promise methods when needed, but the syntax is cleaner and more intuitive.</p>
58 | 
59 |     <p>For more information, visit our <a href="/blog/javascript-tutorials">JavaScript Tutorials</a> or check out the <a href="https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/async_function">MDN documentation</a>.</p>
60 |   </article>
61 |   
62 |   <footer>
63 |     <p>© 2023 JavaScript Blog</p>
64 |   </footer>
65 | </body>
66 | </html> 


--------------------------------------------------------------------------------
/tests/fixtures/product-list.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |   <meta charset="UTF-8">
 5 |   <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |   <title>Smart Home Products</title>
 7 | </head>
 8 | <body>
 9 |   <header>
10 |     <h1>Smart Home Products</h1>
11 |     <p>Top-rated devices to make your home smarter</p>
12 |   </header>
13 |   
14 |   <main class="product-container">
15 |     <section class="product-item">
16 |       <h2>Smart Speaker Pro</h2>
17 |       <div class="product-image">
18 |         <img src="/images/products/speaker.jpg" alt="Smart Speaker Pro">
19 |       </div>
20 |       <div class="product-details">
21 |         <div class="price">$129.99</div>
22 |         <div class="rating">★★★★☆ (4.2/5)</div>
23 |         <div class="description">
24 |           <p>Premium smart speaker with built-in voice assistant. Control your smart home, play music, or get answers to your questions.</p>
25 |           <ul class="features">
26 |             <li>360° sound with deep bass</li>
27 |             <li>Multi-room audio support</li>
28 |             <li>Compatible with most smart home devices</li>
29 |             <li>Available in black, white, and gray</li>
30 |           </ul>
31 |         </div>
32 |         <a href="/products/smart-speaker-pro" class="product-link">View Details</a>
33 |       </div>
34 |     </section>
35 |     
36 |     <section class="product-item">
37 |       <h2>Smart Thermostat</h2>
38 |       <div class="product-image">
39 |         <img src="/images/products/thermostat.jpg" alt="Smart Thermostat">
40 |       </div>
41 |       <div class="product-details">
42 |         <div class="price">$89.95</div>
43 |         <div class="rating">★★★★★ (4.8/5)</div>
44 |         <div class="description">
45 |           <p>Energy-efficient smart thermostat that learns your preferences and helps save on utility bills.</p>
46 |           <ul class="features">
47 |             <li>Easy installation</li>
48 |             <li>Compatible with most HVAC systems</li>
49 |             <li>Mobile app control</li>
50 |             <li>Energy usage reports</li>
51 |           </ul>
52 |         </div>
53 |         <a href="/products/smart-thermostat" class="product-link">View Details</a>
54 |       </div>
55 |     </section>
56 |     
57 |     <section class="product-item">
58 |       <h2>Smart Security Camera</h2>
59 |       <div class="product-image">
60 |         <img src="/images/products/camera.jpg" alt="Smart Security Camera">
61 |       </div>
62 |       <div class="product-details">
63 |         <div class="price">$74.50</div>
64 |         <div class="rating">★★★★☆ (4.0/5)</div>
65 |         <div class="description">
66 |           <p>HD security camera with motion detection, night vision, and two-way audio.</p>
67 |           <ul class="features">
68 |             <li>1080p HD video</li>
69 |             <li>Cloud storage available</li>
70 |             <li>Weather-resistant</li>
71 |             <li>Real-time alerts</li>
72 |           </ul>
73 |         </div>
74 |         <a href="/products/smart-security-camera" class="product-link">View Details</a>
75 |       </div>
76 |     </section>
77 |   </main>
78 |   
79 |   <footer>
80 |     <p>Prices and availability may vary. Last updated: June 2023</p>
81 |   </footer>
82 | </body>
83 | </html> 


--------------------------------------------------------------------------------
/tests/integration/extract.test.ts:
--------------------------------------------------------------------------------
  1 | import * as fs from "fs";
  2 | import * as path from "path";
  3 | import { z } from "zod";
  4 | import {
  5 |   extract,
  6 |   ContentFormat,
  7 |   LLMProvider,
  8 |   ExtractorResult,
  9 | } from "../../src";
 10 | import { htmlToMarkdown } from "../../src/converters";
 11 | 
 12 | // Read the sample HTML files
 13 | const blogPostHtml = fs.readFileSync(
 14 |   path.resolve(__dirname, "../fixtures/blog-post.html"),
 15 |   "utf8"
 16 | );
 17 | // Define schemas that will be reused
 18 | const blogSchema = z.object({
 19 |   title: z.string(),
 20 |   author: z.string(),
 21 |   date: z.string(),
 22 |   tags: z
 23 |     .array(z.string())
 24 |     .optional()
 25 |     .describe("Tags appear after the date. Do not include the # symbol."),
 26 |   summary: z.string(),
 27 |   links: z
 28 |     .array(z.string().url())
 29 |     .optional()
 30 |     .describe("Extract all URLs from the content"),
 31 | });
 32 | 
 33 | // Define a separate schema for OpenAI tests using nullable instead of optional
 34 | const blogSchemaOpenAI = z.object({
 35 |   title: z.string(),
 36 |   author: z.string(),
 37 |   date: z.string(),
 38 |   tags: z
 39 |     .array(z.string())
 40 |     .nullable()
 41 |     .describe("Tags appear after the date. Do not include the # symbol."),
 42 |   summary: z.string(),
 43 |   links: z
 44 |     .array(z.string().url())
 45 |     .nullable()
 46 |     .describe("Extract all URLs from the content"),
 47 | });
 48 | 
 49 | // Helper function to verify blog post extraction results
 50 | function verifyBlogPostExtraction(result: ExtractorResult<any>): void {
 51 |   // Check the data is extracted correctly
 52 |   expect(result.data).toBeDefined();
 53 |   expect(result.data.title).toBe("Understanding Async/Await in JavaScript");
 54 |   expect(result.data.author).toBe("John Doe");
 55 |   expect(result.data.date).toBe("January 15, 2023");
 56 |   expect(typeof result.data.summary).toBe("string");
 57 |   expect(result.data.summary.length).toBeGreaterThan(0);
 58 |   expect(result.data.tags).toEqual(["JavaScript", "Programming"]);
 59 | 
 60 |   // Verify URLs are extracted and are absolute
 61 |   expect(result.data.links).toBeDefined();
 62 |   expect(Array.isArray(result.data.links)).toBe(true);
 63 |   expect(result.data.links).toContain(
 64 |     "https://example.com/blog/javascript-tutorials"
 65 |   );
 66 |   expect(result.data.links).toContain(
 67 |     "https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/async_function"
 68 |   );
 69 |   expect(result.data.links).toContain("https://api.example.com/data");
 70 | 
 71 |   // Verify that usage statistics are returned
 72 |   expect(result.usage).toBeDefined();
 73 |   expect(result.usage.inputTokens).toBeGreaterThan(0);
 74 |   expect(result.usage.outputTokens).toBeGreaterThan(0);
 75 | }
 76 | 
 77 | describe("Extract Integration Tests", () => {
 78 |   describe("Blog Post Extraction", () => {
 79 |     test("should extract blog post data using Google Gemini default model", async () => {
 80 |       const result = await extract({
 81 |         content: blogPostHtml,
 82 |         format: ContentFormat.HTML,
 83 |         schema: blogSchema,
 84 |         provider: LLMProvider.GOOGLE_GEMINI,
 85 |         googleApiKey: process.env.GOOGLE_API_KEY,
 86 |         sourceUrl: "https://example.com/blog/async-await",
 87 |       });
 88 | 
 89 |       verifyBlogPostExtraction(result);
 90 |     });
 91 | 
 92 |     test("should extract blog post data using OpenAI default model", async () => {
 93 |       const result = await extract({
 94 |         content: blogPostHtml,
 95 |         format: ContentFormat.HTML,
 96 |         schema: blogSchemaOpenAI,
 97 |         provider: LLMProvider.OPENAI,
 98 |         openaiApiKey: process.env.OPENAI_API_KEY,
 99 |         sourceUrl: "https://example.com/blog/async-await",
100 |       });
101 | 
102 |       verifyBlogPostExtraction(result);
103 |     });
104 |   });
105 | 
106 |   const productListHtml = fs.readFileSync(
107 |     path.resolve(__dirname, "../fixtures/product-list.html"),
108 |     "utf8"
109 |   );
110 | 
111 |   const productSchema = z.object({
112 |     products: z.array(
113 |       z.object({
114 |         name: z.string(),
115 |         price: z.number(),
116 |         rating: z.number().optional(),
117 |         description: z.string().optional(),
118 |         features: z.array(z.string()).optional(),
119 |         imageUrl: z.string().url().optional(),
120 |         productUrl: z.string().url().optional(),
121 |       })
122 |     ),
123 |   });
124 | 
125 |   // Define a separate schema for OpenAI tests using nullable instead of optional
126 |   const productSchemaOpenAI = z.object({
127 |     products: z.array(
128 |       z.object({
129 |         name: z.string(),
130 |         price: z.number(),
131 |         rating: z.number().nullable(),
132 |         description: z.string().nullable(),
133 |         features: z.array(z.string()).nullable(),
134 |         imageUrl: z.string().url().nullable(),
135 |         productUrl: z.string().url().nullable(),
136 |       })
137 |     ),
138 |   });
139 | 
140 |   const groundTruthProductList = [
141 |     {
142 |       name: "Smart Speaker Pro",
143 |       price: 129.99,
144 |       rating: 4.2,
145 |       description:
146 |         "Premium smart speaker with built-in voice assistant. Control your smart home, play music, or get answers to your questions.",
147 |       features: [
148 |         "360° sound with deep bass",
149 |         "Multi-room audio support",
150 |         "Compatible with most smart home devices",
151 |         "Available in black, white, and gray",
152 |       ],
153 |       imageUrl: "https://example.com/images/products/speaker.jpg",
154 |       productUrl: "https://example.com/products/smart-speaker-pro",
155 |     },
156 |     {
157 |       name: "Smart Thermostat",
158 |       price: 89.95,
159 |       rating: 4.8,
160 |       description:
161 |         "Energy-efficient smart thermostat that learns your preferences and helps save on utility bills.",
162 |       features: [
163 |         "Easy installation",
164 |         "Compatible with most HVAC systems",
165 |         "Mobile app control",
166 |         "Energy usage reports",
167 |       ],
168 |       imageUrl: "https://example.com/images/products/thermostat.jpg",
169 |       productUrl: "https://example.com/products/smart-thermostat",
170 |     },
171 |     {
172 |       name: "Smart Security Camera",
173 |       price: 74.5,
174 |       rating: 4,
175 |       description:
176 |         "HD security camera with motion detection, night vision, and two-way audio.",
177 |       features: [
178 |         "1080p HD video",
179 |         "Cloud storage available",
180 |         "Weather-resistant",
181 |         "Real-time alerts",
182 |       ],
183 |       imageUrl: "https://example.com/images/products/camera.jpg",
184 |       productUrl: "https://example.com/products/smart-security-camera",
185 |     },
186 |   ];
187 | 
188 |   // Helper function to verify product list extraction results
189 |   function verifyProductListExtraction(result: ExtractorResult<any>): void {
190 |     // Check structure, not exact values
191 |     expect(result.data).toBeDefined();
192 |     expect(Array.isArray(result.data.products)).toBe(true);
193 | 
194 |     // Check parity with ground truth data
195 |     expect(result.data.products.length).toBe(groundTruthProductList.length);
196 | 
197 |     // Verify each extracted product matches the ground truth
198 |     for (const product of result.data.products) {
199 |       // Find matching product in ground truth by name
200 |       const groundTruthProduct = groundTruthProductList.find(
201 |         (p) => p.name === product.name
202 |       );
203 | 
204 |       // Ensure the product exists in ground truth
205 |       expect(groundTruthProduct).toBeDefined();
206 | 
207 |       // Compare all product properties
208 |       expect(product.price).toBe(groundTruthProduct!.price);
209 |       expect(product.rating).toBe(groundTruthProduct!.rating);
210 |       expect(product.description).toBe(groundTruthProduct!.description);
211 |       expect(product.features).toEqual(groundTruthProduct!.features);
212 | 
213 |       // Verify URLs are absolute
214 |       expect(product.imageUrl).toBe(groundTruthProduct!.imageUrl);
215 |       expect(product.productUrl).toBe(groundTruthProduct!.productUrl);
216 |     }
217 | 
218 |     // Verify that usage statistics are returned
219 |     expect(result.usage).toBeDefined();
220 |     expect(result.usage.inputTokens).toBeGreaterThan(0);
221 |     expect(result.usage.outputTokens).toBeGreaterThan(0);
222 |   }
223 | 
224 |   describe("Product List Extraction", () => {
225 |     test("should extract product list data using Google Gemini", async () => {
226 |       const result = await extract({
227 |         content: productListHtml,
228 |         format: ContentFormat.HTML,
229 |         schema: productSchema,
230 |         provider: LLMProvider.GOOGLE_GEMINI,
231 |         googleApiKey: process.env.GOOGLE_API_KEY,
232 |         sourceUrl: "https://example.com/products",
233 |         htmlExtractionOptions: {
234 |           extractMainHtml: true,
235 |           includeImages: true,
236 |         },
237 |       });
238 |       verifyProductListExtraction(result);
239 |     });
240 | 
241 |     test("should extract product list data using OpenAI", async () => {
242 |       const result = await extract({
243 |         content: productListHtml,
244 |         format: ContentFormat.HTML,
245 |         schema: productSchemaOpenAI,
246 |         provider: LLMProvider.OPENAI,
247 |         openaiApiKey: process.env.OPENAI_API_KEY,
248 |         sourceUrl: "https://example.com/products",
249 |         htmlExtractionOptions: {
250 |           extractMainHtml: true,
251 |           includeImages: true,
252 |         },
253 |       });
254 |       verifyProductListExtraction(result);
255 |     });
256 |   });
257 | 
258 |   const markdownContent = "Product: Apple, Price: N/A";
259 | 
260 |   describe("Handle Structured Output Errors", () => {
261 |     test("should handle structured output errors using OpenAI", async () => {
262 |       const result = await extract({
263 |         content: markdownContent,
264 |         format: ContentFormat.MARKDOWN,
265 |         schema: z.object({
266 |           product: z.string(),
267 |           // For this test, force the price to be N/A and break the schema so we can test the
268 |           // structured output error handling. In real life, this could happen if the LLM returns
269 |           // a value that is not expected by the schema.
270 |           price: z.number().describe("Use 'N/A' if not available").nullable(),
271 |         }),
272 |         provider: LLMProvider.OPENAI,
273 |         openaiApiKey: process.env.OPENAI_API_KEY,
274 |         modelName: "gpt-3.5-turbo",
275 |       });
276 |       expect(result.data).toEqual({ product: "Apple", price: null });
277 |     });
278 | 
279 |     test("should handle structured output errors using Google Gemini", async () => {
280 |       const result = await extract({
281 |         content: blogPostHtml,
282 |         format: ContentFormat.HTML,
283 |         schema: z.object({
284 |           title: z.string(),
285 |           author: z.string().optional(),
286 |           date: z.string().optional(),
287 |           tags: z
288 |             .array(z.string())
289 |             .optional()
290 |             .describe(
291 |               "Tags appear after the date. Do not include the # symbol."
292 |             ),
293 |           summary: z.string(),
294 |           // For this test, adding an additional content field seems to cause the Google Gemini model
295 |           // to fail in some cases to return the structured output.
296 |           content: z.string().optional(),
297 |         }),
298 |         provider: LLMProvider.GOOGLE_GEMINI,
299 |         googleApiKey: process.env.GOOGLE_API_KEY,
300 |         sourceUrl: "https://example.com/blog/async-await",
301 |       });
302 |       expect(result.data).toBeDefined();
303 |     });
304 |   });
305 | 
306 |   describe("Special Character Handling", () => {
307 |     test("should extract link with special characters from markdown and validate as URL", async () => {
308 |       const markdownContent =
309 |         "[Meeting \\[11-12-24\\]](https://example.com/meeting-\\(11-12-24\\))";
310 | 
311 |       // Use string().url() validation
312 |       const schema = z.object({
313 |         title: z.string(),
314 |         link: z.string().url(), // Added URL validation
315 |       });
316 | 
317 |       const result = await extract({
318 |         content: markdownContent,
319 |         format: ContentFormat.MARKDOWN,
320 |         schema,
321 |         provider: LLMProvider.OPENAI,
322 |         openaiApiKey: process.env.OPENAI_API_KEY,
323 |       });
324 | 
325 |       // Verify the extracted data
326 |       expect(result.data.title).toBe("Meeting [11-12-24]");
327 |       expect(result.data.link).toBe("https://example.com/meeting-(11-12-24)");
328 |     });
329 | 
330 |     test("should extract an array of URLs with special characters", async () => {
331 |       const markdownContent = `
332 | # Meeting Links
333 | 
334 | - [Q4 Planning \\(2023\\)](https://example.com/meetings/q4-planning-\\(2023\\))
335 | - [Budget Review \\[2024\\]](https://example.com/budget/review-\\[2024\\])
336 | - [Product Launch (May 2024)](https://example.com/products/launch-(may-2024))
337 |       `;
338 | 
339 |       // Use array of string().url() validation
340 |       const schema = z.object({
341 |         title: z.string(),
342 |         links: z.array(z.string().url()),
343 |       });
344 | 
345 |       const result = await extract({
346 |         content: markdownContent,
347 |         format: ContentFormat.MARKDOWN,
348 |         schema,
349 |         provider: LLMProvider.OPENAI,
350 |         openaiApiKey: process.env.OPENAI_API_KEY,
351 |       });
352 | 
353 |       // Verify the extracted data
354 |       expect(result.data.title).toBe("Meeting Links");
355 |       expect(result.data.links).toContain(
356 |         "https://example.com/meetings/q4-planning-(2023)"
357 |       );
358 |       expect(result.data.links).toContain(
359 |         "https://example.com/budget/review-[2024]"
360 |       );
361 |       expect(result.data.links).toContain(
362 |         "https://example.com/products/launch-(may-2024)"
363 |       );
364 |     });
365 |   });
366 | 
367 |   describe("Data Enrichment", () => {
368 |     test("should enrich existing data with blog post content using Google Gemini", async () => {
369 |       // Create partial data to be enriched
370 |       const partialData = {
371 |         title: "A Different Title",
372 |         date: "February 1, 2022", // This might be updated based on content
373 |         summary: "",
374 |       };
375 | 
376 |       const result = await extract({
377 |         content: blogPostHtml,
378 |         format: ContentFormat.HTML,
379 |         schema: blogSchema,
380 |         provider: LLMProvider.GOOGLE_GEMINI,
381 |         googleApiKey: process.env.GOOGLE_API_KEY,
382 |         sourceUrl: "https://example.com/blog/async-await",
383 |         dataToEnrich: partialData,
384 |       });
385 | 
386 |       // Verify the enriched data has the correct values
387 |       verifyBlogPostExtraction(result);
388 |     });
389 | 
390 |     test("should enrich existing data with blog post content using OpenAI", async () => {
391 |       // Create partial data with some existing values
392 |       const partialData = {
393 |         title: "A Different Title", // This should be updated
394 |         date: "February 1, 2022", // This might be updated based on content
395 |         summary: "",
396 |       };
397 | 
398 |       const result = await extract({
399 |         content: blogPostHtml,
400 |         format: ContentFormat.HTML,
401 |         schema: blogSchemaOpenAI,
402 |         provider: LLMProvider.OPENAI,
403 |         openaiApiKey: process.env.OPENAI_API_KEY,
404 |         sourceUrl: "https://example.com/blog/async-await",
405 |         dataToEnrich: partialData,
406 |       });
407 | 
408 |       // Verify the enriched data has the correct values
409 |       verifyBlogPostExtraction(result);
410 |     });
411 | 
412 |     test("should enrich product list data with custom prompt using Google Gemini", async () => {
413 |       // Create partial product data with missing information
414 |       const partialData = {
415 |         products: [
416 |           {
417 |             name: "Smart Speaker Pro",
418 |             price: 0, // Missing price
419 |             features: [], // Missing features
420 |           },
421 |           {
422 |             name: "Smart Thermostat",
423 |             price: 0, // Missing price
424 |             features: [], // Missing features
425 |           },
426 |           {
427 |             name: "Smart Security Camera",
428 |             price: 0, // Missing price
429 |             features: [], // Missing features
430 |           },
431 |         ],
432 |       };
433 | 
434 |       const result = await extract({
435 |         content: productListHtml,
436 |         format: ContentFormat.HTML,
437 |         schema: productSchema,
438 |         provider: LLMProvider.GOOGLE_GEMINI,
439 |         googleApiKey: process.env.GOOGLE_API_KEY,
440 |         sourceUrl: "https://example.com/products",
441 |         dataToEnrich: partialData,
442 |         prompt:
443 |           "Focus on enriching the product data with accurate prices and feature lists from the context.",
444 |       });
445 | 
446 |       // Verify that prices and features were enriched correctly
447 |       expect(result.data).toBeDefined();
448 |       expect(Array.isArray(result.data.products)).toBe(true);
449 |       expect(result.data.products.length).toBe(3);
450 | 
451 |       // Check prices were updated
452 |       expect(result.data.products[0].price).toBe(129.99);
453 |       expect(result.data.products[1].price).toBe(89.95);
454 |       expect(result.data.products[2].price).toBe(74.5);
455 | 
456 |       // Check features were populated
457 |       expect(result.data.products[0].features?.length).toBeGreaterThan(0);
458 |       expect(result.data.products[1].features?.length).toBeGreaterThan(0);
459 |       expect(result.data.products[2].features?.length).toBeGreaterThan(0);
460 | 
461 |       // Verify usage stats
462 |       expect(result.usage).toBeDefined();
463 |       expect(result.usage.inputTokens).toBeGreaterThan(0);
464 |       expect(result.usage.outputTokens).toBeGreaterThan(0);
465 |     });
466 |   });
467 | });
468 | 
469 | // Read the sample HTML file with images
470 | const articleWithImages = fs.readFileSync(
471 |   path.resolve(__dirname, "../fixtures/article-with-images.html"),
472 |   "utf8"
473 | );
474 | 
475 | // Define a schema that includes image extraction
476 | const articleSchema = z.object({
477 |   title: z.string(),
478 |   author: z.string(),
479 |   date: z.string(),
480 |   tags: z
481 |     .array(z.string())
482 |     .optional()
483 |     .describe("Tags appear after the date. Do not include the # symbol."),
484 |   summary: z.string(),
485 |   images: z
486 |     .array(
487 |       z.object({
488 |         url: z.string().url(),
489 |         alt: z.string().optional(),
490 |         caption: z.string().optional(),
491 |       })
492 |     )
493 |     .optional()
494 |     .describe(
495 |       "Extract all images from the article with their URLs and alt text"
496 |     ),
497 | });
498 | 
499 | // Define a separate schema for OpenAI tests using nullable instead of optional
500 | const articleSchemaOpenAI = z.object({
501 |   title: z.string(),
502 |   author: z.string(),
503 |   date: z.string(),
504 |   tags: z
505 |     .array(z.string())
506 |     .nullable()
507 |     .describe("Tags appear after the date. Do not include the # symbol."),
508 |   summary: z.string(),
509 |   images: z
510 |     .array(
511 |       z.object({
512 |         url: z.string().url(),
513 |         alt: z.string().nullable(),
514 |         caption: z.string().nullable(),
515 |       })
516 |     )
517 |     .nullable()
518 |     .describe(
519 |       "Extract all images from the article with their URLs and alt text"
520 |     ),
521 | });
522 | 
523 | // Function to verify that images are correctly extracted
524 | function verifyImageExtraction(result: ExtractorResult<any>): void {
525 |   // Check the data is extracted correctly
526 |   expect(result.data).toBeDefined();
527 |   expect(result.data.title).toBe(
528 |     "Modern Web Development with React and Node.js"
529 |   );
530 |   expect(result.data.author).toBe("Jane Smith");
531 |   expect(result.data.date).toBe("March 20, 2023");
532 |   expect(result.data.tags).toContain("React");
533 |   expect(result.data.tags).toContain("Node.js");
534 |   expect(result.data.tags).toContain("JavaScript");
535 | 
536 |   // Verify that images are extracted
537 |   expect(result.data.images).toBeDefined();
538 |   expect(Array.isArray(result.data.images)).toBe(true);
539 |   expect(result.data.images.length).toBeGreaterThan(0);
540 | 
541 |   // Check for the main architecture image
542 |   const architectureImage = result.data.images.find((img: any) =>
543 |     img.url.includes("react-node-architecture.png")
544 |   );
545 |   expect(architectureImage).toBeDefined();
546 |   expect(architectureImage.alt).toBe("React and Node.js Architecture");
547 | 
548 |   // Check for the event loop image
549 |   const eventLoopImage = result.data.images.find((img: any) =>
550 |     img.url.includes("nodejs-event-loop.jpg")
551 |   );
552 |   expect(eventLoopImage).toBeDefined();
553 |   expect(eventLoopImage.alt).toBe("Node.js Event Loop");
554 | 
555 |   // Check for the webpack image
556 |   const webpackImage = result.data.images.find((img: any) =>
557 |     img.url.includes("webpack-logo.png")
558 |   );
559 |   expect(webpackImage).toBeDefined();
560 |   expect(webpackImage.alt).toBe("Webpack Logo");
561 |   expect(webpackImage.caption).toBe("Webpack for module bundling");
562 | 
563 |   // Verify that usage statistics are returned
564 |   expect(result.usage).toBeDefined();
565 |   expect(result.usage.inputTokens).toBeGreaterThan(0);
566 |   expect(result.usage.outputTokens).toBeGreaterThan(0);
567 | }
568 | 
569 | describe("Image Extraction Integration Tests", () => {
570 |   // Test that the low level htmlToMarkdown function correctly handles images
571 |   test("should include images in markdown when includeImages is true", () => {
572 |     const markdownWithImages = htmlToMarkdown(articleWithImages, {
573 |       includeImages: true,
574 |     });
575 |     const markdownWithoutImages = htmlToMarkdown(articleWithImages);
576 | 
577 |     // With includeImages: true, markdown should contain image references
578 |     expect(markdownWithImages).toContain(
579 |       "![React and Node.js Architecture](https://example.com/images/react-node-architecture.png)"
580 |     );
581 |     expect(markdownWithImages).toContain(
582 |       "![Node.js Event Loop](https://example.com/images/nodejs-event-loop.jpg)"
583 |     );
584 | 
585 |     // Without includeImages, markdown should not contain image references
586 |     expect(markdownWithoutImages).not.toContain(
587 |       "![React and Node.js Architecture]"
588 |     );
589 |     expect(markdownWithoutImages).not.toContain("![Node.js Event Loop]");
590 |   });
591 | 
592 |   // Test with OpenAI
593 |   test("should extract images using OpenAI when includeImages is true", async () => {
594 |     const result = await extract({
595 |       content: articleWithImages,
596 |       format: ContentFormat.HTML,
597 |       schema: articleSchemaOpenAI,
598 |       provider: LLMProvider.OPENAI,
599 |       openaiApiKey: process.env.OPENAI_API_KEY,
600 |       htmlExtractionOptions: {
601 |         includeImages: true,
602 |       },
603 |       sourceUrl: "https://example.com/blog/async-await",
604 |     });
605 | 
606 |     verifyImageExtraction(result);
607 |   });
608 | 
609 |   // Test with Google Gemini
610 |   test("should extract images using Google Gemini when includeImages is true", async () => {
611 |     const result = await extract({
612 |       content: articleWithImages,
613 |       format: ContentFormat.HTML,
614 |       schema: articleSchema,
615 |       provider: LLMProvider.GOOGLE_GEMINI,
616 |       googleApiKey: process.env.GOOGLE_API_KEY,
617 |       htmlExtractionOptions: {
618 |         includeImages: true,
619 |       },
620 |       sourceUrl: "https://example.com/blog/async-await",
621 |     });
622 | 
623 |     verifyImageExtraction(result);
624 |   });
625 | });
626 | 


--------------------------------------------------------------------------------
/tests/integration/html-to-markdown.test.ts:
--------------------------------------------------------------------------------
  1 | import * as fs from "fs";
  2 | import * as path from "path";
  3 | import { htmlToMarkdown } from "../../src/converters";
  4 | import { HTMLExtractionOptions } from "../../src/types";
  5 | 
  6 | // Flag to check if the test-data submodule exists
  7 | const testDataExists = fs.existsSync(path.join(__dirname, "../../test-data"));
  8 | 
  9 | // Skip all tests if the test-data submodule is not available
 10 | const testOrSkip = testDataExists ? test : test.skip;
 11 | 
 12 | describe("HTML to Markdown Integration Tests", () => {
 13 |   // Function to test a specific HTML file against its groundtruth markdown
 14 |   function testConversion(
 15 |     category: string,
 16 |     filename: string,
 17 |     options?: HTMLExtractionOptions,
 18 |     variant: string = ""
 19 |   ) {
 20 |     // Construct file paths
 21 |     const htmlFilePath = path.join(
 22 |       __dirname,
 23 |       "../../test-data/html",
 24 |       category,
 25 |       `${filename}.html`
 26 |     );
 27 | 
 28 |     // Determine the groundtruth file path based on variant
 29 |     let groundtruthFilename = `${filename}`;
 30 |     if (variant === "main") {
 31 |       groundtruthFilename += ".main";
 32 |     } else if (variant === "images") {
 33 |       groundtruthFilename += ".images";
 34 |     }
 35 | 
 36 |     const markdownFilePath = path.join(
 37 |       __dirname,
 38 |       "../../test-data/groundtruth",
 39 |       category,
 40 |       `${groundtruthFilename}.md`
 41 |     );
 42 | 
 43 |     // Skip if files don't exist
 44 |     if (!fs.existsSync(htmlFilePath) || !fs.existsSync(markdownFilePath)) {
 45 |       console.warn(
 46 |         `Skipping test: Missing files for ${category}/${filename}: ${htmlFilePath} or ${markdownFilePath} not found`
 47 |       );
 48 |       return;
 49 |     }
 50 | 
 51 |     // Read files
 52 |     const html = fs.readFileSync(htmlFilePath, "utf8");
 53 |     const expectedMarkdown = fs.readFileSync(markdownFilePath, "utf8");
 54 | 
 55 |     // Convert HTML to Markdown
 56 |     const actualMarkdown = htmlToMarkdown(html, options);
 57 | 
 58 |     // Compare
 59 |     expect(actualMarkdown).toBe(expectedMarkdown);
 60 |   }
 61 | 
 62 |   // Dynamic test generation - automatically test all files in the test-data directory
 63 |   if (testDataExists) {
 64 |     describe("Auto-discovered Tests", () => {
 65 |       // Get all categories (subdirectories under html/)
 66 |       const testDataDir = path.join(__dirname, "../../test-data");
 67 |       const htmlDir = path.join(testDataDir, "html");
 68 |       const categories = fs
 69 |         .readdirSync(htmlDir, { withFileTypes: true })
 70 |         .filter((dirent) => dirent.isDirectory())
 71 |         .map((dirent) => dirent.name);
 72 | 
 73 |       // For each category, get all HTML files and create tests
 74 |       categories.forEach((category) => {
 75 |         const categoryDir = path.join(htmlDir, category);
 76 |         const htmlFiles = fs
 77 |           .readdirSync(categoryDir)
 78 |           .filter((file) => file.endsWith(".html"))
 79 |           .map((file) => file.replace(".html", ""));
 80 | 
 81 |         htmlFiles.forEach((filename) => {
 82 |           // Check which groundtruth files exist for this file
 83 |           const groundtruthDir = path.join(
 84 |             testDataDir,
 85 |             "groundtruth",
 86 |             category
 87 |           );
 88 | 
 89 |           // Basic conversion
 90 |           if (fs.existsSync(path.join(groundtruthDir, `${filename}.md`))) {
 91 |             testOrSkip(
 92 |               `should convert ${category}/${filename} to markdown`,
 93 |               () => {
 94 |                 testConversion(category, filename);
 95 |               }
 96 |             );
 97 |           }
 98 | 
 99 |           // Main content extraction
100 |           if (fs.existsSync(path.join(groundtruthDir, `${filename}.main.md`))) {
101 |             testOrSkip(
102 |               `should extract main content from ${category}/${filename}`,
103 |               () => {
104 |                 testConversion(
105 |                   category,
106 |                   filename,
107 |                   { extractMainHtml: true },
108 |                   "main"
109 |                 );
110 |               }
111 |             );
112 |           }
113 | 
114 |           // Conversion with images
115 |           if (
116 |             fs.existsSync(path.join(groundtruthDir, `${filename}.images.md`))
117 |           ) {
118 |             testOrSkip(
119 |               `should convert ${category}/${filename} with images`,
120 |               () => {
121 |                 testConversion(
122 |                   category,
123 |                   filename,
124 |                   { includeImages: true },
125 |                   "images"
126 |                 );
127 |               }
128 |             );
129 |           }
130 |         });
131 |       });
132 |     });
133 |   }
134 | });
135 | 


--------------------------------------------------------------------------------
/tests/integration/processedContent.test.ts:
--------------------------------------------------------------------------------
 1 | import { z } from "zod";
 2 | import { extract, ContentFormat, LLMProvider } from "../../src";
 3 | 
 4 | describe("ProcessedContent Integration Tests", () => {
 5 |   const simpleSchema = z.object({
 6 |     title: z.string(),
 7 |     content: z.string().nullable(),
 8 |   });
 9 | 
10 |   // Skip tests if API keys are not available
11 |   const skipIfNoKeys = () => {
12 |     if (!process.env.OPENAI_API_KEY) {
13 |       return true;
14 |     }
15 |     return false;
16 |   };
17 | 
18 |   it("should return original content as processedContent for TXT format", async () => {
19 |     if (skipIfNoKeys()) {
20 |       console.log("Skipping test: No API keys available");
21 |       return;
22 |     }
23 | 
24 |     const plainTextContent =
25 |       "Title: Simple Test\n\nThis is a test of plain text extraction.";
26 | 
27 |     const result = await extract({
28 |       content: plainTextContent,
29 |       format: ContentFormat.TXT,
30 |       schema: simpleSchema,
31 |       provider: LLMProvider.OPENAI,
32 |       openaiApiKey: process.env.OPENAI_API_KEY,
33 |     });
34 | 
35 |     // Verify the processedContent is the same as the original content
36 |     expect(result.processedContent).toBe(plainTextContent);
37 |   }, 60000);
38 | 
39 |   it("should return original content as processedContent for MARKDOWN format", async () => {
40 |     if (skipIfNoKeys()) {
41 |       console.log("Skipping test: No API keys available");
42 |       return;
43 |     }
44 | 
45 |     const markdownContent =
46 |       "# Simple Test\n\nThis is a test of markdown extraction.";
47 | 
48 |     const result = await extract({
49 |       content: markdownContent,
50 |       format: ContentFormat.MARKDOWN,
51 |       schema: simpleSchema,
52 |       provider: LLMProvider.OPENAI,
53 |       openaiApiKey: process.env.OPENAI_API_KEY,
54 |     });
55 | 
56 |     // Verify the processedContent is the same as the original content
57 |     expect(result.processedContent).toBe(markdownContent);
58 |   }, 60000);
59 | 
60 |   it("should return converted markdown as processedContent for HTML format", async () => {
61 |     if (skipIfNoKeys()) {
62 |       console.log("Skipping test: No API keys available");
63 |       return;
64 |     }
65 | 
66 |     const htmlContent =
67 |       "<h1>Simple Test</h1><p>This is a test of HTML extraction.</p>";
68 | 
69 |     const result = await extract({
70 |       content: htmlContent,
71 |       format: ContentFormat.HTML,
72 |       schema: simpleSchema,
73 |       provider: LLMProvider.OPENAI,
74 |       openaiApiKey: process.env.OPENAI_API_KEY,
75 |       sourceUrl: "https://example.com",
76 |     });
77 | 
78 |     // For HTML, processedContent should be the converted markdown
79 |     expect(result.processedContent).toContain("Simple Test");
80 |     expect(result.processedContent).toContain(
81 |       "This is a test of HTML extraction."
82 |     );
83 |     expect(result.processedContent).not.toContain("<h1>");
84 |     expect(result.processedContent).not.toContain("</p>");
85 |   }, 60000);
86 | });
87 | 


--------------------------------------------------------------------------------
/tests/setup.ts:
--------------------------------------------------------------------------------
1 | import { config } from "dotenv";
2 | import * as path from "path";
3 | 
4 | // Load environment variables from .env file
5 | config({ path: path.resolve(process.cwd(), ".env") });
6 | 
7 | // Set default timeout for tests (useful for tests involving LLM API calls)
8 | jest.setTimeout(30000);
9 | 


--------------------------------------------------------------------------------
/tests/unit/converters.test.ts:
--------------------------------------------------------------------------------
  1 | import { htmlToMarkdown } from "../../src/converters";
  2 | import { convertHtmlToMarkdown } from "../../src/index";
  3 | 
  4 | describe("HTML to Markdown converter", () => {
  5 |   test("should convert simple HTML to markdown", () => {
  6 |     const html = "<h1>Hello World</h1><p>This is a test</p>";
  7 |     const markdown = htmlToMarkdown(html);
  8 | 
  9 |     expect(markdown).toEqual("Hello World\n===========\n\nThis is a test");
 10 |     expect(markdown).toContain("Hello World");
 11 |     expect(markdown).toContain("This is a test");
 12 |   });
 13 | 
 14 |   test("should handle HTML with attributes", () => {
 15 |     const html =
 16 |       '<div class="content"><h2 id="title">Title</h2><p>Paragraph</p></div>';
 17 |     const markdown = htmlToMarkdown(html);
 18 | 
 19 |     expect(markdown).toContain("Title");
 20 |     expect(markdown).toContain("Paragraph");
 21 |   });
 22 | 
 23 |   // TODO: Add test for end-to-end extraction
 24 |   test("should escape markdown characters", () => {
 25 |     const html =
 26 |       '<a href="https://example.com/meeting-(11-12-24)">Meeting [11-12-24]</a>';
 27 |     const markdown = htmlToMarkdown(html);
 28 | 
 29 |     expect(markdown).toBe(
 30 |       "[Meeting \\[11-12-24\\]](https://example.com/meeting-\\(11-12-24\\))"
 31 |     );
 32 |   });
 33 | 
 34 |   test("should convert links correctly", () => {
 35 |     const html = '<a href="https://example.com">Example</a>';
 36 |     const markdown = htmlToMarkdown(html);
 37 | 
 38 |     expect(markdown).toBe("[Example](https://example.com)");
 39 |   });
 40 | 
 41 |   test("should discard images by default", () => {
 42 |     const html = '<img src="image.jpg" alt="An image">';
 43 |     const markdown = htmlToMarkdown(html);
 44 |     expect(markdown).toBe("");
 45 |   });
 46 | 
 47 |   test("should discard images when includeImages is false", () => {
 48 |     const html = '<img src="image.jpg" alt="An image">';
 49 |     const markdown = htmlToMarkdown(html, { includeImages: false });
 50 |     expect(markdown).toBe("");
 51 |   });
 52 | 
 53 |   test("should include images when includeImages is true", () => {
 54 |     const html =
 55 |       '<p>Text with an image: <img src="https://example.com/image.jpg" alt="Example image"></p>';
 56 |     const markdownWithImages = htmlToMarkdown(html, { includeImages: true });
 57 |     const markdownWithoutImages = htmlToMarkdown(html);
 58 | 
 59 |     // With includeImages, the image should be converted to markdown format
 60 |     expect(markdownWithImages).toContain("Text with an image:");
 61 |     expect(markdownWithImages).toContain(
 62 |       "![Example image](https://example.com/image.jpg)"
 63 |     );
 64 | 
 65 |     // Without includeImages, the image should be removed
 66 |     expect(markdownWithoutImages).toContain("Text with an image:");
 67 |     expect(markdownWithoutImages).not.toContain("![Example image]");
 68 |     expect(markdownWithoutImages).not.toContain(
 69 |       "https://example.com/image.jpg"
 70 |     );
 71 |   });
 72 | 
 73 |   test("should handle complex HTML with multiple images", () => {
 74 |     const html = `
 75 |       <article>
 76 |         <h1>Test Article</h1>
 77 |         <p>First paragraph with <img src="image1.jpg" alt="First image"> embedded.</p>
 78 |         <figure>
 79 |           <img src="image2.jpg" alt="Second image">
 80 |           <figcaption>Figure caption</figcaption>
 81 |         </figure>
 82 |         <picture>
 83 |           <source srcset="image3-large.jpg" media="(min-width: 800px)">
 84 |           <source srcset="image3-medium.jpg" media="(min-width: 400px)">
 85 |           <img src="image3.jpg" alt="Third image">
 86 |         </picture>
 87 |         <p>Final paragraph.</p>
 88 |       </article>
 89 |     `;
 90 | 
 91 |     const markdownWithImages = htmlToMarkdown(html, { includeImages: true });
 92 | 
 93 |     // Check that both images are included
 94 |     expect(markdownWithImages).toContain("![First image](image1.jpg)");
 95 |     expect(markdownWithImages).toContain("![Second image](image2.jpg)");
 96 |     expect(markdownWithImages).toContain("![Third image](image3.jpg)");
 97 |     expect(markdownWithImages).toContain("Figure caption");
 98 | 
 99 |     // Verify the basic structure is preserved
100 |     expect(markdownWithImages).toContain("Test Article");
101 |     expect(markdownWithImages).toContain("First paragraph");
102 |     expect(markdownWithImages).toContain("Final paragraph");
103 | 
104 |     // Check without images
105 |     const markdownWithoutImages = htmlToMarkdown(html);
106 |     expect(markdownWithoutImages).not.toContain("![First image]");
107 |     expect(markdownWithoutImages).not.toContain("![Second image]");
108 |     expect(markdownWithoutImages).not.toContain("![Third image]");
109 |   });
110 | 
111 |   test("should extract main content when extractMainHtml is true", () => {
112 |     const html = `
113 |       <html>
114 |         <body>
115 |           <header>Header content</header>
116 |           <article>
117 |             <h1>Main Content</h1>
118 |             <p>This is the main content</p>
119 |           </article>
120 |           <footer>Footer content</footer>
121 |         </body>
122 |       </html>
123 |     `;
124 | 
125 |     const markdownWithExtraction = htmlToMarkdown(html, {
126 |       extractMainHtml: true,
127 |     });
128 |     const markdownWithoutExtraction = htmlToMarkdown(html);
129 | 
130 |     // With extraction, only the article content should be included
131 |     expect(markdownWithExtraction).toContain("Main Content");
132 |     expect(markdownWithExtraction).toContain("This is the main content");
133 |     expect(markdownWithExtraction).not.toContain("Header content");
134 |     expect(markdownWithExtraction).not.toContain("Footer content");
135 | 
136 |     // Without extraction, the entire HTML should be converted
137 |     expect(markdownWithoutExtraction).toContain("Header content");
138 |     expect(markdownWithoutExtraction).toContain("Main Content");
139 |     expect(markdownWithoutExtraction).toContain("Footer content");
140 |   });
141 | 
142 |   describe("URL handling", () => {
143 |     test("should convert relative URLs to absolute URLs when sourceUrl is provided", () => {
144 |       const html = `
145 |         <a href="/about">About Us</a>
146 |         <a href="products/item.html">Product</a>
147 |         <a href="../blog/post.html">Blog Post</a>
148 |         <img src="/images/logo.png" alt="Logo">
149 |         <img src="assets/photo.jpg" alt="Photo">
150 |       `;
151 |       const sourceUrl = "https://example.com/company/";
152 |       const markdown = htmlToMarkdown(html, { includeImages: true }, sourceUrl);
153 | 
154 |       // Check that relative URLs are converted to absolute
155 |       expect(markdown).toContain("[About Us](https://example.com/about)");
156 |       expect(markdown).toContain(
157 |         "[Product](https://example.com/company/products/item.html)"
158 |       );
159 |       expect(markdown).toContain(
160 |         "[Blog Post](https://example.com/blog/post.html)"
161 |       );
162 |       expect(markdown).toContain(
163 |         "![Logo](https://example.com/images/logo.png)"
164 |       );
165 |       expect(markdown).toContain(
166 |         "![Photo](https://example.com/company/assets/photo.jpg)"
167 |       );
168 |     });
169 | 
170 |     test("should not modify absolute URLs when sourceUrl is provided", () => {
171 |       const html = `
172 |         <a href="https://other-site.com/page">External Link</a>
173 |         <a href="mailto:user@example.com">Email</a>
174 |         <img src="https://cdn.example.com/image.jpg" alt="CDN Image">
175 |       `;
176 |       const sourceUrl = "https://example.com/";
177 |       const markdown = htmlToMarkdown(html, { includeImages: true }, sourceUrl);
178 | 
179 |       // Check that absolute URLs remain unchanged
180 |       expect(markdown).toContain(
181 |         "[External Link](https://other-site.com/page)"
182 |       );
183 |       expect(markdown).toContain("[Email](mailto:user@example.com)");
184 |       expect(markdown).toContain(
185 |         "![CDN Image](https://cdn.example.com/image.jpg)"
186 |       );
187 |     });
188 | 
189 |     test("should handle relative URLs without sourceUrl", () => {
190 |       const html = `
191 |         <a href="/about">About Us</a>
192 |         <img src="/images/logo.png" alt="Logo">
193 |       `;
194 |       const markdown = htmlToMarkdown(html, { includeImages: true });
195 | 
196 |       // Check that relative URLs remain unchanged when no sourceUrl is provided
197 |       expect(markdown).toContain("[About Us](/about)");
198 |       expect(markdown).toContain("![Logo](/images/logo.png)");
199 |     });
200 | 
201 |     test("should handle invalid URLs gracefully", () => {
202 |       const html = `
203 |         <a href="invalid:url">Invalid Link</a>
204 |         <img src="invalid:url" alt="Invalid Image">
205 |       `;
206 |       const sourceUrl = "https://example.com/";
207 |       const markdown = htmlToMarkdown(html, { includeImages: true }, sourceUrl);
208 | 
209 |       // Check that invalid URLs are preserved as-is
210 |       expect(markdown).toContain("[Invalid Link](invalid:url)");
211 |       expect(markdown).toContain("![Invalid Image](invalid:url)");
212 |     });
213 |   });
214 | });
215 | 
216 | describe("convertHtmlToMarkdown", () => {
217 |   it("should convert HTML to markdown", () => {
218 |     const html = "<h1>Hello World</h1><p>This is a test</p>";
219 |     const markdown = convertHtmlToMarkdown(html);
220 |     expect(markdown).toContain("Hello World");
221 |     expect(markdown).toContain("This is a test");
222 |   });
223 | 
224 |   it("should handle HTML extraction options", () => {
225 |     const html = `
226 |       <nav>Navigation</nav>
227 |       <main><h1>Main Content</h1><p>Important text</p></main>
228 |       <footer>Footer</footer>
229 |     `;
230 |     const markdown = convertHtmlToMarkdown(html, { extractMainHtml: true });
231 |     expect(markdown).toContain("Main Content");
232 |     expect(markdown).toContain("Important text");
233 |     // Navigation and footer might be removed by extractMainHtml
234 |   });
235 | 
236 |   it("should process images when includeImages is true", () => {
237 |     const html = '<div><img src="image.jpg" alt="Test Image"></div>';
238 |     const markdown = convertHtmlToMarkdown(html, { includeImages: true });
239 |     expect(markdown).toContain("![Test Image]");
240 |   });
241 | 
242 |   it("should handle source URL for relative links", () => {
243 |     const html = '<a href="/about">About</a>';
244 |     const markdown = convertHtmlToMarkdown(
245 |       html,
246 |       undefined,
247 |       "https://example.com"
248 |     );
249 |     expect(markdown).toContain("https://example.com/about");
250 |   });
251 | });
252 | 


--------------------------------------------------------------------------------
/tests/unit/extractors.test.ts:
--------------------------------------------------------------------------------
  1 | import {
  2 |   getUsage,
  3 |   createLLM,
  4 |   extractWithLLM,
  5 |   truncateContent,
  6 |   generateExtractionPrompt,
  7 | } from "../../src/extractors";
  8 | import { LLMProvider, ContentFormat } from "../../src/types";
  9 | import { z } from "zod";
 10 | 
 11 | // Mock the LLM providers
 12 | jest.mock("@langchain/openai", () => ({
 13 |   ChatOpenAI: jest.fn().mockImplementation(() => ({
 14 |     constructor: { name: "ChatOpenAI" },
 15 |     withStructuredOutput: jest.fn().mockImplementation(() => ({
 16 |       invoke: jest.fn().mockResolvedValue({
 17 |         parsed: { title: "Test Title", content: "Test Content" },
 18 |         raw: {
 19 |           tool_calls: [
 20 |             {
 21 |               args: { title: "Test Title", content: "Test Content" },
 22 |             },
 23 |           ],
 24 |         },
 25 |       }),
 26 |     })),
 27 |   })),
 28 | }));
 29 | 
 30 | jest.mock("@langchain/google-genai", () => ({
 31 |   ChatGoogleGenerativeAI: jest.fn().mockImplementation(() => ({
 32 |     constructor: { name: "ChatGoogleGenerativeAI" },
 33 |     withStructuredOutput: jest.fn().mockImplementation(() => ({
 34 |       invoke: jest.fn().mockResolvedValue({
 35 |         parsed: { title: "Test Title", content: "Test Content" },
 36 |         raw: {
 37 |           lc_kwargs: {
 38 |             content: '{"title":"Test Title","content":"Test Content"}',
 39 |           },
 40 |         },
 41 |       }),
 42 |     })),
 43 |   })),
 44 | }));
 45 | 
 46 | describe("extractors", () => {
 47 |   const mockSchema = z.object({
 48 |     title: z.string(),
 49 |     content: z.string(),
 50 |   });
 51 | 
 52 |   const mockContent = "Test content";
 53 |   const mockApiKey = "test-api-key";
 54 | 
 55 |   beforeEach(() => {
 56 |     jest.clearAllMocks();
 57 |   });
 58 | 
 59 |   describe("getUsage", () => {
 60 |     it("should extract usage statistics from LLM output", () => {
 61 |       const mockOutput = {
 62 |         llmOutput: {
 63 |           tokenUsage: {
 64 |             promptTokens: 100,
 65 |             completionTokens: 50,
 66 |             totalTokens: 150,
 67 |           },
 68 |         },
 69 |       };
 70 | 
 71 |       const usage = getUsage(mockOutput);
 72 | 
 73 |       expect(usage.inputTokens).toBe(100);
 74 |       expect(usage.outputTokens).toBe(50);
 75 |     });
 76 | 
 77 |     it("should handle missing token usage", () => {
 78 |       const mockOutput = {
 79 |         llmOutput: {},
 80 |       };
 81 | 
 82 |       const usage = getUsage(mockOutput);
 83 | 
 84 |       expect(usage.inputTokens).toBeUndefined();
 85 |       expect(usage.outputTokens).toBeUndefined();
 86 |     });
 87 | 
 88 |     it("should handle missing llmOutput", () => {
 89 |       const mockOutput = {};
 90 | 
 91 |       const usage = getUsage(mockOutput);
 92 | 
 93 |       expect(usage.inputTokens).toBeUndefined();
 94 |       expect(usage.outputTokens).toBeUndefined();
 95 |     });
 96 |   });
 97 | 
 98 |   describe("createLLM", () => {
 99 |     it("should create ChatOpenAI instance for OPENAI provider", () => {
100 |       const llm = createLLM(
101 |         LLMProvider.OPENAI,
102 |         "gpt-4o-mini",
103 |         "fake-api-key",
104 |         0
105 |       );
106 | 
107 |       expect(llm).toBeDefined();
108 |       expect(llm.constructor.name).toBe("ChatOpenAI");
109 |     });
110 | 
111 |     it("should create ChatGoogleGenerativeAI instance for GOOGLE_GEMINI provider", () => {
112 |       const llm = createLLM(
113 |         LLMProvider.GOOGLE_GEMINI,
114 |         "gemini-2.5-flash-preview-04-17",
115 |         "fake-api-key",
116 |         0
117 |       );
118 | 
119 |       expect(llm).toBeDefined();
120 |       expect(llm.constructor.name).toBe("ChatGoogleGenerativeAI");
121 |     });
122 | 
123 |     it("should throw error for unsupported provider", () => {
124 |       expect(() => {
125 |         // @ts-ignore - Testing invalid provider
126 |         createLLM("unsupported-provider", "model", "api-key", 0);
127 |       }).toThrow("Unsupported LLM provider");
128 |     });
129 |   });
130 | 
131 |   describe("extractWithLLM", () => {
132 |     it("should extract data using OpenAI", async () => {
133 |       const result = await extractWithLLM(
134 |         mockContent,
135 |         mockSchema,
136 |         LLMProvider.OPENAI,
137 |         "gpt-4o-mini",
138 |         mockApiKey
139 |       );
140 | 
141 |       expect(result.data).toEqual({
142 |         title: "Test Title",
143 |         content: "Test Content",
144 |       });
145 |     });
146 | 
147 |     it("should extract data using Google Gemini", async () => {
148 |       const result = await extractWithLLM(
149 |         mockContent,
150 |         mockSchema,
151 |         LLMProvider.GOOGLE_GEMINI,
152 |         "gemini-2.5-flash-preview-04-17",
153 |         mockApiKey
154 |       );
155 | 
156 |       expect(result.data).toEqual({
157 |         title: "Test Title",
158 |         content: "Test Content",
159 |       });
160 |     });
161 | 
162 |     it("should handle custom prompts", async () => {
163 |       const customPrompt = "Extract the main topic and summary";
164 |       const result = await extractWithLLM(
165 |         mockContent,
166 |         mockSchema,
167 |         LLMProvider.OPENAI,
168 |         "gpt-4o-mini",
169 |         mockApiKey,
170 |         0,
171 |         customPrompt
172 |       );
173 | 
174 |       expect(result.data).toEqual({
175 |         title: "Test Title",
176 |         content: "Test Content",
177 |       });
178 |     });
179 | 
180 |     it("should handle different content formats", async () => {
181 |       const result = await extractWithLLM(
182 |         mockContent,
183 |         mockSchema,
184 |         LLMProvider.OPENAI,
185 |         "gpt-4o-mini",
186 |         mockApiKey,
187 |         0,
188 |         undefined,
189 |         ContentFormat.TXT
190 |       );
191 | 
192 |       expect(result.data).toEqual({
193 |         title: "Test Title",
194 |         content: "Test Content",
195 |       });
196 |     });
197 | 
198 |     it("should handle data enrichment", async () => {
199 |       const dataToEnrich = {
200 |         title: "Existing Title",
201 |         content: "", // Empty field that should be filled
202 |       };
203 | 
204 |       const result = await extractWithLLM(
205 |         mockContent,
206 |         mockSchema,
207 |         LLMProvider.OPENAI,
208 |         "gpt-4o-mini",
209 |         mockApiKey,
210 |         0,
211 |         undefined,
212 |         ContentFormat.TXT,
213 |         undefined,
214 |         dataToEnrich
215 |       );
216 | 
217 |       expect(result.data).toEqual({
218 |         title: "Test Title",
219 |         content: "Test Content",
220 |       });
221 |     });
222 |   });
223 | 
224 |   describe("truncateContent", () => {
225 |     it("should not truncate content when full prompt is within limit", () => {
226 |       const prompt = generateExtractionPrompt({
227 |         format: ContentFormat.TXT,
228 |         content: "",
229 |       });
230 |       const content = "This is a short test content.";
231 |       const result = truncateContent({
232 |         content,
233 |         maxTokens: (prompt.length + content.length) / 4,
234 |         format: ContentFormat.TXT,
235 |       });
236 |       expect(result).toBe(content);
237 |     });
238 | 
239 |     it("should truncate content by excess amount", () => {
240 |       const prompt = generateExtractionPrompt({
241 |         format: ContentFormat.TXT,
242 |         content: "",
243 |       });
244 |       // Create a content that will make the full prompt exceed the limit
245 |       const content = "This is a longer test content that should be truncated.";
246 |       const result = truncateContent({
247 |         content,
248 |         maxTokens: (prompt.length + content.length) / 4 - 1,
249 |         format: ContentFormat.TXT,
250 |       });
251 |       expect(result.length).toBe(content.length - 4);
252 |     });
253 | 
254 |     it("should account for dataToEnrich in prompt size calculation", () => {
255 |       const prompt = generateExtractionPrompt({
256 |         format: ContentFormat.TXT,
257 |         content: "",
258 |         dataToEnrich: { a: 1, b: 2 },
259 |       });
260 | 
261 |       const content = "This is a test content for enrichment.";
262 |       const result = truncateContent({
263 |         content,
264 |         maxTokens: (prompt.length + content.length) / 4 - 1,
265 |         format: ContentFormat.TXT,
266 |         dataToEnrich: { a: 1, b: 2 },
267 |       });
268 | 
269 |       expect(result.length).toBe(content.length - 4);
270 |     });
271 |   });
272 | 
273 |   describe("generateExtractionPrompt", () => {
274 |     it("should generate a basic extraction prompt without dataToEnrich", () => {
275 |       const prompt = generateExtractionPrompt({
276 |         format: ContentFormat.TXT,
277 |         content: "Some test content",
278 |       });
279 | 
280 |       expect(prompt).toContain("Context information is below:");
281 |       expect(prompt).toContain("Format: txt");
282 |       expect(prompt).toContain("Some test content");
283 |       expect(prompt).toContain("You are a data extraction assistant");
284 |       expect(prompt).toContain(
285 |         "Extract ONLY information explicitly stated in the context"
286 |       );
287 |       expect(prompt).not.toContain("Enrich the original JSON object");
288 |       expect(prompt).toContain(
289 |         "Return only the structured data in valid JSON format"
290 |       );
291 |     });
292 | 
293 |     it("should generate an enrichment prompt with dataToEnrich", () => {
294 |       const dataToEnrich = {
295 |         title: "Existing Title",
296 |         author: "",
297 |         tags: ["existing"],
298 |       };
299 | 
300 |       const prompt = generateExtractionPrompt({
301 |         format: ContentFormat.MARKDOWN,
302 |         content: "Some markdown content",
303 |         dataToEnrich,
304 |       });
305 | 
306 |       expect(prompt).toContain("Context information is below:");
307 |       expect(prompt).toContain("Format: markdown");
308 |       expect(prompt).toContain("Some markdown content");
309 |       expect(prompt).toContain("Original JSON object");
310 |       expect(prompt).toContain(JSON.stringify(dataToEnrich, null, 2));
311 |       expect(prompt).toContain(
312 |         "You are a data extraction assistant that extracts structured information from the above context in markdown and JSON"
313 |       );
314 |       expect(prompt).toContain(
315 |         "Enrich the original JSON object with information from the context"
316 |       );
317 |       expect(prompt).toContain(
318 |         "Only update existing fields and fill in additional fields if new and relevant information is available in the context"
319 |       );
320 |       expect(prompt).toContain(
321 |         "Return only the structured data in valid JSON format"
322 |       );
323 |     });
324 | 
325 |     it("should include custom prompt in the instructions", () => {
326 |       const customPrompt = "Extract only product information and prices";
327 |       const dataToEnrich = { products: [] };
328 | 
329 |       const prompt = generateExtractionPrompt({
330 |         format: ContentFormat.HTML,
331 |         content: "<div>Product content</div>",
332 |         customPrompt,
333 |         dataToEnrich,
334 |       });
335 | 
336 |       expect(prompt).toContain(customPrompt);
337 |       expect(prompt).toContain("Enrich the original JSON object");
338 |       expect(prompt).toContain(JSON.stringify(dataToEnrich, null, 2));
339 |     });
340 |   });
341 | });
342 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "compilerOptions": {
  3 |     /* Visit https://aka.ms/tsconfig to read more about this file */
  4 | 
  5 |     /* Projects */
  6 |     // "incremental": true,                              /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
  7 |     // "composite": true,                                /* Enable constraints that allow a TypeScript project to be used with project references. */
  8 |     // "tsBuildInfoFile": "./.tsbuildinfo",              /* Specify the path to .tsbuildinfo incremental compilation file. */
  9 |     // "disableSourceOfProjectReferenceRedirect": true,  /* Disable preferring source files instead of declaration files when referencing composite projects. */
 10 |     // "disableSolutionSearching": true,                 /* Opt a project out of multi-project reference checking when editing. */
 11 |     // "disableReferencedProjectLoad": true,             /* Reduce the number of projects loaded automatically by TypeScript. */
 12 | 
 13 |     /* Language and Environment */
 14 |     "target": "es2018",                                  /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
 15 |     // "lib": [],                                        /* Specify a set of bundled library declaration files that describe the target runtime environment. */
 16 |     // "jsx": "preserve",                                /* Specify what JSX code is generated. */
 17 |     // "libReplacement": true,                           /* Enable lib replacement. */
 18 |     // "experimentalDecorators": true,                   /* Enable experimental support for legacy experimental decorators. */
 19 |     // "emitDecoratorMetadata": true,                    /* Emit design-type metadata for decorated declarations in source files. */
 20 |     // "jsxFactory": "",                                 /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
 21 |     // "jsxFragmentFactory": "",                         /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
 22 |     // "jsxImportSource": "",                            /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
 23 |     // "reactNamespace": "",                             /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
 24 |     // "noLib": true,                                    /* Disable including any library files, including the default lib.d.ts. */
 25 |     // "useDefineForClassFields": true,                  /* Emit ECMAScript-standard-compliant class fields. */
 26 |     // "moduleDetection": "auto",                        /* Control what method is used to detect module-format JS files. */
 27 | 
 28 |     /* Modules */
 29 |     "module": "commonjs",                                /* Specify what module code is generated. */
 30 |     "rootDir": "./src",
 31 |     "moduleResolution": "node",
 32 |     // "baseUrl": "./",                                  /* Specify the base directory to resolve non-relative module names. */
 33 |     // "paths": {},                                      /* Specify a set of entries that re-map imports to additional lookup locations. */
 34 |     // "rootDirs": [],                                   /* Allow multiple folders to be treated as one when resolving modules. */
 35 |     // "typeRoots": [],                                  /* Specify multiple folders that act like './node_modules/@types'. */
 36 |     // "types": [],                                      /* Specify type package names to be included without being referenced in a source file. */
 37 |     // "allowUmdGlobalAccess": true,                     /* Allow accessing UMD globals from modules. */
 38 |     // "moduleSuffixes": [],                             /* List of file name suffixes to search when resolving a module. */
 39 |     // "allowImportingTsExtensions": true,               /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */
 40 |     // "rewriteRelativeImportExtensions": true,          /* Rewrite '.ts', '.tsx', '.mts', and '.cts' file extensions in relative import paths to their JavaScript equivalent in output files. */
 41 |     // "resolvePackageJsonExports": true,                /* Use the package.json 'exports' field when resolving package imports. */
 42 |     // "resolvePackageJsonImports": true,                /* Use the package.json 'imports' field when resolving imports. */
 43 |     // "customConditions": [],                           /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
 44 |     // "noUncheckedSideEffectImports": true,             /* Check side effect imports. */
 45 |     // "resolveJsonModule": true,                        /* Enable importing .json files. */
 46 |     // "allowArbitraryExtensions": true,                 /* Enable importing files with any extension, provided a declaration file is present. */
 47 |     // "noResolve": true,                                /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
 48 | 
 49 |     /* JavaScript Support */
 50 |     // "allowJs": true,                                  /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
 51 |     // "checkJs": true,                                  /* Enable error reporting in type-checked JavaScript files. */
 52 |     // "maxNodeModuleJsDepth": 1,                        /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
 53 | 
 54 |     /* Emit */
 55 |     "declaration": true,
 56 |     // "declarationMap": true,                           /* Create sourcemaps for d.ts files. */
 57 |     // "emitDeclarationOnly": true,                      /* Only output d.ts files and not JavaScript files. */
 58 |     "sourceMap": true,
 59 |     // "inlineSourceMap": true,                          /* Include sourcemap files inside the emitted JavaScript. */
 60 |     // "noEmit": true,                                   /* Disable emitting files from a compilation. */
 61 |     // "outFile": "./",                                  /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
 62 |     "outDir": "./dist",
 63 |     // "removeComments": true,                           /* Disable emitting comments. */
 64 |     // "importHelpers": true,                            /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
 65 |     // "downlevelIteration": true,                       /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
 66 |     // "sourceRoot": "",                                 /* Specify the root path for debuggers to find the reference source code. */
 67 |     // "mapRoot": "",                                    /* Specify the location where debugger should locate map files instead of generated locations. */
 68 |     // "inlineSources": true,                            /* Include source code in the sourcemaps inside the emitted JavaScript. */
 69 |     // "emitBOM": true,                                  /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
 70 |     // "newLine": "crlf",                                /* Set the newline character for emitting files. */
 71 |     // "stripInternal": true,                            /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
 72 |     // "noEmitHelpers": true,                            /* Disable generating custom helper functions like '__extends' in compiled output. */
 73 |     // "noEmitOnError": true,                            /* Disable emitting files if any type checking errors are reported. */
 74 |     // "preserveConstEnums": true,                       /* Disable erasing 'const enum' declarations in generated code. */
 75 |     // "declarationDir": "./",                           /* Specify the output directory for generated declaration files. */
 76 | 
 77 |     /* Interop Constraints */
 78 |     // "isolatedModules": true,                          /* Ensure that each file can be safely transpiled without relying on other imports. */
 79 |     // "verbatimModuleSyntax": true,                     /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
 80 |     // "isolatedDeclarations": true,                     /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */
 81 |     // "erasableSyntaxOnly": true,                       /* Do not allow runtime constructs that are not part of ECMAScript. */
 82 |     // "allowSyntheticDefaultImports": true,             /* Allow 'import x from y' when a module doesn't have a default export. */
 83 |     "esModuleInterop": true,                             /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
 84 |     // "preserveSymlinks": true,                         /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
 85 |     "forceConsistentCasingInFileNames": true,            /* Ensure that casing is correct in imports. */
 86 | 
 87 |     /* Type Checking */
 88 |     "strict": true,                                      /* Enable all strict type-checking options. */
 89 |     // "noImplicitAny": true,                            /* Enable error reporting for expressions and declarations with an implied 'any' type. */
 90 |     // "strictNullChecks": true,                         /* When type checking, take into account 'null' and 'undefined'. */
 91 |     // "strictFunctionTypes": true,                      /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
 92 |     // "strictBindCallApply": true,                      /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
 93 |     // "strictPropertyInitialization": true,             /* Check for class properties that are declared but not set in the constructor. */
 94 |     // "strictBuiltinIteratorReturn": true,              /* Built-in iterators are instantiated with a 'TReturn' type of 'undefined' instead of 'any'. */
 95 |     // "noImplicitThis": true,                           /* Enable error reporting when 'this' is given the type 'any'. */
 96 |     // "useUnknownInCatchVariables": true,               /* Default catch clause variables as 'unknown' instead of 'any'. */
 97 |     // "alwaysStrict": true,                             /* Ensure 'use strict' is always emitted. */
 98 |     // "noUnusedLocals": true,                           /* Enable error reporting when local variables aren't read. */
 99 |     // "noUnusedParameters": true,                       /* Raise an error when a function parameter isn't read. */
100 |     // "exactOptionalPropertyTypes": true,               /* Interpret optional property types as written, rather than adding 'undefined'. */
101 |     // "noImplicitReturns": true,                        /* Enable error reporting for codepaths that do not explicitly return in a function. */
102 |     // "noFallthroughCasesInSwitch": true,               /* Enable error reporting for fallthrough cases in switch statements. */
103 |     // "noUncheckedIndexedAccess": true,                 /* Add 'undefined' to a type when accessed using an index. */
104 |     // "noImplicitOverride": true,                       /* Ensure overriding members in derived classes are marked with an override modifier. */
105 |     // "noPropertyAccessFromIndexSignature": true,       /* Enforces using indexed accessors for keys declared using an indexed type. */
106 |     // "allowUnusedLabels": true,                        /* Disable error reporting for unused labels. */
107 |     // "allowUnreachableCode": true,                     /* Disable error reporting for unreachable code. */
108 | 
109 |     /* Completeness */
110 |     // "skipDefaultLibCheck": true,                      /* Skip type checking .d.ts files that are included with TypeScript. */
111 |     "skipLibCheck": true                                 /* Skip type checking all .d.ts files. */
112 |   },
113 |   "include": ["src/**/*"],
114 |   "exclude": ["node_modules", "dist", "**/*.test.ts"]
115 | }
116 | 


--------------------------------------------------------------------------------