├── .changeset ├── README.md └── config.json ├── .clinerules ├── .gitattributes ├── .github ├── FUNDING.yml └── workflows │ ├── ci.yaml │ └── release.yaml ├── .gitignore ├── .vscode └── settings.json ├── LICENSE ├── README.md ├── biome.json ├── examples ├── ai-learning │ ├── .env.example │ ├── .gitignore │ ├── CHANGELOG.md │ ├── package.json │ ├── src │ │ ├── auto-optimize.ts │ │ ├── datasets.ts │ │ ├── index.ts │ │ ├── manual.ts │ │ └── utils.ts │ └── tsconfig.json ├── bench │ ├── .gitignore │ ├── CHANGELOG.md │ ├── package.json │ ├── src │ │ └── index.ts │ └── tsconfig.json ├── scraping │ ├── .gitignore │ ├── CHANGELOG.md │ ├── package.json │ ├── src │ │ └── index.ts │ └── tsconfig.json ├── simple │ ├── .gitignore │ ├── CHANGELOG.md │ ├── package.json │ ├── src │ │ └── index.ts │ └── tsconfig.json ├── translate │ ├── .gitignore │ ├── CHANGELOG.md │ ├── package.json │ ├── src │ │ └── index.ts │ └── tsconfig.json └── worker │ ├── .gitignore │ ├── CHANGELOG.md │ ├── package.json │ ├── src │ └── index.ts │ ├── tsconfig.json │ └── wrangler.toml ├── images ├── logo.webp └── voice-genius.png ├── package.json ├── packages └── webforai │ ├── CHANGELOG.md │ ├── README.md │ ├── build.ts │ ├── package.cjs.json │ ├── package.json │ ├── src │ ├── cli │ │ ├── bin.ts │ │ ├── commands │ │ │ └── webforai │ │ │ │ ├── index.ts │ │ │ │ └── loadHtml.ts │ │ ├── constants.ts │ │ ├── helpers │ │ │ ├── assertContinue.ts │ │ │ ├── inputOutputPath.ts │ │ │ ├── inputSourcePath.ts │ │ │ ├── selectExtractMode.ts │ │ │ └── selectLoader.ts │ │ └── utils.ts │ ├── constants.ts │ ├── extract-mdast.ts │ ├── extractors │ │ ├── index.ts │ │ ├── pipeExtractors.ts │ │ ├── presets │ │ │ ├── minimal-filter.ts │ │ │ ├── takumi.ts │ │ │ └── utils.ts │ │ └── types.ts │ ├── html-to-markdown.test.ts │ ├── html-to-markdown.ts │ ├── html-to-mdast.ts │ ├── index.ts │ ├── link-replacer.test.ts │ ├── link-replacer.ts │ ├── loaders │ │ ├── cf-puppeteer.ts │ │ ├── fetch.test.ts │ │ ├── fetch.ts │ │ ├── playwright.test.ts │ │ ├── playwright.ts │ │ ├── puppeteer.test.ts │ │ └── puppeteer.ts │ ├── md-splitter.ts │ ├── mdast-handlers │ │ ├── custom-a-handler.ts │ │ ├── custom-br-handler.ts │ │ ├── custom-code-handler.ts │ │ ├── custom-div-handler.ts │ │ ├── custom-img-handler.ts │ │ ├── custom-table-handler.ts │ │ ├── empty-handler.ts │ │ └── math-handler.ts │ ├── mdast-to-markdown.ts │ └── utils │ │ ├── common.ts │ │ ├── detect-code-lang.ts │ │ ├── hast-utils.ts │ │ └── mdast-utils.ts │ ├── tsconfig.build.json │ └── tsconfig.json ├── pnpm-lock.yaml ├── pnpm-workspace.yaml ├── site ├── CHANGELOG.md ├── README.md ├── docs │ ├── footer.tsx │ ├── pages │ │ ├── cookbook │ │ │ ├── cf-workers.mdx │ │ │ ├── custom-extractor.mdx │ │ │ ├── index.mdx │ │ │ ├── simple.mdx │ │ │ ├── structured-output.mdx │ │ │ └── translation.mdx │ │ ├── docs │ │ │ ├── html-to-markdown.mdx │ │ │ ├── html-to-mdast.mdx │ │ │ ├── loaders.mdx │ │ │ └── mdast-to-markdown.mdx │ │ ├── getting-started.mdx │ │ ├── how-it-works.mdx │ │ ├── index.mdx │ │ └── installation.mdx │ ├── public │ │ └── images │ │ │ ├── how-it-works.svg │ │ │ ├── logo-dark.png │ │ │ ├── logo-full-dark.svg │ │ │ ├── logo-full-light.svg │ │ │ ├── logo-full-pad-dark.svg │ │ │ ├── logo-full-pad-light.svg │ │ │ └── logo-light.png │ └── styles.css ├── package.json ├── postcss.config.js ├── tailwind.config.js ├── tsconfig.json ├── vocs.config.ts ├── worker-configuration.d.ts ├── workers │ └── index.tsx └── wrangler.toml └── vitest.config.ts /.changeset/README.md: -------------------------------------------------------------------------------- 1 | # Changesets 2 | 3 | Hello and welcome! This folder has been automatically generated by `@changesets/cli`, a build tool that works 4 | with multi-package repos, or single-package repos to help you version and publish your code. You can 5 | find the full documentation for it [in our repository](https://github.com/changesets/changesets) 6 | 7 | We have a quick list of common questions to get you started engaging with this project in 8 | [our documentation](https://github.com/changesets/changesets/blob/main/docs/common-questions.md) 9 | -------------------------------------------------------------------------------- /.changeset/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://unpkg.com/@changesets/config@3.0.1/schema.json", 3 | "changelog": ["@changesets/changelog-github", { "repo": "inaridiy/webforai" }], 4 | "commit": false, 5 | "fixed": [], 6 | "linked": [], 7 | "access": "public", 8 | "baseBranch": "main", 9 | "updateInternalDependencies": "patch", 10 | "ignore": [] 11 | } 12 | -------------------------------------------------------------------------------- /.clinerules: -------------------------------------------------------------------------------- 1 | { 2 | "project": { 3 | "name": "WebForAI", 4 | "description": "A library that converts HTML to Markdown with various loaders and extractors for AI consumption", 5 | "repository": "https://github.com/inaridiy/webforai", 6 | "homepage": "https://webforai.dev/" 7 | }, 8 | "structure": { 9 | "monorepo": true, 10 | "packageManager": "pnpm", 11 | "mainPackage": "packages/webforai", 12 | "directories": { 13 | "packages": { 14 | "description": "Contains the main WebForAI package", 15 | "patterns": ["packages/**"] 16 | }, 17 | "examples": { 18 | "description": "Example projects demonstrating WebForAI usage", 19 | "patterns": ["examples/**"] 20 | }, 21 | "site": { 22 | "description": "Documentation website", 23 | "patterns": ["site/**"] 24 | }, 25 | "apps": { 26 | "description": "Application implementations", 27 | "patterns": ["apps/**"] 28 | } 29 | } 30 | }, 31 | "capabilities": { 32 | "core": [ 33 | "HTML to Markdown conversion", 34 | "HTML to MDAST conversion", 35 | "MDAST to Markdown conversion", 36 | "Web content loading via various methods" 37 | ], 38 | "loaders": ["Playwright", "Puppeteer", "Cloudflare Puppeteer", "Fetch API"], 39 | "extractors": ["Content extraction presets", "Custom extraction pipelines"] 40 | }, 41 | "algorithms": { 42 | "htmlToMarkdown": { 43 | "description": "Main conversion pipeline that transforms HTML to Markdown", 44 | "flow": "HTML → HAST → MDAST → Markdown", 45 | "steps": [ 46 | "Parse HTML into HAST (HTML Abstract Syntax Tree)", 47 | "Apply content extractors to clean and focus on main content", 48 | "Transform HAST to MDAST (Markdown Abstract Syntax Tree)", 49 | "Convert MDAST to Markdown text with formatting options" 50 | ] 51 | }, 52 | "contentExtraction": { 53 | "description": "Intelligent algorithms to extract the main content from web pages", 54 | "implementations": [ 55 | { 56 | "name": "takumiExtractor", 57 | "description": "Advanced content extractor inspired by Mozilla Readability", 58 | "techniques": [ 59 | "Metadata filtering to remove scripts, styles, and other non-content elements", 60 | "Universal element filtering to remove navigation, asides, and hidden content", 61 | "Content selection using common article selectors", 62 | "Link density analysis to identify content-rich areas", 63 | "Language-specific content length thresholds" 64 | ] 65 | } 66 | ] 67 | }, 68 | "mdastHandlers": { 69 | "description": "Custom handlers for transforming specific HTML elements to Markdown", 70 | "handlers": [ 71 | "customAHandler: Enhanced link handling with text-only option", 72 | "customCodeHandler: Code block handling with language detection", 73 | "customDivHandler: Special div element processing", 74 | "customImgHandler: Image handling with hide option", 75 | "customTableHandler: Table processing with text-only option", 76 | "mathHandler: Mathematical notation conversion" 77 | ] 78 | }, 79 | "linkProcessing": { 80 | "description": "Utilities for handling and transforming links", 81 | "features": ["Relative to absolute URL conversion", "Base URL integration", "Link text extraction"] 82 | } 83 | }, 84 | "development": { 85 | "nodeVersion": ">=18.0.0", 86 | "commands": { 87 | "build": "pnpm run --r --filter \"./packages/**\" build", 88 | "test": "vitest", 89 | "format": "biome format .", 90 | "lint": "biome check ." 91 | }, 92 | "tools": ["TypeScript", "Biome", "Vitest", "Changesets"] 93 | }, 94 | "customModes": [ 95 | { 96 | "slug": "webforai-dev", 97 | "name": "WebForAI Developer", 98 | "roleDefinition": "You are Roo, a specialized developer for the WebForAI library. You understand HTML parsing, Markdown generation, and web content extraction techniques. You're familiar with the project's architecture including loaders, extractors, and MDAST/HAST transformations.", 99 | "groups": ["read", "edit", "browser", "command", "mcp"] 100 | }, 101 | { 102 | "slug": "webforai-docs", 103 | "name": "WebForAI Documentation", 104 | "roleDefinition": "You are Roo, a documentation specialist for the WebForAI library. You excel at creating clear, concise documentation with practical examples. You understand the library's capabilities and can explain complex concepts in an accessible way.", 105 | "groups": [ 106 | "read", 107 | ["edit", { "fileRegex": "\\.(md|mdx)$", "description": "Markdown and MDX files only" }], 108 | "browser", 109 | "command", 110 | "mcp" 111 | ] 112 | } 113 | ] 114 | } 115 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: inaridiy -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: ci 2 | on: 3 | push: 4 | branches: [main, develop] 5 | pull_request: 6 | branches: ['*'] 7 | paths-ignore: 8 | - 'docs/**' 9 | - '.vscode/**' 10 | - 'README.md' 11 | - '.gitignore' 12 | - 'LICENSE' 13 | 14 | jobs: 15 | lint: 16 | name: 'Lint' 17 | runs-on: ubuntu-22.04 18 | steps: 19 | - uses: actions/checkout@v4 20 | - uses: pnpm/action-setup@v2 21 | name: Install pnpm 22 | id: pnpm-install 23 | with: 24 | version: 9.1.4 25 | run_install: true 26 | - run: pnpm format 27 | - run: pnpm lint 28 | - run: pnpm lint:repo 29 | - run: pnpm build 30 | test: 31 | name: 'Test' 32 | runs-on: ubuntu-22.04 33 | steps: 34 | - uses: actions/checkout@v4 35 | - uses: pnpm/action-setup@v2 36 | name: Install pnpm 37 | id: pnpm-install 38 | with: 39 | version: 9.1.4 40 | run_install: true 41 | - name: Install Playwright Browsers 42 | run: | 43 | pnpm install -w playwright 44 | pnpm exec playwright install chromium 45 | pnpm exec playwright install-deps 46 | - name: Run tests 47 | run: pnpm test -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Changesets 2 | on: 3 | push: 4 | branches: 5 | - main 6 | 7 | concurrency: 8 | group: ${{ github.workflow }}-${{ github.ref }} 9 | cancel-in-progress: true 10 | 11 | jobs: 12 | version: 13 | timeout-minutes: 15 14 | runs-on: ubuntu-latest 15 | permissions: 16 | contents: write 17 | id-token: write 18 | pull-requests: write 19 | steps: 20 | - uses: actions/checkout@v4 21 | - run: | 22 | echo "SKIP_SIMPLE_GIT_HOOKS=1" >> $GITHUB_ENV 23 | - uses: pnpm/action-setup@v2 24 | name: Install pnpm 25 | id: pnpm-install 26 | with: 27 | version: 9.1.4 28 | run_install: true 29 | - name: Setup npmrc 30 | run: echo "//registry.npmjs.org/:_authToken=${{ secrets.NPM_TOKEN }}" > .npmrc 31 | - name: create and publish versions 32 | uses: changesets/action@v1 33 | with: 34 | version: pnpm ci:version 35 | publish: pnpm ci:publish 36 | title: 'chore: version packages' 37 | commit: 'chore: version packages' 38 | env: 39 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 40 | NPM_TOKEN: ${{ secrets.NPM_TOKEN }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | dist 3 | .npmrc 4 | .DS_Store 5 | .wrangler -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.defaultFormatter": "biomejs.biome", 3 | "editor.codeActionsOnSave": { 4 | "source.organizeImports.biome": "explicit" 5 | }, 6 | "[typescript]": { 7 | "editor.defaultFormatter": "biomejs.biome" 8 | }, 9 | "[json]": { 10 | "editor.defaultFormatter": "biomejs.biome" 11 | }, 12 | "markdown.preview.breaks": true, 13 | "[markdown]": { 14 | "editor.defaultFormatter": "esbenp.prettier-vscode" 15 | }, 16 | "[jsonc]": { 17 | "editor.defaultFormatter": "biomejs.biome" 18 | }, 19 | "[html]": { 20 | "editor.defaultFormatter": "vscode.html-language-features" 21 | }, 22 | "[javascript]": { 23 | "editor.defaultFormatter": "biomejs.biome" 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | packages/webforai/README.md -------------------------------------------------------------------------------- /biome.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://biomejs.dev/schemas/1.5.3/schema.json", 3 | "files": { 4 | "ignore": ["worker-configuration.d.ts", "package.json"] 5 | }, 6 | "vcs": { 7 | "enabled": true, 8 | "clientKind": "git", 9 | "useIgnoreFile": true 10 | }, 11 | "organizeImports": { 12 | "enabled": true 13 | }, 14 | "formatter": { 15 | "enabled": true, 16 | "lineWidth": 120 17 | }, 18 | "linter": { 19 | "enabled": true, 20 | "rules": { 21 | "all": true, 22 | "style": { 23 | "useNamingConvention": { 24 | "level": "warn", 25 | "options": { 26 | "strictCase": false 27 | } 28 | } 29 | }, 30 | "correctness": { 31 | "noUndeclaredVariables": "off" 32 | }, 33 | "complexity": { 34 | "noExcessiveCognitiveComplexity": { 35 | "level": "error", 36 | "options": { 37 | "maxAllowedComplexity": 20 38 | } 39 | } 40 | } 41 | } 42 | }, 43 | 44 | "overrides": [ 45 | { 46 | "include": ["examples/**"], 47 | "ignore": ["**/*.json"], 48 | "linter": { 49 | "rules": { 50 | "recommended": true, 51 | "style": { 52 | "useNamingConvention": "off" 53 | }, 54 | "correctness": { 55 | "noUndeclaredVariables": "off" 56 | } 57 | } 58 | } 59 | } 60 | ] 61 | } 62 | -------------------------------------------------------------------------------- /examples/ai-learning/.env.example: -------------------------------------------------------------------------------- 1 | GOOGLE_GENERATIVE_AI_API_KEY -------------------------------------------------------------------------------- /examples/ai-learning/.gitignore: -------------------------------------------------------------------------------- 1 | .output 2 | .env 3 | .cache 4 | .output -------------------------------------------------------------------------------- /examples/ai-learning/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # bench 2 | 3 | ## 1.1.1 4 | 5 | ### Patch Changes 6 | 7 | - Updated dependencies [[`7fa6ec7`](https://github.com/inaridiy/webforai/commit/7fa6ec75b9966a92820a21a4b8f9eb85c5c2020c)]: 8 | - webforai@2.1.1 9 | 10 | ## 1.1.0 11 | 12 | ### Minor Changes 13 | 14 | - [#56](https://github.com/inaridiy/webforai/pull/56) [`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4) Thanks [@inaridiy](https://github.com/inaridiy)! - Add minimal filter extractor 15 | 16 | ### Patch Changes 17 | 18 | - Updated dependencies [[`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4)]: 19 | - webforai@2.1.0 20 | 21 | ## 1.0.17 22 | 23 | ### Patch Changes 24 | 25 | - Updated dependencies [[`a2e0fc0`](https://github.com/inaridiy/webforai/commit/a2e0fc0b00554a3d860f0cdf67494c1691f9eeb3)]: 26 | - webforai@2.0.1 27 | 28 | ## 1.0.16 29 | 30 | ### Patch Changes 31 | 32 | - [#50](https://github.com/inaridiy/webforai/pull/50) [`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f) Thanks [@inaridiy](https://github.com/inaridiy)! - New Documentation Site 33 | 34 | - Updated dependencies [[`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f), [`baaf7ea`](https://github.com/inaridiy/webforai/commit/baaf7ea33b9a75d07dfb910231d64d5b6efc6f40)]: 35 | - webforai@2.0.0 36 | 37 | ## 1.0.15 38 | 39 | ### Patch Changes 40 | 41 | - Updated dependencies [[`fc84541`](https://github.com/inaridiy/webforai/commit/fc84541331ebc2dbf7d80af0694d9cdc79145a59)]: 42 | - webforai@1.6.3 43 | 44 | ## 1.0.14 45 | 46 | ### Patch Changes 47 | 48 | - Updated dependencies [[`00d2a55`](https://github.com/inaridiy/webforai/commit/00d2a55b4a25f6d84da37cb89ec629b5f6179357), [`a7db6ef`](https://github.com/inaridiy/webforai/commit/a7db6ef61f1321e010ed16022086a684c19d77a5)]: 49 | - webforai@1.6.2 50 | 51 | ## 1.0.13 52 | 53 | ### Patch Changes 54 | 55 | - Updated dependencies [[`5e82341`](https://github.com/inaridiy/webforai/commit/5e82341c2a3b8275d78ce808c7d4f81dacb1acdd)]: 56 | - webforai@1.6.1 57 | 58 | ## 1.0.12 59 | 60 | ### Patch Changes 61 | 62 | - [#38](https://github.com/inaridiy/webforai/pull/38) [`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9) Thanks [@inaridiy](https://github.com/inaridiy)! - improve seido 63 | 64 | - Updated dependencies [[`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9), [`30d181e`](https://github.com/inaridiy/webforai/commit/30d181e7a29452b37f6dd665de3e7a3b743c1244)]: 65 | - webforai@1.6.0 66 | 67 | ## 1.0.11 68 | 69 | ### Patch Changes 70 | 71 | - Updated dependencies [[`39c1122`](https://github.com/inaridiy/webforai/commit/39c112205d0aa2a07a46c92b94be6ed91239cf0f)]: 72 | - webforai@1.5.1 73 | 74 | ## 1.0.10 75 | 76 | ### Patch Changes 77 | 78 | - Updated dependencies [[`b13719f`](https://github.com/inaridiy/webforai/commit/b13719fd719511d0aeb1ef3749e88fd8145337a6), [`167acb1`](https://github.com/inaridiy/webforai/commit/167acb1dca303d651d997c23224d3366ff4375ac)]: 79 | - webforai@1.5.0 80 | 81 | ## 1.0.9 82 | 83 | ### Patch Changes 84 | 85 | - Updated dependencies [[`fd7348f`](https://github.com/inaridiy/webforai/commit/fd7348f59a19a027b9bdf012b11d40c38d56cf61)]: 86 | - webforai@1.4.1 87 | 88 | ## 1.0.8 89 | 90 | ### Patch Changes 91 | 92 | - Updated dependencies [[`5bdea98`](https://github.com/inaridiy/webforai/commit/5bdea98cc7cafd79020123260db721fc6ffefd87)]: 93 | - webforai@1.4.0 94 | 95 | ## 1.0.7 96 | 97 | ### Patch Changes 98 | 99 | - Updated dependencies [[`c5e8416`](https://github.com/inaridiy/webforai/commit/c5e841610360346fcba388c777869706dcd5997d), [`97863ea`](https://github.com/inaridiy/webforai/commit/97863ea7f9f4837b96f376bd33371c6ed756d791)]: 100 | - webforai@1.3.3 101 | 102 | ## 1.0.6 103 | 104 | ### Patch Changes 105 | 106 | - Updated dependencies [[`3c6da39`](https://github.com/inaridiy/webforai/commit/3c6da3952f176769cf8aa899f6c7207c231d806a), [`4437e28`](https://github.com/inaridiy/webforai/commit/4437e28e1e7807fd061aee99510ea2d3f71a2a78), [`4764767`](https://github.com/inaridiy/webforai/commit/47647676a838b922e2cf32b1d3637c8153b996dd)]: 107 | - webforai@1.3.2 108 | 109 | ## 1.0.5 110 | 111 | ### Patch Changes 112 | 113 | - Updated dependencies [[`680a226`](https://github.com/inaridiy/webforai/commit/680a22638409517658c3918d90d070b1fa53cc3f)]: 114 | - webforai@1.3.1 115 | 116 | ## 1.0.4 117 | 118 | ### Patch Changes 119 | 120 | - [#12](https://github.com/inaridiy/webforai/pull/12) [`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7) Thanks [@inaridiy](https://github.com/inaridiy)! - Minor interface improvements. 121 | 122 | - Updated dependencies [[`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7)]: 123 | - webforai@1.3.0 124 | 125 | ## 1.0.3 126 | 127 | ### Patch Changes 128 | 129 | - [`64b3a07`](https://github.com/inaridiy/webforai/commit/64b3a07304d364320364b499ca73df24cd312afd) Thanks [@inaridiy](https://github.com/inaridiy)! - Update 130 | 131 | ## 1.0.2 132 | 133 | ### Patch Changes 134 | 135 | - [`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflow 136 | 137 | - [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflows 138 | 139 | - Updated dependencies [[`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953), [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b)]: 140 | - webforai@1.2.3 141 | 142 | ## 1.0.1 143 | 144 | ### Patch Changes 145 | 146 | - 920f310: Update Linter and Workflows 147 | - Updated dependencies [920f310] 148 | - webforai@1.2.2 149 | -------------------------------------------------------------------------------- /examples/ai-learning/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ai-learning", 3 | "version": "1.1.1", 4 | "description": "", 5 | "main": "index.js", 6 | "type": "module", 7 | "private": true, 8 | "scripts": { 9 | "test": "echo \"Error: no test specified\" && exit 1" 10 | }, 11 | "keywords": [], 12 | "author": "", 13 | "license": "ISC", 14 | "dependencies": { 15 | "@ai-sdk/google": "^0.0.48", 16 | "ai": "^3.4.7", 17 | "arg": "^5.0.2", 18 | "dotenv": "^16.4.5", 19 | "hast-util-from-html": "^2.0.3", 20 | "hast-util-select": "^6.0.2", 21 | "hast-util-to-html": "^9.0.3", 22 | "hast-util-to-string": "^3.0.0", 23 | "playwright": "^1.40.1", 24 | "tsx": "^4.19.1", 25 | "unist-util-filter": "^5.0.1", 26 | "webforai": "workspace:^", 27 | "zod": "^3.23.8" 28 | }, 29 | "devDependencies": { 30 | "@tsconfig/recommended": "^1.0.3", 31 | "@types/hast": "^3.0.2", 32 | "@types/node": "^20.14.10", 33 | "typescript": "^5.4.5" 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /examples/ai-learning/src/auto-optimize.ts: -------------------------------------------------------------------------------- 1 | import { promises as fs } from "node:fs"; 2 | import { google } from "@ai-sdk/google"; 3 | import { generateObject } from "ai"; 4 | import dotenv from "dotenv"; 5 | import type { Element } from "hast"; 6 | import { fromHtml } from "hast-util-from-html"; 7 | import { toHtml } from "hast-util-to-html"; 8 | import { tsImport } from "tsx/esm/api"; 9 | import { filter } from "unist-util-filter"; 10 | import { htmlToMarkdown } from "webforai"; 11 | import { z } from "zod"; 12 | import { persitCachedLoadHtml } from "./utils.js"; 13 | 14 | dotenv.config(); 15 | 16 | const target = "https://github.com/wevm/viem/issues/2658"; 17 | const html = await persitCachedLoadHtml(target); 18 | 19 | const htmlToMarkdownWithGenerated = async (html: string, generatedPath: string, parentPath: string) => { 20 | try { 21 | const { extractor: generatedExtractor } = await tsImport(generatedPath, parentPath); 22 | return htmlToMarkdown(html, { baseUrl: target, extractors: [generatedExtractor] }); 23 | } catch (e) { 24 | console.info("Failed to load generated extractor, using default extractor", e); 25 | return htmlToMarkdown(html, { baseUrl: target, extractors: false }); 26 | } 27 | }; 28 | 29 | const generateExtractor = async ( 30 | html: string, 31 | rawMarkdown: string, 32 | userRequirements: string, 33 | genericAlgorithm: string, 34 | ) => { 35 | const result = await generateObject({ 36 | model: google("gemini-1.5-pro-latest"), 37 | schema: z.object({ code: z.string() }), 38 | prompt: `You are tasked with implementing an algorithm to extract the main content from HTML during the process of converting HTML to Markdown. Your goal is to create a TypeScript function that takes in HTML and other parameters, and returns a filtered HTML Abstract Syntax Tree (HAST) containing only the main content. 39 | 40 | You will be working with the following inputs: 41 | 42 | 1. HTML content: 43 | 44 | ${html} 45 | 46 | 47 | 2. Raw Markdown converted from the HTML without content extraction: 48 | 49 | ${rawMarkdown} 50 | 51 | 52 | 3. User requirements for content extraction: 53 | 54 | ${userRequirements} 55 | 56 | 57 | 4. A generic content extraction algorithm for reference: 58 | 59 | ${genericAlgorithm} 60 | 61 | 62 | You may use the following libraries in your implementation: 63 | - unist-util-filter 64 | - hast-util-to-string 65 | - hast-util-select 66 | 67 | Your task is to implement the following function: 68 | 69 | \`\`\`typescript 70 | type ExtractParams = { hast: Hast; lang?: string; url?: string }; 71 | 72 | export const extractor = (params: ExtractParams): Hast => { 73 | // Your implementation here 74 | } 75 | \`\`\` 76 | 77 | Throughout your implementation, use comments to explain your reasoning and approach. Consider edge cases and potential issues that may arise with different types of HTML structures. 78 | 79 | After implementing the extractor function, provide a brief explanation of how to test and refine the algorithm using sample HTML inputs and user requirements. 80 | 81 | Write your complete TypeScript implementation, including imports, helper functions, and the main extractor function.`, 82 | }); 83 | 84 | return result.object.code; 85 | }; 86 | 87 | const rawContent = htmlToMarkdown(html, { baseUrl: target, extractors: false }); 88 | const simpleExtractedHtml = toHtml( 89 | filter(fromHtml(rawContent), (node) => { 90 | return !( 91 | ["comment", "doctype"].includes(node.type) || 92 | (node.type === "element" && 93 | ["script", "style", "link", "meta", "noscript", "svg", "title"].includes((node as Element).tagName)) 94 | ); 95 | }) ?? [], 96 | ); 97 | 98 | const userRequirements = "Issueの議論のみ抽出してください"; 99 | const exampleCode = await fs.readFile("./.output/example-extractor.ts", "utf-8"); 100 | 101 | const extractor = await generateExtractor(simpleExtractedHtml, rawContent, userRequirements, exampleCode); 102 | 103 | await fs.writeFile("./.output/generated-extractors.ts", extractor); 104 | 105 | const extractedContent = await htmlToMarkdownWithGenerated(html, "../.output/generated-extractors.ts", import.meta.url); 106 | 107 | await fs.writeFile("./.output/extracted-content.md", extractedContent); 108 | await fs.writeFile("./.output/raw-content.md", rawContent); 109 | -------------------------------------------------------------------------------- /examples/ai-learning/src/datasets.ts: -------------------------------------------------------------------------------- 1 | export const TECH_DOCUMENTS = [ 2 | "https://react.dev/", 3 | "https://react.dev/learn", 4 | "https://nextjs.org/", 5 | "https://nextjs.org/showcase", 6 | "https://nextjs.org/docs", 7 | "https://nextjs.org/docs/app/building-your-application/routing/dynamic-routes", 8 | "https://docs.expo.dev/", 9 | "https://docs.expo.dev/tutorial/introduction/", 10 | "https://vuejs.org/", 11 | "https://vuejs.org/guide/introduction.html", 12 | "https://hono.dev/", 13 | "https://hono.dev/docs/", 14 | "https://esbuild.github.io/getting-started/", 15 | "https://vitejs.dev/config/", 16 | "https://tailwindcss.com/", 17 | "https://tailwindcss.com/docs/installation", 18 | "https://ui.shadcn.com/docs", 19 | "https://ui.shadcn.com/docs/components/select", 20 | "https://orm.drizzle.team/docs/overview", 21 | "https://orm.drizzle.team/docs/rqb", 22 | "https://developers.cloudflare.com/pages/framework-guides/deploy-a-hono-site/", 23 | "https://emotion.sh/docs/introduction", 24 | "https://jotai.org/", 25 | "https://clerk.com/docs/quickstarts/nextjs", 26 | "https://www.prisma.io/docs/orm/overview/introduction/what-is-prisma", 27 | "https://www.npmjs.com/package/webforai", 28 | ]; 29 | 30 | export const ARTICLES = [ 31 | "https://blog.cloudflare.com/", 32 | "https://blog.cloudflare.com/more-npm-packages-on-cloudflare-workers-combining-polyfills-and-native-code/", 33 | "https://gigazine.net/", 34 | "https://gigazine.net/news/20240917-synchron-brain-computer-interface-alexa/", 35 | "https://dev.classmethod.jp/", 36 | "https://dev.classmethod.jp/articles/gha-volta-error-could-not-unpack-node/", 37 | "https://zenn.dev/", 38 | "https://zenn.dev/inaridiy/articles/f1ed9e73cb182b", 39 | "https://ics.media/", 40 | "https://ics.media/entry/231120/", 41 | "https://saruwakakun.com/html-css/basic", 42 | "https://saruwakakun.com/html-css/basic/tools", 43 | "https://qiita.com/", 44 | "https://qiita.com/Tadataka_Takahashi/items/556e0277017677cef68a", 45 | "https://www.wikipedia.org/", 46 | "https://ja.wikipedia.org/wiki/%E6%9C%A8%E6%9D%91%E6%8B%93%E5%93%89", 47 | ]; 48 | 49 | export const NEWS = [ 50 | "https://www.nytimes.com/international/", 51 | "https://www.wsj.com/", 52 | "https://www.cnn.co.jp/usa/35223960.html", 53 | "https://www.bbc.com/news", 54 | "https://www.bbc.com/news/articles/cx2kdd3n7yqo", 55 | "https://www3.nhk.or.jp/news/html/20240329/k10014405791000.html", 56 | ]; 57 | 58 | export const EC_SITE = [ 59 | "https://www.amazon.co.jp/dp/B08ZSHSFXQ", 60 | "https://store.shopping.yahoo.co.jp/gimi1225/p2750.html?sc_i=shopping-pc-web-top--pm_mod-itm_1", 61 | ]; 62 | -------------------------------------------------------------------------------- /examples/ai-learning/src/index.ts: -------------------------------------------------------------------------------- 1 | import fs from "node:fs"; 2 | import dotenv from "dotenv"; 3 | import { htmlToMarkdown } from "webforai"; 4 | import { ARTICLES, EC_SITE, NEWS, TECH_DOCUMENTS } from "./datasets.js"; 5 | import { persitCachedLoadHtml, scoreMarkdown } from "./utils.js"; 6 | 7 | dotenv.config(); 8 | 9 | const TARGETS = [...ARTICLES, ...EC_SITE, ...NEWS, ...TECH_DOCUMENTS]; 10 | 11 | const contents: { url: string; html: string; extractedContent: string; rawContent: string }[] = []; 12 | 13 | for (const url of TARGETS) { 14 | const html = await persitCachedLoadHtml(url); 15 | 16 | const extractedContent = htmlToMarkdown(html, { baseUrl: url }); 17 | const rawContent = htmlToMarkdown(html, { baseUrl: url, extractors: false }); 18 | 19 | contents.push({ url, html, extractedContent, rawContent }); 20 | } 21 | 22 | const scores: { url: string; score: number; issues: string[] }[] = []; 23 | 24 | for (const content of contents) { 25 | const result = await scoreMarkdown(content); 26 | 27 | console.info(`${content.url} - ${result.object.score}`); 28 | scores.push({ url: content.url, score: result.object.score, issues: result.object.issues }); 29 | } 30 | 31 | console.info(scores); 32 | await fs.mkdirSync("./output", { recursive: true }); 33 | await fs.writeFileSync("./output/scores.json", JSON.stringify(scores, null, 2)); 34 | console.info(`Avg Score: ${scores.reduce((acc, curr) => acc + curr.score, 0) / scores.length}`); 35 | -------------------------------------------------------------------------------- /examples/ai-learning/src/manual.ts: -------------------------------------------------------------------------------- 1 | import fs from "node:fs"; 2 | import dotenv from "dotenv"; 3 | import { chromium } from "playwright"; 4 | import { htmlToMarkdown } from "webforai"; 5 | 6 | dotenv.config(); 7 | 8 | const url = "https://ui.shadcn.com/docs"; 9 | const loadHtml = async (url: string) => { 10 | const browser = await chromium.launch({ headless: true }); 11 | const context = await browser.newContext({ 12 | userAgent: 13 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", 14 | viewport: { width: 1920, height: 1080 }, 15 | deviceScaleFactor: 1, 16 | hasTouch: false, 17 | isMobile: false, 18 | javaScriptEnabled: true, 19 | locale: "en-US", 20 | timezoneId: "America/New_York", 21 | }); 22 | 23 | // Webドライバーの特性を隠す 24 | await context.addInitScript(() => { 25 | Object.defineProperty(navigator, "webdriver", { get: () => undefined }); 26 | Object.defineProperty(navigator, "languages", { get: () => ["en-US", "en"] }); 27 | Object.defineProperty(navigator, "plugins", { get: () => [1, 2, 3, 4, 5] }); 28 | }); 29 | 30 | const page = await context.newPage(); 31 | await page.route("**/*.js", (route) => { 32 | if (route.request().url().includes("captcha-delivery")) { 33 | return route.abort(); 34 | } 35 | return route.continue(); 36 | }); 37 | 38 | await page.goto(url, { waitUntil: "networkidle", timeout: 10_000 }).catch(() => { 39 | /** */ 40 | }); 41 | const html = await page.content(); 42 | await page.close(); 43 | await browser.close(); 44 | 45 | return html; 46 | }; 47 | 48 | const html = await loadHtml(url); 49 | 50 | await fs.mkdirSync(".output", { recursive: true }); 51 | 52 | await fs.writeFileSync(".output/html.html", html); 53 | 54 | const rawContent = await htmlToMarkdown(html, { baseUrl: url, extractors: false }); 55 | const cleanedContent = await htmlToMarkdown(html, { baseUrl: url }); 56 | 57 | await fs.writeFileSync(".output/raw.md", rawContent); 58 | await fs.writeFileSync(".output/cleaned.md", cleanedContent); 59 | -------------------------------------------------------------------------------- /examples/ai-learning/src/utils.ts: -------------------------------------------------------------------------------- 1 | import { promises as fs } from "node:fs"; 2 | import path from "node:path"; 3 | import { google } from "@ai-sdk/google"; 4 | import { generateObject } from "ai"; 5 | import { tsImport } from "tsx/esm/api"; 6 | import { htmlToMarkdown } from "webforai"; 7 | import { loadHtml } from "webforai/loaders/playwright"; 8 | import { z } from "zod"; 9 | 10 | export const persitCachedLoadHtml = async (url: string) => { 11 | const cacheDir = ".cache"; 12 | await fs.mkdir(cacheDir, { recursive: true }); 13 | const cachePath = path.join(cacheDir, `${url.replace(/[^a-zA-Z0-9]/g, "_")}.txt`); 14 | if (await fs.stat(cachePath).catch(() => false)) { 15 | return fs.readFile(cachePath, "utf-8"); 16 | } 17 | const html = await loadHtml(url, { superBypassMode: true }); 18 | await fs.writeFile(cachePath, html); 19 | return html; 20 | }; 21 | 22 | export const htmlToMarkdownWithGenerated = async ( 23 | url: string, 24 | html: string, 25 | generatedPath: string, 26 | parentPath: string, 27 | ) => { 28 | try { 29 | const { extractor: generatedExtractor } = await tsImport(generatedPath, parentPath); 30 | return htmlToMarkdown(html, { baseUrl: url, extractors: [generatedExtractor] }); 31 | } catch { 32 | return htmlToMarkdown(html, { baseUrl: url, extractors: false }); 33 | } 34 | }; 35 | 36 | export const scoreMarkdown = async (content: { rawContent: string; extractedContent: string }) => { 37 | const result = await generateObject({ 38 | model: google("gemini-1.5-flash-latest"), 39 | temperature: 0, 40 | schema: z.object({ 41 | analysis: z.string().describe("Detailed analysis of the cleaning process. 400 characters max."), 42 | issues: z.array(z.string()).describe("List of issues found in the cleaned Markdown. 12 issues max."), 43 | score: z.number().min(0).max(100), 44 | }), 45 | prompt: ` 46 | You are tasked with evaluating the effectiveness of an algorithm that extracts the main content from a website's HTML and converts it to Markdown format. Your goal is to compare the original Markdown output (which includes all content) with a cleaned version that attempts to remove unnecessary elements like advertisements and navigation. 47 | 48 | First, you will be presented with the original Markdown content: 49 | 50 | 51 | ${content.rawContent} 52 | 53 | 54 | Next, you will see the cleaned Markdown content: 55 | 56 | 57 | ${content.extractedContent} 58 | 59 | 60 | Compare these two versions carefully. Your task is to evaluate how accurately the cleaning process has extracted only the main content, removing unnecessary elements while preserving the essential information. 61 | 62 | When evaluating, consider the following criteria: 63 | 1. Removal of advertisements 64 | 2. Removal of navigation elements 65 | 3. Removal of sidebars or other non-essential sections 66 | 4. Preservation of the main article or content 67 | 5. Preservation of important headings and subheadings 68 | 6. Preservation of relevant images or media 69 | 7. Maintenance of the content's logical flow and structure 70 | 71 | Based on these criteria, assign a score from 0 to 100, where 100 represents perfect extraction of only the main content, and 0 represents no improvement or significant loss of important content. 72 | 73 | In addition to the score, identify any problems or issues you notice in the cleaned version. List these problems in bullet points, adjusting the granularity to provide a maximum of 12 points.`, 74 | }).catch((err) => { 75 | console.error(err); 76 | return { object: { score: -1, issues: [], analysis: "" } }; 77 | }); 78 | 79 | return result; 80 | }; 81 | -------------------------------------------------------------------------------- /examples/ai-learning/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@tsconfig/recommended/tsconfig.json", 3 | "compilerOptions": { 4 | "module": "NodeNext", 5 | "target": "ESNext" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /examples/bench/.gitignore: -------------------------------------------------------------------------------- 1 | .output -------------------------------------------------------------------------------- /examples/bench/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # bench 2 | 3 | ## 1.0.19 4 | 5 | ### Patch Changes 6 | 7 | - Updated dependencies [[`7fa6ec7`](https://github.com/inaridiy/webforai/commit/7fa6ec75b9966a92820a21a4b8f9eb85c5c2020c)]: 8 | - webforai@2.1.1 9 | 10 | ## 1.0.18 11 | 12 | ### Patch Changes 13 | 14 | - Updated dependencies [[`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4)]: 15 | - webforai@2.1.0 16 | 17 | ## 1.0.17 18 | 19 | ### Patch Changes 20 | 21 | - Updated dependencies [[`a2e0fc0`](https://github.com/inaridiy/webforai/commit/a2e0fc0b00554a3d860f0cdf67494c1691f9eeb3)]: 22 | - webforai@2.0.1 23 | 24 | ## 1.0.16 25 | 26 | ### Patch Changes 27 | 28 | - [#50](https://github.com/inaridiy/webforai/pull/50) [`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f) Thanks [@inaridiy](https://github.com/inaridiy)! - New Documentation Site 29 | 30 | - Updated dependencies [[`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f), [`baaf7ea`](https://github.com/inaridiy/webforai/commit/baaf7ea33b9a75d07dfb910231d64d5b6efc6f40)]: 31 | - webforai@2.0.0 32 | 33 | ## 1.0.15 34 | 35 | ### Patch Changes 36 | 37 | - Updated dependencies [[`fc84541`](https://github.com/inaridiy/webforai/commit/fc84541331ebc2dbf7d80af0694d9cdc79145a59)]: 38 | - webforai@1.6.3 39 | 40 | ## 1.0.14 41 | 42 | ### Patch Changes 43 | 44 | - Updated dependencies [[`00d2a55`](https://github.com/inaridiy/webforai/commit/00d2a55b4a25f6d84da37cb89ec629b5f6179357), [`a7db6ef`](https://github.com/inaridiy/webforai/commit/a7db6ef61f1321e010ed16022086a684c19d77a5)]: 45 | - webforai@1.6.2 46 | 47 | ## 1.0.13 48 | 49 | ### Patch Changes 50 | 51 | - Updated dependencies [[`5e82341`](https://github.com/inaridiy/webforai/commit/5e82341c2a3b8275d78ce808c7d4f81dacb1acdd)]: 52 | - webforai@1.6.1 53 | 54 | ## 1.0.12 55 | 56 | ### Patch Changes 57 | 58 | - [#38](https://github.com/inaridiy/webforai/pull/38) [`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9) Thanks [@inaridiy](https://github.com/inaridiy)! - improve seido 59 | 60 | - Updated dependencies [[`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9), [`30d181e`](https://github.com/inaridiy/webforai/commit/30d181e7a29452b37f6dd665de3e7a3b743c1244)]: 61 | - webforai@1.6.0 62 | 63 | ## 1.0.11 64 | 65 | ### Patch Changes 66 | 67 | - Updated dependencies [[`39c1122`](https://github.com/inaridiy/webforai/commit/39c112205d0aa2a07a46c92b94be6ed91239cf0f)]: 68 | - webforai@1.5.1 69 | 70 | ## 1.0.10 71 | 72 | ### Patch Changes 73 | 74 | - Updated dependencies [[`b13719f`](https://github.com/inaridiy/webforai/commit/b13719fd719511d0aeb1ef3749e88fd8145337a6), [`167acb1`](https://github.com/inaridiy/webforai/commit/167acb1dca303d651d997c23224d3366ff4375ac)]: 75 | - webforai@1.5.0 76 | 77 | ## 1.0.9 78 | 79 | ### Patch Changes 80 | 81 | - Updated dependencies [[`fd7348f`](https://github.com/inaridiy/webforai/commit/fd7348f59a19a027b9bdf012b11d40c38d56cf61)]: 82 | - webforai@1.4.1 83 | 84 | ## 1.0.8 85 | 86 | ### Patch Changes 87 | 88 | - Updated dependencies [[`5bdea98`](https://github.com/inaridiy/webforai/commit/5bdea98cc7cafd79020123260db721fc6ffefd87)]: 89 | - webforai@1.4.0 90 | 91 | ## 1.0.7 92 | 93 | ### Patch Changes 94 | 95 | - Updated dependencies [[`c5e8416`](https://github.com/inaridiy/webforai/commit/c5e841610360346fcba388c777869706dcd5997d), [`97863ea`](https://github.com/inaridiy/webforai/commit/97863ea7f9f4837b96f376bd33371c6ed756d791)]: 96 | - webforai@1.3.3 97 | 98 | ## 1.0.6 99 | 100 | ### Patch Changes 101 | 102 | - Updated dependencies [[`3c6da39`](https://github.com/inaridiy/webforai/commit/3c6da3952f176769cf8aa899f6c7207c231d806a), [`4437e28`](https://github.com/inaridiy/webforai/commit/4437e28e1e7807fd061aee99510ea2d3f71a2a78), [`4764767`](https://github.com/inaridiy/webforai/commit/47647676a838b922e2cf32b1d3637c8153b996dd)]: 103 | - webforai@1.3.2 104 | 105 | ## 1.0.5 106 | 107 | ### Patch Changes 108 | 109 | - Updated dependencies [[`680a226`](https://github.com/inaridiy/webforai/commit/680a22638409517658c3918d90d070b1fa53cc3f)]: 110 | - webforai@1.3.1 111 | 112 | ## 1.0.4 113 | 114 | ### Patch Changes 115 | 116 | - [#12](https://github.com/inaridiy/webforai/pull/12) [`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7) Thanks [@inaridiy](https://github.com/inaridiy)! - Minor interface improvements. 117 | 118 | - Updated dependencies [[`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7)]: 119 | - webforai@1.3.0 120 | 121 | ## 1.0.3 122 | 123 | ### Patch Changes 124 | 125 | - [`64b3a07`](https://github.com/inaridiy/webforai/commit/64b3a07304d364320364b499ca73df24cd312afd) Thanks [@inaridiy](https://github.com/inaridiy)! - Update 126 | 127 | ## 1.0.2 128 | 129 | ### Patch Changes 130 | 131 | - [`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflow 132 | 133 | - [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflows 134 | 135 | - Updated dependencies [[`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953), [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b)]: 136 | - webforai@1.2.3 137 | 138 | ## 1.0.1 139 | 140 | ### Patch Changes 141 | 142 | - 920f310: Update Linter and Workflows 143 | - Updated dependencies [920f310] 144 | - webforai@1.2.2 145 | -------------------------------------------------------------------------------- /examples/bench/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "bench", 3 | "version": "1.0.19", 4 | "description": "", 5 | "main": "index.js", 6 | "type": "module", 7 | "private": true, 8 | "scripts": { 9 | "test": "echo \"Error: no test specified\" && exit 1" 10 | }, 11 | "keywords": [], 12 | "author": "", 13 | "license": "ISC", 14 | "dependencies": { 15 | "arg": "^5.0.2", 16 | "playwright": "^1.40.1", 17 | "tsx": "^4.19.1", 18 | "webforai": "workspace:^" 19 | }, 20 | "devDependencies": { 21 | "@tsconfig/recommended": "^1.0.3", 22 | "@types/node": "^20.14.10", 23 | "typescript": "^5.4.5" 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /examples/bench/src/index.ts: -------------------------------------------------------------------------------- 1 | import { promises as fs } from "node:fs"; 2 | import { htmlToMarkdown } from "webforai"; 3 | import { loadHtml } from "webforai/loaders/playwright"; 4 | 5 | await fs.mkdir(".output", { recursive: true }); 6 | 7 | const id = Date.now(); 8 | await fs.mkdir(`.output/${id}`, { recursive: true }); 9 | 10 | const targets = [ 11 | "https://nextjs.org/docs/app/building-your-application/routing/pages-and-layouts", 12 | "https://ja.wikipedia.org/wiki/%E6%9C%A8%E6%9D%91%E6%8B%93%E5%93%89", 13 | "https://zenn.dev/frontendflat/articles/9d15b1b7abd524", 14 | "https://zenn.dev/dmmdata/articles/694e32c34dbd4c", 15 | "https://www3.nhk.or.jp/news/html/20240329/k10014405791000.html", 16 | "https://gigazine.net/", 17 | "https://www.npmjs.com/package/webforai", 18 | "https://developers.cloudflare.com/browser-rendering/get-started/reuse-sessions/", 19 | "https://news.livedoor.com/topics/detail/26152830", 20 | "https://viem.sh/docs/actions/public/getLogs.html", 21 | "https://www.google.com/search?q=%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF&oq=%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIGCAEQLhhA0gEIMTM0OGowajSoAgCwAgE&sourceid=chrome&ie=UTF-8", 22 | "https://www.amazon.co.jp/Hold-On-Holdon-Q1J-%E3%83%8A%E3%82%A4%E3%83%88%E3%83%96%E3%83%AB%E3%83%BC/dp/B0872VRY3K/?_encoding=UTF8&ref_=pd_gw_ci_mcx_mr_hp_atf_m", 23 | ]; 24 | 25 | for (const url of targets) { 26 | const html = await loadHtml(url); 27 | await fs.writeFile(`.output/${id}/${url.split("/").slice(-1)[0]}.html`, html); 28 | 29 | const markdown = htmlToMarkdown(html, { 30 | baseUrl: url, 31 | extractors: "takumi", 32 | linkAsText: true, 33 | tableAsText: true, 34 | hideImage: true, 35 | }); 36 | 37 | await fs.writeFile(`.output/${id}/${url.split("/").slice(-1)[0]}.md`, markdown); 38 | } 39 | -------------------------------------------------------------------------------- /examples/bench/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@tsconfig/recommended/tsconfig.json", 3 | "compilerOptions": { 4 | "module": "NodeNext", 5 | "target": "ESNext" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /examples/scraping/.gitignore: -------------------------------------------------------------------------------- 1 | .output 2 | .env -------------------------------------------------------------------------------- /examples/scraping/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # scraping 2 | 3 | ## 1.0.19 4 | 5 | ### Patch Changes 6 | 7 | - Updated dependencies [[`7fa6ec7`](https://github.com/inaridiy/webforai/commit/7fa6ec75b9966a92820a21a4b8f9eb85c5c2020c)]: 8 | - webforai@2.1.1 9 | 10 | ## 1.0.18 11 | 12 | ### Patch Changes 13 | 14 | - Updated dependencies [[`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4)]: 15 | - webforai@2.1.0 16 | 17 | ## 1.0.17 18 | 19 | ### Patch Changes 20 | 21 | - Updated dependencies [[`a2e0fc0`](https://github.com/inaridiy/webforai/commit/a2e0fc0b00554a3d860f0cdf67494c1691f9eeb3)]: 22 | - webforai@2.0.1 23 | 24 | ## 1.0.16 25 | 26 | ### Patch Changes 27 | 28 | - Updated dependencies [[`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f), [`baaf7ea`](https://github.com/inaridiy/webforai/commit/baaf7ea33b9a75d07dfb910231d64d5b6efc6f40)]: 29 | - webforai@2.0.0 30 | 31 | ## 1.0.15 32 | 33 | ### Patch Changes 34 | 35 | - Updated dependencies [[`fc84541`](https://github.com/inaridiy/webforai/commit/fc84541331ebc2dbf7d80af0694d9cdc79145a59)]: 36 | - webforai@1.6.3 37 | 38 | ## 1.0.14 39 | 40 | ### Patch Changes 41 | 42 | - Updated dependencies [[`00d2a55`](https://github.com/inaridiy/webforai/commit/00d2a55b4a25f6d84da37cb89ec629b5f6179357), [`a7db6ef`](https://github.com/inaridiy/webforai/commit/a7db6ef61f1321e010ed16022086a684c19d77a5)]: 43 | - webforai@1.6.2 44 | 45 | ## 1.0.13 46 | 47 | ### Patch Changes 48 | 49 | - Updated dependencies [[`5e82341`](https://github.com/inaridiy/webforai/commit/5e82341c2a3b8275d78ce808c7d4f81dacb1acdd)]: 50 | - webforai@1.6.1 51 | 52 | ## 1.0.12 53 | 54 | ### Patch Changes 55 | 56 | - [#38](https://github.com/inaridiy/webforai/pull/38) [`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9) Thanks [@inaridiy](https://github.com/inaridiy)! - improve seido 57 | 58 | - Updated dependencies [[`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9), [`30d181e`](https://github.com/inaridiy/webforai/commit/30d181e7a29452b37f6dd665de3e7a3b743c1244)]: 59 | - webforai@1.6.0 60 | 61 | ## 1.0.11 62 | 63 | ### Patch Changes 64 | 65 | - Updated dependencies [[`39c1122`](https://github.com/inaridiy/webforai/commit/39c112205d0aa2a07a46c92b94be6ed91239cf0f)]: 66 | - webforai@1.5.1 67 | 68 | ## 1.0.10 69 | 70 | ### Patch Changes 71 | 72 | - Updated dependencies [[`b13719f`](https://github.com/inaridiy/webforai/commit/b13719fd719511d0aeb1ef3749e88fd8145337a6), [`167acb1`](https://github.com/inaridiy/webforai/commit/167acb1dca303d651d997c23224d3366ff4375ac)]: 73 | - webforai@1.5.0 74 | 75 | ## 1.0.9 76 | 77 | ### Patch Changes 78 | 79 | - Updated dependencies [[`fd7348f`](https://github.com/inaridiy/webforai/commit/fd7348f59a19a027b9bdf012b11d40c38d56cf61)]: 80 | - webforai@1.4.1 81 | 82 | ## 1.0.8 83 | 84 | ### Patch Changes 85 | 86 | - Updated dependencies [[`5bdea98`](https://github.com/inaridiy/webforai/commit/5bdea98cc7cafd79020123260db721fc6ffefd87)]: 87 | - webforai@1.4.0 88 | 89 | ## 1.0.7 90 | 91 | ### Patch Changes 92 | 93 | - Updated dependencies [[`c5e8416`](https://github.com/inaridiy/webforai/commit/c5e841610360346fcba388c777869706dcd5997d), [`97863ea`](https://github.com/inaridiy/webforai/commit/97863ea7f9f4837b96f376bd33371c6ed756d791)]: 94 | - webforai@1.3.3 95 | 96 | ## 1.0.6 97 | 98 | ### Patch Changes 99 | 100 | - Updated dependencies [[`3c6da39`](https://github.com/inaridiy/webforai/commit/3c6da3952f176769cf8aa899f6c7207c231d806a), [`4437e28`](https://github.com/inaridiy/webforai/commit/4437e28e1e7807fd061aee99510ea2d3f71a2a78), [`4764767`](https://github.com/inaridiy/webforai/commit/47647676a838b922e2cf32b1d3637c8153b996dd)]: 101 | - webforai@1.3.2 102 | 103 | ## 1.0.5 104 | 105 | ### Patch Changes 106 | 107 | - Updated dependencies [[`680a226`](https://github.com/inaridiy/webforai/commit/680a22638409517658c3918d90d070b1fa53cc3f)]: 108 | - webforai@1.3.1 109 | 110 | ## 1.0.4 111 | 112 | ### Patch Changes 113 | 114 | - [#12](https://github.com/inaridiy/webforai/pull/12) [`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7) Thanks [@inaridiy](https://github.com/inaridiy)! - Minor interface improvements. 115 | 116 | - Updated dependencies [[`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7)]: 117 | - webforai@1.3.0 118 | 119 | ## 1.0.3 120 | 121 | ### Patch Changes 122 | 123 | - [`64b3a07`](https://github.com/inaridiy/webforai/commit/64b3a07304d364320364b499ca73df24cd312afd) Thanks [@inaridiy](https://github.com/inaridiy)! - Update 124 | 125 | ## 1.0.2 126 | 127 | ### Patch Changes 128 | 129 | - [`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflow 130 | 131 | - [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflows 132 | 133 | - Updated dependencies [[`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953), [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b)]: 134 | - webforai@1.2.3 135 | 136 | ## 1.0.1 137 | 138 | ### Patch Changes 139 | 140 | - 920f310: Update Linter and Workflows 141 | - Updated dependencies [920f310] 142 | - webforai@1.2.2 143 | -------------------------------------------------------------------------------- /examples/scraping/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "scraping", 3 | "version": "1.0.19", 4 | "description": "", 5 | "main": "index.js", 6 | "type": "module", 7 | "private": true, 8 | "scripts": { 9 | "test": "echo \"Error: no test specified\" && exit 1" 10 | }, 11 | "keywords": [], 12 | "author": "", 13 | "license": "ISC", 14 | "dependencies": { 15 | "arg": "^5.0.2", 16 | "dotenv": "^16.4.5", 17 | "openai": "^4.29.1", 18 | "playwright": "^1.40.1", 19 | "webforai": "workspace:^" 20 | }, 21 | "devDependencies": { 22 | "@tsconfig/recommended": "^1.0.3", 23 | "@types/node": "^20.14.10", 24 | "tsx": "^4.19.1", 25 | "typescript": "^5.4.5" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /examples/scraping/src/index.ts: -------------------------------------------------------------------------------- 1 | import { promises as fs } from "node:fs"; 2 | import dotenv from "dotenv"; 3 | import { OpenAI } from "openai"; 4 | import { htmlToMarkdown } from "webforai"; 5 | import { loadHtml } from "webforai/loaders/playwright"; 6 | 7 | dotenv.config(); 8 | 9 | const openai = new OpenAI({ 10 | apiKey: process.env.OPENAI_API_KEY, 11 | }); 12 | 13 | await fs.mkdir(".output", { recursive: true }); 14 | 15 | const packages = [ 16 | "https://www.npmjs.com/package/webforai", 17 | "https://crates.io/crates/openai", 18 | "https://github.com/openai/openai-python", 19 | ]; 20 | 21 | const scrapedPackages = []; 22 | for (const packageUrl of packages) { 23 | const html = await loadHtml(packageUrl); 24 | const markdown = htmlToMarkdown(html, { baseUrl: packageUrl }); 25 | 26 | const prompt = `Extract the JSON information from the package's Markdown documentation according to the schema below. 27 | 28 | \`\`\`json 29 | { 30 | "name": "package-name", 31 | "description": "package-description", 32 | "language": "package-language", 33 | "license": "package-license", 34 | } 35 | \`\`\` 36 | 37 | --- 38 | ${markdown} 39 | `; 40 | 41 | const response = await openai.chat.completions.create({ 42 | model: "gpt-3.5-turbo-0125", 43 | response_format: { type: "json_object" }, 44 | messages: [{ role: "user", content: prompt }], 45 | }); 46 | const json = JSON.parse(response.choices[0].message.content ?? ""); 47 | scrapedPackages.push(json); 48 | } 49 | 50 | await fs.writeFile(".output/scraped-packages.json", JSON.stringify(scrapedPackages, null, 2)); 51 | -------------------------------------------------------------------------------- /examples/scraping/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@tsconfig/recommended/tsconfig.json", 3 | "compilerOptions": { 4 | "module": "NodeNext", 5 | "target": "ESNext" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /examples/simple/.gitignore: -------------------------------------------------------------------------------- 1 | .output -------------------------------------------------------------------------------- /examples/simple/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # simple 2 | 3 | ## 1.1.1 4 | 5 | ### Patch Changes 6 | 7 | - Updated dependencies [[`7fa6ec7`](https://github.com/inaridiy/webforai/commit/7fa6ec75b9966a92820a21a4b8f9eb85c5c2020c)]: 8 | - webforai@2.1.1 9 | 10 | ## 1.1.0 11 | 12 | ### Minor Changes 13 | 14 | - [#56](https://github.com/inaridiy/webforai/pull/56) [`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4) Thanks [@inaridiy](https://github.com/inaridiy)! - Add minimal filter extractor 15 | 16 | ### Patch Changes 17 | 18 | - Updated dependencies [[`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4)]: 19 | - webforai@2.1.0 20 | 21 | ## 1.0.17 22 | 23 | ### Patch Changes 24 | 25 | - Updated dependencies [[`a2e0fc0`](https://github.com/inaridiy/webforai/commit/a2e0fc0b00554a3d860f0cdf67494c1691f9eeb3)]: 26 | - webforai@2.0.1 27 | 28 | ## 1.0.16 29 | 30 | ### Patch Changes 31 | 32 | - Updated dependencies [[`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f), [`baaf7ea`](https://github.com/inaridiy/webforai/commit/baaf7ea33b9a75d07dfb910231d64d5b6efc6f40)]: 33 | - webforai@2.0.0 34 | 35 | ## 1.0.15 36 | 37 | ### Patch Changes 38 | 39 | - Updated dependencies [[`fc84541`](https://github.com/inaridiy/webforai/commit/fc84541331ebc2dbf7d80af0694d9cdc79145a59)]: 40 | - webforai@1.6.3 41 | 42 | ## 1.0.14 43 | 44 | ### Patch Changes 45 | 46 | - Updated dependencies [[`00d2a55`](https://github.com/inaridiy/webforai/commit/00d2a55b4a25f6d84da37cb89ec629b5f6179357), [`a7db6ef`](https://github.com/inaridiy/webforai/commit/a7db6ef61f1321e010ed16022086a684c19d77a5)]: 47 | - webforai@1.6.2 48 | 49 | ## 1.0.13 50 | 51 | ### Patch Changes 52 | 53 | - Updated dependencies [[`5e82341`](https://github.com/inaridiy/webforai/commit/5e82341c2a3b8275d78ce808c7d4f81dacb1acdd)]: 54 | - webforai@1.6.1 55 | 56 | ## 1.0.12 57 | 58 | ### Patch Changes 59 | 60 | - [#38](https://github.com/inaridiy/webforai/pull/38) [`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9) Thanks [@inaridiy](https://github.com/inaridiy)! - improve seido 61 | 62 | - Updated dependencies [[`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9), [`30d181e`](https://github.com/inaridiy/webforai/commit/30d181e7a29452b37f6dd665de3e7a3b743c1244)]: 63 | - webforai@1.6.0 64 | 65 | ## 1.0.11 66 | 67 | ### Patch Changes 68 | 69 | - Updated dependencies [[`39c1122`](https://github.com/inaridiy/webforai/commit/39c112205d0aa2a07a46c92b94be6ed91239cf0f)]: 70 | - webforai@1.5.1 71 | 72 | ## 1.0.10 73 | 74 | ### Patch Changes 75 | 76 | - Updated dependencies [[`b13719f`](https://github.com/inaridiy/webforai/commit/b13719fd719511d0aeb1ef3749e88fd8145337a6), [`167acb1`](https://github.com/inaridiy/webforai/commit/167acb1dca303d651d997c23224d3366ff4375ac)]: 77 | - webforai@1.5.0 78 | 79 | ## 1.0.9 80 | 81 | ### Patch Changes 82 | 83 | - Updated dependencies [[`fd7348f`](https://github.com/inaridiy/webforai/commit/fd7348f59a19a027b9bdf012b11d40c38d56cf61)]: 84 | - webforai@1.4.1 85 | 86 | ## 1.0.8 87 | 88 | ### Patch Changes 89 | 90 | - Updated dependencies [[`5bdea98`](https://github.com/inaridiy/webforai/commit/5bdea98cc7cafd79020123260db721fc6ffefd87)]: 91 | - webforai@1.4.0 92 | 93 | ## 1.0.7 94 | 95 | ### Patch Changes 96 | 97 | - Updated dependencies [[`c5e8416`](https://github.com/inaridiy/webforai/commit/c5e841610360346fcba388c777869706dcd5997d), [`97863ea`](https://github.com/inaridiy/webforai/commit/97863ea7f9f4837b96f376bd33371c6ed756d791)]: 98 | - webforai@1.3.3 99 | 100 | ## 1.0.6 101 | 102 | ### Patch Changes 103 | 104 | - Updated dependencies [[`3c6da39`](https://github.com/inaridiy/webforai/commit/3c6da3952f176769cf8aa899f6c7207c231d806a), [`4437e28`](https://github.com/inaridiy/webforai/commit/4437e28e1e7807fd061aee99510ea2d3f71a2a78), [`4764767`](https://github.com/inaridiy/webforai/commit/47647676a838b922e2cf32b1d3637c8153b996dd)]: 105 | - webforai@1.3.2 106 | 107 | ## 1.0.5 108 | 109 | ### Patch Changes 110 | 111 | - Updated dependencies [[`680a226`](https://github.com/inaridiy/webforai/commit/680a22638409517658c3918d90d070b1fa53cc3f)]: 112 | - webforai@1.3.1 113 | 114 | ## 1.0.4 115 | 116 | ### Patch Changes 117 | 118 | - [#12](https://github.com/inaridiy/webforai/pull/12) [`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7) Thanks [@inaridiy](https://github.com/inaridiy)! - Minor interface improvements. 119 | 120 | - Updated dependencies [[`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7)]: 121 | - webforai@1.3.0 122 | 123 | ## 1.0.3 124 | 125 | ### Patch Changes 126 | 127 | - [`64b3a07`](https://github.com/inaridiy/webforai/commit/64b3a07304d364320364b499ca73df24cd312afd) Thanks [@inaridiy](https://github.com/inaridiy)! - Update 128 | 129 | ## 1.0.2 130 | 131 | ### Patch Changes 132 | 133 | - [`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflow 134 | 135 | - [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflows 136 | 137 | - Updated dependencies [[`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953), [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b)]: 138 | - webforai@1.2.3 139 | 140 | ## 1.0.1 141 | 142 | ### Patch Changes 143 | 144 | - 920f310: Update Linter and Workflows 145 | - Updated dependencies [920f310] 146 | - webforai@1.2.2 147 | -------------------------------------------------------------------------------- /examples/simple/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "simple", 3 | "version": "1.1.1", 4 | "description": "", 5 | "main": "index.js", 6 | "type": "module", 7 | "private": true, 8 | "scripts": { 9 | "test": "echo \"Error: no test specified\" && exit 1" 10 | }, 11 | "keywords": [], 12 | "author": "", 13 | "license": "ISC", 14 | "dependencies": { 15 | "arg": "^5.0.2", 16 | "playwright": "^1.40.1", 17 | "webforai": "workspace:^" 18 | }, 19 | "devDependencies": { 20 | "@tsconfig/recommended": "^1.0.3", 21 | "@types/node": "^20.14.10", 22 | "tsx": "^4.19.1", 23 | "typescript": "^5.4.5" 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /examples/simple/src/index.ts: -------------------------------------------------------------------------------- 1 | import { promises as fs } from "node:fs"; 2 | import arg from "arg"; 3 | import { htmlToMarkdown } from "webforai"; 4 | import { loadHtml } from "webforai/loaders/playwright"; 5 | 6 | await fs.mkdir(".output", { recursive: true }); 7 | 8 | const args = arg({ "--url": String }); 9 | 10 | const url = args["--url"] ?? "https://webforai.dev/"; 11 | 12 | const html = await loadHtml(url); 13 | 14 | await fs.writeFile(".output/output.html", html); 15 | 16 | const rawMarkdown = htmlToMarkdown(html, { baseUrl: url, extractors: false }); 17 | 18 | await fs.writeFile(".output/output.raw.md", rawMarkdown); 19 | 20 | const markdown = htmlToMarkdown(html, { baseUrl: url }); 21 | 22 | await fs.writeFile(".output/output.md", markdown); 23 | -------------------------------------------------------------------------------- /examples/simple/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@tsconfig/recommended/tsconfig.json", 3 | "compilerOptions": { 4 | "module": "NodeNext", 5 | "target": "ESNext" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /examples/translate/.gitignore: -------------------------------------------------------------------------------- 1 | .output 2 | .env -------------------------------------------------------------------------------- /examples/translate/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # translate 2 | 3 | ## 1.0.20 4 | 5 | ### Patch Changes 6 | 7 | - Updated dependencies [[`7fa6ec7`](https://github.com/inaridiy/webforai/commit/7fa6ec75b9966a92820a21a4b8f9eb85c5c2020c)]: 8 | - webforai@2.1.1 9 | 10 | ## 1.0.19 11 | 12 | ### Patch Changes 13 | 14 | - Updated dependencies [[`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4)]: 15 | - webforai@2.1.0 16 | 17 | ## 1.0.18 18 | 19 | ### Patch Changes 20 | 21 | - Updated dependencies [[`a2e0fc0`](https://github.com/inaridiy/webforai/commit/a2e0fc0b00554a3d860f0cdf67494c1691f9eeb3)]: 22 | - webforai@2.0.1 23 | 24 | ## 1.0.17 25 | 26 | ### Patch Changes 27 | 28 | - [#50](https://github.com/inaridiy/webforai/pull/50) [`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f) Thanks [@inaridiy](https://github.com/inaridiy)! - New Documentation Site 29 | 30 | - Updated dependencies [[`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f), [`baaf7ea`](https://github.com/inaridiy/webforai/commit/baaf7ea33b9a75d07dfb910231d64d5b6efc6f40)]: 31 | - webforai@2.0.0 32 | 33 | ## 1.0.16 34 | 35 | ### Patch Changes 36 | 37 | - Updated dependencies [[`fc84541`](https://github.com/inaridiy/webforai/commit/fc84541331ebc2dbf7d80af0694d9cdc79145a59)]: 38 | - webforai@1.6.3 39 | 40 | ## 1.0.15 41 | 42 | ### Patch Changes 43 | 44 | - Updated dependencies [[`00d2a55`](https://github.com/inaridiy/webforai/commit/00d2a55b4a25f6d84da37cb89ec629b5f6179357), [`a7db6ef`](https://github.com/inaridiy/webforai/commit/a7db6ef61f1321e010ed16022086a684c19d77a5)]: 45 | - webforai@1.6.2 46 | 47 | ## 1.0.14 48 | 49 | ### Patch Changes 50 | 51 | - Updated dependencies [[`5e82341`](https://github.com/inaridiy/webforai/commit/5e82341c2a3b8275d78ce808c7d4f81dacb1acdd)]: 52 | - webforai@1.6.1 53 | 54 | ## 1.0.13 55 | 56 | ### Patch Changes 57 | 58 | - [#38](https://github.com/inaridiy/webforai/pull/38) [`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9) Thanks [@inaridiy](https://github.com/inaridiy)! - improve seido 59 | 60 | - Updated dependencies [[`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9), [`30d181e`](https://github.com/inaridiy/webforai/commit/30d181e7a29452b37f6dd665de3e7a3b743c1244)]: 61 | - webforai@1.6.0 62 | 63 | ## 1.0.12 64 | 65 | ### Patch Changes 66 | 67 | - Updated dependencies [[`39c1122`](https://github.com/inaridiy/webforai/commit/39c112205d0aa2a07a46c92b94be6ed91239cf0f)]: 68 | - webforai@1.5.1 69 | 70 | ## 1.0.11 71 | 72 | ### Patch Changes 73 | 74 | - Updated dependencies [[`b13719f`](https://github.com/inaridiy/webforai/commit/b13719fd719511d0aeb1ef3749e88fd8145337a6), [`167acb1`](https://github.com/inaridiy/webforai/commit/167acb1dca303d651d997c23224d3366ff4375ac)]: 75 | - webforai@1.5.0 76 | 77 | ## 1.0.10 78 | 79 | ### Patch Changes 80 | 81 | - Updated dependencies [[`fd7348f`](https://github.com/inaridiy/webforai/commit/fd7348f59a19a027b9bdf012b11d40c38d56cf61)]: 82 | - webforai@1.4.1 83 | 84 | ## 1.0.9 85 | 86 | ### Patch Changes 87 | 88 | - Updated dependencies [[`5bdea98`](https://github.com/inaridiy/webforai/commit/5bdea98cc7cafd79020123260db721fc6ffefd87)]: 89 | - webforai@1.4.0 90 | 91 | ## 1.0.8 92 | 93 | ### Patch Changes 94 | 95 | - [#23](https://github.com/inaridiy/webforai/pull/23) [`2513931`](https://github.com/inaridiy/webforai/commit/25139317b242a28df6c2833646a43b42c633e681) Thanks [@inaridiy](https://github.com/inaridiy)! - Add Gemini Translate example 96 | 97 | ## 1.0.7 98 | 99 | ### Patch Changes 100 | 101 | - Updated dependencies [[`c5e8416`](https://github.com/inaridiy/webforai/commit/c5e841610360346fcba388c777869706dcd5997d), [`97863ea`](https://github.com/inaridiy/webforai/commit/97863ea7f9f4837b96f376bd33371c6ed756d791)]: 102 | - webforai@1.3.3 103 | 104 | ## 1.0.6 105 | 106 | ### Patch Changes 107 | 108 | - Updated dependencies [[`3c6da39`](https://github.com/inaridiy/webforai/commit/3c6da3952f176769cf8aa899f6c7207c231d806a), [`4437e28`](https://github.com/inaridiy/webforai/commit/4437e28e1e7807fd061aee99510ea2d3f71a2a78), [`4764767`](https://github.com/inaridiy/webforai/commit/47647676a838b922e2cf32b1d3637c8153b996dd)]: 109 | - webforai@1.3.2 110 | 111 | ## 1.0.5 112 | 113 | ### Patch Changes 114 | 115 | - Updated dependencies [[`680a226`](https://github.com/inaridiy/webforai/commit/680a22638409517658c3918d90d070b1fa53cc3f)]: 116 | - webforai@1.3.1 117 | 118 | ## 1.0.4 119 | 120 | ### Patch Changes 121 | 122 | - [#12](https://github.com/inaridiy/webforai/pull/12) [`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7) Thanks [@inaridiy](https://github.com/inaridiy)! - Minor interface improvements. 123 | 124 | - Updated dependencies [[`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7)]: 125 | - webforai@1.3.0 126 | 127 | ## 1.0.3 128 | 129 | ### Patch Changes 130 | 131 | - [`64b3a07`](https://github.com/inaridiy/webforai/commit/64b3a07304d364320364b499ca73df24cd312afd) Thanks [@inaridiy](https://github.com/inaridiy)! - Update 132 | 133 | ## 1.0.2 134 | 135 | ### Patch Changes 136 | 137 | - [`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflow 138 | 139 | - [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflows 140 | 141 | - Updated dependencies [[`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953), [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b)]: 142 | - webforai@1.2.3 143 | 144 | ## 1.0.1 145 | 146 | ### Patch Changes 147 | 148 | - 920f310: Update Linter and Workflows 149 | - Updated dependencies [920f310] 150 | - webforai@1.2.2 151 | -------------------------------------------------------------------------------- /examples/translate/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "translate", 3 | "version": "1.0.20", 4 | "description": "", 5 | "main": "index.js", 6 | "type": "module", 7 | "private": true, 8 | "scripts": { 9 | "test": "echo \"Error: no test specified\" && exit 1" 10 | }, 11 | "keywords": [], 12 | "author": "", 13 | "license": "ISC", 14 | "dependencies": { 15 | "@ai-sdk/google": "^0.0.48", 16 | "@anthropic-ai/sdk": "^0.18.0", 17 | "@google/generative-ai": "^0.12.0", 18 | "ai": "^3.4.7", 19 | "arg": "^5.0.2", 20 | "dotenv": "^16.4.5", 21 | "playwright": "^1.40.1", 22 | "webforai": "workspace:^" 23 | }, 24 | "devDependencies": { 25 | "@tsconfig/recommended": "^1.0.3", 26 | "@types/node": "^20.14.10", 27 | "tsx": "^4.19.1", 28 | "typescript": "^5.4.5" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /examples/translate/src/index.ts: -------------------------------------------------------------------------------- 1 | import { google } from "@ai-sdk/google"; 2 | import { generateText } from "ai"; 3 | import dotevn from "dotenv"; 4 | import { htmlToMarkdown } from "webforai"; 5 | import { loadHtml } from "webforai/loaders/playwright"; 6 | 7 | dotevn.config(); 8 | 9 | const url = "https://blog.cloudflare.com/the-story-of-web-framework-hono-from-the-creator-of-hono/"; 10 | const targetLanguage = "ja"; 11 | 12 | const html = await loadHtml(url, { superBypassMode: true }); 13 | const markdown = htmlToMarkdown(html); 14 | 15 | const prompt = `Translate mechanically converted HTML-based Markdown into ${targetLanguage}, while refining and correcting the content for clarity and coherence. 16 | 17 | The Markdown provided may contain redundant or unnecessary information and errors due to mechanical conversion. Your task is to translate the text into Japanese, fixing these issues and improving the overall quality of the Markdown document. 18 | 19 | 20 | ${markdown} 21 | `; 22 | 23 | const response = await generateText({ 24 | model: google("gemini-1.5-flash-latest"), 25 | temperature: 0, 26 | prompt, 27 | maxSteps: 10, 28 | experimental_continueSteps: true, 29 | }); 30 | 31 | console.info(response.text); 32 | -------------------------------------------------------------------------------- /examples/translate/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@tsconfig/recommended/tsconfig.json", 3 | "compilerOptions": { 4 | "module": "NodeNext", 5 | "target": "ESNext" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /examples/worker/.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | 3 | logs 4 | _.log 5 | npm-debug.log_ 6 | yarn-debug.log* 7 | yarn-error.log* 8 | lerna-debug.log* 9 | .pnpm-debug.log* 10 | 11 | # Diagnostic reports (https://nodejs.org/api/report.html) 12 | 13 | report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json 14 | 15 | # Runtime data 16 | 17 | pids 18 | _.pid 19 | _.seed 20 | \*.pid.lock 21 | 22 | # Directory for instrumented libs generated by jscoverage/JSCover 23 | 24 | lib-cov 25 | 26 | # Coverage directory used by tools like istanbul 27 | 28 | coverage 29 | \*.lcov 30 | 31 | # nyc test coverage 32 | 33 | .nyc_output 34 | 35 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 36 | 37 | .grunt 38 | 39 | # Bower dependency directory (https://bower.io/) 40 | 41 | bower_components 42 | 43 | # node-waf configuration 44 | 45 | .lock-wscript 46 | 47 | # Compiled binary addons (https://nodejs.org/api/addons.html) 48 | 49 | build/Release 50 | 51 | # Dependency directories 52 | 53 | node_modules/ 54 | jspm_packages/ 55 | 56 | # Snowpack dependency directory (https://snowpack.dev/) 57 | 58 | web_modules/ 59 | 60 | # TypeScript cache 61 | 62 | \*.tsbuildinfo 63 | 64 | # Optional npm cache directory 65 | 66 | .npm 67 | 68 | # Optional eslint cache 69 | 70 | .eslintcache 71 | 72 | # Optional stylelint cache 73 | 74 | .stylelintcache 75 | 76 | # Microbundle cache 77 | 78 | .rpt2_cache/ 79 | .rts2_cache_cjs/ 80 | .rts2_cache_es/ 81 | .rts2_cache_umd/ 82 | 83 | # Optional REPL history 84 | 85 | .node_repl_history 86 | 87 | # Output of 'npm pack' 88 | 89 | \*.tgz 90 | 91 | # Yarn Integrity file 92 | 93 | .yarn-integrity 94 | 95 | # dotenv environment variable files 96 | 97 | .env 98 | .env.development.local 99 | .env.test.local 100 | .env.production.local 101 | .env.local 102 | 103 | # parcel-bundler cache (https://parceljs.org/) 104 | 105 | .cache 106 | .parcel-cache 107 | 108 | # Next.js build output 109 | 110 | .next 111 | out 112 | 113 | # Nuxt.js build / generate output 114 | 115 | .nuxt 116 | dist 117 | 118 | # Gatsby files 119 | 120 | .cache/ 121 | 122 | # Comment in the public line in if your project uses Gatsby and not Next.js 123 | 124 | # https://nextjs.org/blog/next-9-1#public-directory-support 125 | 126 | # public 127 | 128 | # vuepress build output 129 | 130 | .vuepress/dist 131 | 132 | # vuepress v2.x temp and cache directory 133 | 134 | .temp 135 | .cache 136 | 137 | # Docusaurus cache and generated files 138 | 139 | .docusaurus 140 | 141 | # Serverless directories 142 | 143 | .serverless/ 144 | 145 | # FuseBox cache 146 | 147 | .fusebox/ 148 | 149 | # DynamoDB Local files 150 | 151 | .dynamodb/ 152 | 153 | # TernJS port file 154 | 155 | .tern-port 156 | 157 | # Stores VSCode versions used for testing VSCode extensions 158 | 159 | .vscode-test 160 | 161 | # yarn v2 162 | 163 | .yarn/cache 164 | .yarn/unplugged 165 | .yarn/build-state.yml 166 | .yarn/install-state.gz 167 | .pnp.\* 168 | 169 | # wrangler project 170 | 171 | .dev.vars 172 | .wrangler/ 173 | -------------------------------------------------------------------------------- /examples/worker/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # worker 2 | 3 | ## 0.0.18 4 | 5 | ### Patch Changes 6 | 7 | - Updated dependencies [[`7fa6ec7`](https://github.com/inaridiy/webforai/commit/7fa6ec75b9966a92820a21a4b8f9eb85c5c2020c)]: 8 | - webforai@2.1.1 9 | 10 | ## 0.0.17 11 | 12 | ### Patch Changes 13 | 14 | - Updated dependencies [[`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4)]: 15 | - webforai@2.1.0 16 | 17 | ## 0.0.16 18 | 19 | ### Patch Changes 20 | 21 | - Updated dependencies [[`a2e0fc0`](https://github.com/inaridiy/webforai/commit/a2e0fc0b00554a3d860f0cdf67494c1691f9eeb3)]: 22 | - webforai@2.0.1 23 | 24 | ## 0.0.15 25 | 26 | ### Patch Changes 27 | 28 | - [#50](https://github.com/inaridiy/webforai/pull/50) [`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f) Thanks [@inaridiy](https://github.com/inaridiy)! - New Documentation Site 29 | 30 | - Updated dependencies [[`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f), [`baaf7ea`](https://github.com/inaridiy/webforai/commit/baaf7ea33b9a75d07dfb910231d64d5b6efc6f40)]: 31 | - webforai@2.0.0 32 | 33 | ## 0.0.14 34 | 35 | ### Patch Changes 36 | 37 | - Updated dependencies [[`fc84541`](https://github.com/inaridiy/webforai/commit/fc84541331ebc2dbf7d80af0694d9cdc79145a59)]: 38 | - webforai@1.6.3 39 | 40 | ## 0.0.13 41 | 42 | ### Patch Changes 43 | 44 | - Updated dependencies [[`00d2a55`](https://github.com/inaridiy/webforai/commit/00d2a55b4a25f6d84da37cb89ec629b5f6179357), [`a7db6ef`](https://github.com/inaridiy/webforai/commit/a7db6ef61f1321e010ed16022086a684c19d77a5)]: 45 | - webforai@1.6.2 46 | 47 | ## 0.0.12 48 | 49 | ### Patch Changes 50 | 51 | - Updated dependencies [[`5e82341`](https://github.com/inaridiy/webforai/commit/5e82341c2a3b8275d78ce808c7d4f81dacb1acdd)]: 52 | - webforai@1.6.1 53 | 54 | ## 0.0.11 55 | 56 | ### Patch Changes 57 | 58 | - Updated dependencies [[`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9), [`30d181e`](https://github.com/inaridiy/webforai/commit/30d181e7a29452b37f6dd665de3e7a3b743c1244)]: 59 | - webforai@1.6.0 60 | 61 | ## 0.0.10 62 | 63 | ### Patch Changes 64 | 65 | - Updated dependencies [[`39c1122`](https://github.com/inaridiy/webforai/commit/39c112205d0aa2a07a46c92b94be6ed91239cf0f)]: 66 | - webforai@1.5.1 67 | 68 | ## 0.0.9 69 | 70 | ### Patch Changes 71 | 72 | - Updated dependencies [[`b13719f`](https://github.com/inaridiy/webforai/commit/b13719fd719511d0aeb1ef3749e88fd8145337a6), [`167acb1`](https://github.com/inaridiy/webforai/commit/167acb1dca303d651d997c23224d3366ff4375ac)]: 73 | - webforai@1.5.0 74 | 75 | ## 0.0.8 76 | 77 | ### Patch Changes 78 | 79 | - Updated dependencies [[`fd7348f`](https://github.com/inaridiy/webforai/commit/fd7348f59a19a027b9bdf012b11d40c38d56cf61)]: 80 | - webforai@1.4.1 81 | 82 | ## 0.0.7 83 | 84 | ### Patch Changes 85 | 86 | - Updated dependencies [[`5bdea98`](https://github.com/inaridiy/webforai/commit/5bdea98cc7cafd79020123260db721fc6ffefd87)]: 87 | - webforai@1.4.0 88 | 89 | ## 0.0.6 90 | 91 | ### Patch Changes 92 | 93 | - Updated dependencies [[`c5e8416`](https://github.com/inaridiy/webforai/commit/c5e841610360346fcba388c777869706dcd5997d), [`97863ea`](https://github.com/inaridiy/webforai/commit/97863ea7f9f4837b96f376bd33371c6ed756d791)]: 94 | - webforai@1.3.3 95 | 96 | ## 0.0.5 97 | 98 | ### Patch Changes 99 | 100 | - Updated dependencies [[`3c6da39`](https://github.com/inaridiy/webforai/commit/3c6da3952f176769cf8aa899f6c7207c231d806a), [`4437e28`](https://github.com/inaridiy/webforai/commit/4437e28e1e7807fd061aee99510ea2d3f71a2a78), [`4764767`](https://github.com/inaridiy/webforai/commit/47647676a838b922e2cf32b1d3637c8153b996dd)]: 101 | - webforai@1.3.2 102 | 103 | ## 0.0.4 104 | 105 | ### Patch Changes 106 | 107 | - Updated dependencies [[`680a226`](https://github.com/inaridiy/webforai/commit/680a22638409517658c3918d90d070b1fa53cc3f)]: 108 | - webforai@1.3.1 109 | 110 | ## 0.0.3 111 | 112 | ### Patch Changes 113 | 114 | - [#12](https://github.com/inaridiy/webforai/pull/12) [`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7) Thanks [@inaridiy](https://github.com/inaridiy)! - Minor interface improvements. 115 | 116 | - Updated dependencies [[`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7)]: 117 | - webforai@1.3.0 118 | 119 | ## 0.0.2 120 | 121 | ### Patch Changes 122 | 123 | - [`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflow 124 | 125 | - [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflows 126 | 127 | - Updated dependencies [[`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953), [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b)]: 128 | - webforai@1.2.3 129 | 130 | ## 0.0.1 131 | 132 | ### Patch Changes 133 | 134 | - 920f310: Update Linter and Workflows 135 | - Updated dependencies [920f310] 136 | - webforai@1.2.2 137 | -------------------------------------------------------------------------------- /examples/worker/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "worker", 3 | "version": "0.0.18", 4 | "private": true, 5 | "scripts": { 6 | "deploy": "wrangler deploy", 7 | "dev": "wrangler dev --remote", 8 | "start": "wrangler dev" 9 | }, 10 | "devDependencies": { 11 | "@cloudflare/puppeteer": "^0.0.6", 12 | "@cloudflare/vitest-pool-workers": "^0.1.0", 13 | "@cloudflare/workers-types": "^4.20241018.0", 14 | "typescript": "^5.4.5", 15 | "vitest": "1.3.0", 16 | "wrangler": "^3.81.0" 17 | }, 18 | "dependencies": { 19 | "@hono/valibot-validator": "^0.2.2", 20 | "hono": "^4.6.5", 21 | "valibot": "^0.30.0", 22 | "webforai": "workspace:^" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /examples/worker/src/index.ts: -------------------------------------------------------------------------------- 1 | import { DurableObject } from "cloudflare:workers"; 2 | import puppeteer from "@cloudflare/puppeteer"; 3 | import { vValidator } from "@hono/valibot-validator"; 4 | import { Hono } from "hono"; 5 | import { cache } from "hono/cache"; 6 | import { url, literal, object, optional, string, union } from "valibot"; 7 | import { htmlToMarkdown } from "webforai"; 8 | 9 | type Bindings = { MYBROWSER: puppeteer.BrowserWorker; BROWSER: DurableObjectNamespace }; 10 | 11 | const app = new Hono<{ Bindings: Bindings }>(); 12 | 13 | const BROWSER_KEYS = ["browser1", "browser2"]; 14 | 15 | const schema = object({ 16 | url: string([url()]), 17 | mode: optional(union([literal("readability"), literal("ai")])), 18 | }); 19 | 20 | app.get( 21 | "/", 22 | cache({ cacheName: "html-to-markdown", cacheControl: "max-age=3600" }), 23 | vValidator("query", schema), 24 | async (c) => { 25 | const { url, mode } = c.req.valid("query"); 26 | 27 | const pickedKey = BROWSER_KEYS[Math.floor(Math.random() * BROWSER_KEYS.length)]; 28 | const browser = c.env.BROWSER.get(c.env.BROWSER.idFromName(pickedKey)); 29 | const result = await browser.renderUrl(url); 30 | 31 | if (!result.success) { 32 | return c.text(result.error, 500); 33 | } 34 | 35 | const aiModeOptions = { linkAsText: true, tableAsText: true, hideImage: true }; 36 | const readabilityModeOptions = { linkAsText: false, tableAsText: false, hideImage: false }; 37 | const markdown = htmlToMarkdown(result.html, { 38 | baseUrl: url, 39 | ...(mode === "ai" ? aiModeOptions : readabilityModeOptions), 40 | }); 41 | return c.text(markdown); 42 | }, 43 | ); 44 | 45 | // biome-ignore lint/style/noDefaultExport: This is the default export for the worker script 46 | export default app; 47 | 48 | const KEEP_BROWSER_ALIVE_IN_SECONDS = 60; 49 | 50 | export class BrowserDO extends DurableObject { 51 | private browser: puppeteer.Browser | null = null; 52 | private keptAliveInSeconds = 0; 53 | 54 | async renderUrl(url: string): Promise<{ success: true; html: string } | { success: false; error: string }> { 55 | const normalizedUrl = new URL(url).toString(); 56 | 57 | try { 58 | if (!this.browser?.isConnected()) { 59 | const sessions = await puppeteer.sessions(this.env.MYBROWSER); 60 | const freeSession = sessions.find((s) => !s.connectionId); 61 | if (freeSession) { 62 | this.browser = await puppeteer.connect(this.env.MYBROWSER, freeSession.sessionId); 63 | } else { 64 | this.browser = await puppeteer.launch(this.env.MYBROWSER); 65 | } 66 | } 67 | } catch (e) { 68 | console.error(e); 69 | return { success: false, error: "Failed to launch browser" }; 70 | } 71 | 72 | this.keptAliveInSeconds = 0; 73 | const page = await this.browser.newPage(); 74 | await page.goto(normalizedUrl, { waitUntil: "networkidle0" }); 75 | 76 | //scriptタグを削除 77 | await page.evaluate(() => { 78 | const scripts = document.querySelectorAll("script"); 79 | for (const script of Array.from(scripts)) { 80 | script.remove(); 81 | } 82 | }); 83 | 84 | const html = await page.content(); 85 | 86 | const cleanup = async () => { 87 | await page.close(); 88 | this.keptAliveInSeconds = 0; 89 | const currentAlarm = await this.ctx.storage.getAlarm(); 90 | if (currentAlarm) { 91 | return; 92 | } 93 | const tenSeconds = 10 * 1000; 94 | await this.ctx.storage.setAlarm(Date.now() + tenSeconds); 95 | }; 96 | this.ctx.waitUntil(cleanup()); 97 | 98 | return { success: true, html }; 99 | } 100 | 101 | async alarm() { 102 | this.keptAliveInSeconds += 10; 103 | if (this.keptAliveInSeconds < KEEP_BROWSER_ALIVE_IN_SECONDS) { 104 | await this.ctx.storage.setAlarm(Date.now() + 10 * 1000); 105 | if (this.browser?.isConnected()) { 106 | await this.browser.version(); 107 | } 108 | } else { 109 | await this.browser?.close(); 110 | this.browser = null; 111 | } 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /examples/worker/wrangler.toml: -------------------------------------------------------------------------------- 1 | name = "webforai" 2 | main = "src/index.ts" 3 | compatibility_date = "2024-04-03" 4 | compatibility_flags = ["nodejs_compat"] 5 | 6 | 7 | browser = { binding = "MYBROWSER" } 8 | 9 | [durable_objects] 10 | bindings = [ 11 | { name = "BROWSER", class_name = "BrowserDO" } 12 | ] 13 | 14 | [[migrations]] 15 | new_classes = ["BrowserDO"] 16 | tag = "v1" -------------------------------------------------------------------------------- /images/logo.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inaridiy/webforai/2c5339a177ac6197c9a382b5d08fc973f1fc4a12/images/logo.webp -------------------------------------------------------------------------------- /images/voice-genius.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inaridiy/webforai/2c5339a177ac6197c9a382b5d08fc973f1fc4a12/images/voice-genius.png -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "webforai", 3 | "version": "0.0.0", 4 | "description": "A library that provides a web interface for AI", 5 | "author": "inaridiy", 6 | "repository": { 7 | "type": "git", 8 | "url": "https://github.com/inaridiy/webforai.git" 9 | }, 10 | "homepage": "https://webforai.dev/", 11 | "bugs": "https://github.com/inaridiy/webforai/issues", 12 | "keywords": ["ai", "web", "scraping"], 13 | "private": true, 14 | "packageManager": "pnpm@9.1.4", 15 | "engines": { 16 | "node": ">=18.0.0" 17 | }, 18 | "scripts": { 19 | "test": "vitest", 20 | "build": "pnpm run --r --filter \"./packages/**\" build", 21 | "format": "biome format .", 22 | "format:fix": "pnpm format --write .", 23 | "lint": "biome check .", 24 | "lint:fix": "pnpm lint --apply", 25 | "lint:repo": "sherif", 26 | "typecheck": "pnpm run --filter \"./packages/**\" typecheck", 27 | "ci:prepublish": "pnpm run build", 28 | "ci:version": "changeset version", 29 | "ci:publish": "pnpm ci:prepublish && changeset publish", 30 | "preinstall": "npx only-allow pnpm", 31 | "prepare": "pnpm simple-git-hooks", 32 | "postinstall": "pnpm -w build" 33 | }, 34 | "devDependencies": { 35 | "@biomejs/biome": "1.7.3", 36 | "@changesets/changelog-github": "^0.5.0", 37 | "@changesets/cli": "^2.27.5", 38 | "sherif": "^0.8.4", 39 | "simple-git-hooks": "^2.11.1", 40 | "vitest": "1.3.0" 41 | }, 42 | "simple-git-hooks": { 43 | "pre-commit": "pnpm format && pnpm lint && pnpm lint:repo && pnpm typecheck" 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /packages/webforai/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # webforai 2 | 3 | ## 2.1.1 4 | 5 | ### Patch Changes 6 | 7 | - [#58](https://github.com/inaridiy/webforai/pull/58) [`7fa6ec7`](https://github.com/inaridiy/webforai/commit/7fa6ec75b9966a92820a21a4b8f9eb85c5c2020c) Thanks [@inaridiy](https://github.com/inaridiy)! - fix 8 | 9 | ## 2.1.0 10 | 11 | ### Minor Changes 12 | 13 | - [#56](https://github.com/inaridiy/webforai/pull/56) [`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4) Thanks [@inaridiy](https://github.com/inaridiy)! - Add minimal filter extractor 14 | 15 | ## 2.0.1 16 | 17 | ### Patch Changes 18 | 19 | - [#52](https://github.com/inaridiy/webforai/pull/52) [`a2e0fc0`](https://github.com/inaridiy/webforai/commit/a2e0fc0b00554a3d860f0cdf67494c1691f9eeb3) Thanks [@moons-14](https://github.com/moons-14)! - update package.json homepage 20 | 21 | ## 2.0.0 22 | 23 | ### Major Changes 24 | 25 | - [#50](https://github.com/inaridiy/webforai/pull/50) [`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f) Thanks [@inaridiy](https://github.com/inaridiy)! - New Documentation Site 26 | 27 | - [#49](https://github.com/inaridiy/webforai/pull/49) [`baaf7ea`](https://github.com/inaridiy/webforai/commit/baaf7ea33b9a75d07dfb910231d64d5b6efc6f40) Thanks [@inaridiy](https://github.com/inaridiy)! - “Readability” Extractor renamed takumi and license changed to Apache2 license. 28 | 29 | ## 1.6.3 30 | 31 | ### Patch Changes 32 | 33 | - [#47](https://github.com/inaridiy/webforai/pull/47) [`fc84541`](https://github.com/inaridiy/webforai/commit/fc84541331ebc2dbf7d80af0694d9cdc79145a59) Thanks [@inaridiy](https://github.com/inaridiy)! - Fix Playwright installation command for improved reliability 34 | 35 | ## 1.6.2 36 | 37 | ### Patch Changes 38 | 39 | - [#45](https://github.com/inaridiy/webforai/pull/45) [`00d2a55`](https://github.com/inaridiy/webforai/commit/00d2a55b4a25f6d84da37cb89ec629b5f6179357) Thanks [@inaridiy](https://github.com/inaridiy)! - Re Re Fix CLI 40 | 41 | - [#45](https://github.com/inaridiy/webforai/pull/45) [`a7db6ef`](https://github.com/inaridiy/webforai/commit/a7db6ef61f1321e010ed16022086a684c19d77a5) Thanks [@inaridiy](https://github.com/inaridiy)! - Re Re Fix CLI 42 | 43 | ## 1.6.1 44 | 45 | ### Patch Changes 46 | 47 | - [#41](https://github.com/inaridiy/webforai/pull/41) [`5e82341`](https://github.com/inaridiy/webforai/commit/5e82341c2a3b8275d78ce808c7d4f81dacb1acdd) Thanks [@inaridiy](https://github.com/inaridiy)! - Fix with { type : "json" } Error 48 | 49 | ## 1.6.0 50 | 51 | ### Minor Changes 52 | 53 | - [#40](https://github.com/inaridiy/webforai/pull/40) [`30d181e`](https://github.com/inaridiy/webforai/commit/30d181e7a29452b37f6dd665de3e7a3b743c1244) Thanks [@inaridiy](https://github.com/inaridiy)! - The CLI has been completely corrected. Maybe. 54 | 55 | ### Patch Changes 56 | 57 | - [#38](https://github.com/inaridiy/webforai/pull/38) [`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9) Thanks [@inaridiy](https://github.com/inaridiy)! - improve seido 58 | 59 | ## 1.5.1 60 | 61 | ### Patch Changes 62 | 63 | - [#37](https://github.com/inaridiy/webforai/pull/37) [`39c1122`](https://github.com/inaridiy/webforai/commit/39c112205d0aa2a07a46c92b94be6ed91239cf0f) Thanks [@inaridiy](https://github.com/inaridiy)! - Fix CLI Dep Error 64 | 65 | ## 1.5.0 66 | 67 | ### Minor Changes 68 | 69 | - [#30](https://github.com/inaridiy/webforai/pull/30) [`167acb1`](https://github.com/inaridiy/webforai/commit/167acb1dca303d651d997c23224d3366ff4375ac) Thanks [@moons-14](https://github.com/moons-14)! - webforai can now be run from the cli 70 | 71 | ### Patch Changes 72 | 73 | - [#31](https://github.com/inaridiy/webforai/pull/31) [`b13719f`](https://github.com/inaridiy/webforai/commit/b13719fd719511d0aeb1ef3749e88fd8145337a6) Thanks [@moons-14](https://github.com/moons-14)! - PRESET_EXTRACT_HAST and DEFAULT_EXTRACT_HAST can be referenced externally 74 | 75 | ## 1.4.1 76 | 77 | ### Patch Changes 78 | 79 | - [#27](https://github.com/inaridiy/webforai/pull/27) [`fd7348f`](https://github.com/inaridiy/webforai/commit/fd7348f59a19a027b9bdf012b11d40c38d56cf61) Thanks [@inaridiy](https://github.com/inaridiy)! - Improve extract algorithm 80 | 81 | ## 1.4.0 82 | 83 | ### Minor Changes 84 | 85 | - [#25](https://github.com/inaridiy/webforai/pull/25) [`5bdea98`](https://github.com/inaridiy/webforai/commit/5bdea98cc7cafd79020123260db721fc6ffefd87) Thanks [@moons-14](https://github.com/moons-14)! - Add loader using puppeteer 86 | 87 | ## 1.3.3 88 | 89 | ### Patch Changes 90 | 91 | - [#20](https://github.com/inaridiy/webforai/pull/20) [`c5e8416`](https://github.com/inaridiy/webforai/commit/c5e841610360346fcba388c777869706dcd5997d) Thanks [@inaridiy](https://github.com/inaridiy)! - Minimal Param update 92 | 93 | - [#22](https://github.com/inaridiy/webforai/pull/22) [`97863ea`](https://github.com/inaridiy/webforai/commit/97863ea7f9f4837b96f376bd33371c6ed756d791) Thanks [@inaridiy](https://github.com/inaridiy)! - Add fetch loader and improve playwright loader 94 | 95 | ## 1.3.2 96 | 97 | ### Patch Changes 98 | 99 | - [#18](https://github.com/inaridiy/webforai/pull/18) [`3c6da39`](https://github.com/inaridiy/webforai/commit/3c6da3952f176769cf8aa899f6c7207c231d806a) Thanks [@inaridiy](https://github.com/inaridiy)! - Improve 100 | 101 | - [#18](https://github.com/inaridiy/webforai/pull/18) [`4437e28`](https://github.com/inaridiy/webforai/commit/4437e28e1e7807fd061aee99510ea2d3f71a2a78) Thanks [@inaridiy](https://github.com/inaridiy)! - accuracy improvement 102 | 103 | - [#18](https://github.com/inaridiy/webforai/pull/18) [`4764767`](https://github.com/inaridiy/webforai/commit/47647676a838b922e2cf32b1d3637c8153b996dd) Thanks [@inaridiy](https://github.com/inaridiy)! - Minor performance improvements 104 | 105 | ## 1.3.1 106 | 107 | ### Patch Changes 108 | 109 | - [#15](https://github.com/inaridiy/webforai/pull/15) [`680a226`](https://github.com/inaridiy/webforai/commit/680a22638409517658c3918d90d070b1fa53cc3f) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Document 110 | 111 | ## 1.3.0 112 | 113 | ### Minor Changes 114 | 115 | - [#12](https://github.com/inaridiy/webforai/pull/12) [`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7) Thanks [@inaridiy](https://github.com/inaridiy)! - Minor interface improvements. 116 | 117 | ## 1.2.3 118 | 119 | ### Patch Changes 120 | 121 | - [`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflow 122 | 123 | - [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflows 124 | 125 | ## 1.2.2 126 | 127 | ### Patch Changes 128 | 129 | - 920f310: Update Linter and Workflows 130 | -------------------------------------------------------------------------------- /packages/webforai/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | A esm-native library that converts HTML to Markdown & Useful Utilities with simple, lightweight and epic quality. 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | ## Documentation 32 | 33 | [Head to the documentation](https://webforai.dev/) to read and learn more about Webforai. 34 | 35 | ## Overview 36 | 37 | ```bash 38 | npx webforai@latest 39 | ``` 40 | 41 | or 42 | 43 | ```ts 44 | import { htmlToMarkdown, htmlToMdast } from "webforai"; 45 | import { loadHtml } from "webforai/loaders/playwright"; 46 | 47 | // Load html from url 48 | const url = "https://www.npmjs.com/package/webforai"; 49 | const html = await loadHtml(url); 50 | 51 | // Convert html to markdown 52 | const markdown = htmlToMarkdown(html, { baseUrl: url }); 53 | ``` 54 | 55 | ## Support 56 | 57 | - [GitHub Sponsors](https://github.com/sponsors/inaridiy) 58 | - [inaridiy.eth](https://x.com/inaridiy) 59 | 60 | ## License 61 | 62 | [Apache 2.0](/LICENSE) License 63 | -------------------------------------------------------------------------------- /packages/webforai/build.ts: -------------------------------------------------------------------------------- 1 | /* 2 | For `build.ts`, further inspire @honojs/hono with inspire @kaze-style/react. 3 | https://github.com/honojs/hono/blob/main/build.ts 4 | https://github.com/taishinaritomi/kaze-style/blob/main/scripts/build.ts 5 | MIT License 6 | Copyright (c) 2024 - present, inaridiy and webforai contributors 7 | */ 8 | 9 | import { exec } from "node:child_process"; 10 | import fs from "node:fs"; 11 | import path from "node:path"; 12 | import arg from "arg"; 13 | import { context } from "esbuild"; 14 | import type { BuildOptions, Plugin, PluginBuild } from "esbuild"; 15 | import { glob } from "glob"; 16 | 17 | const args = arg({ 18 | "--watch": Boolean, 19 | }); 20 | 21 | const isWatch = args["--watch"]; 22 | 23 | const entryPoints = glob.sync("./src/**/*.ts", { 24 | ignore: ["./src/**/*.test.ts", "./src/cli/**/*.ts"], 25 | }); 26 | 27 | const addExtension = (extension = ".js", fileExtension = ".ts"): Plugin => ({ 28 | name: "add-extension", 29 | setup(build: PluginBuild) { 30 | build.onResolve({ filter: /.*/ }, (args) => { 31 | if (args.importer) { 32 | const p = path.join(args.resolveDir, args.path); 33 | let tsPath = `${p}${fileExtension}`; 34 | 35 | let importPath = ""; 36 | if (path.basename(args.importer).split(".")[0] === args.path) { 37 | importPath = args.path; 38 | } else if (fs.existsSync(tsPath)) { 39 | importPath = args.path + extension; 40 | } else { 41 | tsPath = path.join(args.resolveDir, args.path, `index${fileExtension}`); 42 | if (fs.existsSync(tsPath)) { 43 | importPath = `${args.path}/index${extension}`; 44 | } 45 | } 46 | 47 | return { path: importPath, external: true }; 48 | } 49 | }); 50 | }, 51 | }); 52 | 53 | const commonOptions: BuildOptions = { 54 | entryPoints, 55 | logLevel: "info", 56 | platform: "node", 57 | }; 58 | 59 | const cjsBuild = () => 60 | context({ 61 | ...commonOptions, 62 | outbase: "./src", 63 | outdir: "./dist/cjs", 64 | format: "cjs", 65 | }); 66 | 67 | const esmBuild = () => 68 | context({ 69 | ...commonOptions, 70 | bundle: true, 71 | outbase: "./src", 72 | outdir: "./dist", 73 | format: "esm", 74 | plugins: [addExtension(".js")], 75 | }); 76 | 77 | const cliBuild = () => 78 | context({ 79 | entryPoints: ["./src/cli/bin.ts"], 80 | banner: { 81 | js: "#!/usr/bin/env node", 82 | }, 83 | outfile: "./dist/bin.js", 84 | format: "esm", 85 | packages: "external", 86 | bundle: true, 87 | }); 88 | 89 | const [esmCtx, cjsCtx, cliCtx] = await Promise.all([esmBuild(), cjsBuild(), cliBuild()]); 90 | if (isWatch) { 91 | Promise.all([esmCtx.watch(), cjsCtx.watch(), cliCtx.watch()]); 92 | } else { 93 | Promise.all([esmCtx.rebuild(), cjsCtx.rebuild(), cliCtx.rebuild()]).then(() => 94 | Promise.all([esmCtx.dispose(), cjsCtx.dispose(), cliCtx.dispose()]), 95 | ); 96 | } 97 | 98 | exec(`tsc ${isWatch ? "-w" : ""} --declaration --project tsconfig.build.json`); 99 | -------------------------------------------------------------------------------- /packages/webforai/package.cjs.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "commonjs" 3 | } 4 | -------------------------------------------------------------------------------- /packages/webforai/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "webforai", 3 | "version": "2.1.1", 4 | "description": "A library that provides a web interface for AI", 5 | "author": "inaridiy", 6 | "license": "Apache-2.0", 7 | "keywords": [ 8 | "web", 9 | "ai", 10 | "html", 11 | "html2md", 12 | "markdown", 13 | "mdast", 14 | "hast" 15 | ], 16 | "repository": { 17 | "type": "git", 18 | "url": "https://github.com/inaridiy/webforai.git" 19 | }, 20 | "homepage": "https://webforai.dev/", 21 | "scripts": { 22 | "copy:package.cjs.json": "pnpm ncp ./package.cjs.json ./dist/cjs/package.json && pnpm ncp ./package.cjs.json ./dist/types/package.json ", 23 | "clean": "rimraf dist", 24 | "build": "pnpm clean && tsx build.ts && pnpm copy:package.cjs.json", 25 | "typecheck": "tsc --noEmit", 26 | "prerelease": "pnpm build", 27 | "release": "np" 28 | }, 29 | "files": [ 30 | "dist", 31 | "!dist/types/**/*.js" 32 | ], 33 | "main": "dist/cjs/index.js", 34 | "type": "module", 35 | "module": "dist/index.js", 36 | "types": "dist/types/index.d.ts", 37 | "bin": "dist/bin.js", 38 | "exports": { 39 | ".": { 40 | "types": "./dist/types/index.d.ts", 41 | "import": "./dist/index.js", 42 | "require": "./dist/cjs/index.js" 43 | }, 44 | "./types": { 45 | "types": "./dist/types/index.d.ts", 46 | "import": "./dist/index.js", 47 | "require": "./dist/cjs/index.js" 48 | }, 49 | "./loaders/playwright": { 50 | "types": "./dist/types/loaders/playwright.d.ts", 51 | "import": "./dist/loaders/playwright.js", 52 | "require": "./dist/cjs/loaders/playwright.js" 53 | }, 54 | "./loaders/fetch": { 55 | "types": "./dist/types/loaders/fetch.d.ts", 56 | "import": "./dist/loaders/fetch.js", 57 | "require": "./dist/cjs/loaders/fetch.js" 58 | }, 59 | "./loaders/cf-puppeteer": { 60 | "types": "./dist/types/loaders/cf-puppeteer.d.ts", 61 | "import": "./dist/loaders/cf-puppeteer.js", 62 | "require": "./dist/cjs/loaders/cf-puppeteer.js" 63 | }, 64 | "./loaders/puppeteer": { 65 | "types": "./dist/types/loaders/puppeteer.d.ts", 66 | "import": "./dist/loaders/puppeteer.js", 67 | "require": "./dist/cjs/loaders/puppeteer.js" 68 | } 69 | }, 70 | "typesVersions": { 71 | "*": { 72 | "types": [ 73 | "./dist/types/index.d.ts" 74 | ], 75 | "loaders/playwright": [ 76 | "./dist/types/loaders/playwright.d.ts" 77 | ], 78 | "loaders/cf-puppeteer": [ 79 | "./dist/types/loaders/cf-puppeteer.d.ts" 80 | ], 81 | "loaders/fetch": [ 82 | "./dist/types/loaders/fetch.d.ts" 83 | ], 84 | "loaders/puppeteer": [ 85 | "./dist/types/loaders/puppeteer.d.ts" 86 | ] 87 | } 88 | }, 89 | "peerDependencies": { 90 | "@cloudflare/puppeteer": ">=0.0.6", 91 | "playwright-core": ">=1.4", 92 | "puppeteer": ">=22" 93 | }, 94 | "peerDependenciesMeta": { 95 | "@cloudflare/puppeteer": { 96 | "optional": true 97 | }, 98 | "playwright-core": { 99 | "optional": false 100 | }, 101 | "puppeteer": { 102 | "optional": true 103 | } 104 | }, 105 | "dependencies": { 106 | "@clack/prompts": "^0.7.0", 107 | "boxen": "^8.0.1", 108 | "commander": "^12.1.0", 109 | "hast-util-from-html": "^2.0.3", 110 | "hast-util-select": "^6.0.2", 111 | "hast-util-to-html": "^9.0.3", 112 | "hast-util-to-mdast": "^10.1.0", 113 | "hast-util-to-string": "^3.0.0", 114 | "hast-util-to-text": "^4.0.0", 115 | "mathml-to-latex": "^1.4.1", 116 | "mdast-util-gfm": "^3.0.0", 117 | "mdast-util-math": "^3.0.0", 118 | "mdast-util-to-markdown": "^2.1.0", 119 | "picocolors": "^1.0.1", 120 | "trim-trailing-lines": "^2.1.0", 121 | "unist-util-filter": "^5.0.1", 122 | "zx": "^8.1.5" 123 | }, 124 | "devDependencies": { 125 | "@cloudflare/puppeteer": "^0.0.6", 126 | "@tsconfig/recommended": "^1.0.3", 127 | "@types/hast": "^3.0.2", 128 | "@types/mdast": "^4.0.2", 129 | "@types/node": "^20.14.10", 130 | "arg": "^5.0.2", 131 | "esbuild": "^0.19.11", 132 | "fastest-levenshtein": "^1.0.16", 133 | "glob": "^10.3.10", 134 | "ncp": "^2.0.0", 135 | "np": "^9.2.0", 136 | "playwright-core": "^1.40.1", 137 | "puppeteer": "^23.2.2", 138 | "rimraf": "^5.0.5", 139 | "tsx": "^4.19.1", 140 | "typescript": "^5.4.5" 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /packages/webforai/src/cli/bin.ts: -------------------------------------------------------------------------------- 1 | import { program } from "commander"; 2 | import packageInfo from "../../package.json"; 3 | import { webforaiCommand } from "./commands/webforai"; 4 | import { LOADERS, MODES } from "./constants"; 5 | 6 | program 7 | .name("webforai") 8 | .description("CLI tool for ultra-precise HTML to Markdown conversion") 9 | .version(packageInfo.version, "-v, --version", "output the current version"); 10 | 11 | program 12 | .argument("[source]", "URL or path to process") 13 | .option("-o, --output ", "Path to output file or directory") 14 | .option("-m, --mode ", `Processing mode (${MODES.join(", ")})`) 15 | .option("-l, --loader ", `Loader to use (${LOADERS.join(", ")})`) 16 | .option("-d, --debug", "output extra debugging information") 17 | .action(webforaiCommand); 18 | 19 | program.parse(); 20 | -------------------------------------------------------------------------------- /packages/webforai/src/cli/commands/webforai/index.ts: -------------------------------------------------------------------------------- 1 | import fs from "node:fs/promises"; 2 | import path from "node:path"; 3 | import { intro, log, outro, spinner } from "@clack/prompts"; 4 | import pc from "picocolors"; 5 | import packageInfo from "../../../../package.json"; 6 | import { htmlToMarkdown } from "../../../html-to-markdown"; 7 | import { inputOutputPath } from "../../helpers/inputOutputPath"; 8 | import { inputSourcePath } from "../../helpers/inputSourcePath"; 9 | import { selectExtractMode } from "../../helpers/selectExtractMode"; 10 | import { selectLoader } from "../../helpers/selectLoader"; 11 | import { isUrl } from "../../utils"; 12 | import { loadHtml } from "./loadHtml"; 13 | 14 | const aiModeOptions = { linkAsText: true, tableAsText: true, hideImage: true }; 15 | const readabilityModeOptions = { linkAsText: false, tableAsText: false, hideImage: false }; 16 | 17 | export const webforaiCommand = async ( 18 | initialPath: string, 19 | options: { output?: string; mode?: string; loader?: string; debug?: boolean }, 20 | ) => { 21 | intro(pc.bold(pc.green(`webforai CLI version ${packageInfo.version}`))); 22 | 23 | const sourcePath = initialPath ?? (await inputSourcePath()); 24 | options.debug && log.info(`sourcePath: ${sourcePath}`); 25 | 26 | const loader = isUrl(sourcePath) ? options.loader ?? (await selectLoader()) : "local"; 27 | options.debug && log.info(`loader: ${loader}`); 28 | 29 | const outputPath = options.output ?? (await inputOutputPath(sourcePath)); 30 | options.debug && log.info(`outputPath: ${outputPath}`); 31 | 32 | const mode = options.mode ?? (await selectExtractMode()); 33 | options.debug && log.info(`mode: ${mode}`); 34 | 35 | let html: string; 36 | const s = spinner(); 37 | try { 38 | s.start("Loading content..."); 39 | html = await loadHtml(sourcePath, loader, { debug: options.debug }); 40 | s.stop(pc.green("Content loaded!")); 41 | } catch (error) { 42 | s.stop(pc.red("Content loading failed!")); 43 | console.error(error); 44 | process.exit(1); 45 | } 46 | options.debug && log.info(`html: ${html}`); 47 | 48 | const markdown = htmlToMarkdown(html, { 49 | baseUrl: isUrl(sourcePath) ? sourcePath : undefined, 50 | ...(mode === "ai" ? aiModeOptions : readabilityModeOptions), 51 | }); 52 | options.debug && log.info(`markdown: ${markdown}`); 53 | 54 | const directory = path.dirname(outputPath); 55 | const isDirectoryExists = await fs.stat(directory).then((stat) => stat.isDirectory()); 56 | 57 | if (!isDirectoryExists) { 58 | await fs.mkdir(directory, { recursive: true }); 59 | } 60 | await fs.writeFile(outputPath, markdown); 61 | 62 | outro(pc.green(`${pc.bold("Done!")} Markdown saved to ${outputPath}`)); 63 | }; 64 | -------------------------------------------------------------------------------- /packages/webforai/src/cli/commands/webforai/loadHtml.ts: -------------------------------------------------------------------------------- 1 | import fs from "node:fs/promises"; 2 | import { fileURLToPath } from "node:url"; 3 | import { log } from "@clack/prompts"; 4 | import boxen from "boxen"; 5 | import pc from "picocolors"; 6 | import { chromium } from "playwright-core"; 7 | import { loadHtml as loadHtmlByFetch } from "../../../loaders/fetch"; 8 | import { loadHtml as loadHtmlByPlaywright } from "../../../loaders/playwright"; 9 | 10 | const checkPlaywrightAvailable = async () => { 11 | const path = chromium.executablePath(); 12 | try { 13 | await fs.access(path); 14 | return true; 15 | } catch { 16 | return false; 17 | } 18 | }; 19 | 20 | const getPlaywrightVersion = async () => { 21 | const path = await import.meta.resolve("playwright-core/package.json"); 22 | const pwPackageJson = await fs 23 | .readFile(fileURLToPath(path), "utf-8") 24 | .then((res) => JSON.parse(res.toString())) 25 | .catch(() => null); 26 | return pwPackageJson?.version; 27 | }; 28 | 29 | export const loadHtml = async (sourcePath: string, loader: string, options: { debug?: boolean }) => { 30 | if (loader === "local") { 31 | options.debug && log.info(`Loading HTML from local file: ${sourcePath}`); 32 | const content = await fs.readFile(sourcePath, "utf-8"); 33 | options.debug && log.info(`HTML loaded: ${content.slice(0, 100)}`); 34 | return content; 35 | } 36 | 37 | if (loader === "fetch") { 38 | options.debug && log.info(`Loading HTML from URL: ${sourcePath}`); 39 | const content = await loadHtmlByFetch(sourcePath); 40 | options.debug && log.info(`HTML loaded: ${content.slice(0, 100)}`); 41 | return content; 42 | } 43 | 44 | if (loader === "playwright") { 45 | options.debug && log.info(`Loading HTML from playwright: ${sourcePath}`); 46 | const isPlaywrightAvailable = await checkPlaywrightAvailable(); 47 | options.debug && log.info(`Playwright available: ${isPlaywrightAvailable}`); 48 | 49 | const pwVersion = await getPlaywrightVersion(); 50 | 51 | if (!isPlaywrightAvailable) { 52 | const message = [ 53 | pc.bold("Error: Playwright is not available"), 54 | "", 55 | "To use the Playwright loader, please install Playwright by running:", 56 | "", 57 | ` npx playwright@${pwVersion} install chromium`, 58 | "", 59 | "Hint 1: If you receive a warning like this:", 60 | ` "WARNING: It looks like you are running 'npx playwright install' without first installing your project's dependencies."`, 61 | "You can safely ignore this warning.", 62 | "", 63 | "Hint 2: If you encounter the following message:", 64 | ` "Host system is missing dependencies to run browsers."`, 65 | "You should install the necessary dependencies by executing:", 66 | "", 67 | ` sudo npx playwright@${pwVersion} install-deps`, 68 | ]; 69 | 70 | log.error(boxen(message.join("\n"), { padding: 1, borderStyle: "round" })); 71 | throw new Error("Playwright is not available"); 72 | } 73 | const content = await loadHtmlByPlaywright(sourcePath); 74 | options.debug && log.info(`HTML loaded: ${content.slice(0, 100)}`); 75 | return content; 76 | } 77 | 78 | throw new Error(`Unsupported loader: ${loader}`); 79 | }; 80 | -------------------------------------------------------------------------------- /packages/webforai/src/cli/constants.ts: -------------------------------------------------------------------------------- 1 | export const DEFAULT_PATH = "https://example.com"; 2 | 3 | export const LOADERS = ["fetch", "playwright"] as const; 4 | export type Loaders = (typeof LOADERS)[number]; 5 | 6 | export const MODES: string[] = ["default", "ai"]; 7 | export type Modes = (typeof MODES)[number]; 8 | -------------------------------------------------------------------------------- /packages/webforai/src/cli/helpers/assertContinue.ts: -------------------------------------------------------------------------------- 1 | import { cancel, isCancel } from "@clack/prompts"; 2 | 3 | export function assertContinue(message: T | symbol, cancelMessage = "Canceled."): asserts message is T { 4 | if (isCancel(message)) { 5 | cancel(cancelMessage); 6 | process.exit(1); 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /packages/webforai/src/cli/helpers/inputOutputPath.ts: -------------------------------------------------------------------------------- 1 | import fs from "node:fs"; 2 | import { confirm, text } from "@clack/prompts"; 3 | import { getNextAvailableFilePath, sourcePathToOutputPath } from "../utils"; 4 | import { assertContinue } from "./assertContinue"; 5 | 6 | export const inputOutputPath = async (sourcePath: string) => { 7 | const outputPath = await text({ 8 | message: "Enter the output file path:", 9 | placeholder: "output.md", 10 | initialValue: sourcePathToOutputPath(sourcePath), 11 | validate: (value: string) => { 12 | if (value.trim() === "") { 13 | return "Output path is required"; 14 | } 15 | if (fs.existsSync(value) && fs.statSync(value).isDirectory()) { 16 | return "No directory can be specified."; 17 | } 18 | if (!fs.existsSync(value) && value.endsWith("/")) { 19 | return "No directory can be specified."; 20 | } 21 | }, 22 | }); 23 | assertContinue(outputPath); 24 | 25 | if (!fs.existsSync(outputPath)) { 26 | return outputPath; 27 | } 28 | 29 | const isOutputFileOverwrite = await confirm({ 30 | message: "The file already exists. Overwrite?", 31 | initialValue: false, 32 | }); 33 | assertContinue(isOutputFileOverwrite); 34 | 35 | if (isOutputFileOverwrite) { 36 | return outputPath; 37 | } 38 | 39 | const escapedOutputPath = await text({ 40 | message: "Enter the output file path:", 41 | placeholder: "output.md", 42 | initialValue: getNextAvailableFilePath(outputPath), 43 | validate: (value: string) => { 44 | if (value.trim() === "") { 45 | return "Output path is required"; 46 | } 47 | if (fs.existsSync(value)) { 48 | return "The file already exists"; 49 | } 50 | if (!fs.existsSync(value) && value.endsWith("/")) { 51 | return "No directory can be specified."; 52 | } 53 | }, 54 | }); 55 | 56 | assertContinue(escapedOutputPath); 57 | 58 | return escapedOutputPath; 59 | }; 60 | -------------------------------------------------------------------------------- /packages/webforai/src/cli/helpers/inputSourcePath.ts: -------------------------------------------------------------------------------- 1 | import fs from "node:fs"; 2 | import { text } from "@clack/prompts"; 3 | import { DEFAULT_PATH } from "../constants"; 4 | import { isUrl } from "../utils"; 5 | import { assertContinue } from "./assertContinue"; 6 | 7 | export const inputSourcePath = async () => { 8 | const result = await text({ 9 | message: "Enter the URL or html path to be converted to markdown:", 10 | placeholder: DEFAULT_PATH, 11 | initialValue: "", 12 | validate: (value: string) => { 13 | if (value.trim() === "") { 14 | return "Source is required"; 15 | } 16 | if (!isUrl(value)) { 17 | if (!fs.existsSync(value)) { 18 | return "It appears that you are specifying a local file, but the file cannot be found. hint: when specifying a url, start with http or https."; 19 | } 20 | if (fs.statSync(value).isDirectory()) { 21 | return "You are specifying a local file, but you cannot specify a directory. hint: when specifying a url, start with http or https."; 22 | } 23 | } 24 | }, 25 | }); 26 | assertContinue(result); 27 | 28 | return result; 29 | }; 30 | -------------------------------------------------------------------------------- /packages/webforai/src/cli/helpers/selectExtractMode.ts: -------------------------------------------------------------------------------- 1 | import { select } from "@clack/prompts"; 2 | import { MODES } from "../constants"; 3 | import { assertContinue } from "./assertContinue"; 4 | 5 | export const selectExtractMode = async () => { 6 | const result = await select({ 7 | message: "Select processing mode:", 8 | options: MODES.map((mode) => ({ value: mode, label: mode })), 9 | initialValue: MODES[0], 10 | }); 11 | 12 | assertContinue(result); 13 | 14 | return result; 15 | }; 16 | -------------------------------------------------------------------------------- /packages/webforai/src/cli/helpers/selectLoader.ts: -------------------------------------------------------------------------------- 1 | import { select } from "@clack/prompts"; 2 | import { LOADERS } from "../constants"; 3 | import { assertContinue } from "./assertContinue"; 4 | 5 | const loadersHint = { 6 | fetch: "Fetch HTML content from the given URL", 7 | playwright: "Retrieve HTML content after rendering using Playwright; Playwright must be installed in advance.", 8 | puppeteer: "Retrieve HTML content after rendering using Puppeteer; Puppeteer must be installed in advance.", 9 | } as const; 10 | 11 | export const selectLoader = async () => { 12 | const result = await select({ 13 | message: "Select loader:", 14 | initialValue: "fetch", 15 | options: LOADERS.map((mode) => ({ value: mode, label: mode, hint: loadersHint[mode] || "" })), 16 | }); 17 | assertContinue(result); 18 | 19 | return result; 20 | }; 21 | -------------------------------------------------------------------------------- /packages/webforai/src/cli/utils.ts: -------------------------------------------------------------------------------- 1 | import fs from "node:fs"; 2 | import path from "node:path"; 3 | 4 | export const isUrl = (maybeUrl: string) => { 5 | try { 6 | new URL(maybeUrl); 7 | return true; 8 | } catch { 9 | return false; 10 | } 11 | }; 12 | 13 | export function changeFileExtension(filePath: string, newExtension: string): string { 14 | const parsedPath = filePath.split("/"); 15 | const fileName = parsedPath[parsedPath.length - 1]; 16 | 17 | const formattedNewExtension = newExtension.startsWith(".") ? newExtension : `.${newExtension}`; 18 | 19 | if (fileName.startsWith(".")) { 20 | const parts = fileName.split("."); 21 | if (parts.length === 2) { 22 | return parsedPath.slice(0, -1).concat(`${fileName}${formattedNewExtension}`).join("/"); 23 | } 24 | parts[parts.length - 1] = newExtension.replace(/^\./, ""); 25 | return parsedPath.slice(0, -1).concat(parts.join(".")).join("/"); 26 | } 27 | 28 | const lastDotIndex = fileName.lastIndexOf("."); 29 | const baseName = lastDotIndex !== -1 ? fileName.slice(0, lastDotIndex) : fileName; 30 | const newFileName = `${baseName}${formattedNewExtension}`; 31 | 32 | parsedPath[parsedPath.length - 1] = newFileName; 33 | return parsedPath.join("/"); 34 | } 35 | 36 | export function urlToFilename(url: string): string { 37 | try { 38 | const urlObj = new URL(url); 39 | 40 | const domainParts = urlObj.hostname 41 | .split(".") 42 | .reverse() 43 | .reduce((acc: string[], part: string, index: number) => { 44 | if (index === 0) { 45 | return acc; 46 | } 47 | if (acc.length >= 2) { 48 | return acc; 49 | } 50 | if (part === "www") { 51 | return acc; 52 | } 53 | // biome-ignore lint/performance/noAccumulatingSpread: 54 | return [part, ...acc]; 55 | }, []); 56 | const domainString = domainParts.reverse().join("-"); 57 | 58 | const pathParts = urlObj.pathname.split("/").filter(Boolean); 59 | const relevantPathParts = pathParts.slice(-2); 60 | const pathString = relevantPathParts.map((part) => decodeURIComponent(part)).join("-"); 61 | 62 | let filename = [domainString, pathString].filter(Boolean).join("-"); 63 | 64 | filename = filename 65 | .toLowerCase() 66 | // biome-ignore lint/suspicious/noControlCharactersInRegex: 67 | .replace(/[<>:"/\\|?*\x00-\x1F]/g, "") 68 | .replace(/[\s.]+/g, "-") 69 | .replace(/^-+|-+$/g, ""); 70 | 71 | return filename || "output"; 72 | } catch { 73 | return "output"; 74 | } 75 | } 76 | 77 | export const sourcePathToOutputPath = (sourcePath: string) => { 78 | return isUrl(sourcePath) ? `${urlToFilename(sourcePath)}.md` : changeFileExtension(sourcePath, "md"); 79 | }; 80 | 81 | export function getNextAvailableFilePath(filePath: string): string { 82 | const parsedPath = path.parse(filePath); 83 | const directory = parsedPath.dir; 84 | const fullName = parsedPath.base; 85 | 86 | const [firstPart, ...restParts] = fullName.split("."); 87 | const restName = restParts.length > 0 ? `.${restParts.join(".")}` : ""; 88 | 89 | const baseName = firstPart.replace(/_\d+$/, ""); 90 | 91 | let counter = 1; 92 | let nextFilePath = filePath; 93 | 94 | while (fs.existsSync(nextFilePath)) { 95 | const match = firstPart.match(/_(\d+)$/); 96 | if (match) { 97 | counter = Number.parseInt(match[1], 10) + 1; 98 | } 99 | const newName = `${baseName}_${counter}${restName}`; 100 | nextFilePath = path.join(directory, newName); 101 | counter++; 102 | } 103 | 104 | return nextFilePath; 105 | } 106 | -------------------------------------------------------------------------------- /packages/webforai/src/constants.ts: -------------------------------------------------------------------------------- 1 | import { takumiExtractor } from "./extractors/presets/takumi"; 2 | 3 | export const DEFAULT_EXTRACTORS = [takumiExtractor]; 4 | -------------------------------------------------------------------------------- /packages/webforai/src/extract-mdast.ts: -------------------------------------------------------------------------------- 1 | import type { Nodes as Mdast, Parent } from "mdast"; 2 | import { filter } from "unist-util-filter"; 3 | 4 | const DECLATION_TYPES = ["blockquote", "strong", "emphasis", "delete"]; 5 | 6 | const emptyDeclarationFilter = (node: Mdast) => { 7 | if (!DECLATION_TYPES.includes(node.type)) { 8 | return true; 9 | } 10 | if ((node as Parent).children.length === 0) { 11 | return false; 12 | } 13 | 14 | return true; 15 | }; 16 | 17 | export const extractMdast = (node: Mdast) => { 18 | const extracted = filter(node, (node) => { 19 | if (!emptyDeclarationFilter(node as Mdast)) { 20 | return false; 21 | } 22 | return true; 23 | }); 24 | return extracted as Mdast; 25 | }; 26 | -------------------------------------------------------------------------------- /packages/webforai/src/extractors/index.ts: -------------------------------------------------------------------------------- 1 | // biome-ignore lint/performance/noBarrelFile: module index 2 | export { 3 | pipeExtractors, 4 | type ExtractorSelectors, 5 | type ExtractorSelector, 6 | } from "./pipeExtractors"; 7 | export { takumiExtractor } from "./presets/takumi"; 8 | export { type ExtractParams, type Extractor } from "./types"; 9 | export { minimalFilter } from "./presets/minimal-filter"; 10 | -------------------------------------------------------------------------------- /packages/webforai/src/extractors/pipeExtractors.ts: -------------------------------------------------------------------------------- 1 | import type { Nodes as Hast } from "hast"; 2 | import { DEFAULT_EXTRACTORS } from "../constants"; 3 | import type { ExtractParams, Extractor } from "./types"; 4 | 5 | export type ExtractorSelector = Extractor | false; 6 | export type ExtractorSelectors = ExtractorSelector | ExtractorSelector[]; 7 | 8 | export const pipeExtractors = (params: ExtractParams, extractors: ExtractorSelectors = DEFAULT_EXTRACTORS): Hast => { 9 | const { hast, lang } = params; 10 | const _extractors = Array.isArray(extractors) ? extractors : [extractors]; 11 | 12 | const extracted = 13 | _extractors.reduce((acc, extractor) => { 14 | if (extractor === false) { 15 | return acc; 16 | } 17 | if (typeof extractor === "function") { 18 | return extractor({ hast: acc, lang }); 19 | } 20 | throw new Error(`Invalid extractor: ${extractor}`); 21 | }, hast) || hast; 22 | 23 | return extracted; 24 | }; 25 | -------------------------------------------------------------------------------- /packages/webforai/src/extractors/presets/minimal-filter.ts: -------------------------------------------------------------------------------- 1 | import type { Element, Nodes as Hast } from "hast"; 2 | import { select } from "hast-util-select"; 3 | import { toString as hastToString } from "hast-util-to-string"; 4 | import { filter } from "unist-util-filter"; 5 | import type { ExtractParams } from "../types"; 6 | import { classnames, isStrInclude, matchString } from "./utils"; 7 | 8 | const UNLIKELY_ROLES = ["menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog"]; 9 | 10 | /* 11 | * This section of the code is influenced by @mozilla/readability, licensed under Apache License 2.0. 12 | * Original copyright (c) 2010 Arc90 Inc 13 | * See https://github.com/mozilla/readability for the full license text. 14 | * Modifications made by inaridiy 15 | * - Added and edited some regular expressions. 16 | */ 17 | const REGEXPS = { 18 | hidden: /hidden|invisible|fallback-image/i, 19 | byline: /byline|author|dateline|writtenby|p-author/i, 20 | specialUnlikelyCandidates: /frb-|uls-menu|language-link/i, 21 | unlikelyCandidates: 22 | /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|tooltip|disqus|extra|footer|gdpr|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote|speechify-ignore|avatar/i, 23 | okMaybeItsaCandidate: /and|article|body|column|content|main|shadow|code/i, 24 | }; 25 | 26 | const metadataFilter = (node: Hast) => { 27 | return !( 28 | ["comment", "doctype"].includes(node.type) || 29 | (node.type === "element" && ["script", "style", "link", "meta", "noscript", "svg", "title"].includes(node.tagName)) 30 | ); 31 | }; 32 | 33 | const universalElementFilter = (node: Hast) => { 34 | if (node.type !== "element") { 35 | return true; 36 | } 37 | const element = node as Element; 38 | 39 | if (["aside", "nav"].includes(element.tagName)) { 40 | return false; 41 | } 42 | 43 | // Remove elements with hidden properties 44 | if (["hidden", "aria-hidden"].some((key) => element.properties[key])) { 45 | return false; 46 | } 47 | if (classnames(element).some((classname) => REGEXPS.hidden.test(classname))) { 48 | return false; 49 | } 50 | 51 | // Remove dialog elements 52 | if (element.tagName === "dialog") { 53 | return false; 54 | } 55 | if (element.properties.role === "dialog" && element.properties["aria-modal"]) { 56 | return false; 57 | } 58 | 59 | // Remove byline elements 60 | if (element.properties.rel === "author" && isStrInclude(element.properties.itemprop, "author")) { 61 | return false; 62 | } 63 | if (REGEXPS.byline.test(matchString(element))) { 64 | return false; 65 | } 66 | 67 | // Remove unlikely roles 68 | if (element.properties.role && UNLIKELY_ROLES.includes(element.properties.role as string)) { 69 | return false; 70 | } 71 | 72 | return true; 73 | }; 74 | 75 | /** 76 | * Simple filter to remove unwanted elements from the HAST tree. 77 | * 78 | * @param params - {@link ExtractParams} 79 | * @returns The HAST tree. 80 | */ 81 | export const minimalFilter = (params: ExtractParams): Hast => { 82 | const { hast } = params; 83 | const body = select("body", hast) ?? hast; 84 | 85 | const metadataFilteredHast = filter(body, (node) => metadataFilter(node as Hast)); 86 | const metadataFilteredHastText = metadataFilteredHast && hastToString(metadataFilteredHast); 87 | if (!(metadataFilteredHast && metadataFilteredHastText)) { 88 | return body; 89 | } 90 | 91 | const baseFilterd = filter(metadataFilteredHast, (node) => universalElementFilter(node as Hast)); 92 | const baseFilterdText = baseFilterd ? hastToString(baseFilterd) : ""; 93 | 94 | const isOverExtracted = baseFilterdText.length > metadataFilteredHastText.length / 3 || baseFilterdText.length > 5000; 95 | const baseTree = isOverExtracted && baseFilterd ? baseFilterd : metadataFilteredHast; 96 | 97 | return baseTree; 98 | }; 99 | -------------------------------------------------------------------------------- /packages/webforai/src/extractors/presets/utils.ts: -------------------------------------------------------------------------------- 1 | import type { Element } from "hast"; 2 | 3 | export const matchString = (element: Element) => 4 | `${element.tagName} ${element.properties.id} ${classnames(element).join(" ")}`; 5 | 6 | export const classnames = (element: Element) => { 7 | if (Array.isArray(element.properties.className)) { 8 | return element.properties.className as string[]; 9 | } 10 | return []; 11 | }; 12 | 13 | export const isStrInclude = (value: unknown, match: string) => { 14 | if (typeof value === "string") { 15 | return value.includes(match); 16 | } 17 | return false; 18 | }; 19 | -------------------------------------------------------------------------------- /packages/webforai/src/extractors/types.ts: -------------------------------------------------------------------------------- 1 | import type { Nodes as Hast } from "hast"; 2 | 3 | export type ExtractParams = { hast: Hast; lang?: string; url?: string }; 4 | export type Extractor = (param: ExtractParams) => Hast; 5 | -------------------------------------------------------------------------------- /packages/webforai/src/html-to-markdown.test.ts: -------------------------------------------------------------------------------- 1 | import { distance } from "fastest-levenshtein"; 2 | import { describe, expect, it } from "vitest"; 3 | import { htmlToMarkdown } from "./html-to-markdown"; 4 | import { loadHtml } from "./loaders/fetch"; 5 | 6 | const html = ` 7 | Hello, world! 8 | This is a paragraph. 9 | Example 10 | 11 | 12 | Item 1 13 | Item 2 14 | 15 | `; 16 | 17 | const expected = `# Hello, world! 18 | 19 | This is a paragraph. 20 | 21 | [Example](/example.html) 22 | 23 |  24 | 25 | * Item 1 26 | * Item 2 27 | `; 28 | 29 | const baseUrlReplaced = `# Hello, world! 30 | 31 | This is a paragraph. 32 | 33 | [Example](https://example.com/example.html) 34 | 35 |  36 | 37 | * Item 1 38 | * Item 2 39 | `; 40 | 41 | const linkAsText = `# Hello, world! 42 | 43 | This is a paragraph. 44 | 45 | Example 46 | 47 |  48 | 49 | - Item 1 50 | - Item 2 51 | `; 52 | 53 | const imageHidden = `# Hello, world! 54 | 55 | This is a paragraph. 56 | 57 | [Example](/example.html) 58 | 59 | - Item 1 60 | - Item 2 61 | `; 62 | 63 | const htmlTable = ` 64 | 65 | 66 | Header 1 67 | Header 2 68 | 69 | 70 | Cell 1 71 | Cell 2 72 | 73 | 74 | `; 75 | 76 | const expectedTableMarkdown = ` 77 | | Header 1 | Header 2 | 78 | | -------- | -------- | 79 | | Cell 1 | Cell 2 | 80 | `; 81 | 82 | const expectedTableText = `Header 1 Header 2 83 | Cell 1 Cell 2`; 84 | 85 | describe("htmlToMarkdown", () => { 86 | it("should convert HTML to Markdown", () => { 87 | const markdown = htmlToMarkdown(html, { extractors: false }); 88 | const d = distance(markdown, expected); 89 | expect(d).lte(5); 90 | }); 91 | 92 | it("should convert HTML to Markdown with replaced base URL", () => { 93 | const markdown = htmlToMarkdown(html, { baseUrl: "https://example.com", extractors: false }); 94 | const d = distance(markdown, baseUrlReplaced); 95 | expect(d).lte(5); 96 | }); 97 | 98 | it("should convert HTML to Markdown with links as text", () => { 99 | const markdown = htmlToMarkdown(html, { linkAsText: true, extractors: false }); 100 | const d = distance(markdown, linkAsText); 101 | expect(d).lte(5); 102 | }); 103 | 104 | it("should convert HTML to Markdown with hidden images", () => { 105 | const markdown = htmlToMarkdown(html, { hideImage: true, extractors: false }); 106 | const d = distance(markdown, imageHidden); 107 | expect(d).lte(5); 108 | }); 109 | 110 | it("should convert HTML table to Markdown table", () => { 111 | const markdown = htmlToMarkdown(htmlTable, { extractors: false }); 112 | const d = distance(markdown, expectedTableMarkdown); 113 | expect(d).lte(5); 114 | }); 115 | 116 | it("should convert HTML table with table as text option", () => { 117 | const markdown = htmlToMarkdown(htmlTable, { tableAsText: true, extractors: false }); 118 | const d = distance(markdown, expectedTableText); 119 | expect(d).lte(10); // Allow a higher distance due to the difference in formatting 120 | }); 121 | }); 122 | 123 | describe("htmlToMarkdown E2E", () => { 124 | it("Converting for good", async () => { 125 | const html1 = await loadHtml("https://www.npmjs.com/package/webforai"); 126 | const markdown1 = htmlToMarkdown(html1, { linkAsText: true, hideImage: true }); 127 | 128 | const html2 = await loadHtml("https://github.com/inaridiy/webforai"); 129 | const markdown2 = htmlToMarkdown(html2, { linkAsText: true, hideImage: true }); 130 | 131 | // @ts-ignore 132 | 133 | const d = distance(markdown1, markdown2); 134 | expect(d).lte(2500); // I'd like to optimise more! 135 | }); 136 | }); 137 | -------------------------------------------------------------------------------- /packages/webforai/src/html-to-markdown.ts: -------------------------------------------------------------------------------- 1 | import type { Nodes as Hast } from "hast"; 2 | import { type HtmlToMdastOptions, htmlToMdast } from "./html-to-mdast"; 3 | import { type MdastToMarkdownOptions, mdastToMarkdown } from "./mdast-to-markdown"; 4 | 5 | export interface HtmlToMarkdownOptions extends HtmlToMdastOptions { 6 | /** The base URL to use for replacing relative links. */ 7 | baseUrl?: string; 8 | /** Formatting options passed to [mdast-util-to-markdown](https://github.com/syntax-tree/mdast-util-to-markdown). */ 9 | formatting?: Omit; 10 | } 11 | 12 | /** 13 | * Converts HTML or HAST to a Markdown string. 14 | * 15 | * @param htmlOrHast - The HTML string or HAST tree to convert. 16 | * @param options - {@link HtmlToMarkdownOptions} to customize the conversion. 17 | * @returns The Markdown string. 18 | * 19 | * @example 20 | * ```ts 21 | * import { htmlToMarkdown } from "webforai" 22 | * 23 | * const html = 'Hello, world!'; 24 | * const markdown = htmlToMarkdown(html); 25 | * 26 | * console.log(markdown); // Output: "# Hello, world!" 27 | * ``` 28 | */ 29 | export const htmlToMarkdown = (htmlOrHast: string | Hast, options?: HtmlToMarkdownOptions): string => { 30 | const { baseUrl, formatting: toMarkdownOptions, ...toMdastOptions } = options || {}; 31 | const mdast = htmlToMdast(htmlOrHast, toMdastOptions); 32 | const markdown = mdastToMarkdown(mdast, { baseUrl, ...toMarkdownOptions }); 33 | return markdown; 34 | }; 35 | -------------------------------------------------------------------------------- /packages/webforai/src/html-to-mdast.ts: -------------------------------------------------------------------------------- 1 | import type { Nodes as Hast } from "hast"; 2 | import { fromHtml } from "hast-util-from-html"; 3 | import { toMdast } from "hast-util-to-mdast"; 4 | import type { Nodes as Mdast } from "mdast"; 5 | 6 | import { extractMdast } from "./extract-mdast"; 7 | import { type ExtractorSelectors, pipeExtractors } from "./extractors"; 8 | import { customAHandler } from "./mdast-handlers/custom-a-handler"; 9 | import { customCodeHandler } from "./mdast-handlers/custom-code-handler"; 10 | import { customDivHandler } from "./mdast-handlers/custom-div-handler"; 11 | import { customImgHandler } from "./mdast-handlers/custom-img-handler"; 12 | import { customTableHandler } from "./mdast-handlers/custom-table-handler"; 13 | import { mathHandler } from "./mdast-handlers/math-handler"; 14 | import { getLangFromHast, getLangFromStr, getUrlFromHast } from "./utils/hast-utils"; 15 | 16 | export type HtmlToMdastOptions = { 17 | /** 18 | * An array of extractors to extract specific elements from the HTML. 19 | * You can define your own functions in addition to the Extractor provided as a preset. 20 | */ 21 | extractors?: ExtractorSelectors; 22 | /** Whether to convert links to plain text. */ 23 | linkAsText?: boolean; 24 | /** Whether to convert tables to plain text. */ 25 | tableAsText?: boolean; 26 | /** Whether to hide images. */ 27 | hideImage?: boolean; 28 | /** The language of the HTML. */ 29 | lang?: string; 30 | /** The URL of the HTML. */ 31 | url?: string; 32 | }; 33 | 34 | /** 35 | * Converts an HTML string or HAST tree to an MDAST tree. 36 | * 37 | * @param htmlOrHast - The HTML string or HAST tree to convert. 38 | * @param options - {@link HtmlToMdastOptions} to customize the conversion. 39 | * @returns The MDAST tree. 40 | * 41 | * @example 42 | * ```ts 43 | * import { htmlToMdast } from 'webforai'; 44 | * 45 | * const html = 'Hello, world!'; 46 | * const mdast = htmlToMdast(html); 47 | * 48 | * console.log(mdast); // Output: { type: 'root', children: [ { type: 'heading', depth: 1, children: [ { type: 'text', value: 'Hello, world!' } ] } ] } 49 | * ``` 50 | */ 51 | export const htmlToMdast = (htmlOrHast: string | Hast, options?: HtmlToMdastOptions): Mdast => { 52 | const { extractors, url: defaultUrl, lang: defaultLang } = options || {}; 53 | 54 | const [lang, hast] = 55 | typeof htmlOrHast === "string" 56 | ? [defaultLang || getLangFromStr(htmlOrHast), fromHtml(htmlOrHast, { fragment: true })] 57 | : [defaultLang || getLangFromHast(htmlOrHast), htmlOrHast]; 58 | 59 | const url = defaultUrl || getUrlFromHast(hast); 60 | 61 | const extractedHast = pipeExtractors({ hast, lang, url }, extractors); 62 | 63 | const mdast = toMdast(extractedHast, { 64 | handlers: { 65 | math: mathHandler, 66 | div: customDivHandler, 67 | pre: customCodeHandler, 68 | a: customAHandler({ asText: options?.linkAsText }), 69 | img: customImgHandler({ hideImage: options?.hideImage }), 70 | table: customTableHandler({ asText: options?.tableAsText }), 71 | }, 72 | }); 73 | 74 | const extractedMdast = extractMdast(mdast); 75 | 76 | return extractedMdast; 77 | }; 78 | -------------------------------------------------------------------------------- /packages/webforai/src/index.ts: -------------------------------------------------------------------------------- 1 | // biome-ignore lint/performance/noBarrelFile: module index 2 | export { htmlToMarkdown, type HtmlToMarkdownOptions } from "./html-to-markdown"; 3 | export { mdastSplitter } from "./md-splitter"; 4 | export { htmlToMdast, type HtmlToMdastOptions } from "./html-to-mdast"; 5 | export { mdastToMarkdown } from "./mdast-to-markdown"; 6 | export { 7 | pipeExtractors, 8 | takumiExtractor, 9 | minimalFilter, 10 | type ExtractorSelectors, 11 | type ExtractorSelector, 12 | type ExtractParams, 13 | type Extractor, 14 | } from "./extractors"; 15 | -------------------------------------------------------------------------------- /packages/webforai/src/link-replacer.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from "vitest"; 2 | import { linkReplacer } from "./link-replacer"; 3 | 4 | const markdown = `# Hello, world! 5 | 6 | This is a paragraph. 7 | 8 | [Example](/example.html) 9 | 10 |  11 | 12 | [Absolute Link](https://www.google.com) 13 | 14 | [Link with hash](/page#hash) 15 | 16 | [Link with query](/page?query=string)`; 17 | 18 | const expected = `# Hello, world! 19 | 20 | This is a paragraph. 21 | 22 | [Example](https://example.com/example.html) 23 | 24 |  25 | 26 | [Absolute Link](https://www.google.com) 27 | 28 | [Link with hash](https://example.com/page#hash) 29 | 30 | [Link with query](https://example.com/page?query=string)`; 31 | 32 | describe("linkReplacer", () => { 33 | it("should replace relative links", () => { 34 | const replaced = linkReplacer(markdown, "https://example.com"); 35 | 36 | expect(replaced).toEqual(expected); 37 | }); 38 | 39 | it("should not replace absolute links", () => { 40 | const replaced = linkReplacer("[Absolute Link](https://www.google.com)", "https://example.com"); 41 | 42 | expect(replaced).toEqual("[Absolute Link](https://www.google.com)"); 43 | }); 44 | 45 | it("should handle links with hashes", () => { 46 | const replaced = linkReplacer("[Link with hash](/page#hash)", "https://example.com"); 47 | 48 | expect(replaced).toEqual("[Link with hash](https://example.com/page#hash)"); 49 | }); 50 | 51 | it("should handle links with query parameters", () => { 52 | const replaced = linkReplacer("[Link with query](/page?query=string)", "https://example.com"); 53 | 54 | expect(replaced).toEqual("[Link with query](https://example.com/page?query=string)"); 55 | }); 56 | }); 57 | -------------------------------------------------------------------------------- /packages/webforai/src/link-replacer.ts: -------------------------------------------------------------------------------- 1 | export const linkReplacer = (markdown: string, base: string) => { 2 | const regex = /(!?\[.*?\]\()([^)\s]+)(\))/g; 3 | return markdown.replace(regex, (match, pre, url, post) => { 4 | if (/^(https?:|#)/.test(url)) { 5 | return match; 6 | } 7 | try { 8 | const absoluteUrl = new URL(url, base).href; 9 | return `${pre}${absoluteUrl}${post}`; 10 | } catch { 11 | return match; 12 | } 13 | }); 14 | }; 15 | -------------------------------------------------------------------------------- /packages/webforai/src/loaders/cf-puppeteer.ts: -------------------------------------------------------------------------------- 1 | import puppeteer from "@cloudflare/puppeteer"; 2 | 3 | const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); 4 | 5 | export const loadHtml = async (url: string, ctx: puppeteer.BrowserWorker) => { 6 | const browser = await puppeteer.launch(ctx); 7 | const page = await browser.newPage(); 8 | await page.goto(url); 9 | 10 | const html = await page.content(); 11 | 12 | await Promise.race([page.waitForNetworkIdle(), sleep(10000)]); 13 | 14 | await page.close(); 15 | 16 | return html; 17 | }; 18 | -------------------------------------------------------------------------------- /packages/webforai/src/loaders/fetch.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from "vitest"; 2 | import { loadHtml } from "./fetch"; 3 | 4 | describe("Fetch loader", () => { 5 | it("should load the HTML of a URL", async () => { 6 | const html = await loadHtml("https://example.com"); 7 | expect(html).toContain("Example Domain"); 8 | }); 9 | }); 10 | -------------------------------------------------------------------------------- /packages/webforai/src/loaders/fetch.ts: -------------------------------------------------------------------------------- 1 | export const USER_AGENT = 2 | "mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/125.0.0.0 safari/537.36"; 3 | 4 | /** 5 | * Useful function for load the HTML of a URL using the Fetch API. 6 | * **Not recommended** for use in production environments. 7 | * @param url - The URL to load. 8 | * @param userAgent - The user agent to use. Default is a Chrome user agent. 9 | * @returns The HTML content of the URL. 10 | */ 11 | export const loadHtml = async (url: string, userAgent = USER_AGENT) => { 12 | const response = await fetch(url, { headers: { "User-Agent": userAgent } }); 13 | return response.text(); 14 | }; 15 | -------------------------------------------------------------------------------- /packages/webforai/src/loaders/playwright.test.ts: -------------------------------------------------------------------------------- 1 | import { chromium } from "playwright-core"; 2 | import { describe, expect, it } from "vitest"; 3 | import { loadHtml } from "./playwright"; 4 | 5 | describe("Playwright loader", () => { 6 | it("should load the HTML of a URL", async () => { 7 | const html = await loadHtml("https://example.com"); 8 | expect(html).toContain("Example Domain"); 9 | }); 10 | 11 | it("should load the HTML of a URL using a custom context", async () => { 12 | const context = await chromium.launch({ headless: true }); 13 | const html = await loadHtml("https://example.com", { browser: context }); 14 | 15 | expect(html).toContain("Example Domain"); 16 | expect(context.isConnected()).toBe(true); 17 | await context.close(); 18 | }); 19 | }); 20 | -------------------------------------------------------------------------------- /packages/webforai/src/loaders/playwright.ts: -------------------------------------------------------------------------------- 1 | import { type Browser, chromium, devices } from "playwright-core"; 2 | 3 | export type LoadHtmlOptions = { 4 | browser?: Browser; 5 | timeout?: number; 6 | waitUntil?: "load" | "domcontentloaded" | "networkidle"; 7 | superBypassMode?: boolean; 8 | }; 9 | 10 | const SUPER_BYPASS_DEVICE = { 11 | userAgent: 12 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", 13 | viewport: { width: 1920, height: 1080 }, 14 | deviceScaleFactor: 1, 15 | hasTouch: false, 16 | isMobile: false, 17 | javaScriptEnabled: true, 18 | locale: "en-US", 19 | timezoneId: "America/New_York", 20 | }; 21 | 22 | /** 23 | * Useful function for load the HTML of a URL using Playwright. 24 | * **Not recommended** for use in production environments. 25 | * @param url - The URL to load. 26 | * @param context - The Playwright browser context to use. If not provided, a new browser context will be created and closed after loading the URL. 27 | * @returns The HTML content of the URL. 28 | * @example 29 | * ```ts 30 | * import { loadHtml } from "webforai/loaders/playwright"; 31 | * 32 | * const html = await loadHtml("https://example.com"); 33 | * console.log(html); 34 | * ``` 35 | */ 36 | export const loadHtml = async (url: string, options?: LoadHtmlOptions) => { 37 | const { browser, waitUntil, timeout, superBypassMode } = options ?? {}; 38 | const _browser = browser ?? (await chromium.launch({ headless: true })); 39 | const context = await _browser.newContext(superBypassMode ? SUPER_BYPASS_DEVICE : devices["Desktop Chrome"]); 40 | 41 | if (superBypassMode) { 42 | await context.addInitScript(() => { 43 | Object.defineProperty(navigator, "webdriver", { get: () => undefined }); 44 | Object.defineProperty(navigator, "languages", { get: () => ["en-US", "en"] }); 45 | Object.defineProperty(navigator, "plugins", { get: () => [1, 2, 3, 4, 5] }); 46 | }); 47 | } 48 | 49 | const page = await context.newPage(); 50 | if (superBypassMode) { 51 | await page.route("**/*.js", (route) => { 52 | if (route.request().url().includes("captcha-delivery")) { 53 | return route.abort(); 54 | } 55 | return route.continue(); 56 | }); 57 | } 58 | 59 | await page.goto(url, { waitUntil: waitUntil ?? "load", timeout }); 60 | await page.evaluate(() => { 61 | const elements = document.querySelectorAll("*"); 62 | for (const element of elements) { 63 | const rect = element.getBoundingClientRect(); 64 | element.setAttribute("data-rwidth", rect.width.toString()); 65 | element.setAttribute("data-rheight", rect.height.toString()); 66 | } 67 | }); 68 | const html = await page.content(); 69 | await page.close(); 70 | 71 | if (browser) { 72 | await context.close(); 73 | } else { 74 | await _browser.close(); 75 | } 76 | 77 | return html; 78 | }; 79 | -------------------------------------------------------------------------------- /packages/webforai/src/loaders/puppeteer.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from "vitest"; 2 | import { loadHtml } from "./puppeteer"; 3 | 4 | describe("Puppeteer loader", () => { 5 | it("should load the HTML of a URL", async () => { 6 | const html = await loadHtml("https://example.com"); 7 | expect(html).toContain("Example Domain"); 8 | }); 9 | 10 | it("should load the HTML of a URL using a custom puppeteer context", async () => { 11 | const html = await loadHtml("https://example.com", { headless: true }); 12 | 13 | expect(html).toContain("Example Domain"); 14 | }); 15 | }); 16 | -------------------------------------------------------------------------------- /packages/webforai/src/loaders/puppeteer.ts: -------------------------------------------------------------------------------- 1 | import puppeteer from "puppeteer"; 2 | import type { PuppeteerLaunchOptions } from "puppeteer"; 3 | 4 | const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); 5 | 6 | export const loadHtml = async (url: string, ctx?: PuppeteerLaunchOptions) => { 7 | const browser = await puppeteer.launch( 8 | ctx || { 9 | headless: true, 10 | args: ["--no-sandbox", "--disable-setuid-sandbox"], 11 | }, 12 | ); 13 | const page = await browser.newPage(); 14 | await page.goto(url); 15 | 16 | const html = await page.content(); 17 | 18 | await Promise.race([page.waitForNetworkIdle(), sleep(10000)]); 19 | 20 | await browser.close(); 21 | 22 | return html; 23 | }; 24 | -------------------------------------------------------------------------------- /packages/webforai/src/md-splitter.ts: -------------------------------------------------------------------------------- 1 | import type { Nodes as Mdast, RootContent } from "mdast"; 2 | import { mdastToMarkdown } from "./mdast-to-markdown"; 3 | import { chunk } from "./utils/common"; 4 | import { internalType, unwarpRoot, warpRoot } from "./utils/mdast-utils"; 5 | 6 | const PRIORITY_SPLITTERS = ["h1", "h2", "h3", "h4", "h5", "h6", "list", "table", "code"]; 7 | type SplitterGenerator = Generator; 8 | const getSplitterGenerator = function* () { 9 | for (const splitter of PRIORITY_SPLITTERS) { 10 | yield splitter; 11 | } 12 | }; 13 | 14 | const _mdastSplitter = async ( 15 | contents: RootContent[], 16 | checker: (markdown: string) => Promise, 17 | splitterGenerator: SplitterGenerator, 18 | ): Promise => { 19 | const splitter = splitterGenerator.next().value; 20 | const markdown = mdastToMarkdown(warpRoot(contents)); 21 | if ((await checker(markdown)) || contents.length === 1) { 22 | return [contents]; 23 | } 24 | const chunked = splitter 25 | ? contents.reduce((acc, content) => { 26 | if (internalType(content) === splitter || acc.length === 0) { 27 | acc.push([content]); 28 | return acc; 29 | } 30 | acc[acc.length - 1].push(content); 31 | return acc; 32 | }, []) 33 | : chunk(contents, Math.ceil(contents.length / 2)); 34 | 35 | const splitting = chunked.map((chunk) => _mdastSplitter(chunk, checker, splitterGenerator)); 36 | 37 | return Promise.all(splitting).then((chunks) => chunks.flat()); 38 | }; 39 | 40 | export const mdastSplitter = ( 41 | mdast: Mdast, 42 | checker: (markdown: string) => Promise, 43 | _options?: { signal?: AbortSignal }, //TODO 44 | ): Promise => { 45 | return _mdastSplitter(unwarpRoot(mdast), checker, getSplitterGenerator()); 46 | }; 47 | -------------------------------------------------------------------------------- /packages/webforai/src/mdast-handlers/custom-a-handler.ts: -------------------------------------------------------------------------------- 1 | import { type Handle, defaultHandlers } from "hast-util-to-mdast"; 2 | import { toString as hastToString } from "hast-util-to-string"; 3 | 4 | export const customAHandler = 5 | (options?: { asText?: boolean }): Handle => 6 | (state, node) => { 7 | if (options?.asText) { 8 | const text = hastToString(node); 9 | if (3 >= text.length) { 10 | return undefined; 11 | } 12 | 13 | const link = { type: "text", value: text } as const; 14 | state.patch(node, link); 15 | return link; 16 | } 17 | 18 | const link = defaultHandlers.a(state, node); 19 | if (link.children.length > 0) { 20 | return link; 21 | } 22 | return undefined; 23 | }; 24 | -------------------------------------------------------------------------------- /packages/webforai/src/mdast-handlers/custom-br-handler.ts: -------------------------------------------------------------------------------- 1 | import { type Handle, defaultHandlers } from "hast-util-to-mdast"; 2 | 3 | export const customBrHandler: Handle = (state, node) => { 4 | return defaultHandlers.br(state, node); 5 | }; 6 | -------------------------------------------------------------------------------- /packages/webforai/src/mdast-handlers/custom-code-handler.ts: -------------------------------------------------------------------------------- 1 | import type { Handle } from "hast-util-to-mdast"; 2 | import { toText } from "hast-util-to-text"; 3 | import type { Code } from "mdast"; 4 | import { trimTrailingLines } from "trim-trailing-lines"; 5 | import { detectLanguage } from "../utils/detect-code-lang"; 6 | 7 | const LANGUAGE_MATCH_REGEX = [/language-(\w+)/, /highlight-source-(\w+)/, /CodeBlock--language-(\w+)/]; 8 | 9 | export const customCodeHandler: Handle = (state, node) => { 10 | const classNames = (node.properties?.className as string[]) || []; 11 | const codeValue = trimTrailingLines(toText(node)).trim(); 12 | 13 | const classLang = classNames 14 | .map((className) => { 15 | const match = LANGUAGE_MATCH_REGEX.map((regex) => className.match(regex)).find((match) => match); 16 | return match?.[1]; 17 | }) 18 | .find((className) => className); 19 | 20 | const lang = classLang || detectLanguage(codeValue) || null; 21 | 22 | const result: Code = { type: "code", lang, meta: null, value: codeValue }; 23 | state.patch(node, result); 24 | return result; 25 | }; 26 | -------------------------------------------------------------------------------- /packages/webforai/src/mdast-handlers/custom-div-handler.ts: -------------------------------------------------------------------------------- 1 | import { select } from "hast-util-select"; 2 | import { type Handle, defaultHandlers } from "hast-util-to-mdast"; 3 | import { toString as hastToString } from "hast-util-to-string"; 4 | import { toText } from "hast-util-to-text"; 5 | import type { Code } from "mdast"; 6 | import { trimTrailingLines } from "trim-trailing-lines"; 7 | import { detectLanguage } from "../utils/detect-code-lang"; 8 | 9 | const CODE_BLOCK_REGEX = /highlight-source|language-|codegroup|codeblock|code-block/i; 10 | 11 | const CODE_FILENAME_SELECTORS = "[class*='fileName'],[class*='fileName'],[class*='title'],[class*='Title']"; 12 | 13 | const LANGUAGE_MATCH_REGEX = [/language-(\w+)/, /highlight-source-(\w+)/, /CodeBlock--language-(\w+)/]; 14 | 15 | const findRecursive = (array: T[], condition: (value: T) => boolean | T[], maxDepth = 3): T | null => { 16 | if (maxDepth <= 0) { 17 | return null; 18 | } 19 | for (const value of array) { 20 | const result = condition(value); 21 | if (Array.isArray(result)) { 22 | return findRecursive(result, condition, maxDepth - 1); 23 | } 24 | if (result) { 25 | return value; 26 | } 27 | } 28 | 29 | return null; 30 | }; 31 | export const customDivHandler: Handle = (state, node) => { 32 | const classNames = Array.isArray(node.properties.className) ? (node.properties.className as string[]) : []; 33 | const codeBlock = findRecursive(node.children, (child) => { 34 | if (child.type !== "element") { 35 | return false; 36 | } 37 | if (child.tagName === "pre") { 38 | return true; 39 | } 40 | return child.children.filter((child) => child.type === "element"); 41 | }); 42 | 43 | if (codeBlock && classNames.some((className) => CODE_BLOCK_REGEX.test(className))) { 44 | const codeBlockClassNames = codeBlock.type === "element" ? (codeBlock.properties.className as string[]) ?? [] : []; 45 | const codeValue = trimTrailingLines(toText(codeBlock)).trim(); 46 | 47 | const filenameElement = select(CODE_FILENAME_SELECTORS, node); 48 | const fileLang = filenameElement ? hastToString(filenameElement).match(/\.(\w+)$/)?.[1] : null; 49 | 50 | const classLang = [...classNames, ...codeBlockClassNames] 51 | .map((className) => { 52 | const match = LANGUAGE_MATCH_REGEX.map((regex) => className.match(regex)).find((match) => match); 53 | 54 | return match?.[1]; 55 | }) 56 | .find((className) => className); 57 | 58 | const lang = fileLang || classLang || detectLanguage(codeValue) || null; 59 | 60 | const result: Code = { type: "code", lang, meta: null, value: codeValue }; 61 | state.patch(node, result); 62 | return result; 63 | } 64 | 65 | return defaultHandlers.div(state, node); 66 | }; 67 | -------------------------------------------------------------------------------- /packages/webforai/src/mdast-handlers/custom-img-handler.ts: -------------------------------------------------------------------------------- 1 | import { type Handle, defaultHandlers } from "hast-util-to-mdast"; 2 | 3 | export const customImgHandler = 4 | (options?: { hideImage?: boolean }): Handle => 5 | (state, node) => { 6 | if (options?.hideImage) { 7 | return undefined; 8 | } 9 | return defaultHandlers.image(state, node); 10 | }; 11 | -------------------------------------------------------------------------------- /packages/webforai/src/mdast-handlers/custom-table-handler.ts: -------------------------------------------------------------------------------- 1 | import { type Handle, defaultHandlers } from "hast-util-to-mdast"; 2 | import { toText } from "hast-util-to-text"; 3 | 4 | export const customTableHandler = 5 | (options?: { asText?: boolean }): Handle => 6 | (state, node) => { 7 | if (options?.asText) { 8 | const paragraph = { type: "paragraph" as const, children: [{ type: "text", value: toText(node) } as const] }; 9 | state.patch(node, paragraph); 10 | return paragraph; 11 | } 12 | return defaultHandlers.table(state, node); 13 | }; 14 | -------------------------------------------------------------------------------- /packages/webforai/src/mdast-handlers/empty-handler.ts: -------------------------------------------------------------------------------- 1 | export const emptyHandler = () => { 2 | // Do nothing 3 | }; 4 | -------------------------------------------------------------------------------- /packages/webforai/src/mdast-handlers/math-handler.ts: -------------------------------------------------------------------------------- 1 | import { toHtml } from "hast-util-to-html"; 2 | import type { Handle } from "hast-util-to-mdast"; 3 | import { MathMLToLaTeX } from "mathml-to-latex"; 4 | import type { InlineMath, Math as mdMath } from "mdast-util-math"; 5 | 6 | export const mathHandler: Handle = (state, node) => { 7 | const mathMl = toHtml(node); 8 | const latex = MathMLToLaTeX.convert(mathMl); 9 | const result: InlineMath | mdMath = { type: "inlineMath", value: latex }; 10 | state.patch(node, result); 11 | return result; 12 | }; 13 | -------------------------------------------------------------------------------- /packages/webforai/src/mdast-to-markdown.ts: -------------------------------------------------------------------------------- 1 | import type { Nodes as Mdast, RootContent } from "mdast"; 2 | 3 | import { gfmToMarkdown } from "mdast-util-gfm"; 4 | import { mathToMarkdown } from "mdast-util-math"; 5 | import { type Options as ToMarkdownOptions, toMarkdown } from "mdast-util-to-markdown"; 6 | 7 | import { linkReplacer } from "./link-replacer"; 8 | import { warpRoot } from "./utils/mdast-utils"; 9 | 10 | /** 11 | * Options for the `mdastToMarkdown` function. 12 | */ 13 | export interface MdastToMarkdownOptions extends ToMarkdownOptions { 14 | /** 15 | * The base URL to use for replacing relative links. 16 | */ 17 | baseUrl?: string; 18 | } 19 | 20 | /** 21 | * Default options for the `mdastToMarkdown` function. 22 | */ 23 | export const DEFAULT_MDAST_TO_MARKDOWN_OPTIONS: MdastToMarkdownOptions = { 24 | extensions: [gfmToMarkdown(), mathToMarkdown()], 25 | bullet: "-", 26 | }; 27 | 28 | /** 29 | * Converts an MDAST tree to a Markdown string. 30 | * 31 | * @param mdast - The MDAST tree to convert. 32 | * @param options - Options for the conversion. 33 | * @returns The Markdown string. 34 | * 35 | * @example 36 | * ```ts 37 | * import { mdastToMarkdown } from './your-library'; 38 | * 39 | * const mdast = { 40 | * type: 'root', 41 | * children: [ 42 | * { 43 | * type: 'paragraph', 44 | * children: [ 45 | * { type: 'text', value: 'Hello, world!' } 46 | * ] 47 | * } 48 | * ] 49 | * }; 50 | * 51 | * const markdown = mdastToMarkdown(mdast); 52 | * console.log(markdown); // Output: "Hello, world!" 53 | * ``` 54 | */ 55 | export const mdastToMarkdown = (mdast: Mdast | RootContent[], options?: MdastToMarkdownOptions): string => { 56 | const { baseUrl, ...toMarkdownOptions } = { ...DEFAULT_MDAST_TO_MARKDOWN_OPTIONS, ...options }; 57 | 58 | let markdown = toMarkdown(warpRoot(mdast), toMarkdownOptions).replace(/\*\*\*\*/g, ""); 59 | 60 | if (baseUrl) { 61 | markdown = linkReplacer(markdown, baseUrl); 62 | } 63 | 64 | return markdown; 65 | }; 66 | -------------------------------------------------------------------------------- /packages/webforai/src/utils/common.ts: -------------------------------------------------------------------------------- 1 | export const chunk = (array: T[], size: number): T[][] => { 2 | return array.reduce((acc, _, index) => { 3 | if (index % size === 0) { 4 | acc.push(array.slice(index, index + size)); 5 | return acc; 6 | } 7 | return acc; 8 | }, []); 9 | }; 10 | -------------------------------------------------------------------------------- /packages/webforai/src/utils/detect-code-lang.ts: -------------------------------------------------------------------------------- 1 | /* 2 | This code is derived from speed-highlight (https://github.com/speed-highlight/core), 3 | which is licensed under the CC0 1.0 Universal License. 4 | It was a very good simple code language selection algorithm and will be used. Thank you! 5 | */ 6 | 7 | //TODO: Add more languages and improve the algorithm 8 | const languages = [ 9 | ["bash", [/#!(\/usr)?\/bin\/bash/g, 500], [/\b(if|elif|then|fi|echo)\b|\$/g, 10]], 10 | ["html", [/<\/?[a-z-]+[^\n>]*>/g, 10], [/^\s+<-]/gm, 10], [/^@@ ?[-+,0-9 ]+ ?@@/gm, 25]], 37 | ["md", [/^(>|\t\*|\t\d+.)/gm, 10], [/\[.*\](.*)/g, 10]], 38 | ["docker", [/^(FROM|ENTRYPOINT|RUN)/gm, 500]], 39 | ["xml", [/<\/?[a-z-]+[^\n>]*>/g, 10], [/^<\?xml/g, 500]], 40 | ["c", [/#include\b|\bprintf\s+\(/g, 100]], 41 | ["rs", [/^\s+(use|fn|mut|match)\b/gm, 100]], 42 | ["go", [/\b(func|fmt|package)\b/g, 100]], 43 | ["java", [/^import\s+java/gm, 500]], 44 | ["asm", [/^(section|global main|extern|\t(call|mov|ret))/gm, 100]], 45 | ["css", [/^(@import|@page|@media|(\.|#)[a-z]+)/gm, 20]], 46 | ["json", [/\b(true|false|null|\{})\b|\"[^"]+\":/g, 10]], 47 | ["yaml", [/^(\s+)?[a-z][a-z0-9]*:/gim, 10]], 48 | ] as const; 49 | 50 | export const detectLanguage = (code: string) => { 51 | return ( 52 | languages 53 | .map( 54 | ([lang, ...features]) => 55 | [lang, features.reduce((acc, [match, score]) => acc + [...code.matchAll(match)].length * score, 0)] as const, 56 | ) 57 | .filter(([_, score]) => score > 20) 58 | .sort((a, b) => b[1] - a[1])[0]?.[0] || "plain" 59 | ); 60 | }; 61 | -------------------------------------------------------------------------------- /packages/webforai/src/utils/hast-utils.ts: -------------------------------------------------------------------------------- 1 | import type { Element, Nodes as Hast } from "hast"; 2 | import { select, selectAll } from "hast-util-select"; 3 | 4 | export const getLangFromHast = (node: Hast) => { 5 | const html = select("html", node); 6 | if (html && typeof html.properties.lang === "string") { 7 | return html.properties.lang; 8 | } 9 | if (node.type !== "element") { 10 | return; 11 | } 12 | const element = node as Element; 13 | if (element.tagName !== "html") { 14 | return; 15 | } 16 | 17 | const langAttr = element.properties.lang || element.properties["xml:lang"]; 18 | if (langAttr) { 19 | return langAttr as string; 20 | } 21 | 22 | return undefined; 23 | }; 24 | 25 | export const getLangFromStr = (str: string) => { 26 | const match = str.match(/lang=["']([^"']+)["']/); 27 | if (match) { 28 | return match[1]; 29 | } 30 | return undefined; 31 | }; 32 | 33 | export const getUrlFromHast = (node: Hast): string | undefined => { 34 | if (node.type !== "element") { 35 | return undefined; 36 | } 37 | 38 | const metaTagAttributes = ["og:url", "twitter:url"]; 39 | const metaTags = selectAll("meta", node); 40 | 41 | for (const meta of metaTags) { 42 | const property = meta.properties.property || meta.properties.name; 43 | if (typeof property === "string" && metaTagAttributes.includes(property)) { 44 | return typeof meta.properties.content === "string" ? meta.properties.content : undefined; 45 | } 46 | } 47 | 48 | return undefined; 49 | }; 50 | -------------------------------------------------------------------------------- /packages/webforai/src/utils/mdast-utils.ts: -------------------------------------------------------------------------------- 1 | import type { Nodes as Mdast, RootContent } from "mdast"; 2 | 3 | export const unwarpRoot = (mdast: Mdast): RootContent[] => { 4 | if (mdast.type === "root") { 5 | return mdast.children; 6 | } 7 | return [mdast]; 8 | }; 9 | 10 | export const warpRoot = (mdast: RootContent[] | Mdast): Mdast => { 11 | if (Array.isArray(mdast)) { 12 | return { type: "root", children: mdast }; 13 | } 14 | return mdast; 15 | }; 16 | 17 | export const internalType = (content: RootContent): string => { 18 | if (content.type === "heading") { 19 | return `h${content.depth}`; 20 | } 21 | return content.type; 22 | }; 23 | -------------------------------------------------------------------------------- /packages/webforai/tsconfig.build.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./tsconfig.json", 3 | "compilerOptions": { 4 | "module": "ES2020", 5 | "rootDir": "./src/", 6 | "outDir": "./dist/types/", 7 | "noUnusedLocals": true, 8 | "noUnusedParameters": true, 9 | "sourceMap": true 10 | }, 11 | "include": ["src/**/*.ts", "src/**/*.mts"], 12 | "exclude": [ 13 | "src/mod.ts", 14 | "src/helper.ts", 15 | "src/middleware.ts", 16 | "src/deno/**/*.ts", 17 | "src/test-utils/*.ts", 18 | "src/**/*.test.ts", 19 | "src/**/*.test.tsx" 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /packages/webforai/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2020", 4 | "module": "ESNext", 5 | "declaration": true, 6 | "moduleResolution": "Bundler", 7 | "outDir": "./dist", 8 | "esModuleInterop": true, 9 | "forceConsistentCasingInFileNames": true, 10 | "strict": true, 11 | "skipLibCheck": false, 12 | "noUnusedLocals": false, 13 | "noUnusedParameters": false, 14 | "resolveJsonModule": true, 15 | "types": ["node"] 16 | }, 17 | "include": [ 18 | "src/**/*.ts", 19 | "src/**/*.d.ts", 20 | "src/**/*.mts", 21 | "src/**/*.test.ts", 22 | "src/**/*.test.tsx", 23 | "bin/**/*.ts", 24 | "bin/**/*.d.ts", 25 | "bin/**/*.mts", 26 | "bin/**/*.test.ts", 27 | "bin/**/*.test.tsx" 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /pnpm-workspace.yaml: -------------------------------------------------------------------------------- 1 | packages: 2 | - "packages/*" 3 | - "examples/*" 4 | - "site" -------------------------------------------------------------------------------- /site/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # site 2 | 3 | ## 0.1.0 4 | 5 | ### Minor Changes 6 | 7 | - [#56](https://github.com/inaridiy/webforai/pull/56) [`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4) Thanks [@inaridiy](https://github.com/inaridiy)! - Add minimal filter extractor 8 | 9 | ## 0.0.2 10 | 11 | ### Patch Changes 12 | 13 | - [#53](https://github.com/inaridiy/webforai/pull/53) [`c3f012c`](https://github.com/inaridiy/webforai/commit/c3f012ca740ef33538ca5d4874277008daf5c5a1) Thanks [@moons-14](https://github.com/moons-14)! - Add description for cf puppeteer loader. 14 | 15 | ## 0.0.1 16 | 17 | ### Patch Changes 18 | 19 | - [#50](https://github.com/inaridiy/webforai/pull/50) [`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f) Thanks [@inaridiy](https://github.com/inaridiy)! - New Documentation Site 20 | -------------------------------------------------------------------------------- /site/README.md: -------------------------------------------------------------------------------- 1 | This is a [Vocs](https://vocs.dev) project bootstrapped with the Vocs CLI. 2 | -------------------------------------------------------------------------------- /site/docs/footer.tsx: -------------------------------------------------------------------------------- 1 | // biome-ignore lint/style/noDefaultExport: 2 | export default function Footer() { 3 | return ( 4 | 5 | Released under the MIT License. 6 | Copyright © 2024-present inaridiy and contributors. 7 | 8 | ); 9 | } 10 | -------------------------------------------------------------------------------- /site/docs/pages/cookbook/custom-extractor.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: Custom extractor 3 | authors: 4 | - "[inaridiy](https://github.com/inaridiy)" 5 | date: 2024-10-20 6 | --- 7 | 8 | # Custom extractor 9 | 10 | ::authors 11 | 12 | The default takumi-extractor in webforai is powerful, but occasionally it might not perform well on websites with unique structures. There may also be cases where you need to extract content other than the main body. 13 | 14 | For such scenarios, you can create a custom extractor to handle specific requirements. 15 | 16 | In the following example, we’ll build a custom extractor to pull the main content from an Amazon product page. 17 | 18 | ```ts [src/index.ts] twoslash 19 | import { select } from "hast-util-select"; 20 | import { type Extractor, htmlToMarkdown, takumiExtractor } from "webforai"; 21 | import { loadHtml } from "webforai/loaders/playwright"; 22 | 23 | const url = "https://www.amazon.com/Generative-Deep-Learning-Teaching-Machines/dp/1098134184/ref=sr_1_8?sr=8-8s"; 24 | const html = await loadHtml(url); 25 | 26 | const amazonShopItemExtractor: Extractor = (params) => { 27 | const { hast } = params; 28 | const mainContent = select("div#centerCol", hast); 29 | if (!mainContent) { 30 | return hast; 31 | } 32 | return mainContent; 33 | }; 34 | 35 | const cleanedContent = await htmlToMarkdown(html, { baseUrl: url, extractors: [amazonShopItemExtractor, takumiExtractor] }); 36 | 37 | console.info(cleanedContent); 38 | ``` 39 | 40 | This custom extractor targets the #centerCol element on Amazon product pages. If found, it returns only that content; otherwise, it defaults to the original structure. 41 | -------------------------------------------------------------------------------- /site/docs/pages/cookbook/index.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: Cookbook 3 | --- 4 | 5 | # Cookbook 6 | 7 | Welcome to the Webforai Cookbook, a collection of practical recipes to help you harness the full potential of Webforai. 8 | 9 | Here, you'll find examples that guide you through various tasks, from basic usage to advanced customization. 10 | 11 | ## Recipes 12 | 13 | ### Basic Usage 14 | - [Simple Usage](/cookbook/simple) 15 | Learn the most basic way to convert HTML to Markdown. 16 | 17 | ### Advanced Conversion 18 | - [Web Page Translation](/cookbook/translation) 19 | Translate web content into any language using Webforai and Vercel AI SDK. 20 | 21 | - [Structured Data Output](/cookbook/structured-output) 22 | Extract structured JSON data directly from websites with AI SDK. 23 | 24 | ### Customization & Extensions 25 | - [Custom Extractor](/cookbook/custom-extractor) 26 | Create custom extractors for specific website structures. 27 | 28 | ### Usage in Specific Environments 29 | - [Cloudflare Workers](/cookbook/cf-workers) 30 | Learn how to use Webforai in Cloudflare Workers. 31 | 32 | ## Contribute 33 | 34 | These recipes are here to help you explore Webforai’s capabilities in real-world scenarios. We welcome contributions—whether you have a new recipe idea or improvements for existing ones. Submit a pull request on our GitHub repository. 35 | 36 | For more detailed information and advanced features, check out the [API Documentation](/docs/html-to-markdown). 37 | 38 | Start building amazing projects with Webforai today! -------------------------------------------------------------------------------- /site/docs/pages/cookbook/simple.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: Simple usage 3 | authors: 4 | - "[inaridiy](https://github.com/inaridiy)" 5 | date: 2024-10-19 6 | --- 7 | 8 | # Simple usage 9 | 10 | ::authors 11 | 12 | The simplest way to use Webforai 13 | 14 | ::::steps 15 | ## Install dependencies 16 | 17 | Install the necessary packages: 18 | 19 | :::code-group 20 | ```bash [npm] 21 | npm init -y 22 | npm install webforai 23 | npm install -D tsx 24 | ``` 25 | 26 | ```bash [pnpm] 27 | pnpm init -y 28 | pnpm install webforai 29 | pnpm install -D tsx 30 | ``` 31 | ::: 32 | 33 | ## Write code 34 | 35 | Here’s how to convert HTML to Markdown using **webforai**: 36 | 37 | ```ts [src/index.ts] twoslash 38 | import { htmlToMarkdown } from "webforai"; 39 | import { loadHtml } from "webforai/loaders/fetch"; 40 | 41 | const html = await loadHtml("https://example.com"); 42 | const markdown = htmlToMarkdown(html); 43 | 44 | console.log(markdown); 45 | ``` 46 | 47 | ## Launch 48 | 49 | ```bash 50 | tsx src/index.ts 51 | 52 | # => # Example Domain 53 | # => 54 | # => This domain is for use in illustrative examples in documents. You may use this 55 | # => domain in literature without prior coordination or asking for permission. 56 | # => 57 | # => More information... 58 | ``` 59 | :::: 60 | -------------------------------------------------------------------------------- /site/docs/pages/cookbook/structured-output.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: Structured Output with ai SDK 3 | authors: 4 | - "[inaridiy](https://github.com/inaridiy)" 5 | date: 2024-10-19 6 | --- 7 | 8 | # Structured Output 9 | 10 | ::authors 11 | You can get **structured JSON** directly from any website by using webforai and the [Vercel AI SDK](https://sdk.vercel.ai/). 12 | 13 | ::::steps 14 | ## Install dependencies 15 | 16 | Install the necessary packages: 17 | 18 | :::code-group 19 | ```bash [npm] 20 | npm init -y 21 | npm install webforai ai @ai-sdk/google zod 22 | npm install -D tsx 23 | ``` 24 | 25 | ```bash [pnpm] 26 | pnpm init -y 27 | pnpm install webforai ai @ai-sdk/google zod 28 | pnpm install -D tsx 29 | ``` 30 | ::: 31 | 32 | ## Prepare API Key 33 | 34 | This example uses **Google Generative AI (Gemini 1.5 Flash)** via the AI SDK. 35 | Set your **Google Generative AI API key** as an environment variable **GOOGLE_GENERATIVE_AI_API_KEY**. You can get the key [here](https://aistudio.google.com/app/apikey). 36 | 37 | For other providers, see the [AI SDK provider documentation](https://sdk.vercel.ai/providers/ai-sdk-providers). 38 | 39 | ## Write code 40 | 41 | Here’s how to convert HTML to Markdown using **webforai** and then transform it into a structured object with **AI SDK**: 42 | 43 | ```ts [src/index.ts] twoslash 44 | import { google } from "@ai-sdk/google"; 45 | import { generateObject } from "ai"; 46 | import { htmlToMarkdown } from "webforai"; 47 | import { loadHtml } from "webforai/loaders/fetch"; 48 | import { z } from "zod"; 49 | 50 | const html = await loadHtml("https://github.com/inaridiy?tab=repositories"); 51 | const markdown = htmlToMarkdown(html); 52 | 53 | const { object: repositories } = await generateObject({ 54 | model: google("gemini-1.5-flash-latest"), 55 | schema: z.object({ 56 | repositories: z.array( 57 | z.object({ 58 | name: z.string(), 59 | url: z.string(), 60 | stars: z.number(), 61 | license: z.string(), 62 | }), 63 | ), 64 | }), 65 | prompt: `Please generate a list of repositories from the following markdown content.\n\n${markdown}`, 66 | }); 67 | 68 | console.log(repositories); 69 | ``` 70 | 71 | ## Launch 🚀 72 | 73 | Just run the following command: 74 | 75 | ```bash 76 | tsx src/index.ts 77 | 78 | # => { 79 | # => repositories: [ 80 | # => { 81 | # => name: 'webforai', 82 | # => url: 'https://github.com/inaridiy/webforai', 83 | # => stars: 46, 84 | # => license: 'MIT' 85 | # => } 86 | # => ] 87 | # => ... 88 | # => } 89 | ``` 90 | 91 | :::: 92 | -------------------------------------------------------------------------------- /site/docs/pages/cookbook/translation.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: Translation web page 3 | authors: 4 | - "[inaridiy](https://github.com/inaridiy)" 5 | date: 2024-10-19 6 | --- 7 | 8 | # Translation web content 9 | ::authors 10 | 11 | You can translate web content to any language by using webforai and the [Vercel AI SDK](https://sdk.vercel.ai/). 12 | 13 | ::::steps 14 | ## Install dependencies 15 | 16 | Install the necessary packages: 17 | 18 | :::code-group 19 | ```bash [npm] 20 | npm init -y 21 | npm install webforai ai @ai-sdk/google zod 22 | npm install -D tsx 23 | ``` 24 | 25 | ```bash [pnpm] 26 | pnpm init -y 27 | pnpm install webforai ai @ai-sdk/google zod 28 | pnpm install -D tsx 29 | ``` 30 | ::: 31 | 32 | ## Prepare API Key 33 | 34 | This example uses **Google Generative AI (Gemini 1.5 Flash)** via the AI SDK. 35 | Set your **Google Generative AI API key** as an environment variable **GOOGLE_GENERATIVE_AI_API_KEY**. You can get the key [here](https://aistudio.google.com/app/apikey). 36 | 37 | For other providers, see the [AI SDK provider documentation](https://sdk.vercel.ai/providers/ai-sdk-providers). 38 | 39 | ## Write code 40 | 41 | Here's an example of how to translate a web page using webforai and the Vercel AI SDK. 42 | A little trick in this code is the use of `experimental_continueSteps`. 43 | If you enable this flag, it will also make it OK if the outputToken is exceeded. 44 | 45 | ```ts [src/index.ts] twoslash 46 | import { google } from "@ai-sdk/google"; 47 | import { generateText } from "ai"; 48 | import { htmlToMarkdown } from "webforai"; 49 | import { loadHtml } from "webforai/loaders/playwright"; 50 | 51 | const url = "https://github.com/inaridiy"; 52 | const targetLanguage = "ja"; // Translate to Japanese 53 | 54 | const html = await loadHtml(url, { superBypassMode: true }); 55 | const markdown = htmlToMarkdown(html); 56 | 57 | const prompt = `Translate mechanically converted HTML-based Markdown into ${targetLanguage}, while refining and correcting the content for clarity and coherence. 58 | 59 | The Markdown provided may contain redundant or unnecessary information and errors due to mechanical conversion. Your task is to translate the text into Japanese, fixing these issues and improving the overall quality of the Markdown document. 60 | 61 | 62 | ${markdown} 63 | `; 64 | 65 | const response = await generateText({ 66 | model: google("gemini-1.5-flash-latest"), 67 | temperature: 0, 68 | prompt, 69 | maxSteps: 10, 70 | experimental_continueSteps: true, // To long content, you need to set this option. 71 | }); 72 | 73 | console.info(response.text); 74 | 75 | ``` 76 | 77 | ## Launch 🚀 78 | 79 | Just run the following command: 80 | 81 | ```bash 82 | tsx src/index.ts 83 | 84 | # => Output the translated content. 85 | ``` 86 | 87 | :::: -------------------------------------------------------------------------------- /site/docs/pages/docs/html-to-markdown.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: htmlToMarkdown 3 | --- 4 | 5 | # htmlToMarkdown 6 | 7 | Useful and high-quality HTML to Markdown converter. 8 | Internally, it just calls [htmlToMdast](/docs/html-to-mdast) and [mdastToMarkdown](/docs/mdast-to-markdown) in that order. 9 | 10 | ## Usage 11 | 12 | ```ts twoslash 13 | import { htmlToMarkdown } from "webforai"; 14 | 15 | const html = "Hello, world!"; 16 | const markdown = htmlToMarkdown(html); 17 | // @log: => "# Hello, world!" 18 | ``` 19 | 20 | ## Returns 21 | 22 | `string` 23 | 24 | The converted Markdown string. 25 | 26 | ## Parameters 27 | 28 | ### htmlOrHast 29 | 30 | type: `string | Hast` 31 | 32 | The HTML string or HAST tree to convert. 33 | 34 | ```ts 35 | const markdown = htmlToMarkdown("Hello, world!"); 36 | // => "# Hello, world!" 37 | ``` 38 | 39 | ### options.baseUrl 40 | 41 | type: `string` 42 | 43 | The base URL to use for replacing relative links. 44 | 45 | ```ts 46 | const markdown = htmlToMarkdown("bar", { 47 | baseUrl: "https://example.com", 48 | }); 49 | // => "[bar](https://example.com/foo)" 50 | ``` 51 | 52 | ### options.extractors 53 | 54 | type: `ExtractorSelectors` 55 | 56 | An array of extractors to extract specific elements from the HTML. 57 | You can define your own functions in addition to the Extractor provided as a preset. 58 | 59 | ```ts twoslash 60 | import { htmlToMarkdown, type Extractor, takumiExtractor } from "webforai" 61 | 62 | const yourCustomExtractor: Extractor = (params) => { 63 | const { hast, url } = params 64 | // ... your logic ... 65 | return hast 66 | }; 67 | 68 | const html = "Hello, world!" 69 | const markdown = htmlToMarkdown(html, { 70 | extractors: [yourCustomExtractor, takumiExtractor] 71 | }); 72 | // => "# Hello, world!" 73 | ``` 74 | 75 | ### options.formatting 76 | 77 | type: `Omit` 78 | 79 | Formatting options passed to [mdast-util-to-markdown](https://github.com/syntax-tree/mdast-util-to-markdown). 80 | 81 | ```ts 82 | const markdown = htmlToMarkdown("Hello, world!", { 83 | formatting: { 84 | bullet: "*", 85 | }, 86 | }); 87 | // => "* Hello, world!" 88 | ``` 89 | 90 | ### options.linkAsText 91 | 92 | type: `boolean` 93 | 94 | Whether to convert links to plain text. 95 | 96 | ```ts 97 | const markdown = htmlToMarkdown("bar", { 98 | linkAsText: true, 99 | }); 100 | // => "bar" 101 | ``` 102 | 103 | ### options.tableAsText 104 | 105 | type: `boolean` 106 | 107 | Whether to convert tables to plain text. 108 | 109 | 110 | ### options.hideImage 111 | 112 | type: `boolean` 113 | 114 | Whether to hide images. 115 | 116 | ### options.lang 117 | 118 | type: `string` 119 | 120 | The language of the HTML. -------------------------------------------------------------------------------- /site/docs/pages/docs/html-to-mdast.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: htmlToMdast 3 | --- 4 | 5 | # htmlToMdast 6 | 7 | Convert HTML to Mdast. 8 | If you simply want to convert from HTML to Markdown, we recommend using [htmlToMarkdown](/docs/html-to-markdown). 9 | 10 | ## Usage 11 | 12 | ```ts twoslash 13 | import { htmlToMdast } from "webforai"; 14 | 15 | const mdast = htmlToMdast("Hello, world!"); 16 | // @log: => { 17 | // @log: type: "root", 18 | // @log: children: [{ type: "heading", depth: 1, children: [{ type: "text", value: "Hello, world!" }] }] 19 | // @log: } 20 | ``` 21 | 22 | ## Returns 23 | 24 | `mdast.Nodes` 25 | 26 | The converted Mdast tree. 27 | 28 | ## Parameters 29 | 30 | ### htmlOrHast 31 | 32 | type: `string | Hast` 33 | 34 | The HTML string or HAST tree to convert. 35 | 36 | ```ts 37 | const mdast = htmlToMdast("Hello, world!"); 38 | // => { 39 | // type: "root", 40 | // children: [{ type: "heading", depth: 1, children: [{ type: "text", value: "Hello, world!" }] }] 41 | // } 42 | ``` 43 | ### options.extractors 44 | 45 | type: `ExtractorSelectors` 46 | 47 | An array of extractors to extract specific elements from the HTML. 48 | You can define your own functions in addition to the Extractor provided as a preset. 49 | 50 | ```ts twoslash 51 | import { htmlToMdast, type Extractor, takumiExtractor } from "webforai" 52 | 53 | const yourCustomExtractor: Extractor = (params) => { 54 | const { hast, url } = params 55 | // ... your logic ... 56 | return hast 57 | }; 58 | 59 | const html = "Hello, world!" 60 | const mdast = htmlToMdast(html, { 61 | extractors: [yourCustomExtractor, takumiExtractor] 62 | }); 63 | ``` 64 | ### options.linkAsText 65 | 66 | type: `boolean` 67 | 68 | Whether to convert links to plain text. 69 | 70 | ```ts 71 | const mdast = htmlToMdast("bar", { 72 | linkAsText: true, 73 | }); 74 | ``` 75 | 76 | ### options.tableAsText 77 | 78 | type: `boolean` 79 | 80 | Whether to convert tables to plain text. 81 | 82 | 83 | ### options.hideImage 84 | 85 | type: `boolean` 86 | 87 | Whether to hide images. 88 | 89 | ### options.lang 90 | 91 | type: `string` 92 | 93 | The language of the HTML. -------------------------------------------------------------------------------- /site/docs/pages/docs/loaders.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: Loaders Utilities 3 | --- 4 | 5 | # Loaders Utilities 6 | 7 | The **Loaders Utilities** provide simple tools to easily fetch HTML from websites. 8 | All the utilities are designed to be straightforward, requiring no configuration. 9 | 10 | :::warning 11 | However, they are not recommended for production use. 12 | ::: 13 | 14 | ## Overview of Loaders 15 | 16 | Webforai provides four different loaders: 17 | 18 | - **Fetch Loader**: The simplest option, using JavaScript's built-in Fetch API. 19 | - **Playwright Loader**: Ideal for sites requiring JavaScript execution, like SPAs. 20 | - **Puppeteer Loader**: Another option for handling websites with JavaScript execution. 21 | - **CF Puppeteer Loader**: Option to handle websites running JavaScript on cloudflare workers. 22 | 23 | ## Fetch Loader 24 | 25 | The **Fetch Loader** is the simplest utility, using JavaScript’s **Fetch API**. 26 | It retrieves HTML from a given URL, using a basic User-Agent for the request. 27 | 28 | ### Usage 29 | 30 | ```ts twoslash 31 | import { loadHtml } from "webforai/loaders/fetch"; 32 | 33 | const html = await loadHtml("https://example.com"); 34 | ``` 35 | 36 | ## Playwright Loader 37 | 38 | The **Playwright Loader** is a more powerful tool, using [Playwright](https://playwright.dev/) to 39 | fetch HTML from websites that need JavaScript execution, like SPAs (Single Page Applications). 40 | 41 | ### Usage 42 | 43 | Before using the Playwright Loader, you need to install the Playwright browser and its dependencies. 44 | 45 | :::code-group 46 | 47 | ```bash [npm] 48 | npx playwright-core install 49 | ``` 50 | 51 | ```bash [pnpm] 52 | pnpm install playwright-core 53 | ``` 54 | ::: 55 | 56 | And then you can use the Playwright Loader as follows: 57 | 58 | :::code-group 59 | 60 | ```ts twoslash [basic-usage] 61 | import { loadHtml } from "webforai/loaders/playwright"; 62 | 63 | const html = await loadHtml("https://example.com"); 64 | ``` 65 | 66 | ```ts twoslash [super-bypass-mode] 67 | import { loadHtml } from "webforai/loaders/playwright"; 68 | 69 | const html = await loadHtml("https://example.com", { 70 | superBypassMode: true, 71 | }); 72 | 73 | ``` 74 | ::: 75 | 76 | 77 | ## Puppeteer Loader 78 | 79 | The **Puppeteer Loader** is another advanced tool that uses [Puppeteer](https://pptr.dev/) to 80 | load HTML from sites that rely on JavaScript execution, similar to Playwright. 81 | 82 | ### Usage 83 | 84 | Before using the Puppeteer Loader, you need to install the Puppeteer browser and its dependencies. 85 | 86 | :::code-group 87 | 88 | ```bash [npm] 89 | npm install puppeteer 90 | ``` 91 | 92 | ```bash [pnpm] 93 | pnpm install puppeteer 94 | ``` 95 | ::: 96 | 97 | And then you can use the Puppeteer Loader as follows: 98 | 99 | ```ts twoslash 100 | import { loadHtml } from "webforai/loaders/puppeteer"; 101 | 102 | const html = await loadHtml("https://example.com"); 103 | ``` 104 | 105 | ## CF Puppeteer Loader 106 | The **CF Puppeteer Loader** is the best option for loading HTML from sites that rely on JavaScript execution on [cloudflare workers](https://workers.cloudflare.com/). This loader relies on [puppeteer on cloudflare workers](https://developers.cloudflare.com/browser-rendering/platform/puppeteer/). 107 | 108 | ### Usage 109 | Before using the CF Puppeteer Loader, you need to prepare a wrangler environment and install @cloudflare/puppeteer. Refer to the [cookbook](/cookbook/cf-workers) for instructions on how to create a project. 110 | 111 | :::code-group 112 | 113 | ```bash [npm] 114 | npm install @cloudflare/puppeteer --save-dev 115 | ``` 116 | 117 | ```bash [pnpm] 118 | pnpm install -D @cloudflare/puppeteer 119 | ``` 120 | ::: 121 | 122 | And then you can use the Playwright Loader as follows: 123 | 124 | ```ts 125 | import { loadHtml } from "webforai/loaders/cf-puppeteer"; 126 | 127 | const html = await loadHtml("https://example.com", browser); // browser is the puppeteer browser instance 128 | ``` 129 | -------------------------------------------------------------------------------- /site/docs/pages/docs/mdast-to-markdown.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: mdastToMarkdown 3 | --- 4 | 5 | # mdastToMarkdown 6 | 7 | Convert Mdast to Markdown. 8 | 9 | ```ts 10 | import { mdastToMarkdown } from "webforai"; 11 | 12 | const mdast = { 13 | type: 'root', 14 | children: [ 15 | { 16 | type: 'paragraph', 17 | children: [ 18 | { type: 'text', value: 'Hello, world!' } 19 | ] 20 | } 21 | ] 22 | }; 23 | 24 | const markdown = mdastToMarkdown(mdast); 25 | // => "# Hello, world!" 26 | ``` 27 | 28 | ## Returns 29 | 30 | `string` 31 | 32 | The converted Markdown string. 33 | 34 | ## Parameters 35 | 36 | ### mdast 37 | 38 | type: `Mdast` 39 | 40 | The Mdast tree to convert. 41 | 42 | ### options.baseUrl 43 | 44 | type: `string` 45 | 46 | The base URL to use for replacing relative links. 47 | 48 | ### Omit\ 49 | 50 | 51 | Formatting options passed to [mdast-util-to-markdown](https://github.com/syntax-tree/mdast-util-to-markdown). 52 | 53 | default: `DEFAULT_MDAST_TO_MARKDOWN_OPTIONS` 54 | ```ts 55 | const DEFAULT_MDAST_TO_MARKDOWN_OPTIONS: MdastToMarkdownOptions = { 56 | extensions: [gfmToMarkdown(), mathToMarkdown()], 57 | bullet: "-", 58 | }; 59 | ``` 60 | 61 | -------------------------------------------------------------------------------- /site/docs/pages/getting-started.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: Getting Started 3 | --- 4 | 5 | # Getting Started 6 | 7 | ## Overview 8 | 9 | Welcome to webforai, an library designed to convert **HTML to Markdown** with simple utilities. 10 | Whether you're working in a browser, Node.js, or even on Cloudflare Workers, webforai is your go-to tool for bridging between web and LLMs. 11 | 12 | ## Installation 13 | 14 | :::code-group 15 | 16 | ```bash [npm] 17 | npm i webforai 18 | ``` 19 | 20 | ```bash [pnpm] 21 | pnpm i webforai 22 | ``` 23 | 24 | ```bash [yarn] 25 | yarn add webforai 26 | ``` 27 | 28 | ::: 29 | 30 | ## Quick Start (CLI) 31 | 32 | You can convert HTML to Markdown with the following command. 33 | 34 | ```bash 35 | $ npx webforai@latest https://www.npmjs.com/package/webforai 36 | 37 | ┌ webforai CLI version 1.6.3 38 | │ 39 | ◇ Select loader: # [!code hl] 40 | │ fetch # fetch(default) or playwright 41 | │ 42 | ◇ Enter the output file path: # [!code hl] 43 | │ npmjs-package-webforai.md # default is `{escaped-url}.md` 44 | │ 45 | ◇ Select processing mode: # [!code hl] 46 | │ default # default or ai mode. ai mode is remove imapges, links, and so on. 47 | │ 48 | ◇ Content loaded! 49 | │ 50 | └ Done! Markdown saved to npmjs-package-webforai.md 51 | 52 | ``` 53 | 54 | ## Quick Start (Library) 55 | 56 | 57 | ::::steps 58 | 59 | ### Load HTML with utilities 60 | 61 | Firstly, load HTML using the `loadHtml` utility. Using this function, you can get HTML from a URL in a simple way. It supports versions for **fetch**, **Playwright**, and **Puppeteer**. 62 | 63 | :::code-group 64 | 65 | ```tsx [fetch] twoslash 66 | import { loadHtml } from "webforai/loaders/fetch"; // [!code hl] 67 | 68 | // Load html from url 69 | const url = "https://www.npmjs.com/package/webforai"; 70 | const html = await loadHtml(url); // [!code hl] 71 | ``` 72 | 73 | 74 | ```tsx [playwright] twoslash 75 | // Before using playwright loader, run `npx playwright install` 76 | import { loadHtml } from "webforai/loaders/playwright"; // [!code hl] 77 | 78 | // Load html from url 79 | const url = "https://www.npmjs.com/package/webforai"; 80 | const html = await loadHtml(url , { superBypassMode: true }); // [!code hl] 81 | // @log: Only playwright loader supports super bypass mode. 82 | // @log: This is useful to bypass some anti-bot measures. 83 | ``` 84 | 85 | ```tsx [puppeteer] twoslash 86 | // Before using puppeteer loader, run `npm i puppeteer` 87 | import { loadHtml } from "webforai/loaders/puppeteer"; // [!code hl] 88 | 89 | // Load html from url 90 | const url = "https://www.npmjs.com/package/webforai"; 91 | const html = await loadHtml(url); // [!code hl] 92 | ``` 93 | 94 | ::: 95 | 96 | :::warning 97 | The `loadHtml` function is designed for ease of use and is not recommended for intensive use in production environments. 98 | ::: 99 | 100 | ### Convert HTML to Markdown 101 | 102 | Finally, convert HTML to Markdown with the `htmlToMarkdown` function. 103 | 104 | ```tsx 105 | import { htmlToMarkdown } from "webforai"; // [!code focus] 106 | import { loadHtml } from "webforai/loaders/fetch"; 107 | 108 | // Load html from url 109 | const url = "https://www.npmjs.com/package/webforai"; 110 | const html = await loadHtml(url); 111 | 112 | const markdown = htmlToMarkdown(html); // [!code focus] 113 | ``` 114 | :::: 115 | -------------------------------------------------------------------------------- /site/docs/pages/how-it-works.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: How it works 3 | --- 4 | 5 | # How it works 6 | 7 | ## Overview 8 | 9 | The core function of webforai is **converting HTML to Markdown**, built on the Syntax Tree ecosystem. This process happens in two steps: 10 | 11 | 1. **Convert HTML to [Hast](https://github.com/syntax-tree/hast)**. (Hypertext Abstract Syntax Tree) 12 | 2. **Convert Hast to [Mdast](https://github.com/syntax-tree/mdast)**. (Markdown Abstract Syntax Tree) 13 | 3. **Convert Mdast to Markdown**. 14 | 15 | What makes this special is the **content extraction** in step 1. This ensures that only the main content—the part humans care about—is extracted from the HTML. 16 | After that, the rest of the transformation is handled using fine-tuned utilities from the Syntax Tree ecosystem. 17 | 18 |  19 | 20 | ## Extractor 21 | 22 | In webforai, the process of extracting the main content from a web page is abstracted into a component called the **Extractor**. 23 | This is a flexible system designed to make content extraction simple and customizable. 24 | 25 | ### Extractor Interface 26 | 27 | The Extractor is a function that takes in two things: 28 | 29 | - A **Hast** object, which represents the structure of the HTML. 30 | - Optional metadata, such as the language or URL of the page. 31 | 32 | The Extractor processes this input and returns a new Hast object that represents the cleaned-up, extracted content. 33 | 34 | ```ts twoslash 35 | import type { Nodes as Hast } from "hast"; 36 | 37 | type ExtractParams = { hast: Hast; lang?: string; url?: string }; 38 | type Extractor = (params: ExtractParams) => Hast; 39 | ``` 40 | 41 | ### Default Extractor 42 | 43 | By default, webforai provides a built-in Extractor called `takumi-extractor`. This extractor is adjusted to produce a high average quality for a typical web page. 44 | I do my best to adjust it to the best of my ability using various flags and scoring with reference to **Mozilla's readability** and other algorithms. 45 | 46 | ### Customizing the Extraction 47 | 48 | **webforai** allows you to define **multiple extractors** and chain them together. 49 | The Hast object is passed from one Extractor to the next in the order they are defined, allowing you to fine-tune the extraction process. 50 | 51 | You can also create **your own custom Extractor** to implement specific algorithms or extraction logic. 52 | 53 | ```ts twoslash 54 | import { htmlToMarkdown } from "webforai"; 55 | import { loadHtml } from "webforai/loaders/fetch"; 56 | import type { Extractor } from "webforai"; 57 | 58 | // [!code focus] 59 | const customExtractor: Extractor = (params) => {// [!code focus] 60 | const { hast, url } = params;// [!code focus] 61 | // Your custom extraction logic here // [!code focus] 62 | return hast; // [!code focus] 63 | }; // [!code focus] 64 | 65 | const html = await loadHtml("https://example.com"); 66 | const markdown = await htmlToMarkdown(html, { // [!code focus] 67 | extractors: [customExtractor], // [!code focus] 68 | }); // [!code focus] 69 | ``` 70 | 71 | -------------------------------------------------------------------------------- /site/docs/pages/index.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | layout: landing 3 | content: 4 | width: 100% 5 | --- 6 | 7 | import { HomePage } from "vocs/components"; 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | A esm-native library that converts HTML to Markdown & Useful Utilities with simple, lightweight and epic quality. 16 | 17 | 18 | Get started 19 | Cookbook 20 | GitHub 21 | 22 | 23 | 24 | 25 | 26 | :::code-group 27 | 28 | ```bash [npm] 29 | npm i webforai 30 | # or just run 31 | npx webforai@latest 32 | ``` 33 | 34 | ```bash [pnpm] 35 | pnpm i webforai 36 | # or just run 37 | pnpx webforai@latest 38 | ``` 39 | 40 | ```bash [yarn] 41 | yarn add webforai 42 | # or just run 43 | npx webforai@latest 44 | ``` 45 | 46 | ::: 47 | 48 | 49 | 50 | 51 | 52 | 53 | license 54 | 55 | 56 | Apache-2.0 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | stars 66 | 67 | 68 | 46 69 | 70 | 71 | 72 | 73 | 74 | 75 | {" <= Let's star!"} 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | # 90 | # Overview 91 | 92 | ```ts twoslash 93 | import { htmlToMarkdown, htmlToMdast } from "webforai"; 94 | import { loadHtml } from "webforai/loaders/playwright"; 95 | 96 | // Load html from url 97 | const url = "https://www.npmjs.com/package/webforai"; 98 | const html = await loadHtml(url); // [!code hl] 99 | 100 | // Convert html to markdown 101 | const markdown = htmlToMarkdown(html, { baseUrl: url }); // [!code hl] 102 | ``` 103 | 104 | # 105 | # Features 106 | 107 | - High-quality HTML to Markdown conversion with simple, customizable options 108 | - ESM-native, compatible with various environments (browser, Cloudflare Worker, Node.js, etc.) 109 | - Lightweight (only 146kb minified and gzipped) 110 | - Flexibility because it's built on [Syntax Tree](https://github.com/syntax-tree) ecosystem. 111 | - MathML to LaTeX with [mathml-to-latext](https://www.npmjs.com/package/mathml-to-latex) package. 112 | - Out-of-the-box loaders such as fetch, [playwright](https://www.npmjs.com/package/playwright), [puppeteer](https://www.npmjs.com/package/puppeteer). 113 | - CLI tool `npx webforai` for quick HTML to Markdown conversion 114 | 115 | 116 | 117 | 118 | 119 | # 120 | # Sponsors 121 | 122 | ::sponsors 123 | 124 | -------------------------------------------------------------------------------- /site/docs/pages/installation.mdx: -------------------------------------------------------------------------------- 1 | --- 2 | title: Installation 3 | --- 4 | 5 | # Installation 6 | 7 | Install the `webforai` package with your preferred package manager. 8 | 9 | ## Package Manager 10 | 11 | :::code-group 12 | 13 | ```bash [npm] 14 | # install core package 15 | npm i webforai 16 | 17 | # or install with playwright browser binaries to use the playwright loader. 18 | npx webforai@latest 19 | npx playwright install 20 | 21 | # or install with puppeteer browser binaries to use the puppeteer loader. 22 | npx webforai@latest puppeteer 23 | ``` 24 | 25 | ```bash [pnpm] 26 | # install core package 27 | pnpm i webforai 28 | 29 | # or install with playwright browser binaries to use the playwright loader. 30 | pnpm i webforai@latest 31 | pnpm playwright install 32 | 33 | # or install with puppeteer browser binaries to use the puppeteer loader. 34 | pnpm i webforai@latest puppeteer 35 | ``` 36 | 37 | ```bash [yarn] 38 | # install core package 39 | yarn add webforai 40 | 41 | # or install with playwright browser binaries to use the playwright loader. 42 | yarn add webforai@latest 43 | npx playwright install 44 | 45 | # or install with puppeteer browser binaries to use the puppeteer loader. 46 | yarn add webforai@latest puppeteer 47 | ``` 48 | ::: 49 | 50 | -------------------------------------------------------------------------------- /site/docs/public/images/logo-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inaridiy/webforai/2c5339a177ac6197c9a382b5d08fc973f1fc4a12/site/docs/public/images/logo-dark.png -------------------------------------------------------------------------------- /site/docs/public/images/logo-full-dark.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /site/docs/public/images/logo-full-light.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /site/docs/public/images/logo-full-pad-dark.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /site/docs/public/images/logo-full-pad-light.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /site/docs/public/images/logo-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inaridiy/webforai/2c5339a177ac6197c9a382b5d08fc973f1fc4a12/site/docs/public/images/logo-light.png -------------------------------------------------------------------------------- /site/docs/styles.css: -------------------------------------------------------------------------------- 1 | @layer vocs_preflight { 2 | @tailwind base; 3 | } 4 | 5 | @tailwind components; 6 | @tailwind utilities; 7 | 8 | #home-install .vocs_CodeGroup { 9 | display: flex; 10 | height: 100%; 11 | flex-direction: column; 12 | } 13 | 14 | #home-install .vocs_Tabs_content { 15 | flex: 1; 16 | } 17 | 18 | #home-install .vocs_Code { 19 | font-size: 18px; 20 | } -------------------------------------------------------------------------------- /site/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "site", 3 | "version": "0.1.0", 4 | "type": "module", 5 | "private": true, 6 | "scripts": { 7 | "dev": "vocs dev", 8 | "build": "vocs build", 9 | "preview": "vocs preview", 10 | "worker:dev": "wrangler dev", 11 | "worker:deploy": "wrangler deploy", 12 | "cf-typegen": "wrangler types" 13 | }, 14 | "devDependencies": { 15 | "@ai-sdk/google": "^0.0.48", 16 | "@cloudflare/pages-plugin-vercel-og": "^0.1.2", 17 | "@cloudflare/workers-types": "^4.20241018.0", 18 | "@hono/zod-validator": "^0.4.1", 19 | "@types/hast": "^3.0.2", 20 | "@types/node": "^20.14.10", 21 | "@types/react": "^18.3.11", 22 | "@types/react-dom": "^18.3.1", 23 | "ai": "^3.4.7", 24 | "autoprefixer": "^10.4.20", 25 | "hast-util-select": "^6.0.2", 26 | "postcss": "^8.4.47", 27 | "react": "^18.3.1", 28 | "react-dom": "latest", 29 | "react-wrap-balancer": "^1.1.1", 30 | "tailwindcss": "^3.4.13", 31 | "typescript": "latest", 32 | "vocs": "1.0.0-alpha.61", 33 | "webforai": "workspace:*", 34 | "wrangler": "^3.81.0", 35 | "zod": "^3.23.8", 36 | "hono": "^4.6.5" 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /site/postcss.config.js: -------------------------------------------------------------------------------- 1 | // biome-ignore lint/style/noDefaultExport: 2 | export default { 3 | plugins: { 4 | tailwindcss: {}, 5 | autoprefixer: {}, 6 | }, 7 | }; 8 | -------------------------------------------------------------------------------- /site/tailwind.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('tailwindcss').Config} */ 2 | // biome-ignore lint/style/noDefaultExport: tailwindcss requires default export 3 | export default { 4 | content: ["./docs/**/*.{js,ts,jsx,tsx,md,mdx}"], 5 | darkMode: "class", 6 | theme: { 7 | extend: {}, 8 | }, 9 | plugins: [], 10 | }; 11 | -------------------------------------------------------------------------------- /site/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2020", 4 | "useDefineForClassFields": true, 5 | "lib": ["ES2020", "DOM", "DOM.Iterable"], 6 | "module": "ESNext", 7 | "skipLibCheck": true, 8 | 9 | /* Bundler mode */ 10 | "moduleResolution": "bundler", 11 | "allowImportingTsExtensions": true, 12 | "resolveJsonModule": true, 13 | "isolatedModules": true, 14 | "noEmit": true, 15 | "jsx": "react-jsx", 16 | 17 | /* Linting */ 18 | "strict": true, 19 | "noUnusedLocals": true, 20 | "noUnusedParameters": true, 21 | "noFallthroughCasesInSwitch": true, 22 | "types": ["@cloudflare/workers-types"] 23 | }, 24 | "include": ["**/*.ts", "**/*.tsx"] 25 | } 26 | -------------------------------------------------------------------------------- /site/vocs.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from "vocs"; 2 | import { version } from "../packages/webforai/package.json"; 3 | 4 | // biome-ignore lint/style/noDefaultExport: This is a config file 5 | export default defineConfig({ 6 | title: "Webforai", 7 | description: "A esm-native library that converts HTML to Markdown.", 8 | baseUrl: "https://webforai.dev", 9 | logoUrl: { 10 | light: "/images/logo-light.png", 11 | dark: "/images/logo-dark.png", 12 | }, 13 | iconUrl: { 14 | light: "/images/logo-light.png", 15 | dark: "/images/logo-dark.png", 16 | }, 17 | editLink: { 18 | pattern: "https://github.com/inaridiy/webforai/edit/main/site/docs/pages/:path", 19 | text: "Suggest changes to this page", 20 | }, 21 | theme: { 22 | accentColor: { 23 | light: "#1f8fff", 24 | dark: "#4db8ff", 25 | }, 26 | }, 27 | ogImageUrl: { 28 | "/": "https://webforai.dev/api/ogp?logo=%logo&title=%title&description=%description", 29 | }, 30 | socials: [ 31 | { 32 | icon: "github", 33 | link: "https://github.com/inaridiy/webforai", 34 | }, 35 | { 36 | icon: "x", 37 | link: "https://twitter.com/inaridiy", 38 | }, 39 | ], 40 | topNav: [ 41 | { text: "Getting Started", link: "/getting-started" }, 42 | { text: "Cookbook", link: "/cookbook" }, 43 | { 44 | text: version, // <= should update automatically 45 | items: [ 46 | { 47 | text: "Releases", 48 | link: "https://github.com/inaridiy/webforai/releases", 49 | }, 50 | { 51 | text: "Contributing", 52 | link: "https://github.com/inaridiy/webforai", 53 | }, 54 | ], 55 | }, 56 | ], 57 | sidebar: [ 58 | { 59 | text: "Installation", 60 | link: "/installation", 61 | }, 62 | { 63 | text: "Getting Started", 64 | link: "/getting-started", 65 | }, 66 | { 67 | text: "How it works", 68 | link: "/how-it-works", 69 | }, 70 | { 71 | text: "API Reference", 72 | items: [ 73 | { 74 | text: "htmlToMarkdown", 75 | link: "/docs/html-to-markdown", 76 | }, 77 | { 78 | text: "htmlToMdast", 79 | link: "/docs/html-to-mdast", 80 | }, 81 | { 82 | text: "mdastToMarkdown", 83 | link: "/docs/mdast-to-markdown", 84 | }, 85 | { 86 | text: "loaders", 87 | link: "/docs/loaders", 88 | }, 89 | ], 90 | }, 91 | { 92 | text: "Cookbook", 93 | link: "/cookbook", 94 | 95 | items: [ 96 | { 97 | text: "Simple usage", 98 | link: "/cookbook/simple", 99 | }, 100 | { 101 | text: "Structured output", 102 | link: "/cookbook/structured-output", 103 | }, 104 | { 105 | text: "Translation", 106 | link: "/cookbook/translation", 107 | }, 108 | { 109 | text: "Custom extractor", 110 | link: "/cookbook/custom-extractor", 111 | }, 112 | { 113 | text: "With Cloudflare Workers", 114 | link: "/cookbook/cf-workers", 115 | }, 116 | ], 117 | }, 118 | ], 119 | sponsors: [ 120 | { 121 | name: "Personal", 122 | height: 60, 123 | items: [ 124 | [ 125 | { 126 | name: "ClankPan ∞", 127 | link: "https://x.com/ClankPan", 128 | image: "https://pbs.twimg.com/profile_images/1407277306414989315/iIZ-R1jd_400x400.jpg", 129 | }, 130 | ], 131 | ], 132 | }, 133 | ], 134 | }); 135 | -------------------------------------------------------------------------------- /site/worker-configuration.d.ts: -------------------------------------------------------------------------------- 1 | // Generated by Wrangler by running `wrangler types` 2 | 3 | interface Env { 4 | ASSETS: Fetcher; 5 | } 6 | -------------------------------------------------------------------------------- /site/workers/index.tsx: -------------------------------------------------------------------------------- 1 | import { ImageResponse } from "@cloudflare/pages-plugin-vercel-og/api"; 2 | import { zValidator } from "@hono/zod-validator"; 3 | import { Hono } from "hono"; 4 | import { z } from "zod"; 5 | 6 | const fetchImage = async (env: Env, url: string) => { 7 | const res = await env.ASSETS.fetch(url).then((r) => (r.status !== 404 ? r : fetch(url))); 8 | 9 | const contentType = res.headers.get("Content-Type") || "application/octet-stream"; 10 | const arrayBuffer = await res.arrayBuffer(); 11 | const base64String = btoa(String.fromCharCode(...new Uint8Array(arrayBuffer))); 12 | const dataURL = `data:${contentType};base64,${base64String}`; 13 | 14 | return dataURL; 15 | }; 16 | 17 | // biome-ignore lint/style/useNamingConvention: library definition 18 | const app = new Hono<{ Bindings: Env }>().get( 19 | "/api/ogp", 20 | zValidator( 21 | "query", 22 | z.object({ logo: z.string().optional(), title: z.string().optional(), description: z.string().optional() }), 23 | ), 24 | async (c) => { 25 | const { logo, title, description } = c.req.valid("query"); 26 | 27 | const logoDataUrl = logo && (await fetchImage(c.env, logo)); 28 | 29 | return new ImageResponse( 30 | 42 | {/* biome-ignore lint/a11y/useAltText: */} 43 | {logoDataUrl && } 44 | {title} 45 | {description && {description}} 46 | , 47 | { 48 | width: 1200, 49 | height: 630, 50 | }, 51 | ); 52 | }, 53 | ); 54 | 55 | // biome-ignore lint/style/noDefaultExport: worker 56 | export default app; 57 | 58 | // https://webforai.dev/ogp?logo=https://webforai.dev/images/logo-dark.png&title=Getting%20Started&description=hoge 59 | -------------------------------------------------------------------------------- /site/wrangler.toml: -------------------------------------------------------------------------------- 1 | #:schema node_modules/wrangler/config-schema.json 2 | name = "webforai-site" 3 | main = "workers/index.tsx" 4 | compatibility_date = "2024-10-18" 5 | compatibility_flags = ["nodejs_compat"] 6 | assets = { directory = "./docs/dist", binding = "ASSETS" } 7 | 8 | # Workers Logs 9 | # Docs: https://developers.cloudflare.com/workers/observability/logs/workers-logs/ 10 | # Configuration: https://developers.cloudflare.com/workers/observability/logs/workers-logs/#enable-workers-logs 11 | [observability] 12 | enabled = true 13 | -------------------------------------------------------------------------------- /vitest.config.ts: -------------------------------------------------------------------------------- 1 | /// 2 | import { defineConfig } from "vitest/config"; 3 | 4 | // biome-ignore lint/style/noDefaultExport: This is a configuration file 5 | export default defineConfig({ 6 | assetsInclude: ["**/*.html", "**/*.md"], 7 | test: {}, 8 | }); 9 | --------------------------------------------------------------------------------
4 | 5 | 6 | 7 | 8 | 9 | 10 |
13 | A esm-native library that converts HTML to Markdown & Useful Utilities with simple, lightweight and epic quality. 14 |
15 | 16 |
17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 |
This is a paragraph.