├── .changeset
    ├── README.md
    └── config.json
├── .clinerules
├── .gitattributes
├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── ci.yaml
    │   └── release.yaml
├── .gitignore
├── .vscode
    └── settings.json
├── LICENSE
├── README.md
├── biome.json
├── examples
    ├── ai-learning
    │   ├── .env.example
    │   ├── .gitignore
    │   ├── CHANGELOG.md
    │   ├── package.json
    │   ├── src
    │   │   ├── auto-optimize.ts
    │   │   ├── datasets.ts
    │   │   ├── index.ts
    │   │   ├── manual.ts
    │   │   └── utils.ts
    │   └── tsconfig.json
    ├── bench
    │   ├── .gitignore
    │   ├── CHANGELOG.md
    │   ├── package.json
    │   ├── src
    │   │   └── index.ts
    │   └── tsconfig.json
    ├── scraping
    │   ├── .gitignore
    │   ├── CHANGELOG.md
    │   ├── package.json
    │   ├── src
    │   │   └── index.ts
    │   └── tsconfig.json
    ├── simple
    │   ├── .gitignore
    │   ├── CHANGELOG.md
    │   ├── package.json
    │   ├── src
    │   │   └── index.ts
    │   └── tsconfig.json
    ├── translate
    │   ├── .gitignore
    │   ├── CHANGELOG.md
    │   ├── package.json
    │   ├── src
    │   │   └── index.ts
    │   └── tsconfig.json
    └── worker
    │   ├── .gitignore
    │   ├── CHANGELOG.md
    │   ├── package.json
    │   ├── src
    │       └── index.ts
    │   ├── tsconfig.json
    │   └── wrangler.toml
├── images
    ├── logo.webp
    └── voice-genius.png
├── package.json
├── packages
    └── webforai
    │   ├── CHANGELOG.md
    │   ├── README.md
    │   ├── build.ts
    │   ├── package.cjs.json
    │   ├── package.json
    │   ├── src
    │       ├── cli
    │       │   ├── bin.ts
    │       │   ├── commands
    │       │   │   └── webforai
    │       │   │   │   ├── index.ts
    │       │   │   │   └── loadHtml.ts
    │       │   ├── constants.ts
    │       │   ├── helpers
    │       │   │   ├── assertContinue.ts
    │       │   │   ├── inputOutputPath.ts
    │       │   │   ├── inputSourcePath.ts
    │       │   │   ├── selectExtractMode.ts
    │       │   │   └── selectLoader.ts
    │       │   └── utils.ts
    │       ├── constants.ts
    │       ├── extract-mdast.ts
    │       ├── extractors
    │       │   ├── index.ts
    │       │   ├── pipeExtractors.ts
    │       │   ├── presets
    │       │   │   ├── minimal-filter.ts
    │       │   │   ├── takumi.ts
    │       │   │   └── utils.ts
    │       │   └── types.ts
    │       ├── html-to-markdown.test.ts
    │       ├── html-to-markdown.ts
    │       ├── html-to-mdast.ts
    │       ├── index.ts
    │       ├── link-replacer.test.ts
    │       ├── link-replacer.ts
    │       ├── loaders
    │       │   ├── cf-puppeteer.ts
    │       │   ├── fetch.test.ts
    │       │   ├── fetch.ts
    │       │   ├── playwright.test.ts
    │       │   ├── playwright.ts
    │       │   ├── puppeteer.test.ts
    │       │   └── puppeteer.ts
    │       ├── md-splitter.ts
    │       ├── mdast-handlers
    │       │   ├── custom-a-handler.ts
    │       │   ├── custom-br-handler.ts
    │       │   ├── custom-code-handler.ts
    │       │   ├── custom-div-handler.ts
    │       │   ├── custom-img-handler.ts
    │       │   ├── custom-table-handler.ts
    │       │   ├── empty-handler.ts
    │       │   └── math-handler.ts
    │       ├── mdast-to-markdown.ts
    │       └── utils
    │       │   ├── common.ts
    │       │   ├── detect-code-lang.ts
    │       │   ├── hast-utils.ts
    │       │   └── mdast-utils.ts
    │   ├── tsconfig.build.json
    │   └── tsconfig.json
├── pnpm-lock.yaml
├── pnpm-workspace.yaml
├── site
    ├── CHANGELOG.md
    ├── README.md
    ├── docs
    │   ├── footer.tsx
    │   ├── pages
    │   │   ├── cookbook
    │   │   │   ├── cf-workers.mdx
    │   │   │   ├── custom-extractor.mdx
    │   │   │   ├── index.mdx
    │   │   │   ├── simple.mdx
    │   │   │   ├── structured-output.mdx
    │   │   │   └── translation.mdx
    │   │   ├── docs
    │   │   │   ├── html-to-markdown.mdx
    │   │   │   ├── html-to-mdast.mdx
    │   │   │   ├── loaders.mdx
    │   │   │   └── mdast-to-markdown.mdx
    │   │   ├── getting-started.mdx
    │   │   ├── how-it-works.mdx
    │   │   ├── index.mdx
    │   │   └── installation.mdx
    │   ├── public
    │   │   └── images
    │   │   │   ├── how-it-works.svg
    │   │   │   ├── logo-dark.png
    │   │   │   ├── logo-full-dark.svg
    │   │   │   ├── logo-full-light.svg
    │   │   │   ├── logo-full-pad-dark.svg
    │   │   │   ├── logo-full-pad-light.svg
    │   │   │   └── logo-light.png
    │   └── styles.css
    ├── package.json
    ├── postcss.config.js
    ├── tailwind.config.js
    ├── tsconfig.json
    ├── vocs.config.ts
    ├── worker-configuration.d.ts
    ├── workers
    │   └── index.tsx
    └── wrangler.toml
└── vitest.config.ts


/.changeset/README.md:
--------------------------------------------------------------------------------
1 | # Changesets
2 | 
3 | Hello and welcome! This folder has been automatically generated by `@changesets/cli`, a build tool that works
4 | with multi-package repos, or single-package repos to help you version and publish your code. You can
5 | find the full documentation for it [in our repository](https://github.com/changesets/changesets)
6 | 
7 | We have a quick list of common questions to get you started engaging with this project in
8 | [our documentation](https://github.com/changesets/changesets/blob/main/docs/common-questions.md)
9 | 


--------------------------------------------------------------------------------
/.changeset/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"$schema": "https://unpkg.com/@changesets/config@3.0.1/schema.json",
 3 | 	"changelog": ["@changesets/changelog-github", { "repo": "inaridiy/webforai" }],
 4 | 	"commit": false,
 5 | 	"fixed": [],
 6 | 	"linked": [],
 7 | 	"access": "public",
 8 | 	"baseBranch": "main",
 9 | 	"updateInternalDependencies": "patch",
10 | 	"ignore": []
11 | }
12 | 


--------------------------------------------------------------------------------
/.clinerules:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"project": {
  3 | 		"name": "WebForAI",
  4 | 		"description": "A library that converts HTML to Markdown with various loaders and extractors for AI consumption",
  5 | 		"repository": "https://github.com/inaridiy/webforai",
  6 | 		"homepage": "https://webforai.dev/"
  7 | 	},
  8 | 	"structure": {
  9 | 		"monorepo": true,
 10 | 		"packageManager": "pnpm",
 11 | 		"mainPackage": "packages/webforai",
 12 | 		"directories": {
 13 | 			"packages": {
 14 | 				"description": "Contains the main WebForAI package",
 15 | 				"patterns": ["packages/**"]
 16 | 			},
 17 | 			"examples": {
 18 | 				"description": "Example projects demonstrating WebForAI usage",
 19 | 				"patterns": ["examples/**"]
 20 | 			},
 21 | 			"site": {
 22 | 				"description": "Documentation website",
 23 | 				"patterns": ["site/**"]
 24 | 			},
 25 | 			"apps": {
 26 | 				"description": "Application implementations",
 27 | 				"patterns": ["apps/**"]
 28 | 			}
 29 | 		}
 30 | 	},
 31 | 	"capabilities": {
 32 | 		"core": [
 33 | 			"HTML to Markdown conversion",
 34 | 			"HTML to MDAST conversion",
 35 | 			"MDAST to Markdown conversion",
 36 | 			"Web content loading via various methods"
 37 | 		],
 38 | 		"loaders": ["Playwright", "Puppeteer", "Cloudflare Puppeteer", "Fetch API"],
 39 | 		"extractors": ["Content extraction presets", "Custom extraction pipelines"]
 40 | 	},
 41 | 	"algorithms": {
 42 | 		"htmlToMarkdown": {
 43 | 			"description": "Main conversion pipeline that transforms HTML to Markdown",
 44 | 			"flow": "HTML → HAST → MDAST → Markdown",
 45 | 			"steps": [
 46 | 				"Parse HTML into HAST (HTML Abstract Syntax Tree)",
 47 | 				"Apply content extractors to clean and focus on main content",
 48 | 				"Transform HAST to MDAST (Markdown Abstract Syntax Tree)",
 49 | 				"Convert MDAST to Markdown text with formatting options"
 50 | 			]
 51 | 		},
 52 | 		"contentExtraction": {
 53 | 			"description": "Intelligent algorithms to extract the main content from web pages",
 54 | 			"implementations": [
 55 | 				{
 56 | 					"name": "takumiExtractor",
 57 | 					"description": "Advanced content extractor inspired by Mozilla Readability",
 58 | 					"techniques": [
 59 | 						"Metadata filtering to remove scripts, styles, and other non-content elements",
 60 | 						"Universal element filtering to remove navigation, asides, and hidden content",
 61 | 						"Content selection using common article selectors",
 62 | 						"Link density analysis to identify content-rich areas",
 63 | 						"Language-specific content length thresholds"
 64 | 					]
 65 | 				}
 66 | 			]
 67 | 		},
 68 | 		"mdastHandlers": {
 69 | 			"description": "Custom handlers for transforming specific HTML elements to Markdown",
 70 | 			"handlers": [
 71 | 				"customAHandler: Enhanced link handling with text-only option",
 72 | 				"customCodeHandler: Code block handling with language detection",
 73 | 				"customDivHandler: Special div element processing",
 74 | 				"customImgHandler: Image handling with hide option",
 75 | 				"customTableHandler: Table processing with text-only option",
 76 | 				"mathHandler: Mathematical notation conversion"
 77 | 			]
 78 | 		},
 79 | 		"linkProcessing": {
 80 | 			"description": "Utilities for handling and transforming links",
 81 | 			"features": ["Relative to absolute URL conversion", "Base URL integration", "Link text extraction"]
 82 | 		}
 83 | 	},
 84 | 	"development": {
 85 | 		"nodeVersion": ">=18.0.0",
 86 | 		"commands": {
 87 | 			"build": "pnpm run --r --filter \"./packages/**\" build",
 88 | 			"test": "vitest",
 89 | 			"format": "biome format .",
 90 | 			"lint": "biome check ."
 91 | 		},
 92 | 		"tools": ["TypeScript", "Biome", "Vitest", "Changesets"]
 93 | 	},
 94 | 	"customModes": [
 95 | 		{
 96 | 			"slug": "webforai-dev",
 97 | 			"name": "WebForAI Developer",
 98 | 			"roleDefinition": "You are Roo, a specialized developer for the WebForAI library. You understand HTML parsing, Markdown generation, and web content extraction techniques. You're familiar with the project's architecture including loaders, extractors, and MDAST/HAST transformations.",
 99 | 			"groups": ["read", "edit", "browser", "command", "mcp"]
100 | 		},
101 | 		{
102 | 			"slug": "webforai-docs",
103 | 			"name": "WebForAI Documentation",
104 | 			"roleDefinition": "You are Roo, a documentation specialist for the WebForAI library. You excel at creating clear, concise documentation with practical examples. You understand the library's capabilities and can explain complex concepts in an accessible way.",
105 | 			"groups": [
106 | 				"read",
107 | 				["edit", { "fileRegex": "\\.(md|mdx)$", "description": "Markdown and MDX files only" }],
108 | 				"browser",
109 | 				"command",
110 | 				"mcp"
111 | 			]
112 | 		}
113 | 	]
114 | }
115 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: inaridiy


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: ci
 2 | on:
 3 |   push:
 4 |     branches: [main, develop]
 5 |   pull_request:
 6 |     branches: ['*']
 7 |     paths-ignore:
 8 |       - 'docs/**'
 9 |       - '.vscode/**'
10 |       - 'README.md'
11 |       - '.gitignore'
12 |       - 'LICENSE'
13 | 
14 | jobs:
15 |   lint:
16 |     name: 'Lint'
17 |     runs-on: ubuntu-22.04
18 |     steps:
19 |       - uses: actions/checkout@v4
20 |       - uses: pnpm/action-setup@v2
21 |         name: Install pnpm
22 |         id: pnpm-install
23 |         with:
24 |           version: 9.1.4
25 |           run_install: true
26 |       - run: pnpm format
27 |       - run: pnpm lint
28 |       - run: pnpm lint:repo
29 |       - run: pnpm build
30 |   test:
31 |     name: 'Test'
32 |     runs-on: ubuntu-22.04
33 |     steps:
34 |       - uses: actions/checkout@v4
35 |       - uses: pnpm/action-setup@v2
36 |         name: Install pnpm
37 |         id: pnpm-install
38 |         with:
39 |           version: 9.1.4
40 |           run_install: true
41 |       - name: Install Playwright Browsers
42 |         run: |
43 |           pnpm install -w playwright
44 |           pnpm exec playwright install chromium
45 |           pnpm exec playwright install-deps
46 |       - name: Run tests
47 |         run: pnpm test


--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
 1 | name: Changesets
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | 
 7 | concurrency:
 8 |   group: ${{ github.workflow }}-${{ github.ref }}
 9 |   cancel-in-progress: true
10 | 
11 | jobs:
12 |   version:
13 |     timeout-minutes: 15
14 |     runs-on: ubuntu-latest
15 |     permissions:
16 |       contents: write
17 |       id-token: write
18 |       pull-requests: write
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |       - run: |
22 |           echo "SKIP_SIMPLE_GIT_HOOKS=1" >> $GITHUB_ENV
23 |       - uses: pnpm/action-setup@v2
24 |         name: Install pnpm
25 |         id: pnpm-install
26 |         with:
27 |           version: 9.1.4
28 |           run_install: true
29 |       - name: Setup npmrc
30 |         run: echo "//registry.npmjs.org/:_authToken=${{ secrets.NPM_TOKEN }}" > .npmrc
31 |       - name: create and publish versions
32 |         uses: changesets/action@v1
33 |         with:
34 |           version: pnpm ci:version
35 |           publish: pnpm ci:publish
36 |           title: 'chore: version packages'
37 |           commit: 'chore: version packages'
38 |         env:
39 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
40 |           NPM_TOKEN: ${{ secrets.NPM_TOKEN }}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | dist
3 | .npmrc
4 | .DS_Store
5 | .wrangler


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"editor.defaultFormatter": "biomejs.biome",
 3 | 	"editor.codeActionsOnSave": {
 4 | 		"source.organizeImports.biome": "explicit"
 5 | 	},
 6 | 	"[typescript]": {
 7 | 		"editor.defaultFormatter": "biomejs.biome"
 8 | 	},
 9 | 	"[json]": {
10 | 		"editor.defaultFormatter": "biomejs.biome"
11 | 	},
12 | 	"markdown.preview.breaks": true,
13 | 	"[markdown]": {
14 | 		"editor.defaultFormatter": "esbenp.prettier-vscode"
15 | 	},
16 | 	"[jsonc]": {
17 | 		"editor.defaultFormatter": "biomejs.biome"
18 | 	},
19 | 	"[html]": {
20 | 		"editor.defaultFormatter": "vscode.html-language-features"
21 | 	},
22 | 	"[javascript]": {
23 | 		"editor.defaultFormatter": "biomejs.biome"
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | packages/webforai/README.md


--------------------------------------------------------------------------------
/biome.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"$schema": "https://biomejs.dev/schemas/1.5.3/schema.json",
 3 | 	"files": {
 4 | 		"ignore": ["worker-configuration.d.ts", "package.json"]
 5 | 	},
 6 | 	"vcs": {
 7 | 		"enabled": true,
 8 | 		"clientKind": "git",
 9 | 		"useIgnoreFile": true
10 | 	},
11 | 	"organizeImports": {
12 | 		"enabled": true
13 | 	},
14 | 	"formatter": {
15 | 		"enabled": true,
16 | 		"lineWidth": 120
17 | 	},
18 | 	"linter": {
19 | 		"enabled": true,
20 | 		"rules": {
21 | 			"all": true,
22 | 			"style": {
23 | 				"useNamingConvention": {
24 | 					"level": "warn",
25 | 					"options": {
26 | 						"strictCase": false
27 | 					}
28 | 				}
29 | 			},
30 | 			"correctness": {
31 | 				"noUndeclaredVariables": "off"
32 | 			},
33 | 			"complexity": {
34 | 				"noExcessiveCognitiveComplexity": {
35 | 					"level": "error",
36 | 					"options": {
37 | 						"maxAllowedComplexity": 20
38 | 					}
39 | 				}
40 | 			}
41 | 		}
42 | 	},
43 | 
44 | 	"overrides": [
45 | 		{
46 | 			"include": ["examples/**"],
47 | 			"ignore": ["**/*.json"],
48 | 			"linter": {
49 | 				"rules": {
50 | 					"recommended": true,
51 | 					"style": {
52 | 						"useNamingConvention": "off"
53 | 					},
54 | 					"correctness": {
55 | 						"noUndeclaredVariables": "off"
56 | 					}
57 | 				}
58 | 			}
59 | 		}
60 | 	]
61 | }
62 | 


--------------------------------------------------------------------------------
/examples/ai-learning/.env.example:
--------------------------------------------------------------------------------
1 | GOOGLE_GENERATIVE_AI_API_KEY


--------------------------------------------------------------------------------
/examples/ai-learning/.gitignore:
--------------------------------------------------------------------------------
1 | .output
2 | .env
3 | .cache
4 | .output


--------------------------------------------------------------------------------
/examples/ai-learning/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # bench
  2 | 
  3 | ## 1.1.1
  4 | 
  5 | ### Patch Changes
  6 | 
  7 | - Updated dependencies [[`7fa6ec7`](https://github.com/inaridiy/webforai/commit/7fa6ec75b9966a92820a21a4b8f9eb85c5c2020c)]:
  8 |   - webforai@2.1.1
  9 | 
 10 | ## 1.1.0
 11 | 
 12 | ### Minor Changes
 13 | 
 14 | - [#56](https://github.com/inaridiy/webforai/pull/56) [`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4) Thanks [@inaridiy](https://github.com/inaridiy)! - Add minimal filter extractor
 15 | 
 16 | ### Patch Changes
 17 | 
 18 | - Updated dependencies [[`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4)]:
 19 |   - webforai@2.1.0
 20 | 
 21 | ## 1.0.17
 22 | 
 23 | ### Patch Changes
 24 | 
 25 | - Updated dependencies [[`a2e0fc0`](https://github.com/inaridiy/webforai/commit/a2e0fc0b00554a3d860f0cdf67494c1691f9eeb3)]:
 26 |   - webforai@2.0.1
 27 | 
 28 | ## 1.0.16
 29 | 
 30 | ### Patch Changes
 31 | 
 32 | - [#50](https://github.com/inaridiy/webforai/pull/50) [`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f) Thanks [@inaridiy](https://github.com/inaridiy)! - New Documentation Site
 33 | 
 34 | - Updated dependencies [[`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f), [`baaf7ea`](https://github.com/inaridiy/webforai/commit/baaf7ea33b9a75d07dfb910231d64d5b6efc6f40)]:
 35 |   - webforai@2.0.0
 36 | 
 37 | ## 1.0.15
 38 | 
 39 | ### Patch Changes
 40 | 
 41 | - Updated dependencies [[`fc84541`](https://github.com/inaridiy/webforai/commit/fc84541331ebc2dbf7d80af0694d9cdc79145a59)]:
 42 |   - webforai@1.6.3
 43 | 
 44 | ## 1.0.14
 45 | 
 46 | ### Patch Changes
 47 | 
 48 | - Updated dependencies [[`00d2a55`](https://github.com/inaridiy/webforai/commit/00d2a55b4a25f6d84da37cb89ec629b5f6179357), [`a7db6ef`](https://github.com/inaridiy/webforai/commit/a7db6ef61f1321e010ed16022086a684c19d77a5)]:
 49 |   - webforai@1.6.2
 50 | 
 51 | ## 1.0.13
 52 | 
 53 | ### Patch Changes
 54 | 
 55 | - Updated dependencies [[`5e82341`](https://github.com/inaridiy/webforai/commit/5e82341c2a3b8275d78ce808c7d4f81dacb1acdd)]:
 56 |   - webforai@1.6.1
 57 | 
 58 | ## 1.0.12
 59 | 
 60 | ### Patch Changes
 61 | 
 62 | - [#38](https://github.com/inaridiy/webforai/pull/38) [`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9) Thanks [@inaridiy](https://github.com/inaridiy)! - improve seido
 63 | 
 64 | - Updated dependencies [[`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9), [`30d181e`](https://github.com/inaridiy/webforai/commit/30d181e7a29452b37f6dd665de3e7a3b743c1244)]:
 65 |   - webforai@1.6.0
 66 | 
 67 | ## 1.0.11
 68 | 
 69 | ### Patch Changes
 70 | 
 71 | - Updated dependencies [[`39c1122`](https://github.com/inaridiy/webforai/commit/39c112205d0aa2a07a46c92b94be6ed91239cf0f)]:
 72 |   - webforai@1.5.1
 73 | 
 74 | ## 1.0.10
 75 | 
 76 | ### Patch Changes
 77 | 
 78 | - Updated dependencies [[`b13719f`](https://github.com/inaridiy/webforai/commit/b13719fd719511d0aeb1ef3749e88fd8145337a6), [`167acb1`](https://github.com/inaridiy/webforai/commit/167acb1dca303d651d997c23224d3366ff4375ac)]:
 79 |   - webforai@1.5.0
 80 | 
 81 | ## 1.0.9
 82 | 
 83 | ### Patch Changes
 84 | 
 85 | - Updated dependencies [[`fd7348f`](https://github.com/inaridiy/webforai/commit/fd7348f59a19a027b9bdf012b11d40c38d56cf61)]:
 86 |   - webforai@1.4.1
 87 | 
 88 | ## 1.0.8
 89 | 
 90 | ### Patch Changes
 91 | 
 92 | - Updated dependencies [[`5bdea98`](https://github.com/inaridiy/webforai/commit/5bdea98cc7cafd79020123260db721fc6ffefd87)]:
 93 |   - webforai@1.4.0
 94 | 
 95 | ## 1.0.7
 96 | 
 97 | ### Patch Changes
 98 | 
 99 | - Updated dependencies [[`c5e8416`](https://github.com/inaridiy/webforai/commit/c5e841610360346fcba388c777869706dcd5997d), [`97863ea`](https://github.com/inaridiy/webforai/commit/97863ea7f9f4837b96f376bd33371c6ed756d791)]:
100 |   - webforai@1.3.3
101 | 
102 | ## 1.0.6
103 | 
104 | ### Patch Changes
105 | 
106 | - Updated dependencies [[`3c6da39`](https://github.com/inaridiy/webforai/commit/3c6da3952f176769cf8aa899f6c7207c231d806a), [`4437e28`](https://github.com/inaridiy/webforai/commit/4437e28e1e7807fd061aee99510ea2d3f71a2a78), [`4764767`](https://github.com/inaridiy/webforai/commit/47647676a838b922e2cf32b1d3637c8153b996dd)]:
107 |   - webforai@1.3.2
108 | 
109 | ## 1.0.5
110 | 
111 | ### Patch Changes
112 | 
113 | - Updated dependencies [[`680a226`](https://github.com/inaridiy/webforai/commit/680a22638409517658c3918d90d070b1fa53cc3f)]:
114 |   - webforai@1.3.1
115 | 
116 | ## 1.0.4
117 | 
118 | ### Patch Changes
119 | 
120 | - [#12](https://github.com/inaridiy/webforai/pull/12) [`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7) Thanks [@inaridiy](https://github.com/inaridiy)! - Minor interface improvements.
121 | 
122 | - Updated dependencies [[`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7)]:
123 |   - webforai@1.3.0
124 | 
125 | ## 1.0.3
126 | 
127 | ### Patch Changes
128 | 
129 | - [`64b3a07`](https://github.com/inaridiy/webforai/commit/64b3a07304d364320364b499ca73df24cd312afd) Thanks [@inaridiy](https://github.com/inaridiy)! - Update
130 | 
131 | ## 1.0.2
132 | 
133 | ### Patch Changes
134 | 
135 | - [`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflow
136 | 
137 | - [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflows
138 | 
139 | - Updated dependencies [[`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953), [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b)]:
140 |   - webforai@1.2.3
141 | 
142 | ## 1.0.1
143 | 
144 | ### Patch Changes
145 | 
146 | - 920f310: Update Linter and Workflows
147 | - Updated dependencies [920f310]
148 |   - webforai@1.2.2
149 | 


--------------------------------------------------------------------------------
/examples/ai-learning/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "ai-learning",
 3 | 	"version": "1.1.1",
 4 | 	"description": "",
 5 | 	"main": "index.js",
 6 | 	"type": "module",
 7 | 	"private": true,
 8 | 	"scripts": {
 9 | 		"test": "echo \"Error: no test specified\" && exit 1"
10 | 	},
11 | 	"keywords": [],
12 | 	"author": "",
13 | 	"license": "ISC",
14 | 	"dependencies": {
15 | 		"@ai-sdk/google": "^0.0.48",
16 | 		"ai": "^3.4.7",
17 | 		"arg": "^5.0.2",
18 | 		"dotenv": "^16.4.5",
19 | 		"hast-util-from-html": "^2.0.3",
20 | 		"hast-util-select": "^6.0.2",
21 | 		"hast-util-to-html": "^9.0.3",
22 | 		"hast-util-to-string": "^3.0.0",
23 | 		"playwright": "^1.40.1",
24 | 		"tsx": "^4.19.1",
25 | 		"unist-util-filter": "^5.0.1",
26 | 		"webforai": "workspace:^",
27 | 		"zod": "^3.23.8"
28 | 	},
29 | 	"devDependencies": {
30 | 		"@tsconfig/recommended": "^1.0.3",
31 | 		"@types/hast": "^3.0.2",
32 | 		"@types/node": "^20.14.10",
33 | 		"typescript": "^5.4.5"
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/examples/ai-learning/src/auto-optimize.ts:
--------------------------------------------------------------------------------
  1 | import { promises as fs } from "node:fs";
  2 | import { google } from "@ai-sdk/google";
  3 | import { generateObject } from "ai";
  4 | import dotenv from "dotenv";
  5 | import type { Element } from "hast";
  6 | import { fromHtml } from "hast-util-from-html";
  7 | import { toHtml } from "hast-util-to-html";
  8 | import { tsImport } from "tsx/esm/api";
  9 | import { filter } from "unist-util-filter";
 10 | import { htmlToMarkdown } from "webforai";
 11 | import { z } from "zod";
 12 | import { persitCachedLoadHtml } from "./utils.js";
 13 | 
 14 | dotenv.config();
 15 | 
 16 | const target = "https://github.com/wevm/viem/issues/2658";
 17 | const html = await persitCachedLoadHtml(target);
 18 | 
 19 | const htmlToMarkdownWithGenerated = async (html: string, generatedPath: string, parentPath: string) => {
 20 | 	try {
 21 | 		const { extractor: generatedExtractor } = await tsImport(generatedPath, parentPath);
 22 | 		return htmlToMarkdown(html, { baseUrl: target, extractors: [generatedExtractor] });
 23 | 	} catch (e) {
 24 | 		console.info("Failed to load generated extractor, using default extractor", e);
 25 | 		return htmlToMarkdown(html, { baseUrl: target, extractors: false });
 26 | 	}
 27 | };
 28 | 
 29 | const generateExtractor = async (
 30 | 	html: string,
 31 | 	rawMarkdown: string,
 32 | 	userRequirements: string,
 33 | 	genericAlgorithm: string,
 34 | ) => {
 35 | 	const result = await generateObject({
 36 | 		model: google("gemini-1.5-pro-latest"),
 37 | 		schema: z.object({ code: z.string() }),
 38 | 		prompt: `You are tasked with implementing an algorithm to extract the main content from HTML during the process of converting HTML to Markdown. Your goal is to create a TypeScript function that takes in HTML and other parameters, and returns a filtered HTML Abstract Syntax Tree (HAST) containing only the main content.
 39 | 
 40 | You will be working with the following inputs:
 41 | 
 42 | 1. HTML content:
 43 | <html>
 44 | ${html}
 45 | </html>
 46 | 
 47 | 2. Raw Markdown converted from the HTML without content extraction:
 48 | <raw_markdown>
 49 | ${rawMarkdown}
 50 | </raw_markdown>
 51 | 
 52 | 3. User requirements for content extraction:
 53 | <user_requirements>
 54 | ${userRequirements}
 55 | </user_requirements>
 56 | 
 57 | 4. A generic content extraction algorithm for reference:
 58 | <generic_algorithm>
 59 | ${genericAlgorithm}
 60 | </generic_algorithm>
 61 | 
 62 | You may use the following libraries in your implementation:
 63 | - unist-util-filter
 64 | - hast-util-to-string
 65 | - hast-util-select
 66 | 
 67 | Your task is to implement the following function:
 68 | 
 69 | \`\`\`typescript
 70 | type ExtractParams = { hast: Hast; lang?: string; url?: string };
 71 | 
 72 | export const extractor = (params: ExtractParams): Hast => {
 73 |   // Your implementation here
 74 | }
 75 | \`\`\`
 76 | 
 77 | Throughout your implementation, use comments to explain your reasoning and approach. Consider edge cases and potential issues that may arise with different types of HTML structures.
 78 | 
 79 | After implementing the extractor function, provide a brief explanation of how to test and refine the algorithm using sample HTML inputs and user requirements.
 80 | 
 81 | Write your complete TypeScript implementation, including imports, helper functions, and the main extractor function.`,
 82 | 	});
 83 | 
 84 | 	return result.object.code;
 85 | };
 86 | 
 87 | const rawContent = htmlToMarkdown(html, { baseUrl: target, extractors: false });
 88 | const simpleExtractedHtml = toHtml(
 89 | 	filter(fromHtml(rawContent), (node) => {
 90 | 		return !(
 91 | 			["comment", "doctype"].includes(node.type) ||
 92 | 			(node.type === "element" &&
 93 | 				["script", "style", "link", "meta", "noscript", "svg", "title"].includes((node as Element).tagName))
 94 | 		);
 95 | 	}) ?? [],
 96 | );
 97 | 
 98 | const userRequirements = "Issueの議論のみ抽出してください";
 99 | const exampleCode = await fs.readFile("./.output/example-extractor.ts", "utf-8");
100 | 
101 | const extractor = await generateExtractor(simpleExtractedHtml, rawContent, userRequirements, exampleCode);
102 | 
103 | await fs.writeFile("./.output/generated-extractors.ts", extractor);
104 | 
105 | const extractedContent = await htmlToMarkdownWithGenerated(html, "../.output/generated-extractors.ts", import.meta.url);
106 | 
107 | await fs.writeFile("./.output/extracted-content.md", extractedContent);
108 | await fs.writeFile("./.output/raw-content.md", rawContent);
109 | 


--------------------------------------------------------------------------------
/examples/ai-learning/src/datasets.ts:
--------------------------------------------------------------------------------
 1 | export const TECH_DOCUMENTS = [
 2 | 	"https://react.dev/",
 3 | 	"https://react.dev/learn",
 4 | 	"https://nextjs.org/",
 5 | 	"https://nextjs.org/showcase",
 6 | 	"https://nextjs.org/docs",
 7 | 	"https://nextjs.org/docs/app/building-your-application/routing/dynamic-routes",
 8 | 	"https://docs.expo.dev/",
 9 | 	"https://docs.expo.dev/tutorial/introduction/",
10 | 	"https://vuejs.org/",
11 | 	"https://vuejs.org/guide/introduction.html",
12 | 	"https://hono.dev/",
13 | 	"https://hono.dev/docs/",
14 | 	"https://esbuild.github.io/getting-started/",
15 | 	"https://vitejs.dev/config/",
16 | 	"https://tailwindcss.com/",
17 | 	"https://tailwindcss.com/docs/installation",
18 | 	"https://ui.shadcn.com/docs",
19 | 	"https://ui.shadcn.com/docs/components/select",
20 | 	"https://orm.drizzle.team/docs/overview",
21 | 	"https://orm.drizzle.team/docs/rqb",
22 | 	"https://developers.cloudflare.com/pages/framework-guides/deploy-a-hono-site/",
23 | 	"https://emotion.sh/docs/introduction",
24 | 	"https://jotai.org/",
25 | 	"https://clerk.com/docs/quickstarts/nextjs",
26 | 	"https://www.prisma.io/docs/orm/overview/introduction/what-is-prisma",
27 | 	"https://www.npmjs.com/package/webforai",
28 | ];
29 | 
30 | export const ARTICLES = [
31 | 	"https://blog.cloudflare.com/",
32 | 	"https://blog.cloudflare.com/more-npm-packages-on-cloudflare-workers-combining-polyfills-and-native-code/",
33 | 	"https://gigazine.net/",
34 | 	"https://gigazine.net/news/20240917-synchron-brain-computer-interface-alexa/",
35 | 	"https://dev.classmethod.jp/",
36 | 	"https://dev.classmethod.jp/articles/gha-volta-error-could-not-unpack-node/",
37 | 	"https://zenn.dev/",
38 | 	"https://zenn.dev/inaridiy/articles/f1ed9e73cb182b",
39 | 	"https://ics.media/",
40 | 	"https://ics.media/entry/231120/",
41 | 	"https://saruwakakun.com/html-css/basic",
42 | 	"https://saruwakakun.com/html-css/basic/tools",
43 | 	"https://qiita.com/",
44 | 	"https://qiita.com/Tadataka_Takahashi/items/556e0277017677cef68a",
45 | 	"https://www.wikipedia.org/",
46 | 	"https://ja.wikipedia.org/wiki/%E6%9C%A8%E6%9D%91%E6%8B%93%E5%93%89",
47 | ];
48 | 
49 | export const NEWS = [
50 | 	"https://www.nytimes.com/international/",
51 | 	"https://www.wsj.com/",
52 | 	"https://www.cnn.co.jp/usa/35223960.html",
53 | 	"https://www.bbc.com/news",
54 | 	"https://www.bbc.com/news/articles/cx2kdd3n7yqo",
55 | 	"https://www3.nhk.or.jp/news/html/20240329/k10014405791000.html",
56 | ];
57 | 
58 | export const EC_SITE = [
59 | 	"https://www.amazon.co.jp/dp/B08ZSHSFXQ",
60 | 	"https://store.shopping.yahoo.co.jp/gimi1225/p2750.html?sc_i=shopping-pc-web-top--pm_mod-itm_1",
61 | ];
62 | 


--------------------------------------------------------------------------------
/examples/ai-learning/src/index.ts:
--------------------------------------------------------------------------------
 1 | import fs from "node:fs";
 2 | import dotenv from "dotenv";
 3 | import { htmlToMarkdown } from "webforai";
 4 | import { ARTICLES, EC_SITE, NEWS, TECH_DOCUMENTS } from "./datasets.js";
 5 | import { persitCachedLoadHtml, scoreMarkdown } from "./utils.js";
 6 | 
 7 | dotenv.config();
 8 | 
 9 | const TARGETS = [...ARTICLES, ...EC_SITE, ...NEWS, ...TECH_DOCUMENTS];
10 | 
11 | const contents: { url: string; html: string; extractedContent: string; rawContent: string }[] = [];
12 | 
13 | for (const url of TARGETS) {
14 | 	const html = await persitCachedLoadHtml(url);
15 | 
16 | 	const extractedContent = htmlToMarkdown(html, { baseUrl: url });
17 | 	const rawContent = htmlToMarkdown(html, { baseUrl: url, extractors: false });
18 | 
19 | 	contents.push({ url, html, extractedContent, rawContent });
20 | }
21 | 
22 | const scores: { url: string; score: number; issues: string[] }[] = [];
23 | 
24 | for (const content of contents) {
25 | 	const result = await scoreMarkdown(content);
26 | 
27 | 	console.info(`${content.url} - ${result.object.score}`);
28 | 	scores.push({ url: content.url, score: result.object.score, issues: result.object.issues });
29 | }
30 | 
31 | console.info(scores);
32 | await fs.mkdirSync("./output", { recursive: true });
33 | await fs.writeFileSync("./output/scores.json", JSON.stringify(scores, null, 2));
34 | console.info(`Avg Score: ${scores.reduce((acc, curr) => acc + curr.score, 0) / scores.length}`);
35 | 


--------------------------------------------------------------------------------
/examples/ai-learning/src/manual.ts:
--------------------------------------------------------------------------------
 1 | import fs from "node:fs";
 2 | import dotenv from "dotenv";
 3 | import { chromium } from "playwright";
 4 | import { htmlToMarkdown } from "webforai";
 5 | 
 6 | dotenv.config();
 7 | 
 8 | const url = "https://ui.shadcn.com/docs";
 9 | const loadHtml = async (url: string) => {
10 | 	const browser = await chromium.launch({ headless: true });
11 | 	const context = await browser.newContext({
12 | 		userAgent:
13 | 			"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
14 | 		viewport: { width: 1920, height: 1080 },
15 | 		deviceScaleFactor: 1,
16 | 		hasTouch: false,
17 | 		isMobile: false,
18 | 		javaScriptEnabled: true,
19 | 		locale: "en-US",
20 | 		timezoneId: "America/New_York",
21 | 	});
22 | 
23 | 	// Webドライバーの特性を隠す
24 | 	await context.addInitScript(() => {
25 | 		Object.defineProperty(navigator, "webdriver", { get: () => undefined });
26 | 		Object.defineProperty(navigator, "languages", { get: () => ["en-US", "en"] });
27 | 		Object.defineProperty(navigator, "plugins", { get: () => [1, 2, 3, 4, 5] });
28 | 	});
29 | 
30 | 	const page = await context.newPage();
31 | 	await page.route("**/*.js", (route) => {
32 | 		if (route.request().url().includes("captcha-delivery")) {
33 | 			return route.abort();
34 | 		}
35 | 		return route.continue();
36 | 	});
37 | 
38 | 	await page.goto(url, { waitUntil: "networkidle", timeout: 10_000 }).catch(() => {
39 | 		/** */
40 | 	});
41 | 	const html = await page.content();
42 | 	await page.close();
43 | 	await browser.close();
44 | 
45 | 	return html;
46 | };
47 | 
48 | const html = await loadHtml(url);
49 | 
50 | await fs.mkdirSync(".output", { recursive: true });
51 | 
52 | await fs.writeFileSync(".output/html.html", html);
53 | 
54 | const rawContent = await htmlToMarkdown(html, { baseUrl: url, extractors: false });
55 | const cleanedContent = await htmlToMarkdown(html, { baseUrl: url });
56 | 
57 | await fs.writeFileSync(".output/raw.md", rawContent);
58 | await fs.writeFileSync(".output/cleaned.md", cleanedContent);
59 | 


--------------------------------------------------------------------------------
/examples/ai-learning/src/utils.ts:
--------------------------------------------------------------------------------
 1 | import { promises as fs } from "node:fs";
 2 | import path from "node:path";
 3 | import { google } from "@ai-sdk/google";
 4 | import { generateObject } from "ai";
 5 | import { tsImport } from "tsx/esm/api";
 6 | import { htmlToMarkdown } from "webforai";
 7 | import { loadHtml } from "webforai/loaders/playwright";
 8 | import { z } from "zod";
 9 | 
10 | export const persitCachedLoadHtml = async (url: string) => {
11 | 	const cacheDir = ".cache";
12 | 	await fs.mkdir(cacheDir, { recursive: true });
13 | 	const cachePath = path.join(cacheDir, `${url.replace(/[^a-zA-Z0-9]/g, "_")}.txt`);
14 | 	if (await fs.stat(cachePath).catch(() => false)) {
15 | 		return fs.readFile(cachePath, "utf-8");
16 | 	}
17 | 	const html = await loadHtml(url, { superBypassMode: true });
18 | 	await fs.writeFile(cachePath, html);
19 | 	return html;
20 | };
21 | 
22 | export const htmlToMarkdownWithGenerated = async (
23 | 	url: string,
24 | 	html: string,
25 | 	generatedPath: string,
26 | 	parentPath: string,
27 | ) => {
28 | 	try {
29 | 		const { extractor: generatedExtractor } = await tsImport(generatedPath, parentPath);
30 | 		return htmlToMarkdown(html, { baseUrl: url, extractors: [generatedExtractor] });
31 | 	} catch {
32 | 		return htmlToMarkdown(html, { baseUrl: url, extractors: false });
33 | 	}
34 | };
35 | 
36 | export const scoreMarkdown = async (content: { rawContent: string; extractedContent: string }) => {
37 | 	const result = await generateObject({
38 | 		model: google("gemini-1.5-flash-latest"),
39 | 		temperature: 0,
40 | 		schema: z.object({
41 | 			analysis: z.string().describe("Detailed analysis of the cleaning process. 400 characters max."),
42 | 			issues: z.array(z.string()).describe("List of issues found in the cleaned Markdown. 12 issues max."),
43 | 			score: z.number().min(0).max(100),
44 | 		}),
45 | 		prompt: `
46 | You are tasked with evaluating the effectiveness of an algorithm that extracts the main content from a website's HTML and converts it to Markdown format. Your goal is to compare the original Markdown output (which includes all content) with a cleaned version that attempts to remove unnecessary elements like advertisements and navigation.
47 | 
48 | First, you will be presented with the original Markdown content:
49 | 
50 | <original_markdown>
51 | ${content.rawContent}
52 | </original_markdown>
53 | 
54 | Next, you will see the cleaned Markdown content:
55 | 
56 | <cleaned_markdown>
57 | ${content.extractedContent}
58 | </cleaned_markdown>
59 | 
60 | Compare these two versions carefully. Your task is to evaluate how accurately the cleaning process has extracted only the main content, removing unnecessary elements while preserving the essential information.
61 | 
62 | When evaluating, consider the following criteria:
63 | 1. Removal of advertisements
64 | 2. Removal of navigation elements
65 | 3. Removal of sidebars or other non-essential sections
66 | 4. Preservation of the main article or content
67 | 5. Preservation of important headings and subheadings
68 | 6. Preservation of relevant images or media
69 | 7. Maintenance of the content's logical flow and structure
70 | 
71 | Based on these criteria, assign a score from 0 to 100, where 100 represents perfect extraction of only the main content, and 0 represents no improvement or significant loss of important content.
72 | 
73 | In addition to the score, identify any problems or issues you notice in the cleaned version. List these problems in bullet points, adjusting the granularity to provide a maximum of 12 points.`,
74 | 	}).catch((err) => {
75 | 		console.error(err);
76 | 		return { object: { score: -1, issues: [], analysis: "" } };
77 | 	});
78 | 
79 | 	return result;
80 | };
81 | 


--------------------------------------------------------------------------------
/examples/ai-learning/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"extends": "@tsconfig/recommended/tsconfig.json",
3 | 	"compilerOptions": {
4 | 		"module": "NodeNext",
5 | 		"target": "ESNext"
6 | 	}
7 | }
8 | 


--------------------------------------------------------------------------------
/examples/bench/.gitignore:
--------------------------------------------------------------------------------
1 | .output


--------------------------------------------------------------------------------
/examples/bench/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # bench
  2 | 
  3 | ## 1.0.19
  4 | 
  5 | ### Patch Changes
  6 | 
  7 | - Updated dependencies [[`7fa6ec7`](https://github.com/inaridiy/webforai/commit/7fa6ec75b9966a92820a21a4b8f9eb85c5c2020c)]:
  8 |   - webforai@2.1.1
  9 | 
 10 | ## 1.0.18
 11 | 
 12 | ### Patch Changes
 13 | 
 14 | - Updated dependencies [[`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4)]:
 15 |   - webforai@2.1.0
 16 | 
 17 | ## 1.0.17
 18 | 
 19 | ### Patch Changes
 20 | 
 21 | - Updated dependencies [[`a2e0fc0`](https://github.com/inaridiy/webforai/commit/a2e0fc0b00554a3d860f0cdf67494c1691f9eeb3)]:
 22 |   - webforai@2.0.1
 23 | 
 24 | ## 1.0.16
 25 | 
 26 | ### Patch Changes
 27 | 
 28 | - [#50](https://github.com/inaridiy/webforai/pull/50) [`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f) Thanks [@inaridiy](https://github.com/inaridiy)! - New Documentation Site
 29 | 
 30 | - Updated dependencies [[`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f), [`baaf7ea`](https://github.com/inaridiy/webforai/commit/baaf7ea33b9a75d07dfb910231d64d5b6efc6f40)]:
 31 |   - webforai@2.0.0
 32 | 
 33 | ## 1.0.15
 34 | 
 35 | ### Patch Changes
 36 | 
 37 | - Updated dependencies [[`fc84541`](https://github.com/inaridiy/webforai/commit/fc84541331ebc2dbf7d80af0694d9cdc79145a59)]:
 38 |   - webforai@1.6.3
 39 | 
 40 | ## 1.0.14
 41 | 
 42 | ### Patch Changes
 43 | 
 44 | - Updated dependencies [[`00d2a55`](https://github.com/inaridiy/webforai/commit/00d2a55b4a25f6d84da37cb89ec629b5f6179357), [`a7db6ef`](https://github.com/inaridiy/webforai/commit/a7db6ef61f1321e010ed16022086a684c19d77a5)]:
 45 |   - webforai@1.6.2
 46 | 
 47 | ## 1.0.13
 48 | 
 49 | ### Patch Changes
 50 | 
 51 | - Updated dependencies [[`5e82341`](https://github.com/inaridiy/webforai/commit/5e82341c2a3b8275d78ce808c7d4f81dacb1acdd)]:
 52 |   - webforai@1.6.1
 53 | 
 54 | ## 1.0.12
 55 | 
 56 | ### Patch Changes
 57 | 
 58 | - [#38](https://github.com/inaridiy/webforai/pull/38) [`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9) Thanks [@inaridiy](https://github.com/inaridiy)! - improve seido
 59 | 
 60 | - Updated dependencies [[`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9), [`30d181e`](https://github.com/inaridiy/webforai/commit/30d181e7a29452b37f6dd665de3e7a3b743c1244)]:
 61 |   - webforai@1.6.0
 62 | 
 63 | ## 1.0.11
 64 | 
 65 | ### Patch Changes
 66 | 
 67 | - Updated dependencies [[`39c1122`](https://github.com/inaridiy/webforai/commit/39c112205d0aa2a07a46c92b94be6ed91239cf0f)]:
 68 |   - webforai@1.5.1
 69 | 
 70 | ## 1.0.10
 71 | 
 72 | ### Patch Changes
 73 | 
 74 | - Updated dependencies [[`b13719f`](https://github.com/inaridiy/webforai/commit/b13719fd719511d0aeb1ef3749e88fd8145337a6), [`167acb1`](https://github.com/inaridiy/webforai/commit/167acb1dca303d651d997c23224d3366ff4375ac)]:
 75 |   - webforai@1.5.0
 76 | 
 77 | ## 1.0.9
 78 | 
 79 | ### Patch Changes
 80 | 
 81 | - Updated dependencies [[`fd7348f`](https://github.com/inaridiy/webforai/commit/fd7348f59a19a027b9bdf012b11d40c38d56cf61)]:
 82 |   - webforai@1.4.1
 83 | 
 84 | ## 1.0.8
 85 | 
 86 | ### Patch Changes
 87 | 
 88 | - Updated dependencies [[`5bdea98`](https://github.com/inaridiy/webforai/commit/5bdea98cc7cafd79020123260db721fc6ffefd87)]:
 89 |   - webforai@1.4.0
 90 | 
 91 | ## 1.0.7
 92 | 
 93 | ### Patch Changes
 94 | 
 95 | - Updated dependencies [[`c5e8416`](https://github.com/inaridiy/webforai/commit/c5e841610360346fcba388c777869706dcd5997d), [`97863ea`](https://github.com/inaridiy/webforai/commit/97863ea7f9f4837b96f376bd33371c6ed756d791)]:
 96 |   - webforai@1.3.3
 97 | 
 98 | ## 1.0.6
 99 | 
100 | ### Patch Changes
101 | 
102 | - Updated dependencies [[`3c6da39`](https://github.com/inaridiy/webforai/commit/3c6da3952f176769cf8aa899f6c7207c231d806a), [`4437e28`](https://github.com/inaridiy/webforai/commit/4437e28e1e7807fd061aee99510ea2d3f71a2a78), [`4764767`](https://github.com/inaridiy/webforai/commit/47647676a838b922e2cf32b1d3637c8153b996dd)]:
103 |   - webforai@1.3.2
104 | 
105 | ## 1.0.5
106 | 
107 | ### Patch Changes
108 | 
109 | - Updated dependencies [[`680a226`](https://github.com/inaridiy/webforai/commit/680a22638409517658c3918d90d070b1fa53cc3f)]:
110 |   - webforai@1.3.1
111 | 
112 | ## 1.0.4
113 | 
114 | ### Patch Changes
115 | 
116 | - [#12](https://github.com/inaridiy/webforai/pull/12) [`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7) Thanks [@inaridiy](https://github.com/inaridiy)! - Minor interface improvements.
117 | 
118 | - Updated dependencies [[`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7)]:
119 |   - webforai@1.3.0
120 | 
121 | ## 1.0.3
122 | 
123 | ### Patch Changes
124 | 
125 | - [`64b3a07`](https://github.com/inaridiy/webforai/commit/64b3a07304d364320364b499ca73df24cd312afd) Thanks [@inaridiy](https://github.com/inaridiy)! - Update
126 | 
127 | ## 1.0.2
128 | 
129 | ### Patch Changes
130 | 
131 | - [`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflow
132 | 
133 | - [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflows
134 | 
135 | - Updated dependencies [[`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953), [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b)]:
136 |   - webforai@1.2.3
137 | 
138 | ## 1.0.1
139 | 
140 | ### Patch Changes
141 | 
142 | - 920f310: Update Linter and Workflows
143 | - Updated dependencies [920f310]
144 |   - webforai@1.2.2
145 | 


--------------------------------------------------------------------------------
/examples/bench/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "bench",
 3 | 	"version": "1.0.19",
 4 | 	"description": "",
 5 | 	"main": "index.js",
 6 | 	"type": "module",
 7 | 	"private": true,
 8 | 	"scripts": {
 9 | 		"test": "echo \"Error: no test specified\" && exit 1"
10 | 	},
11 | 	"keywords": [],
12 | 	"author": "",
13 | 	"license": "ISC",
14 | 	"dependencies": {
15 | 		"arg": "^5.0.2",
16 | 		"playwright": "^1.40.1",
17 | 		"tsx": "^4.19.1",
18 | 		"webforai": "workspace:^"
19 | 	},
20 | 	"devDependencies": {
21 | 		"@tsconfig/recommended": "^1.0.3",
22 | 		"@types/node": "^20.14.10",
23 | 		"typescript": "^5.4.5"
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/examples/bench/src/index.ts:
--------------------------------------------------------------------------------
 1 | import { promises as fs } from "node:fs";
 2 | import { htmlToMarkdown } from "webforai";
 3 | import { loadHtml } from "webforai/loaders/playwright";
 4 | 
 5 | await fs.mkdir(".output", { recursive: true });
 6 | 
 7 | const id = Date.now();
 8 | await fs.mkdir(`.output/${id}`, { recursive: true });
 9 | 
10 | const targets = [
11 | 	"https://nextjs.org/docs/app/building-your-application/routing/pages-and-layouts",
12 | 	"https://ja.wikipedia.org/wiki/%E6%9C%A8%E6%9D%91%E6%8B%93%E5%93%89",
13 | 	"https://zenn.dev/frontendflat/articles/9d15b1b7abd524",
14 | 	"https://zenn.dev/dmmdata/articles/694e32c34dbd4c",
15 | 	"https://www3.nhk.or.jp/news/html/20240329/k10014405791000.html",
16 | 	"https://gigazine.net/",
17 | 	"https://www.npmjs.com/package/webforai",
18 | 	"https://developers.cloudflare.com/browser-rendering/get-started/reuse-sessions/",
19 | 	"https://news.livedoor.com/topics/detail/26152830",
20 | 	"https://viem.sh/docs/actions/public/getLogs.html",
21 | 	"https://www.google.com/search?q=%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF&oq=%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIGCAEQLhhA0gEIMTM0OGowajSoAgCwAgE&sourceid=chrome&ie=UTF-8",
22 | 	"https://www.amazon.co.jp/Hold-On-Holdon-Q1J-%E3%83%8A%E3%82%A4%E3%83%88%E3%83%96%E3%83%AB%E3%83%BC/dp/B0872VRY3K/?_encoding=UTF8&ref_=pd_gw_ci_mcx_mr_hp_atf_m",
23 | ];
24 | 
25 | for (const url of targets) {
26 | 	const html = await loadHtml(url);
27 | 	await fs.writeFile(`.output/${id}/${url.split("/").slice(-1)[0]}.html`, html);
28 | 
29 | 	const markdown = htmlToMarkdown(html, {
30 | 		baseUrl: url,
31 | 		extractors: "takumi",
32 | 		linkAsText: true,
33 | 		tableAsText: true,
34 | 		hideImage: true,
35 | 	});
36 | 
37 | 	await fs.writeFile(`.output/${id}/${url.split("/").slice(-1)[0]}.md`, markdown);
38 | }
39 | 


--------------------------------------------------------------------------------
/examples/bench/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"extends": "@tsconfig/recommended/tsconfig.json",
3 | 	"compilerOptions": {
4 | 		"module": "NodeNext",
5 | 		"target": "ESNext"
6 | 	}
7 | }
8 | 


--------------------------------------------------------------------------------
/examples/scraping/.gitignore:
--------------------------------------------------------------------------------
1 | .output
2 | .env


--------------------------------------------------------------------------------
/examples/scraping/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # scraping
  2 | 
  3 | ## 1.0.19
  4 | 
  5 | ### Patch Changes
  6 | 
  7 | - Updated dependencies [[`7fa6ec7`](https://github.com/inaridiy/webforai/commit/7fa6ec75b9966a92820a21a4b8f9eb85c5c2020c)]:
  8 |   - webforai@2.1.1
  9 | 
 10 | ## 1.0.18
 11 | 
 12 | ### Patch Changes
 13 | 
 14 | - Updated dependencies [[`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4)]:
 15 |   - webforai@2.1.0
 16 | 
 17 | ## 1.0.17
 18 | 
 19 | ### Patch Changes
 20 | 
 21 | - Updated dependencies [[`a2e0fc0`](https://github.com/inaridiy/webforai/commit/a2e0fc0b00554a3d860f0cdf67494c1691f9eeb3)]:
 22 |   - webforai@2.0.1
 23 | 
 24 | ## 1.0.16
 25 | 
 26 | ### Patch Changes
 27 | 
 28 | - Updated dependencies [[`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f), [`baaf7ea`](https://github.com/inaridiy/webforai/commit/baaf7ea33b9a75d07dfb910231d64d5b6efc6f40)]:
 29 |   - webforai@2.0.0
 30 | 
 31 | ## 1.0.15
 32 | 
 33 | ### Patch Changes
 34 | 
 35 | - Updated dependencies [[`fc84541`](https://github.com/inaridiy/webforai/commit/fc84541331ebc2dbf7d80af0694d9cdc79145a59)]:
 36 |   - webforai@1.6.3
 37 | 
 38 | ## 1.0.14
 39 | 
 40 | ### Patch Changes
 41 | 
 42 | - Updated dependencies [[`00d2a55`](https://github.com/inaridiy/webforai/commit/00d2a55b4a25f6d84da37cb89ec629b5f6179357), [`a7db6ef`](https://github.com/inaridiy/webforai/commit/a7db6ef61f1321e010ed16022086a684c19d77a5)]:
 43 |   - webforai@1.6.2
 44 | 
 45 | ## 1.0.13
 46 | 
 47 | ### Patch Changes
 48 | 
 49 | - Updated dependencies [[`5e82341`](https://github.com/inaridiy/webforai/commit/5e82341c2a3b8275d78ce808c7d4f81dacb1acdd)]:
 50 |   - webforai@1.6.1
 51 | 
 52 | ## 1.0.12
 53 | 
 54 | ### Patch Changes
 55 | 
 56 | - [#38](https://github.com/inaridiy/webforai/pull/38) [`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9) Thanks [@inaridiy](https://github.com/inaridiy)! - improve seido
 57 | 
 58 | - Updated dependencies [[`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9), [`30d181e`](https://github.com/inaridiy/webforai/commit/30d181e7a29452b37f6dd665de3e7a3b743c1244)]:
 59 |   - webforai@1.6.0
 60 | 
 61 | ## 1.0.11
 62 | 
 63 | ### Patch Changes
 64 | 
 65 | - Updated dependencies [[`39c1122`](https://github.com/inaridiy/webforai/commit/39c112205d0aa2a07a46c92b94be6ed91239cf0f)]:
 66 |   - webforai@1.5.1
 67 | 
 68 | ## 1.0.10
 69 | 
 70 | ### Patch Changes
 71 | 
 72 | - Updated dependencies [[`b13719f`](https://github.com/inaridiy/webforai/commit/b13719fd719511d0aeb1ef3749e88fd8145337a6), [`167acb1`](https://github.com/inaridiy/webforai/commit/167acb1dca303d651d997c23224d3366ff4375ac)]:
 73 |   - webforai@1.5.0
 74 | 
 75 | ## 1.0.9
 76 | 
 77 | ### Patch Changes
 78 | 
 79 | - Updated dependencies [[`fd7348f`](https://github.com/inaridiy/webforai/commit/fd7348f59a19a027b9bdf012b11d40c38d56cf61)]:
 80 |   - webforai@1.4.1
 81 | 
 82 | ## 1.0.8
 83 | 
 84 | ### Patch Changes
 85 | 
 86 | - Updated dependencies [[`5bdea98`](https://github.com/inaridiy/webforai/commit/5bdea98cc7cafd79020123260db721fc6ffefd87)]:
 87 |   - webforai@1.4.0
 88 | 
 89 | ## 1.0.7
 90 | 
 91 | ### Patch Changes
 92 | 
 93 | - Updated dependencies [[`c5e8416`](https://github.com/inaridiy/webforai/commit/c5e841610360346fcba388c777869706dcd5997d), [`97863ea`](https://github.com/inaridiy/webforai/commit/97863ea7f9f4837b96f376bd33371c6ed756d791)]:
 94 |   - webforai@1.3.3
 95 | 
 96 | ## 1.0.6
 97 | 
 98 | ### Patch Changes
 99 | 
100 | - Updated dependencies [[`3c6da39`](https://github.com/inaridiy/webforai/commit/3c6da3952f176769cf8aa899f6c7207c231d806a), [`4437e28`](https://github.com/inaridiy/webforai/commit/4437e28e1e7807fd061aee99510ea2d3f71a2a78), [`4764767`](https://github.com/inaridiy/webforai/commit/47647676a838b922e2cf32b1d3637c8153b996dd)]:
101 |   - webforai@1.3.2
102 | 
103 | ## 1.0.5
104 | 
105 | ### Patch Changes
106 | 
107 | - Updated dependencies [[`680a226`](https://github.com/inaridiy/webforai/commit/680a22638409517658c3918d90d070b1fa53cc3f)]:
108 |   - webforai@1.3.1
109 | 
110 | ## 1.0.4
111 | 
112 | ### Patch Changes
113 | 
114 | - [#12](https://github.com/inaridiy/webforai/pull/12) [`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7) Thanks [@inaridiy](https://github.com/inaridiy)! - Minor interface improvements.
115 | 
116 | - Updated dependencies [[`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7)]:
117 |   - webforai@1.3.0
118 | 
119 | ## 1.0.3
120 | 
121 | ### Patch Changes
122 | 
123 | - [`64b3a07`](https://github.com/inaridiy/webforai/commit/64b3a07304d364320364b499ca73df24cd312afd) Thanks [@inaridiy](https://github.com/inaridiy)! - Update
124 | 
125 | ## 1.0.2
126 | 
127 | ### Patch Changes
128 | 
129 | - [`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflow
130 | 
131 | - [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflows
132 | 
133 | - Updated dependencies [[`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953), [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b)]:
134 |   - webforai@1.2.3
135 | 
136 | ## 1.0.1
137 | 
138 | ### Patch Changes
139 | 
140 | - 920f310: Update Linter and Workflows
141 | - Updated dependencies [920f310]
142 |   - webforai@1.2.2
143 | 


--------------------------------------------------------------------------------
/examples/scraping/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "scraping",
 3 | 	"version": "1.0.19",
 4 | 	"description": "",
 5 | 	"main": "index.js",
 6 | 	"type": "module",
 7 | 	"private": true,
 8 | 	"scripts": {
 9 | 		"test": "echo \"Error: no test specified\" && exit 1"
10 | 	},
11 | 	"keywords": [],
12 | 	"author": "",
13 | 	"license": "ISC",
14 | 	"dependencies": {
15 | 		"arg": "^5.0.2",
16 | 		"dotenv": "^16.4.5",
17 | 		"openai": "^4.29.1",
18 | 		"playwright": "^1.40.1",
19 | 		"webforai": "workspace:^"
20 | 	},
21 | 	"devDependencies": {
22 | 		"@tsconfig/recommended": "^1.0.3",
23 | 		"@types/node": "^20.14.10",
24 | 		"tsx": "^4.19.1",
25 | 		"typescript": "^5.4.5"
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/examples/scraping/src/index.ts:
--------------------------------------------------------------------------------
 1 | import { promises as fs } from "node:fs";
 2 | import dotenv from "dotenv";
 3 | import { OpenAI } from "openai";
 4 | import { htmlToMarkdown } from "webforai";
 5 | import { loadHtml } from "webforai/loaders/playwright";
 6 | 
 7 | dotenv.config();
 8 | 
 9 | const openai = new OpenAI({
10 | 	apiKey: process.env.OPENAI_API_KEY,
11 | });
12 | 
13 | await fs.mkdir(".output", { recursive: true });
14 | 
15 | const packages = [
16 | 	"https://www.npmjs.com/package/webforai",
17 | 	"https://crates.io/crates/openai",
18 | 	"https://github.com/openai/openai-python",
19 | ];
20 | 
21 | const scrapedPackages = [];
22 | for (const packageUrl of packages) {
23 | 	const html = await loadHtml(packageUrl);
24 | 	const markdown = htmlToMarkdown(html, { baseUrl: packageUrl });
25 | 
26 | 	const prompt = `Extract the JSON information from the package's Markdown documentation according to the schema below.
27 | 
28 | \`\`\`json
29 | {
30 |     "name": "package-name",
31 |     "description": "package-description",
32 |     "language": "package-language",
33 |     "license": "package-license",
34 | }
35 | \`\`\`
36 | 
37 | ---
38 | ${markdown}
39 | `;
40 | 
41 | 	const response = await openai.chat.completions.create({
42 | 		model: "gpt-3.5-turbo-0125",
43 | 		response_format: { type: "json_object" },
44 | 		messages: [{ role: "user", content: prompt }],
45 | 	});
46 | 	const json = JSON.parse(response.choices[0].message.content ?? "");
47 | 	scrapedPackages.push(json);
48 | }
49 | 
50 | await fs.writeFile(".output/scraped-packages.json", JSON.stringify(scrapedPackages, null, 2));
51 | 


--------------------------------------------------------------------------------
/examples/scraping/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"extends": "@tsconfig/recommended/tsconfig.json",
3 | 	"compilerOptions": {
4 | 		"module": "NodeNext",
5 | 		"target": "ESNext"
6 | 	}
7 | }
8 | 


--------------------------------------------------------------------------------
/examples/simple/.gitignore:
--------------------------------------------------------------------------------
1 | .output


--------------------------------------------------------------------------------
/examples/simple/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # simple
  2 | 
  3 | ## 1.1.1
  4 | 
  5 | ### Patch Changes
  6 | 
  7 | - Updated dependencies [[`7fa6ec7`](https://github.com/inaridiy/webforai/commit/7fa6ec75b9966a92820a21a4b8f9eb85c5c2020c)]:
  8 |   - webforai@2.1.1
  9 | 
 10 | ## 1.1.0
 11 | 
 12 | ### Minor Changes
 13 | 
 14 | - [#56](https://github.com/inaridiy/webforai/pull/56) [`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4) Thanks [@inaridiy](https://github.com/inaridiy)! - Add minimal filter extractor
 15 | 
 16 | ### Patch Changes
 17 | 
 18 | - Updated dependencies [[`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4)]:
 19 |   - webforai@2.1.0
 20 | 
 21 | ## 1.0.17
 22 | 
 23 | ### Patch Changes
 24 | 
 25 | - Updated dependencies [[`a2e0fc0`](https://github.com/inaridiy/webforai/commit/a2e0fc0b00554a3d860f0cdf67494c1691f9eeb3)]:
 26 |   - webforai@2.0.1
 27 | 
 28 | ## 1.0.16
 29 | 
 30 | ### Patch Changes
 31 | 
 32 | - Updated dependencies [[`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f), [`baaf7ea`](https://github.com/inaridiy/webforai/commit/baaf7ea33b9a75d07dfb910231d64d5b6efc6f40)]:
 33 |   - webforai@2.0.0
 34 | 
 35 | ## 1.0.15
 36 | 
 37 | ### Patch Changes
 38 | 
 39 | - Updated dependencies [[`fc84541`](https://github.com/inaridiy/webforai/commit/fc84541331ebc2dbf7d80af0694d9cdc79145a59)]:
 40 |   - webforai@1.6.3
 41 | 
 42 | ## 1.0.14
 43 | 
 44 | ### Patch Changes
 45 | 
 46 | - Updated dependencies [[`00d2a55`](https://github.com/inaridiy/webforai/commit/00d2a55b4a25f6d84da37cb89ec629b5f6179357), [`a7db6ef`](https://github.com/inaridiy/webforai/commit/a7db6ef61f1321e010ed16022086a684c19d77a5)]:
 47 |   - webforai@1.6.2
 48 | 
 49 | ## 1.0.13
 50 | 
 51 | ### Patch Changes
 52 | 
 53 | - Updated dependencies [[`5e82341`](https://github.com/inaridiy/webforai/commit/5e82341c2a3b8275d78ce808c7d4f81dacb1acdd)]:
 54 |   - webforai@1.6.1
 55 | 
 56 | ## 1.0.12
 57 | 
 58 | ### Patch Changes
 59 | 
 60 | - [#38](https://github.com/inaridiy/webforai/pull/38) [`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9) Thanks [@inaridiy](https://github.com/inaridiy)! - improve seido
 61 | 
 62 | - Updated dependencies [[`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9), [`30d181e`](https://github.com/inaridiy/webforai/commit/30d181e7a29452b37f6dd665de3e7a3b743c1244)]:
 63 |   - webforai@1.6.0
 64 | 
 65 | ## 1.0.11
 66 | 
 67 | ### Patch Changes
 68 | 
 69 | - Updated dependencies [[`39c1122`](https://github.com/inaridiy/webforai/commit/39c112205d0aa2a07a46c92b94be6ed91239cf0f)]:
 70 |   - webforai@1.5.1
 71 | 
 72 | ## 1.0.10
 73 | 
 74 | ### Patch Changes
 75 | 
 76 | - Updated dependencies [[`b13719f`](https://github.com/inaridiy/webforai/commit/b13719fd719511d0aeb1ef3749e88fd8145337a6), [`167acb1`](https://github.com/inaridiy/webforai/commit/167acb1dca303d651d997c23224d3366ff4375ac)]:
 77 |   - webforai@1.5.0
 78 | 
 79 | ## 1.0.9
 80 | 
 81 | ### Patch Changes
 82 | 
 83 | - Updated dependencies [[`fd7348f`](https://github.com/inaridiy/webforai/commit/fd7348f59a19a027b9bdf012b11d40c38d56cf61)]:
 84 |   - webforai@1.4.1
 85 | 
 86 | ## 1.0.8
 87 | 
 88 | ### Patch Changes
 89 | 
 90 | - Updated dependencies [[`5bdea98`](https://github.com/inaridiy/webforai/commit/5bdea98cc7cafd79020123260db721fc6ffefd87)]:
 91 |   - webforai@1.4.0
 92 | 
 93 | ## 1.0.7
 94 | 
 95 | ### Patch Changes
 96 | 
 97 | - Updated dependencies [[`c5e8416`](https://github.com/inaridiy/webforai/commit/c5e841610360346fcba388c777869706dcd5997d), [`97863ea`](https://github.com/inaridiy/webforai/commit/97863ea7f9f4837b96f376bd33371c6ed756d791)]:
 98 |   - webforai@1.3.3
 99 | 
100 | ## 1.0.6
101 | 
102 | ### Patch Changes
103 | 
104 | - Updated dependencies [[`3c6da39`](https://github.com/inaridiy/webforai/commit/3c6da3952f176769cf8aa899f6c7207c231d806a), [`4437e28`](https://github.com/inaridiy/webforai/commit/4437e28e1e7807fd061aee99510ea2d3f71a2a78), [`4764767`](https://github.com/inaridiy/webforai/commit/47647676a838b922e2cf32b1d3637c8153b996dd)]:
105 |   - webforai@1.3.2
106 | 
107 | ## 1.0.5
108 | 
109 | ### Patch Changes
110 | 
111 | - Updated dependencies [[`680a226`](https://github.com/inaridiy/webforai/commit/680a22638409517658c3918d90d070b1fa53cc3f)]:
112 |   - webforai@1.3.1
113 | 
114 | ## 1.0.4
115 | 
116 | ### Patch Changes
117 | 
118 | - [#12](https://github.com/inaridiy/webforai/pull/12) [`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7) Thanks [@inaridiy](https://github.com/inaridiy)! - Minor interface improvements.
119 | 
120 | - Updated dependencies [[`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7)]:
121 |   - webforai@1.3.0
122 | 
123 | ## 1.0.3
124 | 
125 | ### Patch Changes
126 | 
127 | - [`64b3a07`](https://github.com/inaridiy/webforai/commit/64b3a07304d364320364b499ca73df24cd312afd) Thanks [@inaridiy](https://github.com/inaridiy)! - Update
128 | 
129 | ## 1.0.2
130 | 
131 | ### Patch Changes
132 | 
133 | - [`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflow
134 | 
135 | - [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflows
136 | 
137 | - Updated dependencies [[`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953), [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b)]:
138 |   - webforai@1.2.3
139 | 
140 | ## 1.0.1
141 | 
142 | ### Patch Changes
143 | 
144 | - 920f310: Update Linter and Workflows
145 | - Updated dependencies [920f310]
146 |   - webforai@1.2.2
147 | 


--------------------------------------------------------------------------------
/examples/simple/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "simple",
 3 | 	"version": "1.1.1",
 4 | 	"description": "",
 5 | 	"main": "index.js",
 6 | 	"type": "module",
 7 | 	"private": true,
 8 | 	"scripts": {
 9 | 		"test": "echo \"Error: no test specified\" && exit 1"
10 | 	},
11 | 	"keywords": [],
12 | 	"author": "",
13 | 	"license": "ISC",
14 | 	"dependencies": {
15 | 		"arg": "^5.0.2",
16 | 		"playwright": "^1.40.1",
17 | 		"webforai": "workspace:^"
18 | 	},
19 | 	"devDependencies": {
20 | 		"@tsconfig/recommended": "^1.0.3",
21 | 		"@types/node": "^20.14.10",
22 | 		"tsx": "^4.19.1",
23 | 		"typescript": "^5.4.5"
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/examples/simple/src/index.ts:
--------------------------------------------------------------------------------
 1 | import { promises as fs } from "node:fs";
 2 | import arg from "arg";
 3 | import { htmlToMarkdown } from "webforai";
 4 | import { loadHtml } from "webforai/loaders/playwright";
 5 | 
 6 | await fs.mkdir(".output", { recursive: true });
 7 | 
 8 | const args = arg({ "--url": String });
 9 | 
10 | const url = args["--url"] ?? "https://webforai.dev/";
11 | 
12 | const html = await loadHtml(url);
13 | 
14 | await fs.writeFile(".output/output.html", html);
15 | 
16 | const rawMarkdown = htmlToMarkdown(html, { baseUrl: url, extractors: false });
17 | 
18 | await fs.writeFile(".output/output.raw.md", rawMarkdown);
19 | 
20 | const markdown = htmlToMarkdown(html, { baseUrl: url });
21 | 
22 | await fs.writeFile(".output/output.md", markdown);
23 | 


--------------------------------------------------------------------------------
/examples/simple/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"extends": "@tsconfig/recommended/tsconfig.json",
3 | 	"compilerOptions": {
4 | 		"module": "NodeNext",
5 | 		"target": "ESNext"
6 | 	}
7 | }
8 | 


--------------------------------------------------------------------------------
/examples/translate/.gitignore:
--------------------------------------------------------------------------------
1 | .output
2 | .env


--------------------------------------------------------------------------------
/examples/translate/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # translate
  2 | 
  3 | ## 1.0.20
  4 | 
  5 | ### Patch Changes
  6 | 
  7 | - Updated dependencies [[`7fa6ec7`](https://github.com/inaridiy/webforai/commit/7fa6ec75b9966a92820a21a4b8f9eb85c5c2020c)]:
  8 |   - webforai@2.1.1
  9 | 
 10 | ## 1.0.19
 11 | 
 12 | ### Patch Changes
 13 | 
 14 | - Updated dependencies [[`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4)]:
 15 |   - webforai@2.1.0
 16 | 
 17 | ## 1.0.18
 18 | 
 19 | ### Patch Changes
 20 | 
 21 | - Updated dependencies [[`a2e0fc0`](https://github.com/inaridiy/webforai/commit/a2e0fc0b00554a3d860f0cdf67494c1691f9eeb3)]:
 22 |   - webforai@2.0.1
 23 | 
 24 | ## 1.0.17
 25 | 
 26 | ### Patch Changes
 27 | 
 28 | - [#50](https://github.com/inaridiy/webforai/pull/50) [`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f) Thanks [@inaridiy](https://github.com/inaridiy)! - New Documentation Site
 29 | 
 30 | - Updated dependencies [[`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f), [`baaf7ea`](https://github.com/inaridiy/webforai/commit/baaf7ea33b9a75d07dfb910231d64d5b6efc6f40)]:
 31 |   - webforai@2.0.0
 32 | 
 33 | ## 1.0.16
 34 | 
 35 | ### Patch Changes
 36 | 
 37 | - Updated dependencies [[`fc84541`](https://github.com/inaridiy/webforai/commit/fc84541331ebc2dbf7d80af0694d9cdc79145a59)]:
 38 |   - webforai@1.6.3
 39 | 
 40 | ## 1.0.15
 41 | 
 42 | ### Patch Changes
 43 | 
 44 | - Updated dependencies [[`00d2a55`](https://github.com/inaridiy/webforai/commit/00d2a55b4a25f6d84da37cb89ec629b5f6179357), [`a7db6ef`](https://github.com/inaridiy/webforai/commit/a7db6ef61f1321e010ed16022086a684c19d77a5)]:
 45 |   - webforai@1.6.2
 46 | 
 47 | ## 1.0.14
 48 | 
 49 | ### Patch Changes
 50 | 
 51 | - Updated dependencies [[`5e82341`](https://github.com/inaridiy/webforai/commit/5e82341c2a3b8275d78ce808c7d4f81dacb1acdd)]:
 52 |   - webforai@1.6.1
 53 | 
 54 | ## 1.0.13
 55 | 
 56 | ### Patch Changes
 57 | 
 58 | - [#38](https://github.com/inaridiy/webforai/pull/38) [`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9) Thanks [@inaridiy](https://github.com/inaridiy)! - improve seido
 59 | 
 60 | - Updated dependencies [[`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9), [`30d181e`](https://github.com/inaridiy/webforai/commit/30d181e7a29452b37f6dd665de3e7a3b743c1244)]:
 61 |   - webforai@1.6.0
 62 | 
 63 | ## 1.0.12
 64 | 
 65 | ### Patch Changes
 66 | 
 67 | - Updated dependencies [[`39c1122`](https://github.com/inaridiy/webforai/commit/39c112205d0aa2a07a46c92b94be6ed91239cf0f)]:
 68 |   - webforai@1.5.1
 69 | 
 70 | ## 1.0.11
 71 | 
 72 | ### Patch Changes
 73 | 
 74 | - Updated dependencies [[`b13719f`](https://github.com/inaridiy/webforai/commit/b13719fd719511d0aeb1ef3749e88fd8145337a6), [`167acb1`](https://github.com/inaridiy/webforai/commit/167acb1dca303d651d997c23224d3366ff4375ac)]:
 75 |   - webforai@1.5.0
 76 | 
 77 | ## 1.0.10
 78 | 
 79 | ### Patch Changes
 80 | 
 81 | - Updated dependencies [[`fd7348f`](https://github.com/inaridiy/webforai/commit/fd7348f59a19a027b9bdf012b11d40c38d56cf61)]:
 82 |   - webforai@1.4.1
 83 | 
 84 | ## 1.0.9
 85 | 
 86 | ### Patch Changes
 87 | 
 88 | - Updated dependencies [[`5bdea98`](https://github.com/inaridiy/webforai/commit/5bdea98cc7cafd79020123260db721fc6ffefd87)]:
 89 |   - webforai@1.4.0
 90 | 
 91 | ## 1.0.8
 92 | 
 93 | ### Patch Changes
 94 | 
 95 | - [#23](https://github.com/inaridiy/webforai/pull/23) [`2513931`](https://github.com/inaridiy/webforai/commit/25139317b242a28df6c2833646a43b42c633e681) Thanks [@inaridiy](https://github.com/inaridiy)! - Add Gemini Translate example
 96 | 
 97 | ## 1.0.7
 98 | 
 99 | ### Patch Changes
100 | 
101 | - Updated dependencies [[`c5e8416`](https://github.com/inaridiy/webforai/commit/c5e841610360346fcba388c777869706dcd5997d), [`97863ea`](https://github.com/inaridiy/webforai/commit/97863ea7f9f4837b96f376bd33371c6ed756d791)]:
102 |   - webforai@1.3.3
103 | 
104 | ## 1.0.6
105 | 
106 | ### Patch Changes
107 | 
108 | - Updated dependencies [[`3c6da39`](https://github.com/inaridiy/webforai/commit/3c6da3952f176769cf8aa899f6c7207c231d806a), [`4437e28`](https://github.com/inaridiy/webforai/commit/4437e28e1e7807fd061aee99510ea2d3f71a2a78), [`4764767`](https://github.com/inaridiy/webforai/commit/47647676a838b922e2cf32b1d3637c8153b996dd)]:
109 |   - webforai@1.3.2
110 | 
111 | ## 1.0.5
112 | 
113 | ### Patch Changes
114 | 
115 | - Updated dependencies [[`680a226`](https://github.com/inaridiy/webforai/commit/680a22638409517658c3918d90d070b1fa53cc3f)]:
116 |   - webforai@1.3.1
117 | 
118 | ## 1.0.4
119 | 
120 | ### Patch Changes
121 | 
122 | - [#12](https://github.com/inaridiy/webforai/pull/12) [`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7) Thanks [@inaridiy](https://github.com/inaridiy)! - Minor interface improvements.
123 | 
124 | - Updated dependencies [[`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7)]:
125 |   - webforai@1.3.0
126 | 
127 | ## 1.0.3
128 | 
129 | ### Patch Changes
130 | 
131 | - [`64b3a07`](https://github.com/inaridiy/webforai/commit/64b3a07304d364320364b499ca73df24cd312afd) Thanks [@inaridiy](https://github.com/inaridiy)! - Update
132 | 
133 | ## 1.0.2
134 | 
135 | ### Patch Changes
136 | 
137 | - [`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflow
138 | 
139 | - [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflows
140 | 
141 | - Updated dependencies [[`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953), [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b)]:
142 |   - webforai@1.2.3
143 | 
144 | ## 1.0.1
145 | 
146 | ### Patch Changes
147 | 
148 | - 920f310: Update Linter and Workflows
149 | - Updated dependencies [920f310]
150 |   - webforai@1.2.2
151 | 


--------------------------------------------------------------------------------
/examples/translate/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "translate",
 3 | 	"version": "1.0.20",
 4 | 	"description": "",
 5 | 	"main": "index.js",
 6 | 	"type": "module",
 7 | 	"private": true,
 8 | 	"scripts": {
 9 | 		"test": "echo \"Error: no test specified\" && exit 1"
10 | 	},
11 | 	"keywords": [],
12 | 	"author": "",
13 | 	"license": "ISC",
14 | 	"dependencies": {
15 | 		"@ai-sdk/google": "^0.0.48",
16 | 		"@anthropic-ai/sdk": "^0.18.0",
17 | 		"@google/generative-ai": "^0.12.0",
18 | 		"ai": "^3.4.7",
19 | 		"arg": "^5.0.2",
20 | 		"dotenv": "^16.4.5",
21 | 		"playwright": "^1.40.1",
22 | 		"webforai": "workspace:^"
23 | 	},
24 | 	"devDependencies": {
25 | 		"@tsconfig/recommended": "^1.0.3",
26 | 		"@types/node": "^20.14.10",
27 | 		"tsx": "^4.19.1",
28 | 		"typescript": "^5.4.5"
29 | 	}
30 | }
31 | 


--------------------------------------------------------------------------------
/examples/translate/src/index.ts:
--------------------------------------------------------------------------------
 1 | import { google } from "@ai-sdk/google";
 2 | import { generateText } from "ai";
 3 | import dotevn from "dotenv";
 4 | import { htmlToMarkdown } from "webforai";
 5 | import { loadHtml } from "webforai/loaders/playwright";
 6 | 
 7 | dotevn.config();
 8 | 
 9 | const url = "https://blog.cloudflare.com/the-story-of-web-framework-hono-from-the-creator-of-hono/";
10 | const targetLanguage = "ja";
11 | 
12 | const html = await loadHtml(url, { superBypassMode: true });
13 | const markdown = htmlToMarkdown(html);
14 | 
15 | const prompt = `Translate mechanically converted HTML-based Markdown into ${targetLanguage}, while refining and correcting the content for clarity and coherence.
16 | 
17 | The Markdown provided may contain redundant or unnecessary information and errors due to mechanical conversion. Your task is to translate the text into Japanese, fixing these issues and improving the overall quality of the Markdown document.
18 | 
19 | <input_document>
20 | ${markdown}
21 | </input_document>`;
22 | 
23 | const response = await generateText({
24 | 	model: google("gemini-1.5-flash-latest"),
25 | 	temperature: 0,
26 | 	prompt,
27 | 	maxSteps: 10,
28 | 	experimental_continueSteps: true,
29 | });
30 | 
31 | console.info(response.text);
32 | 


--------------------------------------------------------------------------------
/examples/translate/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"extends": "@tsconfig/recommended/tsconfig.json",
3 | 	"compilerOptions": {
4 | 		"module": "NodeNext",
5 | 		"target": "ESNext"
6 | 	}
7 | }
8 | 


--------------------------------------------------------------------------------
/examples/worker/.gitignore:
--------------------------------------------------------------------------------
  1 | # Logs
  2 | 
  3 | logs
  4 | _.log
  5 | npm-debug.log_
  6 | yarn-debug.log*
  7 | yarn-error.log*
  8 | lerna-debug.log*
  9 | .pnpm-debug.log*
 10 | 
 11 | # Diagnostic reports (https://nodejs.org/api/report.html)
 12 | 
 13 | report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
 14 | 
 15 | # Runtime data
 16 | 
 17 | pids
 18 | _.pid
 19 | _.seed
 20 | \*.pid.lock
 21 | 
 22 | # Directory for instrumented libs generated by jscoverage/JSCover
 23 | 
 24 | lib-cov
 25 | 
 26 | # Coverage directory used by tools like istanbul
 27 | 
 28 | coverage
 29 | \*.lcov
 30 | 
 31 | # nyc test coverage
 32 | 
 33 | .nyc_output
 34 | 
 35 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 36 | 
 37 | .grunt
 38 | 
 39 | # Bower dependency directory (https://bower.io/)
 40 | 
 41 | bower_components
 42 | 
 43 | # node-waf configuration
 44 | 
 45 | .lock-wscript
 46 | 
 47 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 48 | 
 49 | build/Release
 50 | 
 51 | # Dependency directories
 52 | 
 53 | node_modules/
 54 | jspm_packages/
 55 | 
 56 | # Snowpack dependency directory (https://snowpack.dev/)
 57 | 
 58 | web_modules/
 59 | 
 60 | # TypeScript cache
 61 | 
 62 | \*.tsbuildinfo
 63 | 
 64 | # Optional npm cache directory
 65 | 
 66 | .npm
 67 | 
 68 | # Optional eslint cache
 69 | 
 70 | .eslintcache
 71 | 
 72 | # Optional stylelint cache
 73 | 
 74 | .stylelintcache
 75 | 
 76 | # Microbundle cache
 77 | 
 78 | .rpt2_cache/
 79 | .rts2_cache_cjs/
 80 | .rts2_cache_es/
 81 | .rts2_cache_umd/
 82 | 
 83 | # Optional REPL history
 84 | 
 85 | .node_repl_history
 86 | 
 87 | # Output of 'npm pack'
 88 | 
 89 | \*.tgz
 90 | 
 91 | # Yarn Integrity file
 92 | 
 93 | .yarn-integrity
 94 | 
 95 | # dotenv environment variable files
 96 | 
 97 | .env
 98 | .env.development.local
 99 | .env.test.local
100 | .env.production.local
101 | .env.local
102 | 
103 | # parcel-bundler cache (https://parceljs.org/)
104 | 
105 | .cache
106 | .parcel-cache
107 | 
108 | # Next.js build output
109 | 
110 | .next
111 | out
112 | 
113 | # Nuxt.js build / generate output
114 | 
115 | .nuxt
116 | dist
117 | 
118 | # Gatsby files
119 | 
120 | .cache/
121 | 
122 | # Comment in the public line in if your project uses Gatsby and not Next.js
123 | 
124 | # https://nextjs.org/blog/next-9-1#public-directory-support
125 | 
126 | # public
127 | 
128 | # vuepress build output
129 | 
130 | .vuepress/dist
131 | 
132 | # vuepress v2.x temp and cache directory
133 | 
134 | .temp
135 | .cache
136 | 
137 | # Docusaurus cache and generated files
138 | 
139 | .docusaurus
140 | 
141 | # Serverless directories
142 | 
143 | .serverless/
144 | 
145 | # FuseBox cache
146 | 
147 | .fusebox/
148 | 
149 | # DynamoDB Local files
150 | 
151 | .dynamodb/
152 | 
153 | # TernJS port file
154 | 
155 | .tern-port
156 | 
157 | # Stores VSCode versions used for testing VSCode extensions
158 | 
159 | .vscode-test
160 | 
161 | # yarn v2
162 | 
163 | .yarn/cache
164 | .yarn/unplugged
165 | .yarn/build-state.yml
166 | .yarn/install-state.gz
167 | .pnp.\*
168 | 
169 | # wrangler project
170 | 
171 | .dev.vars
172 | .wrangler/
173 | 


--------------------------------------------------------------------------------
/examples/worker/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # worker
  2 | 
  3 | ## 0.0.18
  4 | 
  5 | ### Patch Changes
  6 | 
  7 | - Updated dependencies [[`7fa6ec7`](https://github.com/inaridiy/webforai/commit/7fa6ec75b9966a92820a21a4b8f9eb85c5c2020c)]:
  8 |   - webforai@2.1.1
  9 | 
 10 | ## 0.0.17
 11 | 
 12 | ### Patch Changes
 13 | 
 14 | - Updated dependencies [[`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4)]:
 15 |   - webforai@2.1.0
 16 | 
 17 | ## 0.0.16
 18 | 
 19 | ### Patch Changes
 20 | 
 21 | - Updated dependencies [[`a2e0fc0`](https://github.com/inaridiy/webforai/commit/a2e0fc0b00554a3d860f0cdf67494c1691f9eeb3)]:
 22 |   - webforai@2.0.1
 23 | 
 24 | ## 0.0.15
 25 | 
 26 | ### Patch Changes
 27 | 
 28 | - [#50](https://github.com/inaridiy/webforai/pull/50) [`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f) Thanks [@inaridiy](https://github.com/inaridiy)! - New Documentation Site
 29 | 
 30 | - Updated dependencies [[`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f), [`baaf7ea`](https://github.com/inaridiy/webforai/commit/baaf7ea33b9a75d07dfb910231d64d5b6efc6f40)]:
 31 |   - webforai@2.0.0
 32 | 
 33 | ## 0.0.14
 34 | 
 35 | ### Patch Changes
 36 | 
 37 | - Updated dependencies [[`fc84541`](https://github.com/inaridiy/webforai/commit/fc84541331ebc2dbf7d80af0694d9cdc79145a59)]:
 38 |   - webforai@1.6.3
 39 | 
 40 | ## 0.0.13
 41 | 
 42 | ### Patch Changes
 43 | 
 44 | - Updated dependencies [[`00d2a55`](https://github.com/inaridiy/webforai/commit/00d2a55b4a25f6d84da37cb89ec629b5f6179357), [`a7db6ef`](https://github.com/inaridiy/webforai/commit/a7db6ef61f1321e010ed16022086a684c19d77a5)]:
 45 |   - webforai@1.6.2
 46 | 
 47 | ## 0.0.12
 48 | 
 49 | ### Patch Changes
 50 | 
 51 | - Updated dependencies [[`5e82341`](https://github.com/inaridiy/webforai/commit/5e82341c2a3b8275d78ce808c7d4f81dacb1acdd)]:
 52 |   - webforai@1.6.1
 53 | 
 54 | ## 0.0.11
 55 | 
 56 | ### Patch Changes
 57 | 
 58 | - Updated dependencies [[`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9), [`30d181e`](https://github.com/inaridiy/webforai/commit/30d181e7a29452b37f6dd665de3e7a3b743c1244)]:
 59 |   - webforai@1.6.0
 60 | 
 61 | ## 0.0.10
 62 | 
 63 | ### Patch Changes
 64 | 
 65 | - Updated dependencies [[`39c1122`](https://github.com/inaridiy/webforai/commit/39c112205d0aa2a07a46c92b94be6ed91239cf0f)]:
 66 |   - webforai@1.5.1
 67 | 
 68 | ## 0.0.9
 69 | 
 70 | ### Patch Changes
 71 | 
 72 | - Updated dependencies [[`b13719f`](https://github.com/inaridiy/webforai/commit/b13719fd719511d0aeb1ef3749e88fd8145337a6), [`167acb1`](https://github.com/inaridiy/webforai/commit/167acb1dca303d651d997c23224d3366ff4375ac)]:
 73 |   - webforai@1.5.0
 74 | 
 75 | ## 0.0.8
 76 | 
 77 | ### Patch Changes
 78 | 
 79 | - Updated dependencies [[`fd7348f`](https://github.com/inaridiy/webforai/commit/fd7348f59a19a027b9bdf012b11d40c38d56cf61)]:
 80 |   - webforai@1.4.1
 81 | 
 82 | ## 0.0.7
 83 | 
 84 | ### Patch Changes
 85 | 
 86 | - Updated dependencies [[`5bdea98`](https://github.com/inaridiy/webforai/commit/5bdea98cc7cafd79020123260db721fc6ffefd87)]:
 87 |   - webforai@1.4.0
 88 | 
 89 | ## 0.0.6
 90 | 
 91 | ### Patch Changes
 92 | 
 93 | - Updated dependencies [[`c5e8416`](https://github.com/inaridiy/webforai/commit/c5e841610360346fcba388c777869706dcd5997d), [`97863ea`](https://github.com/inaridiy/webforai/commit/97863ea7f9f4837b96f376bd33371c6ed756d791)]:
 94 |   - webforai@1.3.3
 95 | 
 96 | ## 0.0.5
 97 | 
 98 | ### Patch Changes
 99 | 
100 | - Updated dependencies [[`3c6da39`](https://github.com/inaridiy/webforai/commit/3c6da3952f176769cf8aa899f6c7207c231d806a), [`4437e28`](https://github.com/inaridiy/webforai/commit/4437e28e1e7807fd061aee99510ea2d3f71a2a78), [`4764767`](https://github.com/inaridiy/webforai/commit/47647676a838b922e2cf32b1d3637c8153b996dd)]:
101 |   - webforai@1.3.2
102 | 
103 | ## 0.0.4
104 | 
105 | ### Patch Changes
106 | 
107 | - Updated dependencies [[`680a226`](https://github.com/inaridiy/webforai/commit/680a22638409517658c3918d90d070b1fa53cc3f)]:
108 |   - webforai@1.3.1
109 | 
110 | ## 0.0.3
111 | 
112 | ### Patch Changes
113 | 
114 | - [#12](https://github.com/inaridiy/webforai/pull/12) [`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7) Thanks [@inaridiy](https://github.com/inaridiy)! - Minor interface improvements.
115 | 
116 | - Updated dependencies [[`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7)]:
117 |   - webforai@1.3.0
118 | 
119 | ## 0.0.2
120 | 
121 | ### Patch Changes
122 | 
123 | - [`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflow
124 | 
125 | - [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflows
126 | 
127 | - Updated dependencies [[`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953), [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b)]:
128 |   - webforai@1.2.3
129 | 
130 | ## 0.0.1
131 | 
132 | ### Patch Changes
133 | 
134 | - 920f310: Update Linter and Workflows
135 | - Updated dependencies [920f310]
136 |   - webforai@1.2.2
137 | 


--------------------------------------------------------------------------------
/examples/worker/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "worker",
 3 | 	"version": "0.0.18",
 4 | 	"private": true,
 5 | 	"scripts": {
 6 | 		"deploy": "wrangler deploy",
 7 | 		"dev": "wrangler dev --remote",
 8 | 		"start": "wrangler dev"
 9 | 	},
10 | 	"devDependencies": {
11 | 		"@cloudflare/puppeteer": "^0.0.6",
12 | 		"@cloudflare/vitest-pool-workers": "^0.1.0",
13 | 		"@cloudflare/workers-types": "^4.20241018.0",
14 | 		"typescript": "^5.4.5",
15 | 		"vitest": "1.3.0",
16 | 		"wrangler": "^3.81.0"
17 | 	},
18 | 	"dependencies": {
19 | 		"@hono/valibot-validator": "^0.2.2",
20 | 		"hono": "^4.6.5",
21 | 		"valibot": "^0.30.0",
22 | 		"webforai": "workspace:^"
23 | 	}
24 | }
25 | 


--------------------------------------------------------------------------------
/examples/worker/src/index.ts:
--------------------------------------------------------------------------------
  1 | import { DurableObject } from "cloudflare:workers";
  2 | import puppeteer from "@cloudflare/puppeteer";
  3 | import { vValidator } from "@hono/valibot-validator";
  4 | import { Hono } from "hono";
  5 | import { cache } from "hono/cache";
  6 | import { url, literal, object, optional, string, union } from "valibot";
  7 | import { htmlToMarkdown } from "webforai";
  8 | 
  9 | type Bindings = { MYBROWSER: puppeteer.BrowserWorker; BROWSER: DurableObjectNamespace<BrowserDO> };
 10 | 
 11 | const app = new Hono<{ Bindings: Bindings }>();
 12 | 
 13 | const BROWSER_KEYS = ["browser1", "browser2"];
 14 | 
 15 | const schema = object({
 16 | 	url: string([url()]),
 17 | 	mode: optional(union([literal("readability"), literal("ai")])),
 18 | });
 19 | 
 20 | app.get(
 21 | 	"/",
 22 | 	cache({ cacheName: "html-to-markdown", cacheControl: "max-age=3600" }),
 23 | 	vValidator("query", schema),
 24 | 	async (c) => {
 25 | 		const { url, mode } = c.req.valid("query");
 26 | 
 27 | 		const pickedKey = BROWSER_KEYS[Math.floor(Math.random() * BROWSER_KEYS.length)];
 28 | 		const browser = c.env.BROWSER.get(c.env.BROWSER.idFromName(pickedKey));
 29 | 		const result = await browser.renderUrl(url);
 30 | 
 31 | 		if (!result.success) {
 32 | 			return c.text(result.error, 500);
 33 | 		}
 34 | 
 35 | 		const aiModeOptions = { linkAsText: true, tableAsText: true, hideImage: true };
 36 | 		const readabilityModeOptions = { linkAsText: false, tableAsText: false, hideImage: false };
 37 | 		const markdown = htmlToMarkdown(result.html, {
 38 | 			baseUrl: url,
 39 | 			...(mode === "ai" ? aiModeOptions : readabilityModeOptions),
 40 | 		});
 41 | 		return c.text(markdown);
 42 | 	},
 43 | );
 44 | 
 45 | // biome-ignore lint/style/noDefaultExport: This is the default export for the worker script
 46 | export default app;
 47 | 
 48 | const KEEP_BROWSER_ALIVE_IN_SECONDS = 60;
 49 | 
 50 | export class BrowserDO extends DurableObject<Bindings> {
 51 | 	private browser: puppeteer.Browser | null = null;
 52 | 	private keptAliveInSeconds = 0;
 53 | 
 54 | 	async renderUrl(url: string): Promise<{ success: true; html: string } | { success: false; error: string }> {
 55 | 		const normalizedUrl = new URL(url).toString();
 56 | 
 57 | 		try {
 58 | 			if (!this.browser?.isConnected()) {
 59 | 				const sessions = await puppeteer.sessions(this.env.MYBROWSER);
 60 | 				const freeSession = sessions.find((s) => !s.connectionId);
 61 | 				if (freeSession) {
 62 | 					this.browser = await puppeteer.connect(this.env.MYBROWSER, freeSession.sessionId);
 63 | 				} else {
 64 | 					this.browser = await puppeteer.launch(this.env.MYBROWSER);
 65 | 				}
 66 | 			}
 67 | 		} catch (e) {
 68 | 			console.error(e);
 69 | 			return { success: false, error: "Failed to launch browser" };
 70 | 		}
 71 | 
 72 | 		this.keptAliveInSeconds = 0;
 73 | 		const page = await this.browser.newPage();
 74 | 		await page.goto(normalizedUrl, { waitUntil: "networkidle0" });
 75 | 
 76 | 		//scriptタグを削除
 77 | 		await page.evaluate(() => {
 78 | 			const scripts = document.querySelectorAll("script");
 79 | 			for (const script of Array.from(scripts)) {
 80 | 				script.remove();
 81 | 			}
 82 | 		});
 83 | 
 84 | 		const html = await page.content();
 85 | 
 86 | 		const cleanup = async () => {
 87 | 			await page.close();
 88 | 			this.keptAliveInSeconds = 0;
 89 | 			const currentAlarm = await this.ctx.storage.getAlarm();
 90 | 			if (currentAlarm) {
 91 | 				return;
 92 | 			}
 93 | 			const tenSeconds = 10 * 1000;
 94 | 			await this.ctx.storage.setAlarm(Date.now() + tenSeconds);
 95 | 		};
 96 | 		this.ctx.waitUntil(cleanup());
 97 | 
 98 | 		return { success: true, html };
 99 | 	}
100 | 
101 | 	async alarm() {
102 | 		this.keptAliveInSeconds += 10;
103 | 		if (this.keptAliveInSeconds < KEEP_BROWSER_ALIVE_IN_SECONDS) {
104 | 			await this.ctx.storage.setAlarm(Date.now() + 10 * 1000);
105 | 			if (this.browser?.isConnected()) {
106 | 				await this.browser.version();
107 | 			}
108 | 		} else {
109 | 			await this.browser?.close();
110 | 			this.browser = null;
111 | 		}
112 | 	}
113 | }
114 | 


--------------------------------------------------------------------------------
/examples/worker/wrangler.toml:
--------------------------------------------------------------------------------
 1 | name = "webforai"
 2 | main = "src/index.ts"
 3 | compatibility_date = "2024-04-03"
 4 | compatibility_flags = ["nodejs_compat"]
 5 | 
 6 | 
 7 | browser = { binding = "MYBROWSER" }
 8 | 
 9 | [durable_objects]
10 | bindings = [
11 |   { name = "BROWSER", class_name = "BrowserDO" }
12 | ]
13 | 
14 | [[migrations]]
15 | new_classes = ["BrowserDO"]
16 | tag = "v1"


--------------------------------------------------------------------------------
/images/logo.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inaridiy/webforai/2c5339a177ac6197c9a382b5d08fc973f1fc4a12/images/logo.webp


--------------------------------------------------------------------------------
/images/voice-genius.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inaridiy/webforai/2c5339a177ac6197c9a382b5d08fc973f1fc4a12/images/voice-genius.png


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "webforai",
 3 | 	"version": "0.0.0",
 4 | 	"description": "A library that provides a web interface for AI",
 5 | 	"author": "inaridiy",
 6 | 	"repository": {
 7 | 		"type": "git",
 8 | 		"url": "https://github.com/inaridiy/webforai.git"
 9 | 	},
10 | 	"homepage": "https://webforai.dev/",
11 | 	"bugs": "https://github.com/inaridiy/webforai/issues",
12 | 	"keywords": ["ai", "web", "scraping"],
13 | 	"private": true,
14 | 	"packageManager": "pnpm@9.1.4",
15 | 	"engines": {
16 | 		"node": ">=18.0.0"
17 | 	},
18 | 	"scripts": {
19 | 		"test": "vitest",
20 | 		"build": "pnpm run --r --filter \"./packages/**\" build",
21 | 		"format": "biome format .",
22 | 		"format:fix": "pnpm format --write .",
23 | 		"lint": "biome check .",
24 | 		"lint:fix": "pnpm lint --apply",
25 | 		"lint:repo": "sherif",
26 | 		"typecheck": "pnpm run --filter \"./packages/**\" typecheck",
27 | 		"ci:prepublish": "pnpm run build",
28 | 		"ci:version": "changeset version",
29 | 		"ci:publish": "pnpm ci:prepublish && changeset publish",
30 | 		"preinstall": "npx only-allow pnpm",
31 | 		"prepare": "pnpm simple-git-hooks",
32 | 		"postinstall": "pnpm -w build"
33 | 	},
34 | 	"devDependencies": {
35 | 		"@biomejs/biome": "1.7.3",
36 | 		"@changesets/changelog-github": "^0.5.0",
37 | 		"@changesets/cli": "^2.27.5",
38 | 		"sherif": "^0.8.4",
39 | 		"simple-git-hooks": "^2.11.1",
40 | 		"vitest": "1.3.0"
41 | 	},
42 | 	"simple-git-hooks": {
43 | 		"pre-commit": "pnpm format && pnpm lint && pnpm lint:repo && pnpm typecheck"
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/packages/webforai/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # webforai
  2 | 
  3 | ## 2.1.1
  4 | 
  5 | ### Patch Changes
  6 | 
  7 | - [#58](https://github.com/inaridiy/webforai/pull/58) [`7fa6ec7`](https://github.com/inaridiy/webforai/commit/7fa6ec75b9966a92820a21a4b8f9eb85c5c2020c) Thanks [@inaridiy](https://github.com/inaridiy)! - fix
  8 | 
  9 | ## 2.1.0
 10 | 
 11 | ### Minor Changes
 12 | 
 13 | - [#56](https://github.com/inaridiy/webforai/pull/56) [`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4) Thanks [@inaridiy](https://github.com/inaridiy)! - Add minimal filter extractor
 14 | 
 15 | ## 2.0.1
 16 | 
 17 | ### Patch Changes
 18 | 
 19 | - [#52](https://github.com/inaridiy/webforai/pull/52) [`a2e0fc0`](https://github.com/inaridiy/webforai/commit/a2e0fc0b00554a3d860f0cdf67494c1691f9eeb3) Thanks [@moons-14](https://github.com/moons-14)! - update package.json homepage
 20 | 
 21 | ## 2.0.0
 22 | 
 23 | ### Major Changes
 24 | 
 25 | - [#50](https://github.com/inaridiy/webforai/pull/50) [`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f) Thanks [@inaridiy](https://github.com/inaridiy)! - New Documentation Site
 26 | 
 27 | - [#49](https://github.com/inaridiy/webforai/pull/49) [`baaf7ea`](https://github.com/inaridiy/webforai/commit/baaf7ea33b9a75d07dfb910231d64d5b6efc6f40) Thanks [@inaridiy](https://github.com/inaridiy)! - “Readability” Extractor renamed takumi and license changed to Apache2 license.
 28 | 
 29 | ## 1.6.3
 30 | 
 31 | ### Patch Changes
 32 | 
 33 | - [#47](https://github.com/inaridiy/webforai/pull/47) [`fc84541`](https://github.com/inaridiy/webforai/commit/fc84541331ebc2dbf7d80af0694d9cdc79145a59) Thanks [@inaridiy](https://github.com/inaridiy)! - Fix Playwright installation command for improved reliability
 34 | 
 35 | ## 1.6.2
 36 | 
 37 | ### Patch Changes
 38 | 
 39 | - [#45](https://github.com/inaridiy/webforai/pull/45) [`00d2a55`](https://github.com/inaridiy/webforai/commit/00d2a55b4a25f6d84da37cb89ec629b5f6179357) Thanks [@inaridiy](https://github.com/inaridiy)! - Re Re Fix CLI
 40 | 
 41 | - [#45](https://github.com/inaridiy/webforai/pull/45) [`a7db6ef`](https://github.com/inaridiy/webforai/commit/a7db6ef61f1321e010ed16022086a684c19d77a5) Thanks [@inaridiy](https://github.com/inaridiy)! - Re Re Fix CLI
 42 | 
 43 | ## 1.6.1
 44 | 
 45 | ### Patch Changes
 46 | 
 47 | - [#41](https://github.com/inaridiy/webforai/pull/41) [`5e82341`](https://github.com/inaridiy/webforai/commit/5e82341c2a3b8275d78ce808c7d4f81dacb1acdd) Thanks [@inaridiy](https://github.com/inaridiy)! - Fix with { type : "json" } Error
 48 | 
 49 | ## 1.6.0
 50 | 
 51 | ### Minor Changes
 52 | 
 53 | - [#40](https://github.com/inaridiy/webforai/pull/40) [`30d181e`](https://github.com/inaridiy/webforai/commit/30d181e7a29452b37f6dd665de3e7a3b743c1244) Thanks [@inaridiy](https://github.com/inaridiy)! - The CLI has been completely corrected. Maybe.
 54 | 
 55 | ### Patch Changes
 56 | 
 57 | - [#38](https://github.com/inaridiy/webforai/pull/38) [`78f2c44`](https://github.com/inaridiy/webforai/commit/78f2c445f88136bdc596e57b91bec1b223f782d9) Thanks [@inaridiy](https://github.com/inaridiy)! - improve seido
 58 | 
 59 | ## 1.5.1
 60 | 
 61 | ### Patch Changes
 62 | 
 63 | - [#37](https://github.com/inaridiy/webforai/pull/37) [`39c1122`](https://github.com/inaridiy/webforai/commit/39c112205d0aa2a07a46c92b94be6ed91239cf0f) Thanks [@inaridiy](https://github.com/inaridiy)! - Fix CLI Dep Error
 64 | 
 65 | ## 1.5.0
 66 | 
 67 | ### Minor Changes
 68 | 
 69 | - [#30](https://github.com/inaridiy/webforai/pull/30) [`167acb1`](https://github.com/inaridiy/webforai/commit/167acb1dca303d651d997c23224d3366ff4375ac) Thanks [@moons-14](https://github.com/moons-14)! - webforai can now be run from the cli
 70 | 
 71 | ### Patch Changes
 72 | 
 73 | - [#31](https://github.com/inaridiy/webforai/pull/31) [`b13719f`](https://github.com/inaridiy/webforai/commit/b13719fd719511d0aeb1ef3749e88fd8145337a6) Thanks [@moons-14](https://github.com/moons-14)! - PRESET_EXTRACT_HAST and DEFAULT_EXTRACT_HAST can be referenced externally
 74 | 
 75 | ## 1.4.1
 76 | 
 77 | ### Patch Changes
 78 | 
 79 | - [#27](https://github.com/inaridiy/webforai/pull/27) [`fd7348f`](https://github.com/inaridiy/webforai/commit/fd7348f59a19a027b9bdf012b11d40c38d56cf61) Thanks [@inaridiy](https://github.com/inaridiy)! - Improve extract algorithm
 80 | 
 81 | ## 1.4.0
 82 | 
 83 | ### Minor Changes
 84 | 
 85 | - [#25](https://github.com/inaridiy/webforai/pull/25) [`5bdea98`](https://github.com/inaridiy/webforai/commit/5bdea98cc7cafd79020123260db721fc6ffefd87) Thanks [@moons-14](https://github.com/moons-14)! - Add loader using puppeteer
 86 | 
 87 | ## 1.3.3
 88 | 
 89 | ### Patch Changes
 90 | 
 91 | - [#20](https://github.com/inaridiy/webforai/pull/20) [`c5e8416`](https://github.com/inaridiy/webforai/commit/c5e841610360346fcba388c777869706dcd5997d) Thanks [@inaridiy](https://github.com/inaridiy)! - Minimal Param update
 92 | 
 93 | - [#22](https://github.com/inaridiy/webforai/pull/22) [`97863ea`](https://github.com/inaridiy/webforai/commit/97863ea7f9f4837b96f376bd33371c6ed756d791) Thanks [@inaridiy](https://github.com/inaridiy)! - Add fetch loader and improve playwright loader
 94 | 
 95 | ## 1.3.2
 96 | 
 97 | ### Patch Changes
 98 | 
 99 | - [#18](https://github.com/inaridiy/webforai/pull/18) [`3c6da39`](https://github.com/inaridiy/webforai/commit/3c6da3952f176769cf8aa899f6c7207c231d806a) Thanks [@inaridiy](https://github.com/inaridiy)! - Improve
100 | 
101 | - [#18](https://github.com/inaridiy/webforai/pull/18) [`4437e28`](https://github.com/inaridiy/webforai/commit/4437e28e1e7807fd061aee99510ea2d3f71a2a78) Thanks [@inaridiy](https://github.com/inaridiy)! - accuracy improvement
102 | 
103 | - [#18](https://github.com/inaridiy/webforai/pull/18) [`4764767`](https://github.com/inaridiy/webforai/commit/47647676a838b922e2cf32b1d3637c8153b996dd) Thanks [@inaridiy](https://github.com/inaridiy)! - Minor performance improvements
104 | 
105 | ## 1.3.1
106 | 
107 | ### Patch Changes
108 | 
109 | - [#15](https://github.com/inaridiy/webforai/pull/15) [`680a226`](https://github.com/inaridiy/webforai/commit/680a22638409517658c3918d90d070b1fa53cc3f) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Document
110 | 
111 | ## 1.3.0
112 | 
113 | ### Minor Changes
114 | 
115 | - [#12](https://github.com/inaridiy/webforai/pull/12) [`5e188a7`](https://github.com/inaridiy/webforai/commit/5e188a7c4d386e6351a5120213f18948ec5ec6f7) Thanks [@inaridiy](https://github.com/inaridiy)! - Minor interface improvements.
116 | 
117 | ## 1.2.3
118 | 
119 | ### Patch Changes
120 | 
121 | - [`ba73738`](https://github.com/inaridiy/webforai/commit/ba73738c24f509c8f1f060f0314bc0c6e3abc953) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflow
122 | 
123 | - [`663b15d`](https://github.com/inaridiy/webforai/commit/663b15d87a3085ef1d1657dd73a637e43aa4340b) Thanks [@inaridiy](https://github.com/inaridiy)! - Update Workflows
124 | 
125 | ## 1.2.2
126 | 
127 | ### Patch Changes
128 | 
129 | - 920f310: Update Linter and Workflows
130 | 


--------------------------------------------------------------------------------
/packages/webforai/README.md:
--------------------------------------------------------------------------------
 1 | <br/>
 2 | 
 3 | <p align="center">
 4 |   <a href="https://webforai.dev">
 5 |     <picture>
 6 |       <source media="(prefers-color-scheme: dark)" srcset="https://webforai.dev/images/logo-full-dark.svg">
 7 |       <img alt="webforai logo" src="https://webforai.dev/images/logo-full-light.svg" width="auto" height="40">
 8 |     </picture>
 9 |   </a>
10 | </p>
11 | 
12 | <p align="center">
13 |   A esm-native library that converts HTML to Markdown & Useful Utilities with simple, lightweight and epic quality.
14 | <p>
15 | 
16 | <p align="center">
17 |   <a href="https://www.npmjs.com/package/webforai">
18 |     <picture>
19 |       <source media="(prefers-color-scheme: dark)" srcset="https://img.shields.io/npm/v/webforai?style=flat">
20 |       <img src="https://img.shields.io/npm/v/webforai?style=flat" alt="Version">
21 |     </picture>
22 |   </a>
23 |   <a href="https://github.com/inaridiy/webforai/blob/main/LICENSE">
24 |     <picture>
25 |       <source media="(prefers-color-scheme: dark)" srcset="https://img.shields.io/npm/l/webforai?style=flat">
26 |       <img src="https://img.shields.io/npm/l/webforai?style=flat" alt="Apache License">
27 |     </picture>
28 |   </a>
29 | </p>
30 | 
31 | ## Documentation
32 | 
33 | [Head to the documentation](https://webforai.dev/) to read and learn more about Webforai.
34 | 
35 | ## Overview
36 | 
37 | ```bash
38 | npx webforai@latest
39 | ```
40 | 
41 | or
42 | 
43 | ```ts
44 | import { htmlToMarkdown, htmlToMdast } from "webforai";
45 | import { loadHtml } from "webforai/loaders/playwright";
46 | 
47 | // Load html from url
48 | const url = "https://www.npmjs.com/package/webforai";
49 | const html = await loadHtml(url);
50 | 
51 | // Convert html to markdown
52 | const markdown = htmlToMarkdown(html, { baseUrl: url });
53 | ```
54 | 
55 | ## Support
56 | 
57 | - [GitHub Sponsors](https://github.com/sponsors/inaridiy)
58 | - [inaridiy.eth](https://x.com/inaridiy)
59 | 
60 | ## License
61 | 
62 | [Apache 2.0](/LICENSE) License
63 | 


--------------------------------------------------------------------------------
/packages/webforai/build.ts:
--------------------------------------------------------------------------------
 1 | /*
 2 |   For `build.ts`, further inspire @honojs/hono with inspire @kaze-style/react.
 3 |   https://github.com/honojs/hono/blob/main/build.ts
 4 |   https://github.com/taishinaritomi/kaze-style/blob/main/scripts/build.ts
 5 |   MIT License
 6 |   Copyright (c) 2024 - present, inaridiy and webforai contributors
 7 | */
 8 | 
 9 | import { exec } from "node:child_process";
10 | import fs from "node:fs";
11 | import path from "node:path";
12 | import arg from "arg";
13 | import { context } from "esbuild";
14 | import type { BuildOptions, Plugin, PluginBuild } from "esbuild";
15 | import { glob } from "glob";
16 | 
17 | const args = arg({
18 | 	"--watch": Boolean,
19 | });
20 | 
21 | const isWatch = args["--watch"];
22 | 
23 | const entryPoints = glob.sync("./src/**/*.ts", {
24 | 	ignore: ["./src/**/*.test.ts", "./src/cli/**/*.ts"],
25 | });
26 | 
27 | const addExtension = (extension = ".js", fileExtension = ".ts"): Plugin => ({
28 | 	name: "add-extension",
29 | 	setup(build: PluginBuild) {
30 | 		build.onResolve({ filter: /.*/ }, (args) => {
31 | 			if (args.importer) {
32 | 				const p = path.join(args.resolveDir, args.path);
33 | 				let tsPath = `${p}${fileExtension}`;
34 | 
35 | 				let importPath = "";
36 | 				if (path.basename(args.importer).split(".")[0] === args.path) {
37 | 					importPath = args.path;
38 | 				} else if (fs.existsSync(tsPath)) {
39 | 					importPath = args.path + extension;
40 | 				} else {
41 | 					tsPath = path.join(args.resolveDir, args.path, `index${fileExtension}`);
42 | 					if (fs.existsSync(tsPath)) {
43 | 						importPath = `${args.path}/index${extension}`;
44 | 					}
45 | 				}
46 | 
47 | 				return { path: importPath, external: true };
48 | 			}
49 | 		});
50 | 	},
51 | });
52 | 
53 | const commonOptions: BuildOptions = {
54 | 	entryPoints,
55 | 	logLevel: "info",
56 | 	platform: "node",
57 | };
58 | 
59 | const cjsBuild = () =>
60 | 	context({
61 | 		...commonOptions,
62 | 		outbase: "./src",
63 | 		outdir: "./dist/cjs",
64 | 		format: "cjs",
65 | 	});
66 | 
67 | const esmBuild = () =>
68 | 	context({
69 | 		...commonOptions,
70 | 		bundle: true,
71 | 		outbase: "./src",
72 | 		outdir: "./dist",
73 | 		format: "esm",
74 | 		plugins: [addExtension(".js")],
75 | 	});
76 | 
77 | const cliBuild = () =>
78 | 	context({
79 | 		entryPoints: ["./src/cli/bin.ts"],
80 | 		banner: {
81 | 			js: "#!/usr/bin/env node",
82 | 		},
83 | 		outfile: "./dist/bin.js",
84 | 		format: "esm",
85 | 		packages: "external",
86 | 		bundle: true,
87 | 	});
88 | 
89 | const [esmCtx, cjsCtx, cliCtx] = await Promise.all([esmBuild(), cjsBuild(), cliBuild()]);
90 | if (isWatch) {
91 | 	Promise.all([esmCtx.watch(), cjsCtx.watch(), cliCtx.watch()]);
92 | } else {
93 | 	Promise.all([esmCtx.rebuild(), cjsCtx.rebuild(), cliCtx.rebuild()]).then(() =>
94 | 		Promise.all([esmCtx.dispose(), cjsCtx.dispose(), cliCtx.dispose()]),
95 | 	);
96 | }
97 | 
98 | exec(`tsc ${isWatch ? "-w" : ""} --declaration --project tsconfig.build.json`);
99 | 


--------------------------------------------------------------------------------
/packages/webforai/package.cjs.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"type": "commonjs"
3 | }
4 | 


--------------------------------------------------------------------------------
/packages/webforai/package.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"name": "webforai",
  3 | 	"version": "2.1.1",
  4 | 	"description": "A library that provides a web interface for AI",
  5 | 	"author": "inaridiy",
  6 | 	"license": "Apache-2.0",
  7 | 	"keywords": [
  8 | 		"web",
  9 | 		"ai",
 10 | 		"html",
 11 | 		"html2md",
 12 | 		"markdown",
 13 | 		"mdast",
 14 | 		"hast"
 15 | 	],
 16 | 	"repository": {
 17 | 		"type": "git",
 18 | 		"url": "https://github.com/inaridiy/webforai.git"
 19 | 	},
 20 | 	"homepage": "https://webforai.dev/",
 21 | 	"scripts": {
 22 | 		"copy:package.cjs.json": "pnpm ncp ./package.cjs.json ./dist/cjs/package.json && pnpm ncp ./package.cjs.json ./dist/types/package.json ",
 23 | 		"clean": "rimraf dist",
 24 | 		"build": "pnpm clean && tsx build.ts && pnpm copy:package.cjs.json",
 25 | 		"typecheck": "tsc --noEmit",
 26 | 		"prerelease": "pnpm build",
 27 | 		"release": "np"
 28 | 	},
 29 | 	"files": [
 30 | 		"dist",
 31 | 		"!dist/types/**/*.js"
 32 | 	],
 33 | 	"main": "dist/cjs/index.js",
 34 | 	"type": "module",
 35 | 	"module": "dist/index.js",
 36 | 	"types": "dist/types/index.d.ts",
 37 | 	"bin": "dist/bin.js",
 38 | 	"exports": {
 39 | 		".": {
 40 | 			"types": "./dist/types/index.d.ts",
 41 | 			"import": "./dist/index.js",
 42 | 			"require": "./dist/cjs/index.js"
 43 | 		},
 44 | 		"./types": {
 45 | 			"types": "./dist/types/index.d.ts",
 46 | 			"import": "./dist/index.js",
 47 | 			"require": "./dist/cjs/index.js"
 48 | 		},
 49 | 		"./loaders/playwright": {
 50 | 			"types": "./dist/types/loaders/playwright.d.ts",
 51 | 			"import": "./dist/loaders/playwright.js",
 52 | 			"require": "./dist/cjs/loaders/playwright.js"
 53 | 		},
 54 | 		"./loaders/fetch": {
 55 | 			"types": "./dist/types/loaders/fetch.d.ts",
 56 | 			"import": "./dist/loaders/fetch.js",
 57 | 			"require": "./dist/cjs/loaders/fetch.js"
 58 | 		},
 59 | 		"./loaders/cf-puppeteer": {
 60 | 			"types": "./dist/types/loaders/cf-puppeteer.d.ts",
 61 | 			"import": "./dist/loaders/cf-puppeteer.js",
 62 | 			"require": "./dist/cjs/loaders/cf-puppeteer.js"
 63 | 		},
 64 | 		"./loaders/puppeteer": {
 65 | 			"types": "./dist/types/loaders/puppeteer.d.ts",
 66 | 			"import": "./dist/loaders/puppeteer.js",
 67 | 			"require": "./dist/cjs/loaders/puppeteer.js"
 68 | 		}
 69 | 	},
 70 | 	"typesVersions": {
 71 | 		"*": {
 72 | 			"types": [
 73 | 				"./dist/types/index.d.ts"
 74 | 			],
 75 | 			"loaders/playwright": [
 76 | 				"./dist/types/loaders/playwright.d.ts"
 77 | 			],
 78 | 			"loaders/cf-puppeteer": [
 79 | 				"./dist/types/loaders/cf-puppeteer.d.ts"
 80 | 			],
 81 | 			"loaders/fetch": [
 82 | 				"./dist/types/loaders/fetch.d.ts"
 83 | 			],
 84 | 			"loaders/puppeteer": [
 85 | 				"./dist/types/loaders/puppeteer.d.ts"
 86 | 			]
 87 | 		}
 88 | 	},
 89 | 	"peerDependencies": {
 90 | 		"@cloudflare/puppeteer": ">=0.0.6",
 91 | 		"playwright-core": ">=1.4",
 92 | 		"puppeteer": ">=22"
 93 | 	},
 94 | 	"peerDependenciesMeta": {
 95 | 		"@cloudflare/puppeteer": {
 96 | 			"optional": true
 97 | 		},
 98 | 		"playwright-core": {
 99 | 			"optional": false
100 | 		},
101 | 		"puppeteer": {
102 | 			"optional": true
103 | 		}
104 | 	},
105 | 	"dependencies": {
106 | 		"@clack/prompts": "^0.7.0",
107 | 		"boxen": "^8.0.1",
108 | 		"commander": "^12.1.0",
109 | 		"hast-util-from-html": "^2.0.3",
110 | 		"hast-util-select": "^6.0.2",
111 | 		"hast-util-to-html": "^9.0.3",
112 | 		"hast-util-to-mdast": "^10.1.0",
113 | 		"hast-util-to-string": "^3.0.0",
114 | 		"hast-util-to-text": "^4.0.0",
115 | 		"mathml-to-latex": "^1.4.1",
116 | 		"mdast-util-gfm": "^3.0.0",
117 | 		"mdast-util-math": "^3.0.0",
118 | 		"mdast-util-to-markdown": "^2.1.0",
119 | 		"picocolors": "^1.0.1",
120 | 		"trim-trailing-lines": "^2.1.0",
121 | 		"unist-util-filter": "^5.0.1",
122 | 		"zx": "^8.1.5"
123 | 	},
124 | 	"devDependencies": {
125 | 		"@cloudflare/puppeteer": "^0.0.6",
126 | 		"@tsconfig/recommended": "^1.0.3",
127 | 		"@types/hast": "^3.0.2",
128 | 		"@types/mdast": "^4.0.2",
129 | 		"@types/node": "^20.14.10",
130 | 		"arg": "^5.0.2",
131 | 		"esbuild": "^0.19.11",
132 | 		"fastest-levenshtein": "^1.0.16",
133 | 		"glob": "^10.3.10",
134 | 		"ncp": "^2.0.0",
135 | 		"np": "^9.2.0",
136 | 		"playwright-core": "^1.40.1",
137 | 		"puppeteer": "^23.2.2",
138 | 		"rimraf": "^5.0.5",
139 | 		"tsx": "^4.19.1",
140 | 		"typescript": "^5.4.5"
141 | 	}
142 | }
143 | 


--------------------------------------------------------------------------------
/packages/webforai/src/cli/bin.ts:
--------------------------------------------------------------------------------
 1 | import { program } from "commander";
 2 | import packageInfo from "../../package.json";
 3 | import { webforaiCommand } from "./commands/webforai";
 4 | import { LOADERS, MODES } from "./constants";
 5 | 
 6 | program
 7 | 	.name("webforai")
 8 | 	.description("CLI tool for ultra-precise HTML to Markdown conversion")
 9 | 	.version(packageInfo.version, "-v, --version", "output the current version");
10 | 
11 | program
12 | 	.argument("[source]", "URL or path to process")
13 | 	.option("-o, --output <output>", "Path to output file or directory")
14 | 	.option("-m, --mode <mode>", `Processing mode (${MODES.join(", ")})`)
15 | 	.option("-l, --loader <loader>", `Loader to use (${LOADERS.join(", ")})`)
16 | 	.option("-d, --debug", "output extra debugging information")
17 | 	.action(webforaiCommand);
18 | 
19 | program.parse();
20 | 


--------------------------------------------------------------------------------
/packages/webforai/src/cli/commands/webforai/index.ts:
--------------------------------------------------------------------------------
 1 | import fs from "node:fs/promises";
 2 | import path from "node:path";
 3 | import { intro, log, outro, spinner } from "@clack/prompts";
 4 | import pc from "picocolors";
 5 | import packageInfo from "../../../../package.json";
 6 | import { htmlToMarkdown } from "../../../html-to-markdown";
 7 | import { inputOutputPath } from "../../helpers/inputOutputPath";
 8 | import { inputSourcePath } from "../../helpers/inputSourcePath";
 9 | import { selectExtractMode } from "../../helpers/selectExtractMode";
10 | import { selectLoader } from "../../helpers/selectLoader";
11 | import { isUrl } from "../../utils";
12 | import { loadHtml } from "./loadHtml";
13 | 
14 | const aiModeOptions = { linkAsText: true, tableAsText: true, hideImage: true };
15 | const readabilityModeOptions = { linkAsText: false, tableAsText: false, hideImage: false };
16 | 
17 | export const webforaiCommand = async (
18 | 	initialPath: string,
19 | 	options: { output?: string; mode?: string; loader?: string; debug?: boolean },
20 | ) => {
21 | 	intro(pc.bold(pc.green(`webforai CLI version ${packageInfo.version}`)));
22 | 
23 | 	const sourcePath = initialPath ?? (await inputSourcePath());
24 | 	options.debug && log.info(`sourcePath: ${sourcePath}`);
25 | 
26 | 	const loader = isUrl(sourcePath) ? options.loader ?? (await selectLoader()) : "local";
27 | 	options.debug && log.info(`loader: ${loader}`);
28 | 
29 | 	const outputPath = options.output ?? (await inputOutputPath(sourcePath));
30 | 	options.debug && log.info(`outputPath: ${outputPath}`);
31 | 
32 | 	const mode = options.mode ?? (await selectExtractMode());
33 | 	options.debug && log.info(`mode: ${mode}`);
34 | 
35 | 	let html: string;
36 | 	const s = spinner();
37 | 	try {
38 | 		s.start("Loading content...");
39 | 		html = await loadHtml(sourcePath, loader, { debug: options.debug });
40 | 		s.stop(pc.green("Content loaded!"));
41 | 	} catch (error) {
42 | 		s.stop(pc.red("Content loading failed!"));
43 | 		console.error(error);
44 | 		process.exit(1);
45 | 	}
46 | 	options.debug && log.info(`html: ${html}`);
47 | 
48 | 	const markdown = htmlToMarkdown(html, {
49 | 		baseUrl: isUrl(sourcePath) ? sourcePath : undefined,
50 | 		...(mode === "ai" ? aiModeOptions : readabilityModeOptions),
51 | 	});
52 | 	options.debug && log.info(`markdown: ${markdown}`);
53 | 
54 | 	const directory = path.dirname(outputPath);
55 | 	const isDirectoryExists = await fs.stat(directory).then((stat) => stat.isDirectory());
56 | 
57 | 	if (!isDirectoryExists) {
58 | 		await fs.mkdir(directory, { recursive: true });
59 | 	}
60 | 	await fs.writeFile(outputPath, markdown);
61 | 
62 | 	outro(pc.green(`${pc.bold("Done!")} Markdown saved to ${outputPath}`));
63 | };
64 | 


--------------------------------------------------------------------------------
/packages/webforai/src/cli/commands/webforai/loadHtml.ts:
--------------------------------------------------------------------------------
 1 | import fs from "node:fs/promises";
 2 | import { fileURLToPath } from "node:url";
 3 | import { log } from "@clack/prompts";
 4 | import boxen from "boxen";
 5 | import pc from "picocolors";
 6 | import { chromium } from "playwright-core";
 7 | import { loadHtml as loadHtmlByFetch } from "../../../loaders/fetch";
 8 | import { loadHtml as loadHtmlByPlaywright } from "../../../loaders/playwright";
 9 | 
10 | const checkPlaywrightAvailable = async () => {
11 | 	const path = chromium.executablePath();
12 | 	try {
13 | 		await fs.access(path);
14 | 		return true;
15 | 	} catch {
16 | 		return false;
17 | 	}
18 | };
19 | 
20 | const getPlaywrightVersion = async () => {
21 | 	const path = await import.meta.resolve("playwright-core/package.json");
22 | 	const pwPackageJson = await fs
23 | 		.readFile(fileURLToPath(path), "utf-8")
24 | 		.then((res) => JSON.parse(res.toString()))
25 | 		.catch(() => null);
26 | 	return pwPackageJson?.version;
27 | };
28 | 
29 | export const loadHtml = async (sourcePath: string, loader: string, options: { debug?: boolean }) => {
30 | 	if (loader === "local") {
31 | 		options.debug && log.info(`Loading HTML from local file: ${sourcePath}`);
32 | 		const content = await fs.readFile(sourcePath, "utf-8");
33 | 		options.debug && log.info(`HTML loaded: ${content.slice(0, 100)}`);
34 | 		return content;
35 | 	}
36 | 
37 | 	if (loader === "fetch") {
38 | 		options.debug && log.info(`Loading HTML from URL: ${sourcePath}`);
39 | 		const content = await loadHtmlByFetch(sourcePath);
40 | 		options.debug && log.info(`HTML loaded: ${content.slice(0, 100)}`);
41 | 		return content;
42 | 	}
43 | 
44 | 	if (loader === "playwright") {
45 | 		options.debug && log.info(`Loading HTML from playwright: ${sourcePath}`);
46 | 		const isPlaywrightAvailable = await checkPlaywrightAvailable();
47 | 		options.debug && log.info(`Playwright available: ${isPlaywrightAvailable}`);
48 | 
49 | 		const pwVersion = await getPlaywrightVersion();
50 | 
51 | 		if (!isPlaywrightAvailable) {
52 | 			const message = [
53 | 				pc.bold("Error: Playwright is not available"),
54 | 				"",
55 | 				"To use the Playwright loader, please install Playwright by running:",
56 | 				"",
57 | 				`  npx playwright@${pwVersion} install chromium`,
58 | 				"",
59 | 				"Hint 1: If you receive a warning like this:",
60 | 				`  "WARNING: It looks like you are running 'npx playwright install' without first installing your project's dependencies."`,
61 | 				"You can safely ignore this warning.",
62 | 				"",
63 | 				"Hint 2: If you encounter the following message:",
64 | 				`  "Host system is missing dependencies to run browsers."`,
65 | 				"You should install the necessary dependencies by executing:",
66 | 				"",
67 | 				`  sudo npx playwright@${pwVersion} install-deps`,
68 | 			];
69 | 
70 | 			log.error(boxen(message.join("\n"), { padding: 1, borderStyle: "round" }));
71 | 			throw new Error("Playwright is not available");
72 | 		}
73 | 		const content = await loadHtmlByPlaywright(sourcePath);
74 | 		options.debug && log.info(`HTML loaded: ${content.slice(0, 100)}`);
75 | 		return content;
76 | 	}
77 | 
78 | 	throw new Error(`Unsupported loader: ${loader}`);
79 | };
80 | 


--------------------------------------------------------------------------------
/packages/webforai/src/cli/constants.ts:
--------------------------------------------------------------------------------
1 | export const DEFAULT_PATH = "https://example.com";
2 | 
3 | export const LOADERS = ["fetch", "playwright"] as const;
4 | export type Loaders = (typeof LOADERS)[number];
5 | 
6 | export const MODES: string[] = ["default", "ai"];
7 | export type Modes = (typeof MODES)[number];
8 | 


--------------------------------------------------------------------------------
/packages/webforai/src/cli/helpers/assertContinue.ts:
--------------------------------------------------------------------------------
1 | import { cancel, isCancel } from "@clack/prompts";
2 | 
3 | export function assertContinue<T>(message: T | symbol, cancelMessage = "Canceled."): asserts message is T {
4 | 	if (isCancel(message)) {
5 | 		cancel(cancelMessage);
6 | 		process.exit(1);
7 | 	}
8 | }
9 | 


--------------------------------------------------------------------------------
/packages/webforai/src/cli/helpers/inputOutputPath.ts:
--------------------------------------------------------------------------------
 1 | import fs from "node:fs";
 2 | import { confirm, text } from "@clack/prompts";
 3 | import { getNextAvailableFilePath, sourcePathToOutputPath } from "../utils";
 4 | import { assertContinue } from "./assertContinue";
 5 | 
 6 | export const inputOutputPath = async (sourcePath: string) => {
 7 | 	const outputPath = await text({
 8 | 		message: "Enter the output file path:",
 9 | 		placeholder: "output.md",
10 | 		initialValue: sourcePathToOutputPath(sourcePath),
11 | 		validate: (value: string) => {
12 | 			if (value.trim() === "") {
13 | 				return "Output path is required";
14 | 			}
15 | 			if (fs.existsSync(value) && fs.statSync(value).isDirectory()) {
16 | 				return "No directory can be specified.";
17 | 			}
18 | 			if (!fs.existsSync(value) && value.endsWith("/")) {
19 | 				return "No directory can be specified.";
20 | 			}
21 | 		},
22 | 	});
23 | 	assertContinue(outputPath);
24 | 
25 | 	if (!fs.existsSync(outputPath)) {
26 | 		return outputPath;
27 | 	}
28 | 
29 | 	const isOutputFileOverwrite = await confirm({
30 | 		message: "The file already exists. Overwrite?",
31 | 		initialValue: false,
32 | 	});
33 | 	assertContinue(isOutputFileOverwrite);
34 | 
35 | 	if (isOutputFileOverwrite) {
36 | 		return outputPath;
37 | 	}
38 | 
39 | 	const escapedOutputPath = await text({
40 | 		message: "Enter the output file path:",
41 | 		placeholder: "output.md",
42 | 		initialValue: getNextAvailableFilePath(outputPath),
43 | 		validate: (value: string) => {
44 | 			if (value.trim() === "") {
45 | 				return "Output path is required";
46 | 			}
47 | 			if (fs.existsSync(value)) {
48 | 				return "The file already exists";
49 | 			}
50 | 			if (!fs.existsSync(value) && value.endsWith("/")) {
51 | 				return "No directory can be specified.";
52 | 			}
53 | 		},
54 | 	});
55 | 
56 | 	assertContinue(escapedOutputPath);
57 | 
58 | 	return escapedOutputPath;
59 | };
60 | 


--------------------------------------------------------------------------------
/packages/webforai/src/cli/helpers/inputSourcePath.ts:
--------------------------------------------------------------------------------
 1 | import fs from "node:fs";
 2 | import { text } from "@clack/prompts";
 3 | import { DEFAULT_PATH } from "../constants";
 4 | import { isUrl } from "../utils";
 5 | import { assertContinue } from "./assertContinue";
 6 | 
 7 | export const inputSourcePath = async () => {
 8 | 	const result = await text({
 9 | 		message: "Enter the URL or html path to be converted to markdown:",
10 | 		placeholder: DEFAULT_PATH,
11 | 		initialValue: "",
12 | 		validate: (value: string) => {
13 | 			if (value.trim() === "") {
14 | 				return "Source is required";
15 | 			}
16 | 			if (!isUrl(value)) {
17 | 				if (!fs.existsSync(value)) {
18 | 					return "It appears that you are specifying a local file, but the file cannot be found. hint: when specifying a url, start with http or https.";
19 | 				}
20 | 				if (fs.statSync(value).isDirectory()) {
21 | 					return "You are specifying a local file, but you cannot specify a directory. hint: when specifying a url, start with http or https.";
22 | 				}
23 | 			}
24 | 		},
25 | 	});
26 | 	assertContinue(result);
27 | 
28 | 	return result;
29 | };
30 | 


--------------------------------------------------------------------------------
/packages/webforai/src/cli/helpers/selectExtractMode.ts:
--------------------------------------------------------------------------------
 1 | import { select } from "@clack/prompts";
 2 | import { MODES } from "../constants";
 3 | import { assertContinue } from "./assertContinue";
 4 | 
 5 | export const selectExtractMode = async () => {
 6 | 	const result = await select({
 7 | 		message: "Select processing mode:",
 8 | 		options: MODES.map((mode) => ({ value: mode, label: mode })),
 9 | 		initialValue: MODES[0],
10 | 	});
11 | 
12 | 	assertContinue(result);
13 | 
14 | 	return result;
15 | };
16 | 


--------------------------------------------------------------------------------
/packages/webforai/src/cli/helpers/selectLoader.ts:
--------------------------------------------------------------------------------
 1 | import { select } from "@clack/prompts";
 2 | import { LOADERS } from "../constants";
 3 | import { assertContinue } from "./assertContinue";
 4 | 
 5 | const loadersHint = {
 6 | 	fetch: "Fetch HTML content from the given URL",
 7 | 	playwright: "Retrieve HTML content after rendering using Playwright; Playwright must be installed in advance.",
 8 | 	puppeteer: "Retrieve HTML content after rendering using Puppeteer; Puppeteer must be installed in advance.",
 9 | } as const;
10 | 
11 | export const selectLoader = async () => {
12 | 	const result = await select({
13 | 		message: "Select loader:",
14 | 		initialValue: "fetch",
15 | 		options: LOADERS.map((mode) => ({ value: mode, label: mode, hint: loadersHint[mode] || "" })),
16 | 	});
17 | 	assertContinue(result);
18 | 
19 | 	return result;
20 | };
21 | 


--------------------------------------------------------------------------------
/packages/webforai/src/cli/utils.ts:
--------------------------------------------------------------------------------
  1 | import fs from "node:fs";
  2 | import path from "node:path";
  3 | 
  4 | export const isUrl = (maybeUrl: string) => {
  5 | 	try {
  6 | 		new URL(maybeUrl);
  7 | 		return true;
  8 | 	} catch {
  9 | 		return false;
 10 | 	}
 11 | };
 12 | 
 13 | export function changeFileExtension(filePath: string, newExtension: string): string {
 14 | 	const parsedPath = filePath.split("/");
 15 | 	const fileName = parsedPath[parsedPath.length - 1];
 16 | 
 17 | 	const formattedNewExtension = newExtension.startsWith(".") ? newExtension : `.${newExtension}`;
 18 | 
 19 | 	if (fileName.startsWith(".")) {
 20 | 		const parts = fileName.split(".");
 21 | 		if (parts.length === 2) {
 22 | 			return parsedPath.slice(0, -1).concat(`${fileName}${formattedNewExtension}`).join("/");
 23 | 		}
 24 | 		parts[parts.length - 1] = newExtension.replace(/^\./, "");
 25 | 		return parsedPath.slice(0, -1).concat(parts.join(".")).join("/");
 26 | 	}
 27 | 
 28 | 	const lastDotIndex = fileName.lastIndexOf(".");
 29 | 	const baseName = lastDotIndex !== -1 ? fileName.slice(0, lastDotIndex) : fileName;
 30 | 	const newFileName = `${baseName}${formattedNewExtension}`;
 31 | 
 32 | 	parsedPath[parsedPath.length - 1] = newFileName;
 33 | 	return parsedPath.join("/");
 34 | }
 35 | 
 36 | export function urlToFilename(url: string): string {
 37 | 	try {
 38 | 		const urlObj = new URL(url);
 39 | 
 40 | 		const domainParts = urlObj.hostname
 41 | 			.split(".")
 42 | 			.reverse()
 43 | 			.reduce((acc: string[], part: string, index: number) => {
 44 | 				if (index === 0) {
 45 | 					return acc;
 46 | 				}
 47 | 				if (acc.length >= 2) {
 48 | 					return acc;
 49 | 				}
 50 | 				if (part === "www") {
 51 | 					return acc;
 52 | 				}
 53 | 				// biome-ignore lint/performance/noAccumulatingSpread: <explanation>
 54 | 				return [part, ...acc];
 55 | 			}, []);
 56 | 		const domainString = domainParts.reverse().join("-");
 57 | 
 58 | 		const pathParts = urlObj.pathname.split("/").filter(Boolean);
 59 | 		const relevantPathParts = pathParts.slice(-2);
 60 | 		const pathString = relevantPathParts.map((part) => decodeURIComponent(part)).join("-");
 61 | 
 62 | 		let filename = [domainString, pathString].filter(Boolean).join("-");
 63 | 
 64 | 		filename = filename
 65 | 			.toLowerCase()
 66 | 			// biome-ignore lint/suspicious/noControlCharactersInRegex: <explanation>
 67 | 			.replace(/[<>:"/\\|?*\x00-\x1F]/g, "")
 68 | 			.replace(/[\s.]+/g, "-")
 69 | 			.replace(/^-+|-+$/g, "");
 70 | 
 71 | 		return filename || "output";
 72 | 	} catch {
 73 | 		return "output";
 74 | 	}
 75 | }
 76 | 
 77 | export const sourcePathToOutputPath = (sourcePath: string) => {
 78 | 	return isUrl(sourcePath) ? `${urlToFilename(sourcePath)}.md` : changeFileExtension(sourcePath, "md");
 79 | };
 80 | 
 81 | export function getNextAvailableFilePath(filePath: string): string {
 82 | 	const parsedPath = path.parse(filePath);
 83 | 	const directory = parsedPath.dir;
 84 | 	const fullName = parsedPath.base;
 85 | 
 86 | 	const [firstPart, ...restParts] = fullName.split(".");
 87 | 	const restName = restParts.length > 0 ? `.${restParts.join(".")}` : "";
 88 | 
 89 | 	const baseName = firstPart.replace(/_\d+$/, "");
 90 | 
 91 | 	let counter = 1;
 92 | 	let nextFilePath = filePath;
 93 | 
 94 | 	while (fs.existsSync(nextFilePath)) {
 95 | 		const match = firstPart.match(/_(\d+)$/);
 96 | 		if (match) {
 97 | 			counter = Number.parseInt(match[1], 10) + 1;
 98 | 		}
 99 | 		const newName = `${baseName}_${counter}${restName}`;
100 | 		nextFilePath = path.join(directory, newName);
101 | 		counter++;
102 | 	}
103 | 
104 | 	return nextFilePath;
105 | }
106 | 


--------------------------------------------------------------------------------
/packages/webforai/src/constants.ts:
--------------------------------------------------------------------------------
1 | import { takumiExtractor } from "./extractors/presets/takumi";
2 | 
3 | export const DEFAULT_EXTRACTORS = [takumiExtractor];
4 | 


--------------------------------------------------------------------------------
/packages/webforai/src/extract-mdast.ts:
--------------------------------------------------------------------------------
 1 | import type { Nodes as Mdast, Parent } from "mdast";
 2 | import { filter } from "unist-util-filter";
 3 | 
 4 | const DECLATION_TYPES = ["blockquote", "strong", "emphasis", "delete"];
 5 | 
 6 | const emptyDeclarationFilter = (node: Mdast) => {
 7 | 	if (!DECLATION_TYPES.includes(node.type)) {
 8 | 		return true;
 9 | 	}
10 | 	if ((node as Parent).children.length === 0) {
11 | 		return false;
12 | 	}
13 | 
14 | 	return true;
15 | };
16 | 
17 | export const extractMdast = (node: Mdast) => {
18 | 	const extracted = filter(node, (node) => {
19 | 		if (!emptyDeclarationFilter(node as Mdast)) {
20 | 			return false;
21 | 		}
22 | 		return true;
23 | 	});
24 | 	return extracted as Mdast;
25 | };
26 | 


--------------------------------------------------------------------------------
/packages/webforai/src/extractors/index.ts:
--------------------------------------------------------------------------------
 1 | // biome-ignore lint/performance/noBarrelFile: module index
 2 | export {
 3 | 	pipeExtractors,
 4 | 	type ExtractorSelectors,
 5 | 	type ExtractorSelector,
 6 | } from "./pipeExtractors";
 7 | export { takumiExtractor } from "./presets/takumi";
 8 | export { type ExtractParams, type Extractor } from "./types";
 9 | export { minimalFilter } from "./presets/minimal-filter";
10 | 


--------------------------------------------------------------------------------
/packages/webforai/src/extractors/pipeExtractors.ts:
--------------------------------------------------------------------------------
 1 | import type { Nodes as Hast } from "hast";
 2 | import { DEFAULT_EXTRACTORS } from "../constants";
 3 | import type { ExtractParams, Extractor } from "./types";
 4 | 
 5 | export type ExtractorSelector = Extractor | false;
 6 | export type ExtractorSelectors = ExtractorSelector | ExtractorSelector[];
 7 | 
 8 | export const pipeExtractors = (params: ExtractParams, extractors: ExtractorSelectors = DEFAULT_EXTRACTORS): Hast => {
 9 | 	const { hast, lang } = params;
10 | 	const _extractors = Array.isArray(extractors) ? extractors : [extractors];
11 | 
12 | 	const extracted =
13 | 		_extractors.reduce<Hast>((acc, extractor) => {
14 | 			if (extractor === false) {
15 | 				return acc;
16 | 			}
17 | 			if (typeof extractor === "function") {
18 | 				return extractor({ hast: acc, lang });
19 | 			}
20 | 			throw new Error(`Invalid extractor: ${extractor}`);
21 | 		}, hast) || hast;
22 | 
23 | 	return extracted;
24 | };
25 | 


--------------------------------------------------------------------------------
/packages/webforai/src/extractors/presets/minimal-filter.ts:
--------------------------------------------------------------------------------
 1 | import type { Element, Nodes as Hast } from "hast";
 2 | import { select } from "hast-util-select";
 3 | import { toString as hastToString } from "hast-util-to-string";
 4 | import { filter } from "unist-util-filter";
 5 | import type { ExtractParams } from "../types";
 6 | import { classnames, isStrInclude, matchString } from "./utils";
 7 | 
 8 | const UNLIKELY_ROLES = ["menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog"];
 9 | 
10 | /*
11 |  * This section of the code is influenced by @mozilla/readability, licensed under Apache License 2.0.
12 |  * Original copyright (c) 2010 Arc90 Inc
13 |  * See https://github.com/mozilla/readability for the full license text.
14 |  * Modifications made by inaridiy
15 |  * - Added and edited some regular expressions.
16 |  */
17 | const REGEXPS = {
18 | 	hidden: /hidden|invisible|fallback-image/i,
19 | 	byline: /byline|author|dateline|writtenby|p-author/i,
20 | 	specialUnlikelyCandidates: /frb-|uls-menu|language-link/i,
21 | 	unlikelyCandidates:
22 | 		/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|tooltip|disqus|extra|footer|gdpr|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote|speechify-ignore|avatar/i,
23 | 	okMaybeItsaCandidate: /and|article|body|column|content|main|shadow|code/i,
24 | };
25 | 
26 | const metadataFilter = (node: Hast) => {
27 | 	return !(
28 | 		["comment", "doctype"].includes(node.type) ||
29 | 		(node.type === "element" && ["script", "style", "link", "meta", "noscript", "svg", "title"].includes(node.tagName))
30 | 	);
31 | };
32 | 
33 | const universalElementFilter = (node: Hast) => {
34 | 	if (node.type !== "element") {
35 | 		return true;
36 | 	}
37 | 	const element = node as Element;
38 | 
39 | 	if (["aside", "nav"].includes(element.tagName)) {
40 | 		return false;
41 | 	}
42 | 
43 | 	// Remove elements with hidden properties
44 | 	if (["hidden", "aria-hidden"].some((key) => element.properties[key])) {
45 | 		return false;
46 | 	}
47 | 	if (classnames(element).some((classname) => REGEXPS.hidden.test(classname))) {
48 | 		return false;
49 | 	}
50 | 
51 | 	// Remove dialog elements
52 | 	if (element.tagName === "dialog") {
53 | 		return false;
54 | 	}
55 | 	if (element.properties.role === "dialog" && element.properties["aria-modal"]) {
56 | 		return false;
57 | 	}
58 | 
59 | 	// Remove byline elements
60 | 	if (element.properties.rel === "author" && isStrInclude(element.properties.itemprop, "author")) {
61 | 		return false;
62 | 	}
63 | 	if (REGEXPS.byline.test(matchString(element))) {
64 | 		return false;
65 | 	}
66 | 
67 | 	// Remove unlikely roles
68 | 	if (element.properties.role && UNLIKELY_ROLES.includes(element.properties.role as string)) {
69 | 		return false;
70 | 	}
71 | 
72 | 	return true;
73 | };
74 | 
75 | /**
76 |  * Simple filter to remove unwanted elements from the HAST tree.
77 |  *
78 |  * @param params - {@link ExtractParams}
79 |  * @returns The HAST tree.
80 |  */
81 | export const minimalFilter = (params: ExtractParams): Hast => {
82 | 	const { hast } = params;
83 | 	const body = select("body", hast) ?? hast;
84 | 
85 | 	const metadataFilteredHast = filter(body, (node) => metadataFilter(node as Hast));
86 | 	const metadataFilteredHastText = metadataFilteredHast && hastToString(metadataFilteredHast);
87 | 	if (!(metadataFilteredHast && metadataFilteredHastText)) {
88 | 		return body;
89 | 	}
90 | 
91 | 	const baseFilterd = filter(metadataFilteredHast, (node) => universalElementFilter(node as Hast));
92 | 	const baseFilterdText = baseFilterd ? hastToString(baseFilterd) : "";
93 | 
94 | 	const isOverExtracted = baseFilterdText.length > metadataFilteredHastText.length / 3 || baseFilterdText.length > 5000;
95 | 	const baseTree = isOverExtracted && baseFilterd ? baseFilterd : metadataFilteredHast;
96 | 
97 | 	return baseTree;
98 | };
99 | 


--------------------------------------------------------------------------------
/packages/webforai/src/extractors/presets/utils.ts:
--------------------------------------------------------------------------------
 1 | import type { Element } from "hast";
 2 | 
 3 | export const matchString = (element: Element) =>
 4 | 	`${element.tagName} ${element.properties.id} ${classnames(element).join(" ")}`;
 5 | 
 6 | export const classnames = (element: Element) => {
 7 | 	if (Array.isArray(element.properties.className)) {
 8 | 		return element.properties.className as string[];
 9 | 	}
10 | 	return [];
11 | };
12 | 
13 | export const isStrInclude = (value: unknown, match: string) => {
14 | 	if (typeof value === "string") {
15 | 		return value.includes(match);
16 | 	}
17 | 	return false;
18 | };
19 | 


--------------------------------------------------------------------------------
/packages/webforai/src/extractors/types.ts:
--------------------------------------------------------------------------------
1 | import type { Nodes as Hast } from "hast";
2 | 
3 | export type ExtractParams = { hast: Hast; lang?: string; url?: string };
4 | export type Extractor = (param: ExtractParams) => Hast;
5 | 


--------------------------------------------------------------------------------
/packages/webforai/src/html-to-markdown.test.ts:
--------------------------------------------------------------------------------
  1 | import { distance } from "fastest-levenshtein";
  2 | import { describe, expect, it } from "vitest";
  3 | import { htmlToMarkdown } from "./html-to-markdown";
  4 | import { loadHtml } from "./loaders/fetch";
  5 | 
  6 | const html = `
  7 | <h1>Hello, world!</h1>
  8 | <p>This is a paragraph.</p>
  9 | <a href="/example.html">Example</a>
 10 | <img src="/example.jpg" alt="Example" />
 11 | <ul>
 12 |   <li>Item 1</li>
 13 |   <li>Item 2</li>
 14 | </ul>
 15 | `;
 16 | 
 17 | const expected = `# Hello, world!
 18 | 
 19 | This is a paragraph.
 20 | 
 21 | [Example](/example.html)
 22 | 
 23 | ![Example](/example.jpg)
 24 | 
 25 | * Item 1
 26 | * Item 2
 27 | `;
 28 | 
 29 | const baseUrlReplaced = `# Hello, world!
 30 | 
 31 | This is a paragraph.
 32 | 
 33 | [Example](https://example.com/example.html)
 34 |  
 35 | ![Example](https://example.com/example.jpg)
 36 | 
 37 | * Item 1
 38 | * Item 2
 39 | `;
 40 | 
 41 | const linkAsText = `# Hello, world!
 42 | 
 43 | This is a paragraph.
 44 | 
 45 | Example
 46 | 
 47 | ![Example](/example.jpg)
 48 | 
 49 | - Item 1
 50 | - Item 2
 51 | `;
 52 | 
 53 | const imageHidden = `# Hello, world!
 54 | 
 55 | This is a paragraph.
 56 | 
 57 | [Example](/example.html)
 58 | 
 59 | - Item 1
 60 | - Item 2
 61 | `;
 62 | 
 63 | const htmlTable = `
 64 | <table>
 65 |   <tr>
 66 |     <th>Header 1</th>
 67 |     <th>Header 2</th>
 68 |   </tr>
 69 |   <tr>
 70 |     <td>Cell 1</td>
 71 |     <td>Cell 2</td>
 72 |   </tr>
 73 | </table>
 74 | `;
 75 | 
 76 | const expectedTableMarkdown = `
 77 | | Header 1 | Header 2 |
 78 | | -------- | -------- |
 79 | | Cell 1   | Cell 2   |
 80 | `;
 81 | 
 82 | const expectedTableText = `Header 1  Header 2
 83 | Cell 1    Cell 2`;
 84 | 
 85 | describe("htmlToMarkdown", () => {
 86 | 	it("should convert HTML to Markdown", () => {
 87 | 		const markdown = htmlToMarkdown(html, { extractors: false });
 88 | 		const d = distance(markdown, expected);
 89 | 		expect(d).lte(5);
 90 | 	});
 91 | 
 92 | 	it("should convert HTML to Markdown with replaced base URL", () => {
 93 | 		const markdown = htmlToMarkdown(html, { baseUrl: "https://example.com", extractors: false });
 94 | 		const d = distance(markdown, baseUrlReplaced);
 95 | 		expect(d).lte(5);
 96 | 	});
 97 | 
 98 | 	it("should convert HTML to Markdown with links as text", () => {
 99 | 		const markdown = htmlToMarkdown(html, { linkAsText: true, extractors: false });
100 | 		const d = distance(markdown, linkAsText);
101 | 		expect(d).lte(5);
102 | 	});
103 | 
104 | 	it("should convert HTML to Markdown with hidden images", () => {
105 | 		const markdown = htmlToMarkdown(html, { hideImage: true, extractors: false });
106 | 		const d = distance(markdown, imageHidden);
107 | 		expect(d).lte(5);
108 | 	});
109 | 
110 | 	it("should convert HTML table to Markdown table", () => {
111 | 		const markdown = htmlToMarkdown(htmlTable, { extractors: false });
112 | 		const d = distance(markdown, expectedTableMarkdown);
113 | 		expect(d).lte(5);
114 | 	});
115 | 
116 | 	it("should convert HTML table with table as text option", () => {
117 | 		const markdown = htmlToMarkdown(htmlTable, { tableAsText: true, extractors: false });
118 | 		const d = distance(markdown, expectedTableText);
119 | 		expect(d).lte(10); // Allow a higher distance due to the difference in formatting
120 | 	});
121 | });
122 | 
123 | describe("htmlToMarkdown E2E", () => {
124 | 	it("Converting for good", async () => {
125 | 		const html1 = await loadHtml("https://www.npmjs.com/package/webforai");
126 | 		const markdown1 = htmlToMarkdown(html1, { linkAsText: true, hideImage: true });
127 | 
128 | 		const html2 = await loadHtml("https://github.com/inaridiy/webforai");
129 | 		const markdown2 = htmlToMarkdown(html2, { linkAsText: true, hideImage: true });
130 | 
131 | 		// @ts-ignore
132 | 
133 | 		const d = distance(markdown1, markdown2);
134 | 		expect(d).lte(2500); // I'd like to optimise more!
135 | 	});
136 | });
137 | 


--------------------------------------------------------------------------------
/packages/webforai/src/html-to-markdown.ts:
--------------------------------------------------------------------------------
 1 | import type { Nodes as Hast } from "hast";
 2 | import { type HtmlToMdastOptions, htmlToMdast } from "./html-to-mdast";
 3 | import { type MdastToMarkdownOptions, mdastToMarkdown } from "./mdast-to-markdown";
 4 | 
 5 | export interface HtmlToMarkdownOptions extends HtmlToMdastOptions {
 6 | 	/** The base URL to use for replacing relative links. */
 7 | 	baseUrl?: string;
 8 | 	/** Formatting options passed to [mdast-util-to-markdown](https://github.com/syntax-tree/mdast-util-to-markdown). */
 9 | 	formatting?: Omit<MdastToMarkdownOptions, "baseUrl">;
10 | }
11 | 
12 | /**
13 |  * Converts HTML or HAST to a Markdown string.
14 |  *
15 |  * @param htmlOrHast - The HTML string or HAST tree to convert.
16 |  * @param options - {@link HtmlToMarkdownOptions} to customize the conversion.
17 |  * @returns The Markdown string.
18 |  *
19 |  * @example
20 |  * ```ts
21 |  * import { htmlToMarkdown } from "webforai"
22 |  *
23 |  * const html = '<h1>Hello, world!</h1>';
24 |  * const markdown = htmlToMarkdown(html);
25 |  *
26 |  * console.log(markdown); // Output: "# Hello, world!"
27 |  * ```
28 |  */
29 | export const htmlToMarkdown = (htmlOrHast: string | Hast, options?: HtmlToMarkdownOptions): string => {
30 | 	const { baseUrl, formatting: toMarkdownOptions, ...toMdastOptions } = options || {};
31 | 	const mdast = htmlToMdast(htmlOrHast, toMdastOptions);
32 | 	const markdown = mdastToMarkdown(mdast, { baseUrl, ...toMarkdownOptions });
33 | 	return markdown;
34 | };
35 | 


--------------------------------------------------------------------------------
/packages/webforai/src/html-to-mdast.ts:
--------------------------------------------------------------------------------
 1 | import type { Nodes as Hast } from "hast";
 2 | import { fromHtml } from "hast-util-from-html";
 3 | import { toMdast } from "hast-util-to-mdast";
 4 | import type { Nodes as Mdast } from "mdast";
 5 | 
 6 | import { extractMdast } from "./extract-mdast";
 7 | import { type ExtractorSelectors, pipeExtractors } from "./extractors";
 8 | import { customAHandler } from "./mdast-handlers/custom-a-handler";
 9 | import { customCodeHandler } from "./mdast-handlers/custom-code-handler";
10 | import { customDivHandler } from "./mdast-handlers/custom-div-handler";
11 | import { customImgHandler } from "./mdast-handlers/custom-img-handler";
12 | import { customTableHandler } from "./mdast-handlers/custom-table-handler";
13 | import { mathHandler } from "./mdast-handlers/math-handler";
14 | import { getLangFromHast, getLangFromStr, getUrlFromHast } from "./utils/hast-utils";
15 | 
16 | export type HtmlToMdastOptions = {
17 | 	/**
18 | 	 * An array of extractors to extract specific elements from the HTML.
19 | 	 * You can define your own functions in addition to the Extractor provided as a preset.
20 | 	 */
21 | 	extractors?: ExtractorSelectors;
22 | 	/** Whether to convert links to plain text. */
23 | 	linkAsText?: boolean;
24 | 	/** Whether to convert tables to plain text. */
25 | 	tableAsText?: boolean;
26 | 	/** Whether to hide images. */
27 | 	hideImage?: boolean;
28 | 	/** The language of the HTML. */
29 | 	lang?: string;
30 | 	/** The URL of the HTML. */
31 | 	url?: string;
32 | };
33 | 
34 | /**
35 |  * Converts an HTML string or HAST tree to an MDAST tree.
36 |  *
37 |  * @param htmlOrHast - The HTML string or HAST tree to convert.
38 |  * @param options - {@link HtmlToMdastOptions} to customize the conversion.
39 |  * @returns The MDAST tree.
40 |  *
41 |  * @example
42 |  * ```ts
43 |  * import { htmlToMdast } from 'webforai';
44 |  *
45 |  * const html = '<h1>Hello, world!</h1>';
46 |  * const mdast = htmlToMdast(html);
47 |  *
48 |  * console.log(mdast); // Output: { type: 'root', children: [ { type: 'heading', depth: 1, children: [ { type: 'text', value: 'Hello, world!' } ] } ] }
49 |  * ```
50 |  */
51 | export const htmlToMdast = (htmlOrHast: string | Hast, options?: HtmlToMdastOptions): Mdast => {
52 | 	const { extractors, url: defaultUrl, lang: defaultLang } = options || {};
53 | 
54 | 	const [lang, hast] =
55 | 		typeof htmlOrHast === "string"
56 | 			? [defaultLang || getLangFromStr(htmlOrHast), fromHtml(htmlOrHast, { fragment: true })]
57 | 			: [defaultLang || getLangFromHast(htmlOrHast), htmlOrHast];
58 | 
59 | 	const url = defaultUrl || getUrlFromHast(hast);
60 | 
61 | 	const extractedHast = pipeExtractors({ hast, lang, url }, extractors);
62 | 
63 | 	const mdast = toMdast(extractedHast, {
64 | 		handlers: {
65 | 			math: mathHandler,
66 | 			div: customDivHandler,
67 | 			pre: customCodeHandler,
68 | 			a: customAHandler({ asText: options?.linkAsText }),
69 | 			img: customImgHandler({ hideImage: options?.hideImage }),
70 | 			table: customTableHandler({ asText: options?.tableAsText }),
71 | 		},
72 | 	});
73 | 
74 | 	const extractedMdast = extractMdast(mdast);
75 | 
76 | 	return extractedMdast;
77 | };
78 | 


--------------------------------------------------------------------------------
/packages/webforai/src/index.ts:
--------------------------------------------------------------------------------
 1 | // biome-ignore lint/performance/noBarrelFile: module index
 2 | export { htmlToMarkdown, type HtmlToMarkdownOptions } from "./html-to-markdown";
 3 | export { mdastSplitter } from "./md-splitter";
 4 | export { htmlToMdast, type HtmlToMdastOptions } from "./html-to-mdast";
 5 | export { mdastToMarkdown } from "./mdast-to-markdown";
 6 | export {
 7 | 	pipeExtractors,
 8 | 	takumiExtractor,
 9 | 	minimalFilter,
10 | 	type ExtractorSelectors,
11 | 	type ExtractorSelector,
12 | 	type ExtractParams,
13 | 	type Extractor,
14 | } from "./extractors";
15 | 


--------------------------------------------------------------------------------
/packages/webforai/src/link-replacer.test.ts:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it } from "vitest";
 2 | import { linkReplacer } from "./link-replacer";
 3 | 
 4 | const markdown = `# Hello, world!
 5 | 
 6 | This is a paragraph.
 7 | 
 8 | [Example](/example.html)
 9 | 
10 | ![Example](/example.jpg)
11 | 
12 | [Absolute Link](https://www.google.com)
13 | 
14 | [Link with hash](/page#hash)
15 | 
16 | [Link with query](/page?query=string)`;
17 | 
18 | const expected = `# Hello, world!
19 | 
20 | This is a paragraph.
21 | 
22 | [Example](https://example.com/example.html)
23 | 
24 | ![Example](https://example.com/example.jpg)
25 | 
26 | [Absolute Link](https://www.google.com)
27 | 
28 | [Link with hash](https://example.com/page#hash)
29 | 
30 | [Link with query](https://example.com/page?query=string)`;
31 | 
32 | describe("linkReplacer", () => {
33 | 	it("should replace relative links", () => {
34 | 		const replaced = linkReplacer(markdown, "https://example.com");
35 | 
36 | 		expect(replaced).toEqual(expected);
37 | 	});
38 | 
39 | 	it("should not replace absolute links", () => {
40 | 		const replaced = linkReplacer("[Absolute Link](https://www.google.com)", "https://example.com");
41 | 
42 | 		expect(replaced).toEqual("[Absolute Link](https://www.google.com)");
43 | 	});
44 | 
45 | 	it("should handle links with hashes", () => {
46 | 		const replaced = linkReplacer("[Link with hash](/page#hash)", "https://example.com");
47 | 
48 | 		expect(replaced).toEqual("[Link with hash](https://example.com/page#hash)");
49 | 	});
50 | 
51 | 	it("should handle links with query parameters", () => {
52 | 		const replaced = linkReplacer("[Link with query](/page?query=string)", "https://example.com");
53 | 
54 | 		expect(replaced).toEqual("[Link with query](https://example.com/page?query=string)");
55 | 	});
56 | });
57 | 


--------------------------------------------------------------------------------
/packages/webforai/src/link-replacer.ts:
--------------------------------------------------------------------------------
 1 | export const linkReplacer = (markdown: string, base: string) => {
 2 | 	const regex = /(!?\[.*?\]\()([^)\s]+)(\))/g;
 3 | 	return markdown.replace(regex, (match, pre, url, post) => {
 4 | 		if (/^(https?:|#)/.test(url)) {
 5 | 			return match;
 6 | 		}
 7 | 		try {
 8 | 			const absoluteUrl = new URL(url, base).href;
 9 | 			return `${pre}${absoluteUrl}${post}`;
10 | 		} catch {
11 | 			return match;
12 | 		}
13 | 	});
14 | };
15 | 


--------------------------------------------------------------------------------
/packages/webforai/src/loaders/cf-puppeteer.ts:
--------------------------------------------------------------------------------
 1 | import puppeteer from "@cloudflare/puppeteer";
 2 | 
 3 | const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
 4 | 
 5 | export const loadHtml = async (url: string, ctx: puppeteer.BrowserWorker) => {
 6 | 	const browser = await puppeteer.launch(ctx);
 7 | 	const page = await browser.newPage();
 8 | 	await page.goto(url);
 9 | 
10 | 	const html = await page.content();
11 | 
12 | 	await Promise.race([page.waitForNetworkIdle(), sleep(10000)]);
13 | 
14 | 	await page.close();
15 | 
16 | 	return html;
17 | };
18 | 


--------------------------------------------------------------------------------
/packages/webforai/src/loaders/fetch.test.ts:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it } from "vitest";
 2 | import { loadHtml } from "./fetch";
 3 | 
 4 | describe("Fetch loader", () => {
 5 | 	it("should load the HTML of a URL", async () => {
 6 | 		const html = await loadHtml("https://example.com");
 7 | 		expect(html).toContain("Example Domain");
 8 | 	});
 9 | });
10 | 


--------------------------------------------------------------------------------
/packages/webforai/src/loaders/fetch.ts:
--------------------------------------------------------------------------------
 1 | export const USER_AGENT =
 2 | 	"mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/125.0.0.0 safari/537.36";
 3 | 
 4 | /**
 5 |  * Useful function for load the HTML of a URL using the Fetch API.
 6 |  * **Not recommended** for use in production environments.
 7 |  * @param url - The URL to load.
 8 |  * @param userAgent - The user agent to use. Default is a Chrome user agent.
 9 |  * @returns The HTML content of the URL.
10 |  */
11 | export const loadHtml = async (url: string, userAgent = USER_AGENT) => {
12 | 	const response = await fetch(url, { headers: { "User-Agent": userAgent } });
13 | 	return response.text();
14 | };
15 | 


--------------------------------------------------------------------------------
/packages/webforai/src/loaders/playwright.test.ts:
--------------------------------------------------------------------------------
 1 | import { chromium } from "playwright-core";
 2 | import { describe, expect, it } from "vitest";
 3 | import { loadHtml } from "./playwright";
 4 | 
 5 | describe("Playwright loader", () => {
 6 | 	it("should load the HTML of a URL", async () => {
 7 | 		const html = await loadHtml("https://example.com");
 8 | 		expect(html).toContain("Example Domain");
 9 | 	});
10 | 
11 | 	it("should load the HTML of a URL using a custom context", async () => {
12 | 		const context = await chromium.launch({ headless: true });
13 | 		const html = await loadHtml("https://example.com", { browser: context });
14 | 
15 | 		expect(html).toContain("Example Domain");
16 | 		expect(context.isConnected()).toBe(true);
17 | 		await context.close();
18 | 	});
19 | });
20 | 


--------------------------------------------------------------------------------
/packages/webforai/src/loaders/playwright.ts:
--------------------------------------------------------------------------------
 1 | import { type Browser, chromium, devices } from "playwright-core";
 2 | 
 3 | export type LoadHtmlOptions = {
 4 | 	browser?: Browser;
 5 | 	timeout?: number;
 6 | 	waitUntil?: "load" | "domcontentloaded" | "networkidle";
 7 | 	superBypassMode?: boolean;
 8 | };
 9 | 
10 | const SUPER_BYPASS_DEVICE = {
11 | 	userAgent:
12 | 		"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
13 | 	viewport: { width: 1920, height: 1080 },
14 | 	deviceScaleFactor: 1,
15 | 	hasTouch: false,
16 | 	isMobile: false,
17 | 	javaScriptEnabled: true,
18 | 	locale: "en-US",
19 | 	timezoneId: "America/New_York",
20 | };
21 | 
22 | /**
23 |  * Useful function for load the HTML of a URL using Playwright.
24 |  * **Not recommended** for use in production environments.
25 |  * @param url - The URL to load.
26 |  * @param context - The Playwright browser context to use. If not provided, a new browser context will be created and closed after loading the URL.
27 |  * @returns The HTML content of the URL.
28 |  * @example
29 |  * ```ts
30 |  * import { loadHtml } from "webforai/loaders/playwright";
31 |  *
32 |  * const html = await loadHtml("https://example.com");
33 |  * console.log(html);
34 |  * ```
35 |  */
36 | export const loadHtml = async (url: string, options?: LoadHtmlOptions) => {
37 | 	const { browser, waitUntil, timeout, superBypassMode } = options ?? {};
38 | 	const _browser = browser ?? (await chromium.launch({ headless: true }));
39 | 	const context = await _browser.newContext(superBypassMode ? SUPER_BYPASS_DEVICE : devices["Desktop Chrome"]);
40 | 
41 | 	if (superBypassMode) {
42 | 		await context.addInitScript(() => {
43 | 			Object.defineProperty(navigator, "webdriver", { get: () => undefined });
44 | 			Object.defineProperty(navigator, "languages", { get: () => ["en-US", "en"] });
45 | 			Object.defineProperty(navigator, "plugins", { get: () => [1, 2, 3, 4, 5] });
46 | 		});
47 | 	}
48 | 
49 | 	const page = await context.newPage();
50 | 	if (superBypassMode) {
51 | 		await page.route("**/*.js", (route) => {
52 | 			if (route.request().url().includes("captcha-delivery")) {
53 | 				return route.abort();
54 | 			}
55 | 			return route.continue();
56 | 		});
57 | 	}
58 | 
59 | 	await page.goto(url, { waitUntil: waitUntil ?? "load", timeout });
60 | 	await page.evaluate(() => {
61 | 		const elements = document.querySelectorAll("*");
62 | 		for (const element of elements) {
63 | 			const rect = element.getBoundingClientRect();
64 | 			element.setAttribute("data-rwidth", rect.width.toString());
65 | 			element.setAttribute("data-rheight", rect.height.toString());
66 | 		}
67 | 	});
68 | 	const html = await page.content();
69 | 	await page.close();
70 | 
71 | 	if (browser) {
72 | 		await context.close();
73 | 	} else {
74 | 		await _browser.close();
75 | 	}
76 | 
77 | 	return html;
78 | };
79 | 


--------------------------------------------------------------------------------
/packages/webforai/src/loaders/puppeteer.test.ts:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it } from "vitest";
 2 | import { loadHtml } from "./puppeteer";
 3 | 
 4 | describe("Puppeteer loader", () => {
 5 | 	it("should load the HTML of a URL", async () => {
 6 | 		const html = await loadHtml("https://example.com");
 7 | 		expect(html).toContain("Example Domain");
 8 | 	});
 9 | 
10 | 	it("should load the HTML of a URL using a custom puppeteer context", async () => {
11 | 		const html = await loadHtml("https://example.com", { headless: true });
12 | 
13 | 		expect(html).toContain("Example Domain");
14 | 	});
15 | });
16 | 


--------------------------------------------------------------------------------
/packages/webforai/src/loaders/puppeteer.ts:
--------------------------------------------------------------------------------
 1 | import puppeteer from "puppeteer";
 2 | import type { PuppeteerLaunchOptions } from "puppeteer";
 3 | 
 4 | const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
 5 | 
 6 | export const loadHtml = async (url: string, ctx?: PuppeteerLaunchOptions) => {
 7 | 	const browser = await puppeteer.launch(
 8 | 		ctx || {
 9 | 			headless: true,
10 | 			args: ["--no-sandbox", "--disable-setuid-sandbox"],
11 | 		},
12 | 	);
13 | 	const page = await browser.newPage();
14 | 	await page.goto(url);
15 | 
16 | 	const html = await page.content();
17 | 
18 | 	await Promise.race([page.waitForNetworkIdle(), sleep(10000)]);
19 | 
20 | 	await browser.close();
21 | 
22 | 	return html;
23 | };
24 | 


--------------------------------------------------------------------------------
/packages/webforai/src/md-splitter.ts:
--------------------------------------------------------------------------------
 1 | import type { Nodes as Mdast, RootContent } from "mdast";
 2 | import { mdastToMarkdown } from "./mdast-to-markdown";
 3 | import { chunk } from "./utils/common";
 4 | import { internalType, unwarpRoot, warpRoot } from "./utils/mdast-utils";
 5 | 
 6 | const PRIORITY_SPLITTERS = ["h1", "h2", "h3", "h4", "h5", "h6", "list", "table", "code"];
 7 | type SplitterGenerator = Generator<string, void, unknown>;
 8 | const getSplitterGenerator = function* () {
 9 | 	for (const splitter of PRIORITY_SPLITTERS) {
10 | 		yield splitter;
11 | 	}
12 | };
13 | 
14 | const _mdastSplitter = async (
15 | 	contents: RootContent[],
16 | 	checker: (markdown: string) => Promise<boolean>,
17 | 	splitterGenerator: SplitterGenerator,
18 | ): Promise<RootContent[][]> => {
19 | 	const splitter = splitterGenerator.next().value;
20 | 	const markdown = mdastToMarkdown(warpRoot(contents));
21 | 	if ((await checker(markdown)) || contents.length === 1) {
22 | 		return [contents];
23 | 	}
24 | 	const chunked = splitter
25 | 		? contents.reduce<RootContent[][]>((acc, content) => {
26 | 				if (internalType(content) === splitter || acc.length === 0) {
27 | 					acc.push([content]);
28 | 					return acc;
29 | 				}
30 | 				acc[acc.length - 1].push(content);
31 | 				return acc;
32 | 			}, [])
33 | 		: chunk(contents, Math.ceil(contents.length / 2));
34 | 
35 | 	const splitting = chunked.map((chunk) => _mdastSplitter(chunk, checker, splitterGenerator));
36 | 
37 | 	return Promise.all(splitting).then((chunks) => chunks.flat());
38 | };
39 | 
40 | export const mdastSplitter = (
41 | 	mdast: Mdast,
42 | 	checker: (markdown: string) => Promise<boolean>,
43 | 	_options?: { signal?: AbortSignal }, //TODO
44 | ): Promise<RootContent[][]> => {
45 | 	return _mdastSplitter(unwarpRoot(mdast), checker, getSplitterGenerator());
46 | };
47 | 


--------------------------------------------------------------------------------
/packages/webforai/src/mdast-handlers/custom-a-handler.ts:
--------------------------------------------------------------------------------
 1 | import { type Handle, defaultHandlers } from "hast-util-to-mdast";
 2 | import { toString as hastToString } from "hast-util-to-string";
 3 | 
 4 | export const customAHandler =
 5 | 	(options?: { asText?: boolean }): Handle =>
 6 | 	(state, node) => {
 7 | 		if (options?.asText) {
 8 | 			const text = hastToString(node);
 9 | 			if (3 >= text.length) {
10 | 				return undefined;
11 | 			}
12 | 
13 | 			const link = { type: "text", value: text } as const;
14 | 			state.patch(node, link);
15 | 			return link;
16 | 		}
17 | 
18 | 		const link = defaultHandlers.a(state, node);
19 | 		if (link.children.length > 0) {
20 | 			return link;
21 | 		}
22 | 		return undefined;
23 | 	};
24 | 


--------------------------------------------------------------------------------
/packages/webforai/src/mdast-handlers/custom-br-handler.ts:
--------------------------------------------------------------------------------
1 | import { type Handle, defaultHandlers } from "hast-util-to-mdast";
2 | 
3 | export const customBrHandler: Handle = (state, node) => {
4 | 	return defaultHandlers.br(state, node);
5 | };
6 | 


--------------------------------------------------------------------------------
/packages/webforai/src/mdast-handlers/custom-code-handler.ts:
--------------------------------------------------------------------------------
 1 | import type { Handle } from "hast-util-to-mdast";
 2 | import { toText } from "hast-util-to-text";
 3 | import type { Code } from "mdast";
 4 | import { trimTrailingLines } from "trim-trailing-lines";
 5 | import { detectLanguage } from "../utils/detect-code-lang";
 6 | 
 7 | const LANGUAGE_MATCH_REGEX = [/language-(\w+)/, /highlight-source-(\w+)/, /CodeBlock--language-(\w+)/];
 8 | 
 9 | export const customCodeHandler: Handle = (state, node) => {
10 | 	const classNames = (node.properties?.className as string[]) || [];
11 | 	const codeValue = trimTrailingLines(toText(node)).trim();
12 | 
13 | 	const classLang = classNames
14 | 		.map((className) => {
15 | 			const match = LANGUAGE_MATCH_REGEX.map((regex) => className.match(regex)).find((match) => match);
16 | 			return match?.[1];
17 | 		})
18 | 		.find((className) => className);
19 | 
20 | 	const lang = classLang || detectLanguage(codeValue) || null;
21 | 
22 | 	const result: Code = { type: "code", lang, meta: null, value: codeValue };
23 | 	state.patch(node, result);
24 | 	return result;
25 | };
26 | 


--------------------------------------------------------------------------------
/packages/webforai/src/mdast-handlers/custom-div-handler.ts:
--------------------------------------------------------------------------------
 1 | import { select } from "hast-util-select";
 2 | import { type Handle, defaultHandlers } from "hast-util-to-mdast";
 3 | import { toString as hastToString } from "hast-util-to-string";
 4 | import { toText } from "hast-util-to-text";
 5 | import type { Code } from "mdast";
 6 | import { trimTrailingLines } from "trim-trailing-lines";
 7 | import { detectLanguage } from "../utils/detect-code-lang";
 8 | 
 9 | const CODE_BLOCK_REGEX = /highlight-source|language-|codegroup|codeblock|code-block/i;
10 | 
11 | const CODE_FILENAME_SELECTORS = "[class*='fileName'],[class*='fileName'],[class*='title'],[class*='Title']";
12 | 
13 | const LANGUAGE_MATCH_REGEX = [/language-(\w+)/, /highlight-source-(\w+)/, /CodeBlock--language-(\w+)/];
14 | 
15 | const findRecursive = <T>(array: T[], condition: (value: T) => boolean | T[], maxDepth = 3): T | null => {
16 | 	if (maxDepth <= 0) {
17 | 		return null;
18 | 	}
19 | 	for (const value of array) {
20 | 		const result = condition(value);
21 | 		if (Array.isArray(result)) {
22 | 			return findRecursive(result, condition, maxDepth - 1);
23 | 		}
24 | 		if (result) {
25 | 			return value;
26 | 		}
27 | 	}
28 | 
29 | 	return null;
30 | };
31 | export const customDivHandler: Handle = (state, node) => {
32 | 	const classNames = Array.isArray(node.properties.className) ? (node.properties.className as string[]) : [];
33 | 	const codeBlock = findRecursive(node.children, (child) => {
34 | 		if (child.type !== "element") {
35 | 			return false;
36 | 		}
37 | 		if (child.tagName === "pre") {
38 | 			return true;
39 | 		}
40 | 		return child.children.filter((child) => child.type === "element");
41 | 	});
42 | 
43 | 	if (codeBlock && classNames.some((className) => CODE_BLOCK_REGEX.test(className))) {
44 | 		const codeBlockClassNames = codeBlock.type === "element" ? (codeBlock.properties.className as string[]) ?? [] : [];
45 | 		const codeValue = trimTrailingLines(toText(codeBlock)).trim();
46 | 
47 | 		const filenameElement = select(CODE_FILENAME_SELECTORS, node);
48 | 		const fileLang = filenameElement ? hastToString(filenameElement).match(/\.(\w+)$/)?.[1] : null;
49 | 
50 | 		const classLang = [...classNames, ...codeBlockClassNames]
51 | 			.map((className) => {
52 | 				const match = LANGUAGE_MATCH_REGEX.map((regex) => className.match(regex)).find((match) => match);
53 | 
54 | 				return match?.[1];
55 | 			})
56 | 			.find((className) => className);
57 | 
58 | 		const lang = fileLang || classLang || detectLanguage(codeValue) || null;
59 | 
60 | 		const result: Code = { type: "code", lang, meta: null, value: codeValue };
61 | 		state.patch(node, result);
62 | 		return result;
63 | 	}
64 | 
65 | 	return defaultHandlers.div(state, node);
66 | };
67 | 


--------------------------------------------------------------------------------
/packages/webforai/src/mdast-handlers/custom-img-handler.ts:
--------------------------------------------------------------------------------
 1 | import { type Handle, defaultHandlers } from "hast-util-to-mdast";
 2 | 
 3 | export const customImgHandler =
 4 | 	(options?: { hideImage?: boolean }): Handle =>
 5 | 	(state, node) => {
 6 | 		if (options?.hideImage) {
 7 | 			return undefined;
 8 | 		}
 9 | 		return defaultHandlers.image(state, node);
10 | 	};
11 | 


--------------------------------------------------------------------------------
/packages/webforai/src/mdast-handlers/custom-table-handler.ts:
--------------------------------------------------------------------------------
 1 | import { type Handle, defaultHandlers } from "hast-util-to-mdast";
 2 | import { toText } from "hast-util-to-text";
 3 | 
 4 | export const customTableHandler =
 5 | 	(options?: { asText?: boolean }): Handle =>
 6 | 	(state, node) => {
 7 | 		if (options?.asText) {
 8 | 			const paragraph = { type: "paragraph" as const, children: [{ type: "text", value: toText(node) } as const] };
 9 | 			state.patch(node, paragraph);
10 | 			return paragraph;
11 | 		}
12 | 		return defaultHandlers.table(state, node);
13 | 	};
14 | 


--------------------------------------------------------------------------------
/packages/webforai/src/mdast-handlers/empty-handler.ts:
--------------------------------------------------------------------------------
1 | export const emptyHandler = () => {
2 | 	// Do nothing
3 | };
4 | 


--------------------------------------------------------------------------------
/packages/webforai/src/mdast-handlers/math-handler.ts:
--------------------------------------------------------------------------------
 1 | import { toHtml } from "hast-util-to-html";
 2 | import type { Handle } from "hast-util-to-mdast";
 3 | import { MathMLToLaTeX } from "mathml-to-latex";
 4 | import type { InlineMath, Math as mdMath } from "mdast-util-math";
 5 | 
 6 | export const mathHandler: Handle = (state, node) => {
 7 | 	const mathMl = toHtml(node);
 8 | 	const latex = MathMLToLaTeX.convert(mathMl);
 9 | 	const result: InlineMath | mdMath = { type: "inlineMath", value: latex };
10 | 	state.patch(node, result);
11 | 	return result;
12 | };
13 | 


--------------------------------------------------------------------------------
/packages/webforai/src/mdast-to-markdown.ts:
--------------------------------------------------------------------------------
 1 | import type { Nodes as Mdast, RootContent } from "mdast";
 2 | 
 3 | import { gfmToMarkdown } from "mdast-util-gfm";
 4 | import { mathToMarkdown } from "mdast-util-math";
 5 | import { type Options as ToMarkdownOptions, toMarkdown } from "mdast-util-to-markdown";
 6 | 
 7 | import { linkReplacer } from "./link-replacer";
 8 | import { warpRoot } from "./utils/mdast-utils";
 9 | 
10 | /**
11 |  * Options for the `mdastToMarkdown` function.
12 |  */
13 | export interface MdastToMarkdownOptions extends ToMarkdownOptions {
14 | 	/**
15 | 	 * The base URL to use for replacing relative links.
16 | 	 */
17 | 	baseUrl?: string;
18 | }
19 | 
20 | /**
21 |  * Default options for the `mdastToMarkdown` function.
22 |  */
23 | export const DEFAULT_MDAST_TO_MARKDOWN_OPTIONS: MdastToMarkdownOptions = {
24 | 	extensions: [gfmToMarkdown(), mathToMarkdown()],
25 | 	bullet: "-",
26 | };
27 | 
28 | /**
29 |  * Converts an MDAST tree to a Markdown string.
30 |  *
31 |  * @param mdast - The MDAST tree to convert.
32 |  * @param options - Options for the conversion.
33 |  * @returns The Markdown string.
34 |  *
35 |  * @example
36 |  * ```ts
37 |  * import { mdastToMarkdown } from './your-library';
38 |  *
39 |  * const mdast = {
40 |  *   type: 'root',
41 |  *   children: [
42 |  *     {
43 |  *       type: 'paragraph',
44 |  *       children: [
45 |  *         { type: 'text', value: 'Hello, world!' }
46 |  *       ]
47 |  *     }
48 |  *   ]
49 |  * };
50 |  *
51 |  * const markdown = mdastToMarkdown(mdast);
52 |  * console.log(markdown); // Output: "Hello, world!"
53 |  * ```
54 |  */
55 | export const mdastToMarkdown = (mdast: Mdast | RootContent[], options?: MdastToMarkdownOptions): string => {
56 | 	const { baseUrl, ...toMarkdownOptions } = { ...DEFAULT_MDAST_TO_MARKDOWN_OPTIONS, ...options };
57 | 
58 | 	let markdown = toMarkdown(warpRoot(mdast), toMarkdownOptions).replace(/\*\*\*\*/g, "");
59 | 
60 | 	if (baseUrl) {
61 | 		markdown = linkReplacer(markdown, baseUrl);
62 | 	}
63 | 
64 | 	return markdown;
65 | };
66 | 


--------------------------------------------------------------------------------
/packages/webforai/src/utils/common.ts:
--------------------------------------------------------------------------------
 1 | export const chunk = <T>(array: T[], size: number): T[][] => {
 2 | 	return array.reduce<T[][]>((acc, _, index) => {
 3 | 		if (index % size === 0) {
 4 | 			acc.push(array.slice(index, index + size));
 5 | 			return acc;
 6 | 		}
 7 | 		return acc;
 8 | 	}, []);
 9 | };
10 | 


--------------------------------------------------------------------------------
/packages/webforai/src/utils/detect-code-lang.ts:
--------------------------------------------------------------------------------
 1 | /* 
 2 | This code is derived from speed-highlight (https://github.com/speed-highlight/core),
 3 | which is licensed under the CC0 1.0 Universal License.
 4 | It was a very good simple code language selection algorithm and will be used. Thank you!
 5 | */
 6 | 
 7 | //TODO: Add more languages and improve the algorithm
 8 | const languages = [
 9 | 	["bash", [/#!(\/usr)?\/bin\/bash/g, 500], [/\b(if|elif|then|fi|echo)\b|\$/g, 10]],
10 | 	["html", [/<\/?[a-z-]+[^\n>]*>/g, 10], [/^\s+<!DOCTYPE\s+html/g, 500]],
11 | 	["http", [/^(GET|HEAD|POST|PUT|DELETE|PATCH|HTTP)\b/g, 500]],
12 | 	[
13 | 		"ts",
14 | 		[/(import .* from|export \*|export const|const .* = await)/g, 400],
15 | 		[
16 | 			/\b(console|await|async|function|undefined|export|import|this|class|for|let|const|map|join|require|implements|interface|namespace)\b/g,
17 | 			10,
18 | 		],
19 | 	],
20 | 	[
21 | 		"tsx",
22 | 		[/(import .* from|export \*|export const)/g, 400],
23 | 		[/\b(react|next|FC)\b/g, 200],
24 | 		[
25 | 			/\b(console|await|async|function|export|undefined|import|this|class|for|let|const|map|join|require|implements|interface)\b/g,
26 | 			10,
27 | 		],
28 | 	],
29 | 	["py", [/\b(def|print|class|and|or|lambda)\b/g, 10]],
30 | 	["sql", [/\b(SELECT|INSERT|FROM)\b/g, 50]],
31 | 	["pl", [/#!(\/usr)?\/bin\/perl/g, 500], [/\b(use|print)\b|\$/g, 10]],
32 | 	["lua", [/#!(\/usr)?\/bin\/lua/g, 500]],
33 | 	["make", [/\b(ifneq|endif|if|elif|then|fi|echo|.PHONY|^[a-z]+ ?:$)\b|\$/gm, 10]],
34 | 	["uri", [/https?:|mailto:|tel:|ftp:/g, 30]],
35 | 	["css", [/^(@import|@page|@media|(\.|#)[a-z]+)/gm, 20]],
36 | 	["diff", [/^[+><-]/gm, 10], [/^@@ ?[-+,0-9 ]+ ?@@/gm, 25]],
37 | 	["md", [/^(>|\t\*|\t\d+.)/gm, 10], [/\[.*\](.*)/g, 10]],
38 | 	["docker", [/^(FROM|ENTRYPOINT|RUN)/gm, 500]],
39 | 	["xml", [/<\/?[a-z-]+[^\n>]*>/g, 10], [/^<\?xml/g, 500]],
40 | 	["c", [/#include\b|\bprintf\s+\(/g, 100]],
41 | 	["rs", [/^\s+(use|fn|mut|match)\b/gm, 100]],
42 | 	["go", [/\b(func|fmt|package)\b/g, 100]],
43 | 	["java", [/^import\s+java/gm, 500]],
44 | 	["asm", [/^(section|global main|extern|\t(call|mov|ret))/gm, 100]],
45 | 	["css", [/^(@import|@page|@media|(\.|#)[a-z]+)/gm, 20]],
46 | 	["json", [/\b(true|false|null|\{})\b|\"[^"]+\":/g, 10]],
47 | 	["yaml", [/^(\s+)?[a-z][a-z0-9]*:/gim, 10]],
48 | ] as const;
49 | 
50 | export const detectLanguage = (code: string) => {
51 | 	return (
52 | 		languages
53 | 			.map(
54 | 				([lang, ...features]) =>
55 | 					[lang, features.reduce((acc, [match, score]) => acc + [...code.matchAll(match)].length * score, 0)] as const,
56 | 			)
57 | 			.filter(([_, score]) => score > 20)
58 | 			.sort((a, b) => b[1] - a[1])[0]?.[0] || "plain"
59 | 	);
60 | };
61 | 


--------------------------------------------------------------------------------
/packages/webforai/src/utils/hast-utils.ts:
--------------------------------------------------------------------------------
 1 | import type { Element, Nodes as Hast } from "hast";
 2 | import { select, selectAll } from "hast-util-select";
 3 | 
 4 | export const getLangFromHast = (node: Hast) => {
 5 | 	const html = select("html", node);
 6 | 	if (html && typeof html.properties.lang === "string") {
 7 | 		return html.properties.lang;
 8 | 	}
 9 | 	if (node.type !== "element") {
10 | 		return;
11 | 	}
12 | 	const element = node as Element;
13 | 	if (element.tagName !== "html") {
14 | 		return;
15 | 	}
16 | 
17 | 	const langAttr = element.properties.lang || element.properties["xml:lang"];
18 | 	if (langAttr) {
19 | 		return langAttr as string;
20 | 	}
21 | 
22 | 	return undefined;
23 | };
24 | 
25 | export const getLangFromStr = (str: string) => {
26 | 	const match = str.match(/lang=["']([^"']+)["']/);
27 | 	if (match) {
28 | 		return match[1];
29 | 	}
30 | 	return undefined;
31 | };
32 | 
33 | export const getUrlFromHast = (node: Hast): string | undefined => {
34 | 	if (node.type !== "element") {
35 | 		return undefined;
36 | 	}
37 | 
38 | 	const metaTagAttributes = ["og:url", "twitter:url"];
39 | 	const metaTags = selectAll("meta", node);
40 | 
41 | 	for (const meta of metaTags) {
42 | 		const property = meta.properties.property || meta.properties.name;
43 | 		if (typeof property === "string" && metaTagAttributes.includes(property)) {
44 | 			return typeof meta.properties.content === "string" ? meta.properties.content : undefined;
45 | 		}
46 | 	}
47 | 
48 | 	return undefined;
49 | };
50 | 


--------------------------------------------------------------------------------
/packages/webforai/src/utils/mdast-utils.ts:
--------------------------------------------------------------------------------
 1 | import type { Nodes as Mdast, RootContent } from "mdast";
 2 | 
 3 | export const unwarpRoot = (mdast: Mdast): RootContent[] => {
 4 | 	if (mdast.type === "root") {
 5 | 		return mdast.children;
 6 | 	}
 7 | 	return [mdast];
 8 | };
 9 | 
10 | export const warpRoot = (mdast: RootContent[] | Mdast): Mdast => {
11 | 	if (Array.isArray(mdast)) {
12 | 		return { type: "root", children: mdast };
13 | 	}
14 | 	return mdast;
15 | };
16 | 
17 | export const internalType = (content: RootContent): string => {
18 | 	if (content.type === "heading") {
19 | 		return `h${content.depth}`;
20 | 	}
21 | 	return content.type;
22 | };
23 | 


--------------------------------------------------------------------------------
/packages/webforai/tsconfig.build.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"extends": "./tsconfig.json",
 3 | 	"compilerOptions": {
 4 | 		"module": "ES2020",
 5 | 		"rootDir": "./src/",
 6 | 		"outDir": "./dist/types/",
 7 | 		"noUnusedLocals": true,
 8 | 		"noUnusedParameters": true,
 9 | 		"sourceMap": true
10 | 	},
11 | 	"include": ["src/**/*.ts", "src/**/*.mts"],
12 | 	"exclude": [
13 | 		"src/mod.ts",
14 | 		"src/helper.ts",
15 | 		"src/middleware.ts",
16 | 		"src/deno/**/*.ts",
17 | 		"src/test-utils/*.ts",
18 | 		"src/**/*.test.ts",
19 | 		"src/**/*.test.tsx"
20 | 	]
21 | }
22 | 


--------------------------------------------------------------------------------
/packages/webforai/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"compilerOptions": {
 3 | 		"target": "ES2020",
 4 | 		"module": "ESNext",
 5 | 		"declaration": true,
 6 | 		"moduleResolution": "Bundler",
 7 | 		"outDir": "./dist",
 8 | 		"esModuleInterop": true,
 9 | 		"forceConsistentCasingInFileNames": true,
10 | 		"strict": true,
11 | 		"skipLibCheck": false,
12 | 		"noUnusedLocals": false,
13 | 		"noUnusedParameters": false,
14 | 		"resolveJsonModule": true,
15 | 		"types": ["node"]
16 | 	},
17 | 	"include": [
18 | 		"src/**/*.ts",
19 | 		"src/**/*.d.ts",
20 | 		"src/**/*.mts",
21 | 		"src/**/*.test.ts",
22 | 		"src/**/*.test.tsx",
23 | 		"bin/**/*.ts",
24 | 		"bin/**/*.d.ts",
25 | 		"bin/**/*.mts",
26 | 		"bin/**/*.test.ts",
27 | 		"bin/**/*.test.tsx"
28 | 	]
29 | }
30 | 


--------------------------------------------------------------------------------
/pnpm-workspace.yaml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - "packages/*"
3 |   - "examples/*"
4 |   - "site"


--------------------------------------------------------------------------------
/site/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # site
 2 | 
 3 | ## 0.1.0
 4 | 
 5 | ### Minor Changes
 6 | 
 7 | - [#56](https://github.com/inaridiy/webforai/pull/56) [`ea8b326`](https://github.com/inaridiy/webforai/commit/ea8b3261eb2a7ec5b635a54a17ed18cca50106f4) Thanks [@inaridiy](https://github.com/inaridiy)! - Add minimal filter extractor
 8 | 
 9 | ## 0.0.2
10 | 
11 | ### Patch Changes
12 | 
13 | - [#53](https://github.com/inaridiy/webforai/pull/53) [`c3f012c`](https://github.com/inaridiy/webforai/commit/c3f012ca740ef33538ca5d4874277008daf5c5a1) Thanks [@moons-14](https://github.com/moons-14)! - Add description for cf puppeteer loader.
14 | 
15 | ## 0.0.1
16 | 
17 | ### Patch Changes
18 | 
19 | - [#50](https://github.com/inaridiy/webforai/pull/50) [`ff85d73`](https://github.com/inaridiy/webforai/commit/ff85d73a6d64a52a990b031f50430fe2956c5f2f) Thanks [@inaridiy](https://github.com/inaridiy)! - New Documentation Site
20 | 


--------------------------------------------------------------------------------
/site/README.md:
--------------------------------------------------------------------------------
1 | This is a [Vocs](https://vocs.dev) project bootstrapped with the Vocs CLI.
2 | 


--------------------------------------------------------------------------------
/site/docs/footer.tsx:
--------------------------------------------------------------------------------
 1 | // biome-ignore lint/style/noDefaultExport:
 2 | export default function Footer() {
 3 | 	return (
 4 | 		<div className="items-center flex flex-col mt-12 w-full">
 5 | 			<div>Released under the MIT License.</div>
 6 | 			<div>Copyright © 2024-present inaridiy and contributors.</div>
 7 | 		</div>
 8 | 	);
 9 | }
10 | 


--------------------------------------------------------------------------------
/site/docs/pages/cookbook/custom-extractor.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Custom extractor
 3 | authors:
 4 |  - "[inaridiy](https://github.com/inaridiy)"
 5 | date: 2024-10-20
 6 | ---
 7 | 
 8 | # Custom extractor
 9 | 
10 | ::authors
11 | 
12 | The default takumi-extractor in webforai is powerful, but occasionally it might not perform well on websites with unique structures. There may also be cases where you need to extract content other than the main body.
13 | 
14 | For such scenarios, you can create a custom extractor to handle specific requirements.
15 | 
16 | In the following example, we’ll build a custom extractor to pull the main content from an Amazon product page.
17 | 
18 | ```ts [src/index.ts] twoslash
19 | import { select } from "hast-util-select";
20 | import { type Extractor, htmlToMarkdown, takumiExtractor } from "webforai";
21 | import { loadHtml } from "webforai/loaders/playwright";
22 | 
23 | const url = "https://www.amazon.com/Generative-Deep-Learning-Teaching-Machines/dp/1098134184/ref=sr_1_8?sr=8-8s";
24 | const html = await loadHtml(url);
25 | 
26 | const amazonShopItemExtractor: Extractor = (params) => {
27 | 	const { hast } = params;
28 | 	const mainContent = select("div#centerCol", hast);
29 | 	if (!mainContent) {
30 | 		return hast;
31 | 	}
32 | 	return mainContent;
33 | };
34 | 
35 | const cleanedContent = await htmlToMarkdown(html, { baseUrl: url, extractors: [amazonShopItemExtractor, takumiExtractor] });
36 | 
37 | console.info(cleanedContent);
38 | ```
39 | 
40 | This custom extractor targets the #centerCol element on Amazon product pages. If found, it returns only that content; otherwise, it defaults to the original structure.
41 | 


--------------------------------------------------------------------------------
/site/docs/pages/cookbook/index.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Cookbook
 3 | ---
 4 | 
 5 | # Cookbook
 6 | 
 7 | Welcome to the Webforai Cookbook, a collection of practical recipes to help you harness the full potential of Webforai.  
 8 | 
 9 | Here, you'll find examples that guide you through various tasks, from basic usage to advanced customization.
10 | 
11 | ## Recipes
12 | 
13 | ### Basic Usage
14 | - [Simple Usage](/cookbook/simple)
15 |   Learn the most basic way to convert HTML to Markdown.
16 | 
17 | ### Advanced Conversion
18 | - [Web Page Translation](/cookbook/translation)
19 |   Translate web content into any language using Webforai and Vercel AI SDK.
20 | 
21 | - [Structured Data Output](/cookbook/structured-output)
22 |   Extract structured JSON data directly from websites with AI SDK.
23 | 
24 | ### Customization & Extensions
25 | - [Custom Extractor](/cookbook/custom-extractor)
26 |   Create custom extractors for specific website structures.
27 | 
28 | ### Usage in Specific Environments
29 | - [Cloudflare Workers](/cookbook/cf-workers)
30 |   Learn how to use Webforai in Cloudflare Workers.
31 | 
32 | ## Contribute
33 | 
34 | These recipes are here to help you explore Webforai’s capabilities in real-world scenarios. We welcome contributions—whether you have a new recipe idea or improvements for existing ones. Submit a pull request on our GitHub repository.
35 | 
36 | For more detailed information and advanced features, check out the [API Documentation](/docs/html-to-markdown).
37 | 
38 | Start building amazing projects with Webforai today!


--------------------------------------------------------------------------------
/site/docs/pages/cookbook/simple.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Simple usage
 3 | authors:
 4 |  - "[inaridiy](https://github.com/inaridiy)"
 5 | date: 2024-10-19
 6 | ---
 7 | 
 8 | # Simple usage
 9 | 
10 | ::authors
11 | 
12 | The simplest way to use Webforai
13 | 
14 | ::::steps
15 | ## Install dependencies
16 | 
17 | Install the necessary packages:
18 | 
19 | :::code-group
20 | ```bash [npm]
21 | npm init -y
22 | npm install webforai
23 | npm install -D tsx
24 | ```
25 | 
26 | ```bash [pnpm]
27 | pnpm init -y
28 | pnpm install webforai
29 | pnpm install -D tsx
30 | ```
31 | :::
32 | 
33 | ## Write code
34 | 
35 | Here’s how to convert HTML to Markdown using **webforai**:
36 | 
37 | ```ts [src/index.ts] twoslash
38 | import { htmlToMarkdown } from "webforai";
39 | import { loadHtml } from "webforai/loaders/fetch";
40 | 
41 | const html = await loadHtml("https://example.com");
42 | const markdown = htmlToMarkdown(html);
43 | 
44 | console.log(markdown);
45 | ```
46 | 
47 | ## Launch
48 | 
49 | ```bash
50 | tsx src/index.ts
51 | 
52 | # => # Example Domain
53 | # => 
54 | # => This domain is for use in illustrative examples in documents. You may use this
55 | # => domain in literature without prior coordination or asking for permission.
56 | # => 
57 | # => More information...
58 | ```
59 | ::::
60 | 


--------------------------------------------------------------------------------
/site/docs/pages/cookbook/structured-output.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Structured Output with ai SDK
 3 | authors:
 4 |  - "[inaridiy](https://github.com/inaridiy)"
 5 | date: 2024-10-19
 6 | ---
 7 | 
 8 | # Structured Output
 9 | 
10 | ::authors
11 | You can get **structured JSON** directly from any website by using webforai and the [Vercel AI SDK](https://sdk.vercel.ai/).
12 | 
13 | ::::steps
14 | ## Install dependencies
15 | 
16 | Install the necessary packages:
17 | 
18 | :::code-group
19 | ```bash [npm]
20 | npm init -y
21 | npm install webforai ai @ai-sdk/google zod
22 | npm install -D tsx
23 | ```
24 | 
25 | ```bash [pnpm]
26 | pnpm init -y
27 | pnpm install webforai ai @ai-sdk/google zod
28 | pnpm install -D tsx
29 | ```
30 | :::
31 | 
32 | ## Prepare API Key
33 | 
34 | This example uses **Google Generative AI (Gemini 1.5 Flash)** via the AI SDK. 
35 | Set your **Google Generative AI API key** as an environment variable **GOOGLE_GENERATIVE_AI_API_KEY**. You can get the key [here](https://aistudio.google.com/app/apikey).
36 | 
37 | For other providers, see the [AI SDK provider documentation](https://sdk.vercel.ai/providers/ai-sdk-providers).
38 | 
39 | ## Write code
40 | 
41 | Here’s how to convert HTML to Markdown using **webforai** and then transform it into a structured object with **AI SDK**:
42 | 
43 | ```ts [src/index.ts] twoslash
44 | import { google } from "@ai-sdk/google";
45 | import { generateObject } from "ai";
46 | import { htmlToMarkdown } from "webforai";
47 | import { loadHtml } from "webforai/loaders/fetch";
48 | import { z } from "zod";
49 | 
50 | const html = await loadHtml("https://github.com/inaridiy?tab=repositories");
51 | const markdown = htmlToMarkdown(html);
52 | 
53 | const { object: repositories } = await generateObject({
54 | 	model: google("gemini-1.5-flash-latest"),
55 | 	schema: z.object({
56 | 		repositories: z.array(
57 | 			z.object({
58 | 				name: z.string(),
59 | 				url: z.string(),
60 | 				stars: z.number(),
61 | 				license: z.string(),
62 | 			}),
63 | 		),
64 | 	}),
65 | 	prompt: `Please generate a list of repositories from the following markdown content.\n\n${markdown}`,
66 | });
67 | 
68 | console.log(repositories);
69 | ```
70 | 
71 | ## Launch 🚀
72 | 
73 | Just run the following command:
74 | 
75 | ```bash
76 | tsx src/index.ts
77 | 
78 | # => {
79 | # =>   repositories: [
80 | # =>     {
81 | # =>       name: 'webforai',
82 | # =>       url: 'https://github.com/inaridiy/webforai',
83 | # =>       stars: 46,
84 | # =>       license: 'MIT'
85 | # =>     }
86 | # =>   ]
87 | # => ...
88 | # => }
89 | ```
90 | 
91 | ::::
92 | 


--------------------------------------------------------------------------------
/site/docs/pages/cookbook/translation.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Translation web page
 3 | authors:
 4 |  - "[inaridiy](https://github.com/inaridiy)"
 5 | date: 2024-10-19
 6 | ---
 7 | 
 8 | # Translation web content
 9 | ::authors
10 | 
11 | You can translate web content to any language by using webforai and the [Vercel AI SDK](https://sdk.vercel.ai/).
12 | 
13 | ::::steps
14 | ## Install dependencies
15 | 
16 | Install the necessary packages:
17 | 
18 | :::code-group
19 | ```bash [npm]
20 | npm init -y
21 | npm install webforai ai @ai-sdk/google zod
22 | npm install -D tsx
23 | ```
24 | 
25 | ```bash [pnpm]
26 | pnpm init -y
27 | pnpm install webforai ai @ai-sdk/google zod
28 | pnpm install -D tsx
29 | ```
30 | :::
31 | 
32 | ## Prepare API Key
33 | 
34 | This example uses **Google Generative AI (Gemini 1.5 Flash)** via the AI SDK. 
35 | Set your **Google Generative AI API key** as an environment variable **GOOGLE_GENERATIVE_AI_API_KEY**. You can get the key [here](https://aistudio.google.com/app/apikey).
36 | 
37 | For other providers, see the [AI SDK provider documentation](https://sdk.vercel.ai/providers/ai-sdk-providers).
38 | 
39 | ## Write code
40 | 
41 | Here's an example of how to translate a web page using webforai and the Vercel AI SDK.
42 | A little trick in this code is the use of `experimental_continueSteps`.
43 | If you enable this flag, it will also make it OK if the outputToken is exceeded.
44 | 
45 | ```ts [src/index.ts] twoslash
46 | import { google } from "@ai-sdk/google";
47 | import { generateText } from "ai";
48 | import { htmlToMarkdown } from "webforai";
49 | import { loadHtml } from "webforai/loaders/playwright";
50 | 
51 | const url = "https://github.com/inaridiy";
52 | const targetLanguage = "ja"; // Translate to Japanese
53 | 
54 | const html = await loadHtml(url, { superBypassMode: true });
55 | const markdown = htmlToMarkdown(html);
56 | 
57 | const prompt = `Translate mechanically converted HTML-based Markdown into ${targetLanguage}, while refining and correcting the content for clarity and coherence.
58 | 
59 | The Markdown provided may contain redundant or unnecessary information and errors due to mechanical conversion. Your task is to translate the text into Japanese, fixing these issues and improving the overall quality of the Markdown document.
60 | 
61 | <input_document>
62 | ${markdown}
63 | </input_document>`;
64 | 
65 | const response = await generateText({
66 | 	model: google("gemini-1.5-flash-latest"),
67 | 	temperature: 0,
68 | 	prompt,
69 | 	maxSteps: 10,
70 | 	experimental_continueSteps: true, // To long content, you need to set this option.
71 | });
72 | 
73 | console.info(response.text);
74 | 
75 | ```
76 | 
77 | ## Launch 🚀
78 | 
79 | Just run the following command:
80 | 
81 | ```bash
82 | tsx src/index.ts
83 | 
84 | # => Output the translated content.
85 | ```
86 | 
87 | ::::


--------------------------------------------------------------------------------
/site/docs/pages/docs/html-to-markdown.mdx:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: htmlToMarkdown
  3 | ---
  4 | 
  5 | # htmlToMarkdown
  6 | 
  7 | Useful and high-quality HTML to Markdown converter.
  8 | Internally, it just calls [htmlToMdast](/docs/html-to-mdast) and [mdastToMarkdown](/docs/mdast-to-markdown) in that order.
  9 | 
 10 | ## Usage
 11 | 
 12 | ```ts twoslash
 13 | import { htmlToMarkdown } from "webforai";
 14 | 
 15 | const html = "<h1>Hello, world!</h1>";
 16 | const markdown = htmlToMarkdown(html);
 17 | // @log: => "# Hello, world!"
 18 | ```
 19 | 
 20 | ## Returns
 21 | 
 22 | `string`
 23 | 
 24 | The converted Markdown string.
 25 | 
 26 | ## Parameters
 27 | 
 28 | ### htmlOrHast
 29 | 
 30 | type: `string | Hast`
 31 | 
 32 | The HTML string or HAST tree to convert.
 33 | 
 34 | ```ts
 35 | const markdown = htmlToMarkdown("<h1>Hello, world!</h1>");
 36 | // => "# Hello, world!"
 37 | ```
 38 | 
 39 | ### options.baseUrl
 40 | 
 41 | type: `string`
 42 | 
 43 | The base URL to use for replacing relative links.
 44 | 
 45 | ```ts 
 46 | const markdown = htmlToMarkdown("<a href='/foo'>bar</a>", {
 47 | 	baseUrl: "https://example.com",
 48 | });
 49 | // => "[bar](https://example.com/foo)"
 50 | ```
 51 | 
 52 | ### options.extractors
 53 | 
 54 | type: `ExtractorSelectors`
 55 | 
 56 | An array of extractors to extract specific elements from the HTML.
 57 | You can define your own functions in addition to the Extractor provided as a preset.
 58 | 
 59 | ```ts twoslash
 60 | import { htmlToMarkdown, type Extractor, takumiExtractor } from "webforai"
 61 | 
 62 | const yourCustomExtractor: Extractor = (params) => {
 63 | 	const { hast, url } = params
 64 | 	// ... your logic ...
 65 | 	return hast
 66 | };
 67 | 
 68 | const html = "<h1>Hello, world!</h1>"
 69 | const markdown = htmlToMarkdown(html, {
 70 | 	extractors: [yourCustomExtractor, takumiExtractor]
 71 | });
 72 | // => "# Hello, world!"
 73 | ```
 74 | 
 75 | ### options.formatting
 76 | 
 77 | type: `Omit<MdastToMarkdownOptions, "baseUrl">`
 78 | 
 79 | Formatting options passed to [mdast-util-to-markdown](https://github.com/syntax-tree/mdast-util-to-markdown).
 80 | 
 81 | ```ts
 82 | const markdown = htmlToMarkdown("<h1>Hello, world!</h1>", {
 83 | 	formatting: {
 84 | 		bullet: "*",
 85 | 	},
 86 | });
 87 | // => "* Hello, world!"
 88 | ```
 89 | 
 90 | ### options.linkAsText
 91 | 
 92 | type: `boolean`
 93 | 
 94 | Whether to convert links to plain text.
 95 | 
 96 | ```ts
 97 | const markdown = htmlToMarkdown("<a href='/foo'>bar</a>", {
 98 | 	linkAsText: true,
 99 | });
100 | // => "bar"
101 | ```
102 | 
103 | ### options.tableAsText
104 | 
105 | type: `boolean`
106 | 
107 | Whether to convert tables to plain text.
108 | 
109 | 
110 | ### options.hideImage
111 | 
112 | type: `boolean`
113 | 
114 | Whether to hide images.
115 | 
116 | ### options.lang
117 | 
118 | type: `string`
119 | 
120 | The language of the HTML.


--------------------------------------------------------------------------------
/site/docs/pages/docs/html-to-mdast.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: htmlToMdast
 3 | ---
 4 | 
 5 | # htmlToMdast
 6 | 
 7 | Convert HTML to Mdast.
 8 | If you simply want to convert from HTML to Markdown, we recommend using [htmlToMarkdown](/docs/html-to-markdown).
 9 | 
10 | ## Usage
11 | 
12 | ```ts twoslash
13 | import { htmlToMdast } from "webforai";
14 | 
15 | const mdast = htmlToMdast("<h1>Hello, world!</h1>");
16 | // @log: => {
17 | // @log:  type: "root",
18 | // @log:  children: [{ type: "heading", depth: 1, children: [{ type: "text", value: "Hello, world!" }] }]
19 | // @log: }
20 | ```
21 | 
22 | ## Returns
23 | 
24 | `mdast.Nodes`
25 | 
26 | The converted Mdast tree.
27 | 
28 | ## Parameters
29 | 
30 | ### htmlOrHast
31 | 
32 | type: `string | Hast`
33 | 
34 | The HTML string or HAST tree to convert.
35 | 
36 | ```ts
37 | const mdast = htmlToMdast("<h1>Hello, world!</h1>");
38 | // => {
39 | //   type: "root",
40 | //   children: [{ type: "heading", depth: 1, children: [{ type: "text", value: "Hello, world!" }] }]
41 | // }
42 | ```
43 | ### options.extractors
44 | 
45 | type: `ExtractorSelectors`
46 | 
47 | An array of extractors to extract specific elements from the HTML.
48 | You can define your own functions in addition to the Extractor provided as a preset.
49 | 
50 | ```ts twoslash
51 | import { htmlToMdast, type Extractor, takumiExtractor } from "webforai"
52 | 
53 | const yourCustomExtractor: Extractor = (params) => {
54 | 	const { hast, url } = params
55 | 	// ... your logic ...
56 | 	return hast
57 | };
58 | 
59 | const html = "<h1>Hello, world!</h1>"
60 | const mdast = htmlToMdast(html, {
61 | 	extractors: [yourCustomExtractor, takumiExtractor]
62 | });
63 | ```
64 | ### options.linkAsText
65 | 
66 | type: `boolean`
67 | 
68 | Whether to convert links to plain text.
69 | 
70 | ```ts
71 | const mdast = htmlToMdast("<a href='/foo'>bar</a>", {
72 | 	linkAsText: true,
73 | });
74 | ```
75 | 
76 | ### options.tableAsText
77 | 
78 | type: `boolean`
79 | 
80 | Whether to convert tables to plain text.
81 | 
82 | 
83 | ### options.hideImage
84 | 
85 | type: `boolean`
86 | 
87 | Whether to hide images.
88 | 
89 | ### options.lang
90 | 
91 | type: `string`
92 | 
93 | The language of the HTML.


--------------------------------------------------------------------------------
/site/docs/pages/docs/loaders.mdx:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Loaders Utilities
  3 | ---
  4 | 
  5 | # Loaders Utilities
  6 | 
  7 | The **Loaders Utilities** provide simple tools to easily fetch HTML from websites.  
  8 | All the utilities are designed to be straightforward, requiring no configuration. 
  9 | 
 10 | :::warning
 11 | However, they are not recommended for production use.
 12 | :::
 13 | 
 14 | ## Overview of Loaders
 15 | 
 16 | Webforai provides four different loaders:
 17 | 
 18 | - **Fetch Loader**: The simplest option, using JavaScript's built-in Fetch API.
 19 | - **Playwright Loader**: Ideal for sites requiring JavaScript execution, like SPAs.
 20 | - **Puppeteer Loader**: Another option for handling websites with JavaScript execution.
 21 | - **CF Puppeteer Loader**: Option to handle websites running JavaScript on cloudflare workers.
 22 | 
 23 | ## Fetch Loader
 24 | 
 25 | The **Fetch Loader** is the simplest utility, using JavaScript’s **Fetch API**. 
 26 | It retrieves HTML from a given URL, using a basic User-Agent for the request.
 27 | 
 28 | ### Usage
 29 | 
 30 | ```ts twoslash
 31 | import { loadHtml } from "webforai/loaders/fetch";
 32 | 
 33 | const html = await loadHtml("https://example.com");
 34 | ```
 35 | 
 36 | ## Playwright Loader
 37 | 
 38 | The **Playwright Loader** is a more powerful tool, using [Playwright](https://playwright.dev/) to 
 39 | fetch HTML from websites that need JavaScript execution, like SPAs (Single Page Applications).
 40 | 
 41 | ### Usage
 42 | 
 43 | Before using the Playwright Loader, you need to install the Playwright browser and its dependencies.
 44 | 
 45 | :::code-group
 46 | 
 47 | ```bash [npm]
 48 | npx playwright-core install
 49 | ```
 50 | 
 51 | ```bash [pnpm]
 52 | pnpm install playwright-core
 53 | ```
 54 | :::
 55 | 
 56 | And then you can use the Playwright Loader as follows:
 57 | 
 58 | :::code-group
 59 | 
 60 | ```ts twoslash [basic-usage]
 61 | import { loadHtml } from "webforai/loaders/playwright";
 62 | 
 63 | const html = await loadHtml("https://example.com");
 64 | ```
 65 | 
 66 | ```ts twoslash [super-bypass-mode]
 67 | import { loadHtml } from "webforai/loaders/playwright";
 68 | 
 69 | const html = await loadHtml("https://example.com", {
 70 |     superBypassMode: true,
 71 | });
 72 | 
 73 | ```
 74 | :::
 75 | 
 76 | 
 77 | ## Puppeteer Loader
 78 | 
 79 | The **Puppeteer Loader** is another advanced tool that uses [Puppeteer](https://pptr.dev/) to 
 80 | load HTML from sites that rely on JavaScript execution, similar to Playwright.
 81 | 
 82 | ### Usage
 83 | 
 84 | Before using the Puppeteer Loader, you need to install the Puppeteer browser and its dependencies.
 85 | 
 86 | :::code-group
 87 | 
 88 | ```bash [npm]
 89 | npm install puppeteer 
 90 | ```
 91 | 
 92 | ```bash [pnpm]
 93 | pnpm install puppeteer 
 94 | ```
 95 | :::
 96 | 
 97 | And then you can use the Puppeteer Loader as follows:
 98 | 
 99 | ```ts twoslash
100 | import { loadHtml } from "webforai/loaders/puppeteer";
101 | 
102 | const html = await loadHtml("https://example.com");
103 | ```
104 | 
105 | ## CF Puppeteer Loader
106 | The **CF Puppeteer Loader** is the best option for loading HTML from sites that rely on JavaScript execution on [cloudflare workers](https://workers.cloudflare.com/). This loader relies on [puppeteer on cloudflare workers](https://developers.cloudflare.com/browser-rendering/platform/puppeteer/).
107 | 
108 | ### Usage
109 | Before using the CF Puppeteer Loader, you need to prepare a wrangler environment and install @cloudflare/puppeteer. Refer to the [cookbook](/cookbook/cf-workers) for instructions on how to create a project.
110 | 
111 | :::code-group
112 | 
113 | ```bash [npm]
114 | npm install @cloudflare/puppeteer --save-dev
115 | ```
116 | 
117 | ```bash [pnpm]
118 | pnpm install -D @cloudflare/puppeteer
119 | ```
120 | :::
121 | 
122 | And then you can use the Playwright Loader as follows:
123 | 
124 | ```ts
125 | import { loadHtml } from "webforai/loaders/cf-puppeteer";
126 | 
127 | const html = await loadHtml("https://example.com", browser); // browser is the puppeteer browser instance
128 | ```
129 | 


--------------------------------------------------------------------------------
/site/docs/pages/docs/mdast-to-markdown.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: mdastToMarkdown
 3 | ---
 4 | 
 5 | # mdastToMarkdown
 6 | 
 7 | Convert Mdast to Markdown.
 8 | 
 9 | ```ts
10 | import { mdastToMarkdown } from "webforai";
11 | 
12 | const mdast = {
13 |     type: 'root',
14 |     children: [
15 |       {
16 |         type: 'paragraph',
17 |         children: [
18 |           { type: 'text', value: 'Hello, world!' }
19 |         ]
20 |       }
21 |     ]
22 |  };
23 | 
24 | const markdown = mdastToMarkdown(mdast);
25 | // => "# Hello, world!"
26 | ```
27 | 
28 | ## Returns
29 | 
30 | `string`
31 | 
32 | The converted Markdown string.
33 | 
34 | ## Parameters
35 | 
36 | ### mdast
37 | 
38 | type: `Mdast`
39 | 
40 | The Mdast tree to convert.
41 | 
42 | ### options.baseUrl
43 | 
44 | type: `string`
45 | 
46 | The base URL to use for replacing relative links.
47 | 
48 | ### Omit\<options, "baseUrl">
49 | 
50 | 
51 | Formatting options passed to [mdast-util-to-markdown](https://github.com/syntax-tree/mdast-util-to-markdown).
52 | 
53 | default: `DEFAULT_MDAST_TO_MARKDOWN_OPTIONS`
54 | ```ts
55 | const DEFAULT_MDAST_TO_MARKDOWN_OPTIONS: MdastToMarkdownOptions = {
56 |   extensions: [gfmToMarkdown(), mathToMarkdown()],
57 |   bullet: "-",
58 | };
59 | ```
60 | 
61 | 


--------------------------------------------------------------------------------
/site/docs/pages/getting-started.mdx:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Getting Started
  3 | ---
  4 | 
  5 | # Getting Started
  6 | 
  7 | ## Overview
  8 | 
  9 | Welcome to webforai, an library designed to convert **HTML to Markdown** with simple utilities.
 10 | Whether you're working in a browser, Node.js, or even on Cloudflare Workers, webforai is your go-to tool for bridging between web and LLMs.
 11 | 
 12 | ## Installation
 13 | 
 14 | :::code-group
 15 | 
 16 | ```bash [npm]
 17 | npm i webforai
 18 | ```
 19 | 
 20 | ```bash [pnpm]
 21 | pnpm i webforai
 22 | ```
 23 | 
 24 | ```bash [yarn]
 25 | yarn add webforai
 26 | ```
 27 | 
 28 | :::
 29 | 
 30 | ## Quick Start (CLI)
 31 | 
 32 | You can convert HTML to Markdown with the following command.
 33 | 
 34 | ```bash
 35 | $ npx webforai@latest https://www.npmjs.com/package/webforai
 36 | 
 37 | ┌  webforai CLI version 1.6.3
 38 | │
 39 | ◇  Select loader: # [!code hl]
 40 | │  fetch # fetch(default) or playwright
 41 | │
 42 | ◇  Enter the output file path: # [!code hl]
 43 | │  npmjs-package-webforai.md # default is `{escaped-url}.md`
 44 | │
 45 | ◇  Select processing mode: # [!code hl]
 46 | │  default # default or ai mode. ai mode is remove imapges, links, and so on.
 47 | │
 48 | ◇  Content loaded!
 49 | │
 50 | └  Done! Markdown saved to npmjs-package-webforai.md
 51 | 
 52 | ```
 53 | 
 54 | ## Quick Start (Library)
 55 | 
 56 | 
 57 | ::::steps
 58 | 
 59 | ### Load HTML with utilities
 60 | 
 61 | Firstly, load HTML using the `loadHtml` utility. Using this function, you can get HTML from a URL in a simple way. It supports versions for **fetch**, **Playwright**, and **Puppeteer**.
 62 | 
 63 | :::code-group
 64 | 
 65 | ```tsx [fetch] twoslash
 66 | import { loadHtml } from "webforai/loaders/fetch"; // [!code hl]
 67 | 
 68 | // Load html from url
 69 | const url = "https://www.npmjs.com/package/webforai";
 70 | const html = await loadHtml(url); // [!code hl]
 71 | ```
 72 | 
 73 | 
 74 | ```tsx [playwright] twoslash
 75 | // Before using playwright loader, run `npx playwright install`
 76 | import { loadHtml } from "webforai/loaders/playwright"; // [!code hl]
 77 | 
 78 | // Load html from url
 79 | const url = "https://www.npmjs.com/package/webforai";
 80 | const html = await loadHtml(url , { superBypassMode: true }); // [!code hl] 
 81 | // @log: Only playwright loader supports super bypass mode. 
 82 | // @log: This is useful to bypass some anti-bot measures.
 83 | ```
 84 | 
 85 | ```tsx [puppeteer] twoslash
 86 | // Before using puppeteer loader, run `npm i puppeteer`
 87 | import { loadHtml } from "webforai/loaders/puppeteer"; // [!code hl]
 88 | 
 89 | // Load html from url
 90 | const url = "https://www.npmjs.com/package/webforai";
 91 | const html = await loadHtml(url); // [!code hl]
 92 | ```
 93 | 
 94 | :::
 95 | 
 96 | :::warning
 97 | The `loadHtml` function is designed for ease of use and is not recommended for intensive use in production environments.
 98 | :::
 99 | 
100 | ### Convert HTML to Markdown
101 | 
102 | Finally, convert HTML to Markdown with the `htmlToMarkdown` function.
103 | 
104 | ```tsx 
105 | import { htmlToMarkdown } from "webforai"; // [!code focus]
106 | import { loadHtml } from "webforai/loaders/fetch";
107 | 
108 | // Load html from url
109 | const url = "https://www.npmjs.com/package/webforai";
110 | const html = await loadHtml(url); 
111 | 
112 | const markdown = htmlToMarkdown(html); // [!code focus]
113 | ```
114 | ::::
115 | 


--------------------------------------------------------------------------------
/site/docs/pages/how-it-works.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: How it works
 3 | ---
 4 | 
 5 | # How it works
 6 | 
 7 | ## Overview
 8 | 
 9 | The core function of webforai is **converting HTML to Markdown**, built on the Syntax Tree ecosystem. This process happens in two steps:
10 | 
11 | 1. **Convert HTML to [Hast](https://github.com/syntax-tree/hast)**. (Hypertext Abstract Syntax Tree)
12 | 2. **Convert Hast to [Mdast](https://github.com/syntax-tree/mdast)**. (Markdown Abstract Syntax Tree)
13 | 3. **Convert Mdast to Markdown**.
14 | 
15 | What makes this special is the **content extraction** in step 1. This ensures that only the main content—the part humans care about—is extracted from the HTML.
16 | After that, the rest of the transformation is handled using fine-tuned utilities from the Syntax Tree ecosystem.
17 | 
18 | ![how-it-works](/images/how-it-works.svg)
19 | 
20 | ## Extractor
21 | 
22 | In webforai, the process of extracting the main content from a web page is abstracted into a component called the **Extractor**.
23 | This is a flexible system designed to make content extraction simple and customizable.
24 | 
25 | ### Extractor Interface
26 | 
27 | The Extractor is a function that takes in two things:
28 | 
29 | - A **Hast** object, which represents the structure of the HTML.
30 | - Optional metadata, such as the language or URL of the page.
31 | 
32 | The Extractor processes this input and returns a new Hast object that represents the cleaned-up, extracted content.
33 | 
34 | ```ts twoslash
35 | import type { Nodes as Hast } from "hast";
36 | 
37 | type ExtractParams = { hast: Hast; lang?: string; url?: string };
38 | type Extractor = (params: ExtractParams) => Hast;
39 | ```
40 | 
41 | ### Default Extractor
42 | 
43 | By default, webforai provides a built-in Extractor called `takumi-extractor`. This extractor is adjusted to produce a high average quality for a typical web page.
44 | I do my best to adjust it to the best of my ability using various flags and scoring with reference to **Mozilla's readability** and other algorithms.
45 | 
46 | ### Customizing the Extraction
47 | 
48 | **webforai** allows you to define **multiple extractors** and chain them together.
49 | The Hast object is passed from one Extractor to the next in the order they are defined, allowing you to fine-tune the extraction process.
50 | 
51 | You can also create **your own custom Extractor** to implement specific algorithms or extraction logic.
52 | 
53 | ```ts twoslash
54 | import { htmlToMarkdown } from "webforai";
55 | import { loadHtml } from "webforai/loaders/fetch";
56 | import type { Extractor } from "webforai";
57 | 
58 | // [!code focus]
59 | const customExtractor: Extractor = (params) => {// [!code focus]
60 |   const { hast, url } = params;// [!code focus]
61 |   // Your custom extraction logic here // [!code focus]
62 |   return hast; // [!code focus]
63 | }; // [!code focus]
64 | 
65 | const html = await loadHtml("https://example.com");
66 | const markdown = await htmlToMarkdown(html, { // [!code focus]
67 |   extractors: [customExtractor], // [!code focus]
68 | }); // [!code focus]
69 | ```
70 | 
71 | 


--------------------------------------------------------------------------------
/site/docs/pages/index.mdx:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: landing
  3 | content:
  4 |   width: 100%
  5 | ---
  6 | 
  7 | import { HomePage } from "vocs/components";
  8 | 
  9 | <div className="max-w-screen-lg mx-auto vp-doc relative px-6 mb-24 mt-8 md:px-0 md:mb-16">
 10 |   <div className="pt-[48px] max-sm:pt-0">
 11 |     <div className="max-sm:px-0 flex justify-between z-0 max-md:justify-center">
 12 |       <div className="space-y-8 max-w-sm flex flex-col items-start max-md:items-center">
 13 |           <img className="h-[72px] dark:invert-[1] max-sm:h-[60px] pr-6" src="/images/logo-full-light.svg" alt="viem logo" />
 14 |         <div className="font-regular text-lg max-sm:text-base max-md:text-center">
 15 |           A esm-native library that converts HTML to Markdown & Useful Utilities with simple, lightweight and epic quality.
 16 |         </div>
 17 |         <div className="flex justify-center space-x-2">
 18 |           <HomePage.Button href="/getting-started" variant="accent">Get started</HomePage.Button>
 19 |           <HomePage.Button href="/cookbook">Cookbook</HomePage.Button>
 20 |           <HomePage.Button href="https://github.com/inaridiy/webforai">GitHub</HomePage.Button>
 21 |         </div>
 22 |       </div>
 23 |       <div className="flex flex-col justify-between w-[440px] space-y-10 max-md:hidden">
 24 |         <div id="home-install" className="h-full">
 25 | 
 26 | :::code-group
 27 | 
 28 | ```bash [npm]
 29 | npm i webforai
 30 | # or just run
 31 | npx webforai@latest
 32 | ```
 33 | 
 34 | ```bash [pnpm]
 35 | pnpm i webforai
 36 | # or just run
 37 | pnpx webforai@latest
 38 | ```
 39 | 
 40 | ```bash [yarn]
 41 | yarn add webforai
 42 | # or just run
 43 | npx webforai@latest
 44 | ```
 45 | 
 46 | :::
 47 | 
 48 |     </div>
 49 |     <div className="flex justify-between items-center space-x-2">
 50 |     <a href="https://github.com/inaridiy/webforai/blob/main/LICENSE" className="cursor-pointer h-[36px] max-w-[180px] flex-1 relative rounded-lg overflow-hidden border border-black/10 dark:border-white/20" style={{ color: 'inherit' }} rel="noreferrer noopener" target="_blank">
 51 |       <div className="absolute flex z-[1] p-[6px] h-full w-full">
 52 |         <div className="flex-1 bg-white/60 dark:bg-black/40 flex items-center w-full h-full rounded-md">
 53 |           <span className="font-medium text-[15px] leading-none opacity-80 w-full text-center">license</span>
 54 |         </div>
 55 |         <div className="flex items-center h-full px-2">
 56 |           <span className="font-medium text-[15px] leading-none text-center w-full text-black dark:text-white">Apache-2.0</span>
 57 |         </div>
 58 |       </div>
 59 |       <div className="absolute left-0 right-0 top-0 bottom-0 bg-black/5 dark:bg-white/5 z-[0]" />
 60 |       <div className="absolute left-0 right-0 top-0 bottom-0 backdrop-blur-[2px] backdrop-filter z-[0]" />
 61 |     </a>
 62 |     <a href="https://github.com/inaridiy/webforai/stargazers" className="cursor-pointer h-[36px] max-w-[120px] flex-1 relative rounded-lg overflow-hidden border border-black/10 dark:border-white/20" style={{ color: 'inherit' }} rel="noreferrer noopener" target="_blank">
 63 |       <div className="absolute flex z-[1] p-[6px] h-full w-full">
 64 |         <div className="flex-1 bg-white/60 dark:bg-black/40 flex items-center w-full h-full rounded-md">
 65 |           <span className="font-medium text-[15px] opacity-80 leading-none w-full text-center">stars</span>
 66 |         </div>
 67 |         <div className="flex items-center h-full px-2">
 68 |           <span className="font-medium text-[15px] text-center leading-none w-full text-black dark:text-white">46</span>
 69 |         </div>
 70 |       </div>
 71 |       <div className="absolute left-0 right-0 top-0 bottom-0 bg-black/5 dark:bg-white/5 z-0" />
 72 |       <div className="absolute left-0 right-0 top-0 bottom-0 backdrop-blur-[2px] backdrop-filter z-0" />
 73 |     </a>
 74 |     <div className="max-lg:hidden">
 75 |     {" <= Let's star!"}
 76 |     </div>
 77 | 
 78 |     </div>
 79 |     </div>
 80 |     </div>
 81 | 
 82 |   </div>
 83 | </div>
 84 | 
 85 | 
 86 | 
 87 | <section className="max-w-screen-md mx-auto">
 88 | 
 89 | #
 90 | # Overview
 91 | 
 92 | ```ts twoslash
 93 | import { htmlToMarkdown, htmlToMdast } from "webforai";
 94 | import { loadHtml } from "webforai/loaders/playwright";
 95 | 
 96 | // Load html from url
 97 | const url = "https://www.npmjs.com/package/webforai";
 98 | const html = await loadHtml(url); // [!code hl]
 99 | 
100 | // Convert html to markdown
101 | const markdown = htmlToMarkdown(html, { baseUrl: url }); // [!code hl]
102 | ```
103 | 
104 | #
105 | # Features
106 | 
107 | - High-quality HTML to Markdown conversion with simple, customizable options
108 | - ESM-native, compatible with various environments (browser, Cloudflare Worker, Node.js, etc.)
109 | - Lightweight (only 146kb minified and gzipped)
110 | - Flexibility because it's built on [Syntax Tree](https://github.com/syntax-tree) ecosystem.
111 | - MathML to LaTeX with [mathml-to-latext](https://www.npmjs.com/package/mathml-to-latex) package.
112 | - Out-of-the-box loaders such as fetch, [playwright](https://www.npmjs.com/package/playwright), [puppeteer](https://www.npmjs.com/package/puppeteer).
113 | - CLI tool `npx webforai` for quick HTML to Markdown conversion
114 | 
115 | </section>
116 | 
117 | <section className="max-w-screen-md mx-auto">
118 | 
119 | #
120 | # Sponsors
121 | 
122 | ::sponsors
123 | </section>
124 | 


--------------------------------------------------------------------------------
/site/docs/pages/installation.mdx:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Installation
 3 | ---
 4 | 
 5 | # Installation
 6 | 
 7 | Install the `webforai` package with your preferred package manager.
 8 | 
 9 | ## Package Manager
10 | 
11 | :::code-group
12 | 
13 | ```bash [npm]
14 | # install core package
15 | npm i webforai
16 | 
17 | # or install with playwright browser binaries to use the playwright loader.
18 | npx webforai@latest
19 | npx playwright install
20 | 
21 | # or install with puppeteer browser binaries to use the puppeteer loader.
22 | npx webforai@latest puppeteer
23 | ```
24 | 
25 | ```bash [pnpm]
26 | # install core package
27 | pnpm i webforai
28 | 
29 | # or install with playwright browser binaries to use the playwright loader.
30 | pnpm i webforai@latest
31 | pnpm playwright install
32 | 
33 | # or install with puppeteer browser binaries to use the puppeteer loader.
34 | pnpm i webforai@latest puppeteer
35 | ```
36 | 
37 | ```bash [yarn]
38 | # install core package
39 | yarn add webforai
40 | 
41 | # or install with playwright browser binaries to use the playwright loader.
42 | yarn add webforai@latest
43 | npx playwright install
44 | 
45 | # or install with puppeteer browser binaries to use the puppeteer loader.
46 | yarn add webforai@latest puppeteer
47 | ```
48 | :::
49 | 
50 | 


--------------------------------------------------------------------------------
/site/docs/public/images/logo-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inaridiy/webforai/2c5339a177ac6197c9a382b5d08fc973f1fc4a12/site/docs/public/images/logo-dark.png


--------------------------------------------------------------------------------
/site/docs/public/images/logo-full-dark.svg:
--------------------------------------------------------------------------------
1 | <svg width="1387" height="241" viewBox="0 0 1387 241" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <path d="M101.44 10.44H142.4L168.32 85.64L194.56 10.44H235.2L196.16 113.8L224.96 186.12L289.28 9.79999H337.28L246.4 237H208.32L168.32 141.64L128.64 237H90.56L0 9.79999H47.36L112 186.12L140.16 113.8L101.44 10.44ZM411.655 240.2C398.215 240.2 386.055 237.96 375.175 233.48C364.295 228.787 355.015 222.493 347.335 214.6C339.655 206.493 333.682 197.32 329.415 187.08C325.362 176.627 323.335 165.64 323.335 154.12C323.335 138.12 326.855 123.507 333.895 110.28C341.148 97.0533 351.388 86.4933 364.615 78.6C378.055 70.4933 393.842 66.44 411.975 66.44C430.322 66.44 446.002 70.4933 459.015 78.6C472.028 86.4933 481.948 97.0533 488.775 110.28C495.815 123.293 499.335 137.373 499.335 152.52C499.335 155.08 499.228 157.747 499.015 160.52C498.802 163.08 498.588 165.213 498.375 166.92H368.775C369.628 175.453 372.082 182.92 376.135 189.32C380.402 195.72 385.735 200.627 392.135 204.04C398.748 207.24 405.788 208.84 413.255 208.84C421.788 208.84 429.788 206.813 437.255 202.76C444.935 198.493 450.162 192.947 452.935 186.12L489.735 196.36C485.682 204.893 479.815 212.467 472.135 219.08C464.668 225.693 455.815 230.92 445.575 234.76C435.335 238.387 424.028 240.2 411.655 240.2ZM367.815 139.72H455.495C454.642 131.187 452.188 123.827 448.135 117.64C444.295 111.24 439.175 106.333 432.775 102.92C426.375 99.2933 419.228 97.48 411.335 97.48C403.655 97.48 396.615 99.2933 390.215 102.92C384.028 106.333 378.908 111.24 374.855 117.64C371.015 123.827 368.668 131.187 367.815 139.72ZM620.818 240.2C608.018 240.2 596.604 237.32 586.578 231.56C576.551 225.8 568.658 217.907 562.898 207.88V237H525.458V3.39999H568.338V98.76C574.311 88.7333 582.098 80.84 591.698 75.08C601.511 69.32 612.924 66.44 625.938 66.44C636.818 66.44 646.738 68.7867 655.698 73.48C664.658 77.96 672.338 84.2533 678.738 92.36C685.351 100.467 690.364 109.747 693.778 120.2C697.404 130.653 699.218 141.853 699.218 153.8C699.218 165.747 697.191 176.947 693.138 187.4C689.298 197.853 683.858 207.133 676.818 215.24C669.778 223.133 661.458 229.32 651.858 233.8C642.471 238.067 632.124 240.2 620.818 240.2ZM608.978 203.72C615.804 203.72 621.991 202.44 627.538 199.88C633.298 197.107 638.204 193.48 642.258 189C646.524 184.52 649.724 179.293 651.858 173.32C654.204 167.133 655.378 160.733 655.378 154.12C655.378 144.947 653.458 136.52 649.618 128.84C645.991 120.947 640.871 114.653 634.258 109.96C627.858 105.267 620.284 102.92 611.538 102.92C605.351 102.92 599.378 104.413 593.618 107.4C588.071 110.173 583.058 114.013 578.578 118.92C574.311 123.827 570.898 129.373 568.338 135.56V175.56C569.618 179.827 571.644 183.667 574.418 187.08C577.404 190.493 580.818 193.48 584.658 196.04C588.498 198.387 592.444 200.307 596.498 201.8C600.764 203.08 604.924 203.72 608.978 203.72ZM736.948 237V110.92H714.868V78.28H736.948V68.04C736.948 54.1733 739.188 42.2267 743.668 32.2C748.361 21.96 754.868 14.0667 763.188 8.51999C771.721 2.97333 781.641 0.199997 792.948 0.199997C799.774 0.199997 806.601 1.05333 813.428 2.75998C820.468 4.46665 827.401 6.91999 834.228 10.12L825.588 42.44C822.388 40.7333 818.654 39.3466 814.388 38.28C810.121 37 806.068 36.36 802.228 36.36C794.974 36.36 789.428 38.92 785.588 44.04C781.748 49.16 779.828 56.6266 779.828 66.44V78.28H820.788V110.92H779.828V237H736.948ZM920.398 240.2C906.744 240.2 894.478 237.96 883.598 233.48C872.718 228.787 863.438 222.387 855.758 214.28C848.291 206.173 842.531 196.893 838.478 186.44C834.424 175.987 832.398 165 832.398 153.48C832.398 141.747 834.424 130.653 838.478 120.2C842.531 109.747 848.291 100.467 855.758 92.36C863.438 84.2533 872.718 77.96 883.598 73.48C894.478 68.7867 906.744 66.44 920.398 66.44C934.051 66.44 946.211 68.7867 956.878 73.48C967.758 77.96 977.038 84.2533 984.718 92.36C992.398 100.467 998.158 109.747 1002 120.2C1006.05 130.653 1008.08 141.747 1008.08 153.48C1008.08 165 1006.05 175.987 1002 186.44C998.158 196.893 992.398 206.173 984.718 214.28C977.251 222.387 968.078 228.787 957.198 233.48C946.318 237.96 934.051 240.2 920.398 240.2ZM876.558 153.48C876.558 163.293 878.478 172.04 882.318 179.72C886.158 187.187 891.384 193.053 897.998 197.32C904.611 201.587 912.078 203.72 920.398 203.72C928.504 203.72 935.864 201.587 942.478 197.32C949.091 192.84 954.318 186.867 958.158 179.4C962.211 171.72 964.238 162.973 964.238 153.16C964.238 143.56 962.211 134.92 958.158 127.24C954.318 119.56 949.091 113.587 942.478 109.32C935.864 105.053 928.504 102.92 920.398 102.92C912.078 102.92 904.611 105.16 897.998 109.64C891.384 113.907 886.158 119.88 882.318 127.56C878.478 135.027 876.558 143.667 876.558 153.48ZM1136.86 106.44C1123.85 106.44 1112.22 109 1101.98 114.12C1091.74 119.027 1084.38 126.173 1079.9 135.56V237H1037.02V69.32H1076.38V105.16C1082.35 93.64 1089.93 84.5733 1099.1 77.96C1108.27 71.3467 1117.98 67.72 1128.22 67.08C1130.57 67.08 1132.38 67.08 1133.66 67.08C1134.94 67.08 1136.01 67.1867 1136.86 67.4V106.44ZM1148.31 187.4C1148.31 176.52 1151.3 167.027 1157.27 158.92C1163.46 150.6 1171.99 144.2 1182.87 139.72C1193.75 135.24 1206.23 133 1220.31 133C1227.35 133 1234.5 133.533 1241.75 134.6C1249.01 135.667 1255.3 137.373 1260.63 139.72V130.76C1260.63 120.093 1257.43 111.88 1251.03 106.12C1244.85 100.36 1235.57 97.48 1223.19 97.48C1214.02 97.48 1205.27 99.08 1196.95 102.28C1188.63 105.48 1179.89 110.173 1170.71 116.36L1156.95 88.2C1168.05 80.9467 1179.35 75.5067 1190.87 71.88C1202.61 68.2533 1214.87 66.44 1227.67 66.44C1251.57 66.44 1270.13 72.4133 1283.35 84.36C1296.79 96.3067 1303.51 113.373 1303.51 135.56V189C1303.51 193.48 1304.26 196.68 1305.75 198.6C1307.46 200.52 1310.13 201.693 1313.75 202.12V237C1309.91 237.64 1306.39 238.173 1303.19 238.6C1300.21 239.027 1297.65 239.24 1295.51 239.24C1286.98 239.24 1280.47 237.32 1275.99 233.48C1271.73 229.64 1269.06 224.947 1267.99 219.4L1267.03 210.76C1259.78 220.147 1250.71 227.4 1239.83 232.52C1228.95 237.64 1217.86 240.2 1206.55 240.2C1195.46 240.2 1185.43 237.96 1176.47 233.48C1167.73 228.787 1160.79 222.493 1155.67 214.6C1150.77 206.493 1148.31 197.427 1148.31 187.4ZM1252.63 196.04C1254.98 193.48 1256.9 190.92 1258.39 188.36C1259.89 185.8 1260.63 183.453 1260.63 181.32V164.36C1255.51 162.227 1249.97 160.627 1243.99 159.56C1238.02 158.28 1232.37 157.64 1227.03 157.64C1215.94 157.64 1206.77 160.093 1199.51 165C1192.47 169.693 1188.95 175.987 1188.95 183.88C1188.95 188.147 1190.13 192.2 1192.47 196.04C1194.82 199.88 1198.23 202.973 1202.71 205.32C1207.19 207.667 1212.53 208.84 1218.71 208.84C1225.11 208.84 1231.41 207.667 1237.59 205.32C1243.78 202.76 1248.79 199.667 1252.63 196.04ZM1343.58 237V69.32H1386.46V237H1343.58ZM1343.58 45.96V3.39999H1386.46V45.96H1343.58Z" fill="white"/>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/site/docs/public/images/logo-full-light.svg:
--------------------------------------------------------------------------------
1 | 
2 | <svg width="1387" height="241" viewBox="0 0 1387 241" fill="none" xmlns="http://www.w3.org/2000/svg">
3 | <path d="M101.44 10.44H142.4L168.32 85.64L194.56 10.44H235.2L196.16 113.8L224.96 186.12L289.28 9.79999H337.28L246.4 237H208.32L168.32 141.64L128.64 237H90.56L0 9.79999H47.36L112 186.12L140.16 113.8L101.44 10.44ZM411.655 240.2C398.215 240.2 386.055 237.96 375.175 233.48C364.295 228.787 355.015 222.493 347.335 214.6C339.655 206.493 333.682 197.32 329.415 187.08C325.362 176.627 323.335 165.64 323.335 154.12C323.335 138.12 326.855 123.507 333.895 110.28C341.148 97.0533 351.388 86.4933 364.615 78.6C378.055 70.4933 393.842 66.44 411.975 66.44C430.322 66.44 446.002 70.4933 459.015 78.6C472.028 86.4933 481.948 97.0533 488.775 110.28C495.815 123.293 499.335 137.373 499.335 152.52C499.335 155.08 499.228 157.747 499.015 160.52C498.802 163.08 498.588 165.213 498.375 166.92H368.775C369.628 175.453 372.082 182.92 376.135 189.32C380.402 195.72 385.735 200.627 392.135 204.04C398.748 207.24 405.788 208.84 413.255 208.84C421.788 208.84 429.788 206.813 437.255 202.76C444.935 198.493 450.162 192.947 452.935 186.12L489.735 196.36C485.682 204.893 479.815 212.467 472.135 219.08C464.668 225.693 455.815 230.92 445.575 234.76C435.335 238.387 424.028 240.2 411.655 240.2ZM367.815 139.72H455.495C454.642 131.187 452.188 123.827 448.135 117.64C444.295 111.24 439.175 106.333 432.775 102.92C426.375 99.2933 419.228 97.48 411.335 97.48C403.655 97.48 396.615 99.2933 390.215 102.92C384.028 106.333 378.908 111.24 374.855 117.64C371.015 123.827 368.668 131.187 367.815 139.72ZM620.818 240.2C608.018 240.2 596.604 237.32 586.578 231.56C576.551 225.8 568.658 217.907 562.898 207.88V237H525.458V3.39999H568.338V98.76C574.311 88.7333 582.098 80.84 591.698 75.08C601.511 69.32 612.924 66.44 625.938 66.44C636.818 66.44 646.738 68.7867 655.698 73.48C664.658 77.96 672.338 84.2533 678.738 92.36C685.351 100.467 690.364 109.747 693.778 120.2C697.404 130.653 699.218 141.853 699.218 153.8C699.218 165.747 697.191 176.947 693.138 187.4C689.298 197.853 683.858 207.133 676.818 215.24C669.778 223.133 661.458 229.32 651.858 233.8C642.471 238.067 632.124 240.2 620.818 240.2ZM608.978 203.72C615.804 203.72 621.991 202.44 627.538 199.88C633.298 197.107 638.204 193.48 642.258 189C646.524 184.52 649.724 179.293 651.858 173.32C654.204 167.133 655.378 160.733 655.378 154.12C655.378 144.947 653.458 136.52 649.618 128.84C645.991 120.947 640.871 114.653 634.258 109.96C627.858 105.267 620.284 102.92 611.538 102.92C605.351 102.92 599.378 104.413 593.618 107.4C588.071 110.173 583.058 114.013 578.578 118.92C574.311 123.827 570.898 129.373 568.338 135.56V175.56C569.618 179.827 571.644 183.667 574.418 187.08C577.404 190.493 580.818 193.48 584.658 196.04C588.498 198.387 592.444 200.307 596.498 201.8C600.764 203.08 604.924 203.72 608.978 203.72ZM736.948 237V110.92H714.868V78.28H736.948V68.04C736.948 54.1733 739.188 42.2267 743.668 32.2C748.361 21.96 754.868 14.0667 763.188 8.51999C771.721 2.97333 781.641 0.199997 792.948 0.199997C799.774 0.199997 806.601 1.05333 813.428 2.75998C820.468 4.46665 827.401 6.91999 834.228 10.12L825.588 42.44C822.388 40.7333 818.654 39.3466 814.388 38.28C810.121 37 806.068 36.36 802.228 36.36C794.974 36.36 789.428 38.92 785.588 44.04C781.748 49.16 779.828 56.6266 779.828 66.44V78.28H820.788V110.92H779.828V237H736.948ZM920.398 240.2C906.744 240.2 894.478 237.96 883.598 233.48C872.718 228.787 863.438 222.387 855.758 214.28C848.291 206.173 842.531 196.893 838.478 186.44C834.424 175.987 832.398 165 832.398 153.48C832.398 141.747 834.424 130.653 838.478 120.2C842.531 109.747 848.291 100.467 855.758 92.36C863.438 84.2533 872.718 77.96 883.598 73.48C894.478 68.7867 906.744 66.44 920.398 66.44C934.051 66.44 946.211 68.7867 956.878 73.48C967.758 77.96 977.038 84.2533 984.718 92.36C992.398 100.467 998.158 109.747 1002 120.2C1006.05 130.653 1008.08 141.747 1008.08 153.48C1008.08 165 1006.05 175.987 1002 186.44C998.158 196.893 992.398 206.173 984.718 214.28C977.251 222.387 968.078 228.787 957.198 233.48C946.318 237.96 934.051 240.2 920.398 240.2ZM876.558 153.48C876.558 163.293 878.478 172.04 882.318 179.72C886.158 187.187 891.384 193.053 897.998 197.32C904.611 201.587 912.078 203.72 920.398 203.72C928.504 203.72 935.864 201.587 942.478 197.32C949.091 192.84 954.318 186.867 958.158 179.4C962.211 171.72 964.238 162.973 964.238 153.16C964.238 143.56 962.211 134.92 958.158 127.24C954.318 119.56 949.091 113.587 942.478 109.32C935.864 105.053 928.504 102.92 920.398 102.92C912.078 102.92 904.611 105.16 897.998 109.64C891.384 113.907 886.158 119.88 882.318 127.56C878.478 135.027 876.558 143.667 876.558 153.48ZM1136.86 106.44C1123.85 106.44 1112.22 109 1101.98 114.12C1091.74 119.027 1084.38 126.173 1079.9 135.56V237H1037.02V69.32H1076.38V105.16C1082.35 93.64 1089.93 84.5733 1099.1 77.96C1108.27 71.3467 1117.98 67.72 1128.22 67.08C1130.57 67.08 1132.38 67.08 1133.66 67.08C1134.94 67.08 1136.01 67.1867 1136.86 67.4V106.44ZM1148.31 187.4C1148.31 176.52 1151.3 167.027 1157.27 158.92C1163.46 150.6 1171.99 144.2 1182.87 139.72C1193.75 135.24 1206.23 133 1220.31 133C1227.35 133 1234.5 133.533 1241.75 134.6C1249.01 135.667 1255.3 137.373 1260.63 139.72V130.76C1260.63 120.093 1257.43 111.88 1251.03 106.12C1244.85 100.36 1235.57 97.48 1223.19 97.48C1214.02 97.48 1205.27 99.08 1196.95 102.28C1188.63 105.48 1179.89 110.173 1170.71 116.36L1156.95 88.2C1168.05 80.9467 1179.35 75.5067 1190.87 71.88C1202.61 68.2533 1214.87 66.44 1227.67 66.44C1251.57 66.44 1270.13 72.4133 1283.35 84.36C1296.79 96.3067 1303.51 113.373 1303.51 135.56V189C1303.51 193.48 1304.26 196.68 1305.75 198.6C1307.46 200.52 1310.13 201.693 1313.75 202.12V237C1309.91 237.64 1306.39 238.173 1303.19 238.6C1300.21 239.027 1297.65 239.24 1295.51 239.24C1286.98 239.24 1280.47 237.32 1275.99 233.48C1271.73 229.64 1269.06 224.947 1267.99 219.4L1267.03 210.76C1259.78 220.147 1250.71 227.4 1239.83 232.52C1228.95 237.64 1217.86 240.2 1206.55 240.2C1195.46 240.2 1185.43 237.96 1176.47 233.48C1167.73 228.787 1160.79 222.493 1155.67 214.6C1150.77 206.493 1148.31 197.427 1148.31 187.4ZM1252.63 196.04C1254.98 193.48 1256.9 190.92 1258.39 188.36C1259.89 185.8 1260.63 183.453 1260.63 181.32V164.36C1255.51 162.227 1249.97 160.627 1243.99 159.56C1238.02 158.28 1232.37 157.64 1227.03 157.64C1215.94 157.64 1206.77 160.093 1199.51 165C1192.47 169.693 1188.95 175.987 1188.95 183.88C1188.95 188.147 1190.13 192.2 1192.47 196.04C1194.82 199.88 1198.23 202.973 1202.71 205.32C1207.19 207.667 1212.53 208.84 1218.71 208.84C1225.11 208.84 1231.41 207.667 1237.59 205.32C1243.78 202.76 1248.79 199.667 1252.63 196.04ZM1343.58 237V69.32H1386.46V237H1343.58ZM1343.58 45.96V3.39999H1386.46V45.96H1343.58Z" fill="#1E1E20"/>
4 | </svg>
5 | 


--------------------------------------------------------------------------------
/site/docs/public/images/logo-full-pad-dark.svg:
--------------------------------------------------------------------------------
1 | <svg width="1120" height="536" viewBox="0 0 1120 536" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <path d="M238.4 121.16H363.84V153.16H238.4V121.16ZM275.52 133.96L311.36 135.88C306.24 186.12 287.68 219.08 254.72 239.88C248.64 233.8 233.28 222.28 225.28 217.8C257.6 201.8 272.32 173.96 275.52 133.96ZM343.68 121.16H380.16C380.16 121.16 380.16 130.12 379.84 134.28C377.92 188.04 374.72 211.08 367.36 220.36C361.28 227.4 354.24 229.96 345.6 231.24C337.6 232.2 324.48 232.2 310.08 231.88C309.44 221.96 305.92 208.2 300.8 199.56C312 200.84 321.6 200.84 326.4 200.84C330.56 200.84 333.44 200.2 336 197.64C339.84 192.52 342.08 174.6 343.68 125.96V121.16ZM101.44 141.96H229.12V175.24H101.44V141.96ZM94.08 204.68H234.56V238.6H94.08V204.68ZM168.96 273.48H230.72V306.76H168.96V273.48ZM148.16 109.64H183.36V222.28H148.16V109.64ZM154.24 224.84H188.16V366.6H154.24V224.84ZM132.48 287.88C149.44 355.08 185.92 365 254.08 365C278.4 365.32 364.48 365.32 393.6 363.72C387.52 372.36 381.12 390.28 378.88 401.8H254.08C168.64 401.8 127.04 385.8 104.96 295.88L132.48 287.88ZM104 254.92L137.6 256.84C136.96 317.96 132.8 373.32 115.52 409.48C109.12 405 93.12 396.36 84.8 392.84C101.44 361.16 103.68 309.64 104 254.92ZM282.56 275.08V314.76H334.4V275.08H282.56ZM246.72 244.36H372.8V345.48H246.72V244.36ZM412.8 184.84H553.28V213H412.8V184.84ZM438.72 324.04H531.84V349H438.72V324.04ZM439.36 365.96H529.6V394.44H439.36V365.96ZM471.04 139.72H503.04V275.72H471.04V139.72ZM473.92 297.48H499.52V380.68H473.92V297.48ZM535.36 110.28L554.56 135.56C517.44 143.24 464.96 148.04 422.08 149C420.8 141.96 417.28 131.08 414.08 124.36C456 122.12 505.6 117 535.36 110.28ZM499.2 205C510.4 211.08 548.16 236.04 556.48 242.76L537.6 272.52C526.08 258.76 497.28 232.2 483.2 221L499.2 205ZM424.32 155.4L448.96 149C453.44 158.92 458.56 172.36 460.48 180.36L434.88 188.04C432.96 179.72 428.48 165.64 424.32 155.4ZM522.88 145.16L550.4 153.16C543.68 168.84 536.96 184.52 531.2 195.4L507.84 188.04C513.28 176.2 519.68 157.64 522.88 145.16ZM465.92 198.92L487.36 211.08C472.32 238.6 446.72 272.2 422.08 289.16C419.52 279.24 412.8 264.52 408 255.24C429.12 242.76 453.76 218.44 465.92 198.92ZM421.76 280.52H551.36V405.64H521.92V308.68H450.24V409.48H421.76V280.52ZM562.24 122.76H616.32V154.76H562.24V122.76ZM599.36 122.76H630.08V372.04C630.08 387.4 627.52 396.68 618.24 402.12C608.96 407.56 596.16 408.52 577.6 408.52C576.96 399.56 572.48 385.16 568 376.84C579.2 377.16 590.72 377.16 594.24 376.84C598.08 376.84 599.36 375.88 599.36 371.72V122.76ZM636.8 123.4H685.44V155.4H636.8V123.4ZM672.32 123.4H704V371.4C704 387.72 701.12 397.32 691.2 402.76C681.6 408.2 667.52 409.16 647.68 409.16C646.4 400.2 641.6 384.84 636.8 376.2C649.6 377.16 662.72 376.84 666.88 376.84C671.04 376.84 672.32 375.24 672.32 370.76V123.4ZM557.12 186.44L581.12 180.04C589.12 199.88 597.76 226.12 601.28 241.8L575.68 249.8C572.48 233.8 564.48 206.6 557.12 186.44ZM632.32 183.88L656.96 179.4C663.68 199.24 670.72 226.12 673.28 241.8L647.04 247.88C644.8 231.56 638.72 204.68 632.32 183.88ZM628.8 302.6C641.6 291.08 661.12 271.56 680 252.04L694.72 276.04C678.4 293.96 660.48 312.84 644.16 329.16L628.8 302.6ZM554.24 309C567.68 295.88 588.48 274.12 608.64 251.72L623.68 276.04C606.4 296.2 586.88 317.64 569.6 335.24L554.24 309ZM891.2 122.76H1018.56V261H891.2V225.16H979.84V158.92H891.2V122.76ZM870.08 122.76H907.52V240.84C907.52 293 901.44 366.6 861.44 410.44C855.36 403.08 840.32 389.32 832 384.52C867.2 345.16 870.08 286.6 870.08 240.52V122.76ZM970.88 245C978.56 306.44 992 344.2 1034.24 376.84C1024.64 385.16 1013.12 398.6 1008.32 410.12C958.08 368.2 942.72 320.2 933.76 249.8L970.88 245ZM744.64 207.56H841.28V236.36H744.64V207.56ZM745.6 119.24H840.32V148.36H745.6V119.24ZM744.64 251.08H841.28V280.2H744.64V251.08ZM729.28 162.44H851.84V192.84H729.28V162.44ZM761.28 295.56H841.6V392.84H761.28V362.44H808.96V325.96H761.28V295.56ZM744 295.56H776.32V405.64H744V295.56Z" fill="white"/>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/site/docs/public/images/logo-full-pad-light.svg:
--------------------------------------------------------------------------------
1 | <svg width="1120" height="536" viewBox="0 0 1120 536" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <path d="M238.4 121.16H363.84V153.16H238.4V121.16ZM275.52 133.96L311.36 135.88C306.24 186.12 287.68 219.08 254.72 239.88C248.64 233.8 233.28 222.28 225.28 217.8C257.6 201.8 272.32 173.96 275.52 133.96ZM343.68 121.16H380.16C380.16 121.16 380.16 130.12 379.84 134.28C377.92 188.04 374.72 211.08 367.36 220.36C361.28 227.4 354.24 229.96 345.6 231.24C337.6 232.2 324.48 232.2 310.08 231.88C309.44 221.96 305.92 208.2 300.8 199.56C312 200.84 321.6 200.84 326.4 200.84C330.56 200.84 333.44 200.2 336 197.64C339.84 192.52 342.08 174.6 343.68 125.96V121.16ZM101.44 141.96H229.12V175.24H101.44V141.96ZM94.08 204.68H234.56V238.6H94.08V204.68ZM168.96 273.48H230.72V306.76H168.96V273.48ZM148.16 109.64H183.36V222.28H148.16V109.64ZM154.24 224.84H188.16V366.6H154.24V224.84ZM132.48 287.88C149.44 355.08 185.92 365 254.08 365C278.4 365.32 364.48 365.32 393.6 363.72C387.52 372.36 381.12 390.28 378.88 401.8H254.08C168.64 401.8 127.04 385.8 104.96 295.88L132.48 287.88ZM104 254.92L137.6 256.84C136.96 317.96 132.8 373.32 115.52 409.48C109.12 405 93.12 396.36 84.8 392.84C101.44 361.16 103.68 309.64 104 254.92ZM282.56 275.08V314.76H334.4V275.08H282.56ZM246.72 244.36H372.8V345.48H246.72V244.36ZM412.8 184.84H553.28V213H412.8V184.84ZM438.72 324.04H531.84V349H438.72V324.04ZM439.36 365.96H529.6V394.44H439.36V365.96ZM471.04 139.72H503.04V275.72H471.04V139.72ZM473.92 297.48H499.52V380.68H473.92V297.48ZM535.36 110.28L554.56 135.56C517.44 143.24 464.96 148.04 422.08 149C420.8 141.96 417.28 131.08 414.08 124.36C456 122.12 505.6 117 535.36 110.28ZM499.2 205C510.4 211.08 548.16 236.04 556.48 242.76L537.6 272.52C526.08 258.76 497.28 232.2 483.2 221L499.2 205ZM424.32 155.4L448.96 149C453.44 158.92 458.56 172.36 460.48 180.36L434.88 188.04C432.96 179.72 428.48 165.64 424.32 155.4ZM522.88 145.16L550.4 153.16C543.68 168.84 536.96 184.52 531.2 195.4L507.84 188.04C513.28 176.2 519.68 157.64 522.88 145.16ZM465.92 198.92L487.36 211.08C472.32 238.6 446.72 272.2 422.08 289.16C419.52 279.24 412.8 264.52 408 255.24C429.12 242.76 453.76 218.44 465.92 198.92ZM421.76 280.52H551.36V405.64H521.92V308.68H450.24V409.48H421.76V280.52ZM562.24 122.76H616.32V154.76H562.24V122.76ZM599.36 122.76H630.08V372.04C630.08 387.4 627.52 396.68 618.24 402.12C608.96 407.56 596.16 408.52 577.6 408.52C576.96 399.56 572.48 385.16 568 376.84C579.2 377.16 590.72 377.16 594.24 376.84C598.08 376.84 599.36 375.88 599.36 371.72V122.76ZM636.8 123.4H685.44V155.4H636.8V123.4ZM672.32 123.4H704V371.4C704 387.72 701.12 397.32 691.2 402.76C681.6 408.2 667.52 409.16 647.68 409.16C646.4 400.2 641.6 384.84 636.8 376.2C649.6 377.16 662.72 376.84 666.88 376.84C671.04 376.84 672.32 375.24 672.32 370.76V123.4ZM557.12 186.44L581.12 180.04C589.12 199.88 597.76 226.12 601.28 241.8L575.68 249.8C572.48 233.8 564.48 206.6 557.12 186.44ZM632.32 183.88L656.96 179.4C663.68 199.24 670.72 226.12 673.28 241.8L647.04 247.88C644.8 231.56 638.72 204.68 632.32 183.88ZM628.8 302.6C641.6 291.08 661.12 271.56 680 252.04L694.72 276.04C678.4 293.96 660.48 312.84 644.16 329.16L628.8 302.6ZM554.24 309C567.68 295.88 588.48 274.12 608.64 251.72L623.68 276.04C606.4 296.2 586.88 317.64 569.6 335.24L554.24 309ZM891.2 122.76H1018.56V261H891.2V225.16H979.84V158.92H891.2V122.76ZM870.08 122.76H907.52V240.84C907.52 293 901.44 366.6 861.44 410.44C855.36 403.08 840.32 389.32 832 384.52C867.2 345.16 870.08 286.6 870.08 240.52V122.76ZM970.88 245C978.56 306.44 992 344.2 1034.24 376.84C1024.64 385.16 1013.12 398.6 1008.32 410.12C958.08 368.2 942.72 320.2 933.76 249.8L970.88 245ZM744.64 207.56H841.28V236.36H744.64V207.56ZM745.6 119.24H840.32V148.36H745.6V119.24ZM744.64 251.08H841.28V280.2H744.64V251.08ZM729.28 162.44H851.84V192.84H729.28V162.44ZM761.28 295.56H841.6V392.84H761.28V362.44H808.96V325.96H761.28V295.56ZM744 295.56H776.32V405.64H744V295.56Z" fill="#1E1E20"/>
3 | </svg>
4 | 


--------------------------------------------------------------------------------
/site/docs/public/images/logo-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inaridiy/webforai/2c5339a177ac6197c9a382b5d08fc973f1fc4a12/site/docs/public/images/logo-light.png


--------------------------------------------------------------------------------
/site/docs/styles.css:
--------------------------------------------------------------------------------
 1 | @layer vocs_preflight {
 2 |   @tailwind base;
 3 | }
 4 | 
 5 | @tailwind components;
 6 | @tailwind utilities;
 7 | 
 8 | #home-install .vocs_CodeGroup {
 9 |   display: flex;
10 |   height: 100%;
11 |   flex-direction: column;
12 | }
13 | 
14 | #home-install .vocs_Tabs_content {
15 |   flex: 1;
16 | }
17 | 
18 | #home-install .vocs_Code {
19 |   font-size: 18px;
20 | }


--------------------------------------------------------------------------------
/site/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "site",
 3 | 	"version": "0.1.0",
 4 | 	"type": "module",
 5 | 	"private": true,
 6 | 	"scripts": {
 7 | 		"dev": "vocs dev",
 8 | 		"build": "vocs build",
 9 | 		"preview": "vocs preview",
10 | 		"worker:dev": "wrangler dev",
11 | 		"worker:deploy": "wrangler deploy",
12 | 		"cf-typegen": "wrangler types"
13 | 	},
14 | 	"devDependencies": {
15 | 		"@ai-sdk/google": "^0.0.48",
16 | 		"@cloudflare/pages-plugin-vercel-og": "^0.1.2",
17 | 		"@cloudflare/workers-types": "^4.20241018.0",
18 | 		"@hono/zod-validator": "^0.4.1",
19 | 		"@types/hast": "^3.0.2",
20 | 		"@types/node": "^20.14.10",
21 | 		"@types/react": "^18.3.11",
22 | 		"@types/react-dom": "^18.3.1",
23 | 		"ai": "^3.4.7",
24 | 		"autoprefixer": "^10.4.20",
25 | 		"hast-util-select": "^6.0.2",
26 | 		"postcss": "^8.4.47",
27 | 		"react": "^18.3.1",
28 | 		"react-dom": "latest",
29 | 		"react-wrap-balancer": "^1.1.1",
30 | 		"tailwindcss": "^3.4.13",
31 | 		"typescript": "latest",
32 | 		"vocs": "1.0.0-alpha.61",
33 | 		"webforai": "workspace:*",
34 | 		"wrangler": "^3.81.0",
35 | 		"zod": "^3.23.8",
36 | 		"hono": "^4.6.5"
37 | 	}
38 | }
39 | 


--------------------------------------------------------------------------------
/site/postcss.config.js:
--------------------------------------------------------------------------------
1 | // biome-ignore lint/style/noDefaultExport: <explanation>
2 | export default {
3 | 	plugins: {
4 | 		tailwindcss: {},
5 | 		autoprefixer: {},
6 | 	},
7 | };
8 | 


--------------------------------------------------------------------------------
/site/tailwind.config.js:
--------------------------------------------------------------------------------
 1 | /** @type {import('tailwindcss').Config} */
 2 | // biome-ignore lint/style/noDefaultExport: tailwindcss requires default export
 3 | export default {
 4 | 	content: ["./docs/**/*.{js,ts,jsx,tsx,md,mdx}"],
 5 | 	darkMode: "class",
 6 | 	theme: {
 7 | 		extend: {},
 8 | 	},
 9 | 	plugins: [],
10 | };
11 | 


--------------------------------------------------------------------------------
/site/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"compilerOptions": {
 3 | 		"target": "ES2020",
 4 | 		"useDefineForClassFields": true,
 5 | 		"lib": ["ES2020", "DOM", "DOM.Iterable"],
 6 | 		"module": "ESNext",
 7 | 		"skipLibCheck": true,
 8 | 
 9 | 		/* Bundler mode */
10 | 		"moduleResolution": "bundler",
11 | 		"allowImportingTsExtensions": true,
12 | 		"resolveJsonModule": true,
13 | 		"isolatedModules": true,
14 | 		"noEmit": true,
15 | 		"jsx": "react-jsx",
16 | 
17 | 		/* Linting */
18 | 		"strict": true,
19 | 		"noUnusedLocals": true,
20 | 		"noUnusedParameters": true,
21 | 		"noFallthroughCasesInSwitch": true,
22 | 		"types": ["@cloudflare/workers-types"]
23 | 	},
24 | 	"include": ["**/*.ts", "**/*.tsx"]
25 | }
26 | 


--------------------------------------------------------------------------------
/site/vocs.config.ts:
--------------------------------------------------------------------------------
  1 | import { defineConfig } from "vocs";
  2 | import { version } from "../packages/webforai/package.json";
  3 | 
  4 | // biome-ignore lint/style/noDefaultExport: This is a config file
  5 | export default defineConfig({
  6 | 	title: "Webforai",
  7 | 	description: "A esm-native library that converts HTML to Markdown.",
  8 | 	baseUrl: "https://webforai.dev",
  9 | 	logoUrl: {
 10 | 		light: "/images/logo-light.png",
 11 | 		dark: "/images/logo-dark.png",
 12 | 	},
 13 | 	iconUrl: {
 14 | 		light: "/images/logo-light.png",
 15 | 		dark: "/images/logo-dark.png",
 16 | 	},
 17 | 	editLink: {
 18 | 		pattern: "https://github.com/inaridiy/webforai/edit/main/site/docs/pages/:path",
 19 | 		text: "Suggest changes to this page",
 20 | 	},
 21 | 	theme: {
 22 | 		accentColor: {
 23 | 			light: "#1f8fff",
 24 | 			dark: "#4db8ff",
 25 | 		},
 26 | 	},
 27 | 	ogImageUrl: {
 28 | 		"/": "https://webforai.dev/api/ogp?logo=%logo&title=%title&description=%description",
 29 | 	},
 30 | 	socials: [
 31 | 		{
 32 | 			icon: "github",
 33 | 			link: "https://github.com/inaridiy/webforai",
 34 | 		},
 35 | 		{
 36 | 			icon: "x",
 37 | 			link: "https://twitter.com/inaridiy",
 38 | 		},
 39 | 	],
 40 | 	topNav: [
 41 | 		{ text: "Getting Started", link: "/getting-started" },
 42 | 		{ text: "Cookbook", link: "/cookbook" },
 43 | 		{
 44 | 			text: version, // <= should update automatically
 45 | 			items: [
 46 | 				{
 47 | 					text: "Releases",
 48 | 					link: "https://github.com/inaridiy/webforai/releases",
 49 | 				},
 50 | 				{
 51 | 					text: "Contributing",
 52 | 					link: "https://github.com/inaridiy/webforai",
 53 | 				},
 54 | 			],
 55 | 		},
 56 | 	],
 57 | 	sidebar: [
 58 | 		{
 59 | 			text: "Installation",
 60 | 			link: "/installation",
 61 | 		},
 62 | 		{
 63 | 			text: "Getting Started",
 64 | 			link: "/getting-started",
 65 | 		},
 66 | 		{
 67 | 			text: "How it works",
 68 | 			link: "/how-it-works",
 69 | 		},
 70 | 		{
 71 | 			text: "API Reference",
 72 | 			items: [
 73 | 				{
 74 | 					text: "htmlToMarkdown",
 75 | 					link: "/docs/html-to-markdown",
 76 | 				},
 77 | 				{
 78 | 					text: "htmlToMdast",
 79 | 					link: "/docs/html-to-mdast",
 80 | 				},
 81 | 				{
 82 | 					text: "mdastToMarkdown",
 83 | 					link: "/docs/mdast-to-markdown",
 84 | 				},
 85 | 				{
 86 | 					text: "loaders",
 87 | 					link: "/docs/loaders",
 88 | 				},
 89 | 			],
 90 | 		},
 91 | 		{
 92 | 			text: "Cookbook",
 93 | 			link: "/cookbook",
 94 | 
 95 | 			items: [
 96 | 				{
 97 | 					text: "Simple usage",
 98 | 					link: "/cookbook/simple",
 99 | 				},
100 | 				{
101 | 					text: "Structured output",
102 | 					link: "/cookbook/structured-output",
103 | 				},
104 | 				{
105 | 					text: "Translation",
106 | 					link: "/cookbook/translation",
107 | 				},
108 | 				{
109 | 					text: "Custom extractor",
110 | 					link: "/cookbook/custom-extractor",
111 | 				},
112 | 				{
113 | 					text: "With Cloudflare Workers",
114 | 					link: "/cookbook/cf-workers",
115 | 				},
116 | 			],
117 | 		},
118 | 	],
119 | 	sponsors: [
120 | 		{
121 | 			name: "Personal",
122 | 			height: 60,
123 | 			items: [
124 | 				[
125 | 					{
126 | 						name: "ClankPan ∞",
127 | 						link: "https://x.com/ClankPan",
128 | 						image: "https://pbs.twimg.com/profile_images/1407277306414989315/iIZ-R1jd_400x400.jpg",
129 | 					},
130 | 				],
131 | 			],
132 | 		},
133 | 	],
134 | });
135 | 


--------------------------------------------------------------------------------
/site/worker-configuration.d.ts:
--------------------------------------------------------------------------------
1 | // Generated by Wrangler by running `wrangler types`
2 | 
3 | interface Env {
4 | 	ASSETS: Fetcher;
5 | }
6 | 


--------------------------------------------------------------------------------
/site/workers/index.tsx:
--------------------------------------------------------------------------------
 1 | import { ImageResponse } from "@cloudflare/pages-plugin-vercel-og/api";
 2 | import { zValidator } from "@hono/zod-validator";
 3 | import { Hono } from "hono";
 4 | import { z } from "zod";
 5 | 
 6 | const fetchImage = async (env: Env, url: string) => {
 7 | 	const res = await env.ASSETS.fetch(url).then((r) => (r.status !== 404 ? r : fetch(url)));
 8 | 
 9 | 	const contentType = res.headers.get("Content-Type") || "application/octet-stream";
10 | 	const arrayBuffer = await res.arrayBuffer();
11 | 	const base64String = btoa(String.fromCharCode(...new Uint8Array(arrayBuffer)));
12 | 	const dataURL = `data:${contentType};base64,${base64String}`;
13 | 
14 | 	return dataURL;
15 | };
16 | 
17 | // biome-ignore lint/style/useNamingConvention: library definition
18 | const app = new Hono<{ Bindings: Env }>().get(
19 | 	"/api/ogp",
20 | 	zValidator(
21 | 		"query",
22 | 		z.object({ logo: z.string().optional(), title: z.string().optional(), description: z.string().optional() }),
23 | 	),
24 | 	async (c) => {
25 | 		const { logo, title, description } = c.req.valid("query");
26 | 
27 | 		const logoDataUrl = logo && (await fetchImage(c.env, logo));
28 | 
29 | 		return new ImageResponse(
30 | 			<div
31 | 				style={{
32 | 					height: "100%",
33 | 					width: "100%",
34 | 					display: "flex",
35 | 					flexDirection: "column",
36 | 					justifyContent: "center",
37 | 					backgroundColor: "#232225",
38 | 					color: "white",
39 | 					padding: "80px",
40 | 				}}
41 | 			>
42 | 				{/* biome-ignore lint/a11y/useAltText: */}
43 | 				{logoDataUrl && <img src={logoDataUrl} height="120px" style={{ marginTop: 48 }} />}
44 | 				<div style={{ fontSize: "42px", fontWeight: "bold", marginTop: 48, marginBottom: -12 }}>{title}</div>
45 | 				{description && <div style={{ opacity: 0.8, fontSize: "32px", marginTop: 24 }}>{description}</div>}
46 | 			</div>,
47 | 			{
48 | 				width: 1200,
49 | 				height: 630,
50 | 			},
51 | 		);
52 | 	},
53 | );
54 | 
55 | // biome-ignore lint/style/noDefaultExport: worker
56 | export default app;
57 | 
58 | // https://webforai.dev/ogp?logo=https://webforai.dev/images/logo-dark.png&title=Getting%20Started&description=hoge
59 | 


--------------------------------------------------------------------------------
/site/wrangler.toml:
--------------------------------------------------------------------------------
 1 | #:schema node_modules/wrangler/config-schema.json
 2 | name = "webforai-site"
 3 | main = "workers/index.tsx"
 4 | compatibility_date = "2024-10-18"
 5 | compatibility_flags = ["nodejs_compat"]
 6 | assets = { directory = "./docs/dist", binding = "ASSETS" }
 7 | 
 8 | # Workers Logs
 9 | # Docs: https://developers.cloudflare.com/workers/observability/logs/workers-logs/
10 | # Configuration: https://developers.cloudflare.com/workers/observability/logs/workers-logs/#enable-workers-logs
11 | [observability]
12 | enabled = true
13 | 


--------------------------------------------------------------------------------
/vitest.config.ts:
--------------------------------------------------------------------------------
1 | /// <reference types="vitest" />
2 | import { defineConfig } from "vitest/config";
3 | 
4 | // biome-ignore lint/style/noDefaultExport: This is a configuration file
5 | export default defineConfig({
6 | 	assetsInclude: ["**/*.html", "**/*.md"],
7 | 	test: {},
8 | });
9 | 


--------------------------------------------------------------------------------