├── .github
└── workflows
│ └── sync-upstream.yml
├── .gitignore
├── .prettierrc
├── README.md
├── api
└── extract.py
├── app
├── favicon.ico
├── globals.css
├── layout.tsx
└── page.tsx
├── eslint.config.mjs
├── next.config.js
├── package.json
├── postcss.config.js
├── public
├── file.svg
├── globe.svg
├── next.svg
├── vercel.svg
└── window.svg
├── requirements.txt
├── runtime.txt
├── tailwind.config.ts
├── tsconfig.json
├── vercel.json
└── wheels
└── magic_html-0.1.2-py3-none-any.whl
/.github/workflows/sync-upstream.yml:
--------------------------------------------------------------------------------
1 | name: Upstream Sync
2 |
3 | permissions:
4 | contents: write
5 | issues: write
6 | actions: write
7 |
8 | on:
9 | schedule:
10 | - cron: '0 */6 * * *' # 每6小时运行一次
11 | workflow_dispatch: # 允许手动触发
12 |
13 | jobs:
14 | sync_latest_from_upstream:
15 | name: Sync latest commits from upstream repo
16 | runs-on: ubuntu-latest
17 | if: ${{ github.event.repository.fork }}
18 |
19 | steps:
20 | - uses: actions/checkout@v4
21 |
22 | - name: Clean issue notice
23 | uses: actions-cool/issues-helper@v3
24 | with:
25 | actions: 'close-issues'
26 | labels: '🚨 Sync Fail'
27 |
28 | - name: Sync upstream changes
29 | id: sync
30 | uses: aormsby/Fork-Sync-With-Upstream-action@v3.4
31 | with:
32 | upstream_sync_repo: eggacheb/Magic-HTML-API # 已更新为正确的上游仓库
33 | upstream_sync_branch: main
34 | target_sync_branch: main
35 | target_repo_token: ${{ secrets.GITHUB_TOKEN }}
36 |
37 | - name: Sync check
38 | if: failure()
39 | uses: actions-cool/issues-helper@v3
40 | with:
41 | actions: 'create-issue'
42 | title: '🚨 同步失败 | Sync Fail'
43 | labels: '🚨 Sync Fail'
44 | body: |
45 | 同步过程中发生错误,请手动同步一次。
46 | An error occurred during synchronization. Please sync manually.
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | *.so
6 | .Python
7 | env/
8 | build/
9 | develop-eggs/
10 | dist/
11 | downloads/
12 | eggs/
13 | .eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 |
23 | # Virtual Environment
24 | venv/
25 | ENV/
26 |
27 | # IDE
28 | .idea/
29 | .vscode/
30 | *.swp
31 | *.swo
32 |
33 | # Vercel
34 | .vercel
35 | .env
36 | .env.local
37 | .env*.local
38 |
39 | # Next.js
40 | node_modules/
41 | .next/
42 | out/
43 | .DS_Store
44 | *.pem
45 | npm-debug.log*
46 | yarn-debug.log*
47 | yarn-error.log*
48 | .pnpm-debug.log*
49 |
50 | # TypeScript
51 | *.tsbuildinfo
52 |
53 | # Testing
54 | coverage/
55 |
56 | # Production
57 | build/
58 | dist/
59 | out/
60 |
61 | # Misc
62 | .DS_Store
63 | *.pem
64 | .env.local
65 | .env.development.local
66 | .env.test.local
67 | .env.production.local
--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 | "semi": true,
3 | "singleQuote": true,
4 | "tabWidth": 2,
5 | "trailingComma": "es5",
6 | "printWidth": 100,
7 | "bracketSpacing": true,
8 | "arrowParens": "avoid"
9 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # magic-html-api
2 |
3 | 一个智能的网页内容提取API服务,基于magic-html和jina-ai/reader。支持多种内容类型(文章/论坛/微信/知乎),多种输出格式(文本/Markdown/HTML)。只保留主要文章内容,使AI能够更好地理解和分析文本。
4 |
5 | ## 功能特点
6 |
7 | - 🔍 智能识别网页类型并提取主要内容
8 | - 📚 支持多种内容类型(文章/论坛/微信/知乎)
9 | - 📝 多种输出格式(文本/Markdown/HTML)
10 | - ⚡ 异步处理,响应迅速
11 | - 🚀 部署在Vercel上,免费使用
12 | - 🤖 自动降级处理:当默认提取失败时自动使用jina-ai/reader
13 |
14 | ## 🔗 在线演示
15 |
16 | 访问 [https://magic-html-api.vercel.app](https://magic-html-api.vercel.app) 体验在线版本。
17 |
18 | 一键部署:[](https://vercel.com/new/clone?repository-url=https://github.com/eggacheb/Magic-HTML-API)
19 |
20 |
21 | 
22 | 
23 | 
24 |
25 |
26 | ## API使用
27 |
28 | ### 内容提取
29 |
30 | ```
31 | GET /api/extract
32 | ```
33 |
34 | 参数:
35 | - `url`: 要提取内容的网页URL(必需)
36 | - `output_format`: 输出格式(可选,默认为"text")
37 | - text: 纯文本格式
38 | - markdown: Markdown格式
39 | - html: HTML格式
40 |
41 | 示例请求:
42 | ```
43 | https://your-domain.vercel.app/api/extract?url=https://example.com&output_format=markdown
44 | ```
45 |
46 | 响应格式:
47 | ```json
48 | {
49 | "url": "请求的URL",
50 | "content": "提取的内容",
51 | "format": "输出格式",
52 | "type": "内容类型",
53 | "success": true
54 | }
55 | ```
56 |
57 | 内容类型(type)包括:
58 | - article: 文章
59 | - forum: 论坛
60 | - weixin: 微信文章
61 | - jina: AI提取(使用jina-ai/reader处理)
62 |
63 | ## 技术实现
64 |
65 | - 使用[magic-html](https://github.com/opendatalab/magic-html)作为主要内容提取引擎
66 | - 集成[jina-ai/reader](https://github.com/jina-ai/reader)作为备选提取方案
67 | - 自动识别网页类型并选择最佳提取策略
68 | - 智能降级:当默认提取失败时自动切换到jina-ai/reader
69 |
70 | ## 部署
71 |
72 | 本项目使用Vercel部署,直接导入GitHub仓库即可。
73 |
74 | ### 环境要求
75 | - Python 3.9+
76 | - Node.js 16+
77 |
78 | ### 部署步骤
79 | 1. Fork本仓库
80 | 2. 在Vercel中导入项目
81 | 3. 部署完成后即可使用
82 |
83 | ## 技术栈
84 |
85 | ### 后端
86 | - FastAPI
87 | - magic-html
88 | - jina-ai/reader
89 | - Python 3.9+
90 |
91 | ### 前端
92 | - Next.js 13
93 | - React
94 | - Tailwind CSS
95 | - TypeScript
96 |
97 | ### 部署
98 | - Vercel
99 |
100 | ## 致谢
101 |
102 | - [magic-html](https://github.com/opendatalab/magic-html) - 强大的网页内容提取库
103 | - [jina-ai/reader](https://github.com/jina-ai/reader) - 优秀的AI内容提取服务
104 |
105 | ## License
106 |
107 | MIT
108 |
--------------------------------------------------------------------------------
/api/extract.py:
--------------------------------------------------------------------------------
1 | from fastapi import FastAPI, HTTPException
2 | import httpx
3 | from magic_html import GeneralExtractor
4 | from typing import Optional, Literal
5 | from markdownify import markdownify as md
6 | from bs4 import BeautifulSoup
7 | import re
8 | import chardet
9 |
10 | app = FastAPI()
11 | extractor = GeneralExtractor()
12 |
13 | async def fetch_url(url: str) -> str:
14 | async with httpx.AsyncClient() as client:
15 | try:
16 | response = await client.get(url)
17 | response.raise_for_status()
18 |
19 | # 处理响应编码
20 | content_type = response.headers.get('content-type', '').lower()
21 | if 'charset=' in content_type:
22 | try:
23 | charset = content_type.split('charset=')[-1].split(';')[0]
24 | return response.content.decode(charset)
25 | except:
26 | pass
27 |
28 | try:
29 | return response.content.decode('utf-8')
30 | except UnicodeDecodeError:
31 | content = response.content
32 | detected = chardet.detect(content)
33 | encoding = detected['encoding']
34 |
35 | if encoding and encoding.lower() in ['gb2312', 'gbk']:
36 | encoding = 'gb18030'
37 |
38 | return content.decode(encoding or 'utf-8')
39 |
40 | except Exception as e:
41 | raise HTTPException(status_code=400, detail=f"Error fetching URL: {str(e)}")
42 |
43 | def convert_content(html: str, output_format: str) -> str:
44 | """
45 | 将HTML内容转换为指定格式
46 |
47 | Args:
48 | html: HTML内容
49 | output_format: 输出格式 ("html", "markdown", "text")
50 |
51 | Returns:
52 | 转换后的内容
53 | """
54 | if not isinstance(html, str):
55 | html = str(html)
56 |
57 | if output_format == "html":
58 | return html
59 | elif output_format == "markdown":
60 | return md(html,
61 | heading_style="ATX", # 使用 # 风格的标题
62 | bullets="*", # 统一使用 * 作为列表符号
63 | autolinks=True, # 启用自动链接
64 | code_language="", # 保持代码块的语言标记
65 | escape_asterisks=False, # 不转义文本中的星号
66 | escape_underscores=False, # 不转义下划线
67 | newline_style="SPACES") # 使用标准markdown换行方式
68 | elif output_format == "text":
69 | soup = BeautifulSoup(html, 'html.parser')
70 | return soup.get_text(separator='\n', strip=True)
71 | else:
72 | return html
73 |
74 | def extract_html_content(data: dict) -> str:
75 | """
76 | 从magic_html返回的数据中提取HTML内容
77 |
78 | Args:
79 | data: magic_html返回的数据
80 |
81 | Returns:
82 | HTML内容
83 | """
84 | if isinstance(data, dict):
85 | return data.get('html', '')
86 | return ''
87 |
88 | def detect_html_type(html: str, url: str) -> str:
89 | """
90 | 自动检测HTML类型
91 |
92 | Args:
93 | html: HTML内容
94 | url: 页面URL
95 |
96 | Returns:
97 | 检测到的类型 ("article", "forum", "weixin", "jina")
98 | """
99 | # 检查URL特征
100 | url_lower = url.lower()
101 | if any(domain in url_lower for domain in ['mp.weixin.qq.com', 'weixin.qq.com']):
102 | return 'weixin'
103 | elif 'zhihu.com' in url_lower:
104 | return 'jina'
105 |
106 | # 检查HTML特征
107 | soup = BeautifulSoup(html, 'html.parser')
108 |
109 | # 论坛特征检测
110 | forum_indicators = [
111 | 'forum', 'topic', 'thread', 'post', 'reply', 'comment', 'discuss',
112 | '论坛', '帖子', '回复', '评论', '讨论'
113 | ]
114 |
115 | # 检查类名和ID
116 | classes_and_ids = []
117 | for element in soup.find_all(class_=True):
118 | classes_and_ids.extend(element.get('class', []))
119 | for element in soup.find_all(id=True):
120 | classes_and_ids.append(element.get('id', ''))
121 |
122 | classes_and_ids = ' '.join(classes_and_ids).lower()
123 |
124 | if any(indicator in classes_and_ids for indicator in forum_indicators):
125 | return 'forum'
126 |
127 | # 默认为文章类型
128 | return 'article'
129 |
130 | # 添加jina.ai提取函数
131 | async def fetch_from_jina(url: str) -> str:
132 | """
133 | 使用jina.ai服务提取内容,最多等待15秒
134 |
135 | Args:
136 | url: 目标网页URL
137 |
138 | Returns:
139 | 提取的内容
140 | """
141 | jina_url = f"https://r.jina.ai/{url}"
142 | async with httpx.AsyncClient(timeout=15.0) as client:
143 | response = await client.get(jina_url)
144 | response.raise_for_status()
145 | return response.text
146 |
147 | def convert_markdown(markdown: str, output_format: str) -> str:
148 | """
149 | 将markdown内容转换为指定格式
150 |
151 | Args:
152 | markdown: Markdown内容
153 | output_format: 输出格式 ("html", "markdown", "text")
154 |
155 | Returns:
156 | 转换后的内容
157 | """
158 | if output_format == "markdown":
159 | return markdown
160 | elif output_format == "text":
161 | # 移除markdown标记
162 | text = re.sub(r'!\[.*?\]\(.*?\)', '[图片]', markdown) # 替换图片
163 | text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) # 替换链接
164 | text = re.sub(r'[#*`]', '', text) # 移除特殊字符
165 | return text.strip()
166 | elif output_format == "html":
167 | # 这里可以使用markdown到html的转换库,比如markdown2或mistune
168 | # 暂时返回原始markdown
169 | return markdown
170 | return markdown
171 |
172 | @app.get("/api/extract")
173 | async def extract_content(
174 | url: str,
175 | output_format: Optional[Literal["html", "markdown", "text"]] = "text"
176 | ):
177 | """
178 | 从URL提取内容
179 |
180 | Args:
181 | url: 目标网页URL
182 | output_format: 输出格式 ("html", "markdown", "text"),默认为text
183 |
184 | Returns:
185 | JSON格式的提取内容
186 | """
187 | try:
188 | # 检测是否是知乎页面
189 | if 'zhihu.com' in url:
190 | markdown_content = await fetch_from_jina(url)
191 | content = convert_markdown(markdown_content, output_format)
192 | return {
193 | "url": url,
194 | "content": content,
195 | "format": output_format,
196 | "type": "jina",
197 | "success": True
198 | }
199 |
200 | # 尝试使用原有逻辑
201 | try:
202 | html = await fetch_url(url)
203 | html_type = detect_html_type(html, url)
204 | extracted_data = extractor.extract(html, base_url=url, html_type=html_type)
205 | html_content = extract_html_content(extracted_data)
206 |
207 | # 检查提取结果是否为空
208 | if not html_content or html_content.isspace():
209 | # 如果为空,尝试使用jina.ai
210 | markdown_content = await fetch_from_jina(url)
211 | content = convert_markdown(markdown_content, output_format)
212 | return {
213 | "url": url,
214 | "content": content,
215 | "format": output_format,
216 | "type": "jina",
217 | "success": True
218 | }
219 |
220 | # 使用原有结果
221 | converted_content = convert_content(html_content, output_format)
222 | return {
223 | "url": url,
224 | "content": converted_content,
225 | "format": output_format,
226 | "type": html_type,
227 | "success": True
228 | }
229 |
230 | except Exception as e:
231 | # 如果原有逻辑失败,尝试使用jina.ai
232 | markdown_content = await fetch_from_jina(url)
233 | content = convert_markdown(markdown_content, output_format)
234 | return {
235 | "url": url,
236 | "content": content,
237 | "format": output_format,
238 | "type": "jina",
239 | "success": True
240 | }
241 |
242 | except Exception as e:
243 | raise HTTPException(status_code=500, detail=str(e))
244 |
--------------------------------------------------------------------------------
/app/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eggacheb/Magic-HTML-API/b931cdd4aa6c10377026cf2b4a01acc9f8130e22/app/favicon.ico
--------------------------------------------------------------------------------
/app/globals.css:
--------------------------------------------------------------------------------
1 | @tailwind base;
2 | @tailwind components;
3 | @tailwind utilities;
4 |
5 | :root {
6 | --foreground-rgb: 0, 0, 0;
7 | --background-rgb: 250, 250, 250;
8 | }
9 |
10 | body {
11 | color: rgb(var(--foreground-rgb));
12 | background: rgb(var(--background-rgb));
13 | }
14 |
--------------------------------------------------------------------------------
/app/layout.tsx:
--------------------------------------------------------------------------------
1 | import type { Metadata } from "next";
2 | import { Inter } from "next/font/google";
3 | import "./globals.css";
4 |
5 | const inter = Inter({
6 | subsets: ["latin"],
7 | variable: "--font-inter",
8 | });
9 |
10 | export const metadata: Metadata = {
11 | title: "Web Content Extractor - AI驱动的网页内容提取工具",
12 | description: "通过智能算法提取网页的核心内容,移除广告和干扰元素,为AI模型提供清晰的输入数据。",
13 | };
14 |
15 | export default function RootLayout({
16 | children,
17 | }: Readonly<{
18 | children: React.ReactNode;
19 | }>) {
20 | return (
21 |
22 |
23 | {children}
24 |
25 |
26 | );
27 | }
28 |
--------------------------------------------------------------------------------
/app/page.tsx:
--------------------------------------------------------------------------------
1 | 'use client';
2 |
3 | import { useState } from 'react';
4 | import { ArrowRight, Copy, Check, Github, MessageSquare, Volume2 } from 'lucide-react';
5 |
6 | interface CachedResult {
7 | content: string;
8 | format: string;
9 | }
10 |
11 | interface ResultCache {
12 | [key: string]: {
13 | [format: string]: CachedResult;
14 | };
15 | }
16 |
17 | type ContentType = "article" | "forum" | "weixin" | "jina";
18 |
19 | export default function Home() {
20 | const [url, setUrl] = useState('');
21 | const [outputFormat, setOutputFormat] = useState('text');
22 | const [result, setResult] = useState(null);
23 | const [loading, setLoading] = useState(false);
24 | const [error, setError] = useState('');
25 | const [copied, setCopied] = useState(false);
26 | const [cache, setCache] = useState({});
27 |
28 | const handleExtract = async (format = outputFormat) => {
29 | try {
30 | // 检查缓存
31 | if (cache[url]?.[format]) {
32 | setResult(cache[url][format]);
33 | return;
34 | }
35 |
36 | setLoading(true);
37 | setError('');
38 | setResult(null);
39 |
40 | const response = await fetch(`/api/extract?url=${encodeURIComponent(url)}&output_format=${format}`);
41 | const data = await response.json();
42 |
43 | if (!response.ok) {
44 | throw new Error(data.error || '提取内容失败');
45 | }
46 |
47 | // 更新缓存
48 | setCache(prevCache => ({
49 | ...prevCache,
50 | [url]: {
51 | ...(prevCache[url] || {}),
52 | [format]: data
53 | }
54 | }));
55 |
56 | setResult(data);
57 | } catch (err: any) {
58 | setError(err.message);
59 | } finally {
60 | setLoading(false);
61 | }
62 | };
63 |
64 | const handleFormatChange = async (format: string) => {
65 | setOutputFormat(format);
66 | if (result) {
67 | await handleExtract(format);
68 | }
69 | };
70 |
71 | const handleUrlChange = (newUrl: string) => {
72 | setUrl(newUrl);
73 | // 当URL改变时,清除所有结果和缓存
74 | setResult(null);
75 | setError('');
76 | setCache({}); // 清除所有缓存
77 | };
78 |
79 | const handleCopy = async () => {
80 | if (result?.content) {
81 | await navigator.clipboard.writeText(result.content);
82 | setCopied(true);
83 | setTimeout(() => setCopied(false), 2000);
84 | }
85 | };
86 |
87 | const getTypeText = (type: ContentType) => {
88 | switch (type) {
89 | case "article":
90 | return "文章";
91 | case "forum":
92 | return "论坛";
93 | case "weixin":
94 | return "微信";
95 | case "jina":
96 | return "AI提取"; // 或者 "Jina提取"
97 | default:
98 | return type;
99 | }
100 | };
101 |
102 | const getTypeIcon = (type: ContentType) => {
103 | switch (type) {
104 | case "article":
105 | return "📄";
106 | case "forum":
107 | return "💬";
108 | case "weixin":
109 | return "💚";
110 | case "jina":
111 | return "🤖"; // 使用机器人图标表示AI提取
112 | default:
113 | return "📝";
114 | }
115 | };
116 |
117 | return (
118 |
119 | {/* 导航栏 */}
120 |
146 |
147 |
148 | {/* Hero Section */}
149 |
150 |
151 |
152 | 智能网页内容提取
153 |
154 |
155 | 使用智能算法自动识别网页类型并提取核心内容,
156 | 支持文章、论坛和微信等多种类型。
157 |
158 |
159 |
160 | {/* 功能卡片 */}
161 |
162 |
163 |
164 |
165 |
166 |
智能内容提取
167 |
168 | 自动识别网页类型和主要内容,
169 | 智能去除干扰元素。
170 |
171 |
172 |
173 |
174 |
175 |
176 |
多格式支持
177 |
178 | 支持输出为纯文本、Markdown和HTML等多种格式,
179 | 满足不同场景需求。
180 |
181 |
182 |
183 |
184 | {/* 输入部分 */}
185 |
186 |
187 |
188 |
handleUrlChange(e.target.value)}
192 | placeholder="输入网页URL,例如: https://example.com"
193 | className="w-full px-4 py-3 bg-gray-50 border border-gray-100 rounded-xl focus:ring-2 focus:ring-blue-500 focus:border-transparent outline-none transition-all text-gray-900 placeholder-gray-400"
194 | />
195 |
196 |
197 |
198 |
207 |
208 |
209 |
223 |
224 |
225 | {/* API使用说明 */}
226 |
227 |
API 快速上手
228 |
229 |
230 |
231 |
232 | GET /api/extract?url=https://example.com
233 |
234 |
242 |
243 |
244 |
参数说明:
245 |
246 | url
: 要提取内容的网页地址(必需)
247 | -
248 |
output_format
: 输出格式(可选,默认为text)
249 |
250 | - • text: 纯文本格式
251 | - • markdown: Markdown格式
252 | - • html: HTML格式
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 | {/* 错误提示 */}
263 | {error && (
264 |
265 | {error}
266 |
267 | )}
268 |
269 | {/* 结果展示 */}
270 | {result && (
271 |
272 |
273 |
274 |
提取结果
275 | {result.type && (
276 |
277 | 检测类型:{getTypeText(result.type)} {getTypeIcon(result.type)}
278 |
279 | )}
280 |
281 |
297 |
298 |
301 |
302 | {result.content}
303 |
304 |
305 |
306 | )}
307 |
308 |
309 |
310 |
311 | );
312 | }
313 |
--------------------------------------------------------------------------------
/eslint.config.mjs:
--------------------------------------------------------------------------------
1 | import { dirname } from "path";
2 | import { fileURLToPath } from "url";
3 | import { FlatCompat } from "@eslint/eslintrc";
4 |
5 | const __filename = fileURLToPath(import.meta.url);
6 | const __dirname = dirname(__filename);
7 |
8 | const compat = new FlatCompat({
9 | baseDirectory: __dirname,
10 | });
11 |
12 | const eslintConfig = [
13 | ...compat.extends("next/core-web-vitals", "next/typescript"),
14 | ];
15 |
16 | export default eslintConfig;
17 |
--------------------------------------------------------------------------------
/next.config.js:
--------------------------------------------------------------------------------
1 | /** @type {import('next').NextConfig} */
2 | const nextConfig = {
3 | webpack: (config, { isServer }) => {
4 | if (isServer) {
5 | config.externals.push({
6 | 'chrome-aws-lambda': 'chrome-aws-lambda',
7 | });
8 | }
9 | return config;
10 | },
11 | rewrites: async () => {
12 | return [
13 | {
14 | source: '/api/:path*',
15 | destination: '/api/:path*',
16 | },
17 | ];
18 | },
19 | }
20 |
21 | module.exports = nextConfig
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "lbl",
3 | "version": "0.1.0",
4 | "private": true,
5 | "scripts": {
6 | "dev": "next dev",
7 | "build": "next build",
8 | "start": "next start",
9 | "lint": "next lint",
10 | "clean": "rimraf .next out",
11 | "type-check": "tsc --noEmit",
12 | "lint:fix": "next lint --fix",
13 | "format": "prettier --write ."
14 | },
15 | "dependencies": {
16 | "@mozilla/readability": "^0.5.0",
17 | "@tailwindcss/typography": "^0.5.15",
18 | "@types/jsdom": "^21.1.7",
19 | "jsdom": "^21.1.2",
20 | "lucide-react": "^0.469.0",
21 | "next": "14.0.4",
22 | "react": "^18.2.0",
23 | "react-dom": "^18.2.0"
24 | },
25 | "devDependencies": {
26 | "@types/node": "^20",
27 | "@types/react": "^18",
28 | "@types/react-dom": "^18",
29 | "autoprefixer": "^10.4.20",
30 | "eslint": "^8",
31 | "eslint-config-next": "14.0.4",
32 | "postcss": "^8.4.49",
33 | "prettier": "^3.1.1",
34 | "rimraf": "^5.0.5",
35 | "tailwindcss": "^3.4.17",
36 | "typescript": "^5"
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/postcss.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | plugins: {
3 | tailwindcss: {},
4 | autoprefixer: {},
5 | },
6 | }
--------------------------------------------------------------------------------
/public/file.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/public/globe.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/public/next.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/public/vercel.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/public/window.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi==0.95.2
2 | httpx==0.24.1
3 | uvicorn==0.22.0
4 | chardet==4.0.0
5 | ./wheels/magic_html-0.1.2-py3-none-any.whl
6 | markdownify==0.11.6
7 | beautifulsoup4==4.12.2
8 |
--------------------------------------------------------------------------------
/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.9
--------------------------------------------------------------------------------
/tailwind.config.ts:
--------------------------------------------------------------------------------
1 | import type { Config } from "tailwindcss";
2 |
3 | const config: Config = {
4 | content: [
5 | "./pages/**/*.{js,ts,jsx,tsx,mdx}",
6 | "./components/**/*.{js,ts,jsx,tsx,mdx}",
7 | "./app/**/*.{js,ts,jsx,tsx,mdx}",
8 | ],
9 | theme: {
10 | extend: {
11 | borderRadius: {
12 | 'xl': '1rem',
13 | '2xl': '1.25rem',
14 | },
15 | typography: {
16 | DEFAULT: {
17 | css: {
18 | maxWidth: 'none',
19 | color: '#334155',
20 | p: {
21 | marginTop: '1.25em',
22 | marginBottom: '1.25em',
23 | },
24 | },
25 | },
26 | },
27 | },
28 | },
29 | plugins: [
30 | require('@tailwindcss/typography'),
31 | ],
32 | };
33 |
34 | export default config;
35 |
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "ES2017",
4 | "lib": ["dom", "dom.iterable", "esnext"],
5 | "allowJs": true,
6 | "skipLibCheck": true,
7 | "strict": true,
8 | "noEmit": true,
9 | "esModuleInterop": true,
10 | "module": "esnext",
11 | "moduleResolution": "bundler",
12 | "resolveJsonModule": true,
13 | "isolatedModules": true,
14 | "jsx": "preserve",
15 | "incremental": true,
16 | "plugins": [
17 | {
18 | "name": "next"
19 | }
20 | ],
21 | "paths": {
22 | "@/*": ["./*"],
23 | "@/components/*": ["./components/*"],
24 | "@/app/*": ["./app/*"],
25 | "@/lib/*": ["./lib/*"],
26 | "@/styles/*": ["./styles/*"],
27 | "@/types/*": ["./types/*"]
28 | },
29 | "baseUrl": "."
30 | },
31 | "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
32 | "exclude": ["node_modules"]
33 | }
34 |
--------------------------------------------------------------------------------
/vercel.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": 2,
3 | "functions": {
4 | "api/**/*.py": {
5 | "maxDuration": 60,
6 | "memory": 1024
7 | }
8 | },
9 | "routes": [
10 | {
11 | "src": "/api/extract",
12 | "dest": "api/extract.py",
13 | "methods": ["GET"],
14 | "headers": {
15 | "Access-Control-Allow-Origin": "*",
16 | "Access-Control-Allow-Methods": "GET",
17 | "Access-Control-Allow-Headers": "Content-Type",
18 | "Cache-Control": "no-cache, no-store, must-revalidate"
19 | }
20 | },
21 | {
22 | "src": "/api/(.*)",
23 | "dest": "api/extract.py"
24 | },
25 | {
26 | "handle": "filesystem"
27 | },
28 | {
29 | "src": "/(.*)",
30 | "dest": "/"
31 | }
32 | ]
33 | }
34 |
--------------------------------------------------------------------------------
/wheels/magic_html-0.1.2-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eggacheb/Magic-HTML-API/b931cdd4aa6c10377026cf2b4a01acc9f8130e22/wheels/magic_html-0.1.2-py3-none-any.whl
--------------------------------------------------------------------------------