├── .github
    └── workflows
    │   └── sync-upstream.yml
├── .gitignore
├── .prettierrc
├── README.md
├── api
    └── extract.py
├── app
    ├── favicon.ico
    ├── globals.css
    ├── layout.tsx
    └── page.tsx
├── eslint.config.mjs
├── next.config.js
├── package.json
├── postcss.config.js
├── public
    ├── file.svg
    ├── globe.svg
    ├── next.svg
    ├── vercel.svg
    └── window.svg
├── requirements.txt
├── runtime.txt
├── tailwind.config.ts
├── tsconfig.json
├── vercel.json
└── wheels
    └── magic_html-0.1.2-py3-none-any.whl


/.github/workflows/sync-upstream.yml:
--------------------------------------------------------------------------------
 1 | name: Upstream Sync
 2 | 
 3 | permissions:
 4 |   contents: write
 5 |   issues: write
 6 |   actions: write
 7 | 
 8 | on:
 9 |   schedule:
10 |     - cron: '0 */6 * * *'    # 每6小时运行一次
11 |   workflow_dispatch:          # 允许手动触发
12 |   
13 | jobs:
14 |   sync_latest_from_upstream:
15 |     name: Sync latest commits from upstream repo
16 |     runs-on: ubuntu-latest
17 |     if: ${{ github.event.repository.fork }}
18 |     
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |       
22 |       - name: Clean issue notice
23 |         uses: actions-cool/issues-helper@v3
24 |         with:
25 |           actions: 'close-issues'
26 |           labels: '🚨 Sync Fail'
27 |           
28 |       - name: Sync upstream changes
29 |         id: sync
30 |         uses: aormsby/Fork-Sync-With-Upstream-action@v3.4
31 |         with:
32 |           upstream_sync_repo: eggacheb/Magic-HTML-API    # 已更新为正确的上游仓库
33 |           upstream_sync_branch: main
34 |           target_sync_branch: main
35 |           target_repo_token: ${{ secrets.GITHUB_TOKEN }}
36 |           
37 |       - name: Sync check
38 |         if: failure()
39 |         uses: actions-cool/issues-helper@v3
40 |         with:
41 |           actions: 'create-issue'
42 |           title: '🚨 同步失败 | Sync Fail'
43 |           labels: '🚨 Sync Fail'
44 |           body: |
45 |             同步过程中发生错误，请手动同步一次。
46 |             An error occurred during synchronization. Please sync manually. 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | env/
 8 | build/
 9 | develop-eggs/
10 | dist/
11 | downloads/
12 | eggs/
13 | .eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | 
23 | # Virtual Environment
24 | venv/
25 | ENV/
26 | 
27 | # IDE
28 | .idea/
29 | .vscode/
30 | *.swp
31 | *.swo
32 | 
33 | # Vercel
34 | .vercel
35 | .env
36 | .env.local
37 | .env*.local
38 | 
39 | # Next.js
40 | node_modules/
41 | .next/
42 | out/
43 | .DS_Store
44 | *.pem
45 | npm-debug.log*
46 | yarn-debug.log*
47 | yarn-error.log*
48 | .pnpm-debug.log*
49 | 
50 | # TypeScript
51 | *.tsbuildinfo
52 | 
53 | # Testing
54 | coverage/
55 | 
56 | # Production
57 | build/
58 | dist/
59 | out/
60 | 
61 | # Misc
62 | .DS_Store
63 | *.pem
64 | .env.local
65 | .env.development.local
66 | .env.test.local
67 | .env.production.local 


--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 |   "semi": true,
3 |   "singleQuote": true,
4 |   "tabWidth": 2,
5 |   "trailingComma": "es5",
6 |   "printWidth": 100,
7 |   "bracketSpacing": true,
8 |   "arrowParens": "avoid"
9 | } 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # magic-html-api
  2 | 
  3 | 一个智能的网页内容提取API服务，基于magic-html和jina-ai/reader。支持多种内容类型（文章/论坛/微信/知乎），多种输出格式（文本/Markdown/HTML）。只保留主要文章内容，使AI能够更好地理解和分析文本。
  4 | 
  5 | ## 功能特点
  6 | 
  7 | - 🔍 智能识别网页类型并提取主要内容
  8 | - 📚 支持多种内容类型（文章/论坛/微信/知乎）
  9 | - 📝 多种输出格式（文本/Markdown/HTML）
 10 | - ⚡ 异步处理，响应迅速
 11 | - 🚀 部署在Vercel上，免费使用
 12 | - 🤖 自动降级处理：当默认提取失败时自动使用jina-ai/reader
 13 | 
 14 | ## 🔗 在线演示
 15 | 
 16 | 访问 [https://magic-html-api.vercel.app](https://magic-html-api.vercel.app) 体验在线版本。
 17 | 
 18 | 一键部署：[![Vercel Deployment](https://vercel.com/button)](https://vercel.com/new/clone?repository-url=https://github.com/eggacheb/Magic-HTML-API)
 19 | 
 20 | 
 21 | ![image](https://github.com/user-attachments/assets/6ac7637c-909a-47c1-b756-9e7d93c466e3)
 22 | ![image](https://github.com/user-attachments/assets/3ddacaf3-8fbe-4ab5-a306-e81fdc2a2152)
 23 | ![image](https://github.com/user-attachments/assets/03637a58-6870-4101-b350-785a3f36bed3)
 24 | 
 25 | 
 26 | ## API使用
 27 | 
 28 | ### 内容提取
 29 | 
 30 | ```
 31 | GET /api/extract
 32 | ```
 33 | 
 34 | 参数：
 35 | - `url`: 要提取内容的网页URL（必需）
 36 | - `output_format`: 输出格式（可选，默认为"text"）
 37 |   - text: 纯文本格式
 38 |   - markdown: Markdown格式
 39 |   - html: HTML格式
 40 | 
 41 | 示例请求：
 42 | ```
 43 | https://your-domain.vercel.app/api/extract?url=https://example.com&output_format=markdown
 44 | ```
 45 | 
 46 | 响应格式：
 47 | ```json
 48 | {
 49 |     "url": "请求的URL",
 50 |     "content": "提取的内容",
 51 |     "format": "输出格式",
 52 |     "type": "内容类型",
 53 |     "success": true
 54 | }
 55 | ```
 56 | 
 57 | 内容类型（type）包括：
 58 | - article: 文章
 59 | - forum: 论坛
 60 | - weixin: 微信文章
 61 | - jina: AI提取（使用jina-ai/reader处理）
 62 | 
 63 | ## 技术实现
 64 | 
 65 | - 使用[magic-html](https://github.com/opendatalab/magic-html)作为主要内容提取引擎
 66 | - 集成[jina-ai/reader](https://github.com/jina-ai/reader)作为备选提取方案
 67 | - 自动识别网页类型并选择最佳提取策略
 68 | - 智能降级：当默认提取失败时自动切换到jina-ai/reader
 69 | 
 70 | ## 部署
 71 | 
 72 | 本项目使用Vercel部署，直接导入GitHub仓库即可。
 73 | 
 74 | ### 环境要求
 75 | - Python 3.9+
 76 | - Node.js 16+
 77 | 
 78 | ### 部署步骤
 79 | 1. Fork本仓库
 80 | 2. 在Vercel中导入项目
 81 | 3. 部署完成后即可使用
 82 | 
 83 | ## 技术栈
 84 | 
 85 | ### 后端
 86 | - FastAPI
 87 | - magic-html
 88 | - jina-ai/reader
 89 | - Python 3.9+
 90 | 
 91 | ### 前端
 92 | - Next.js 13
 93 | - React
 94 | - Tailwind CSS
 95 | - TypeScript
 96 | 
 97 | ### 部署
 98 | - Vercel
 99 | 
100 | ## 致谢
101 | 
102 | - [magic-html](https://github.com/opendatalab/magic-html) - 强大的网页内容提取库
103 | - [jina-ai/reader](https://github.com/jina-ai/reader) - 优秀的AI内容提取服务
104 | 
105 | ## License
106 | 
107 | MIT 
108 | 


--------------------------------------------------------------------------------
/api/extract.py:
--------------------------------------------------------------------------------
  1 | from fastapi import FastAPI, HTTPException
  2 | import httpx
  3 | from magic_html import GeneralExtractor
  4 | from typing import Optional, Literal
  5 | from markdownify import markdownify as md
  6 | from bs4 import BeautifulSoup
  7 | import re
  8 | import chardet
  9 | 
 10 | app = FastAPI()
 11 | extractor = GeneralExtractor()
 12 | 
 13 | async def fetch_url(url: str) -> str:
 14 |     async with httpx.AsyncClient() as client:
 15 |         try:
 16 |             response = await client.get(url)
 17 |             response.raise_for_status()
 18 |             
 19 |             # 处理响应编码
 20 |             content_type = response.headers.get('content-type', '').lower()
 21 |             if 'charset=' in content_type:
 22 |                 try:
 23 |                     charset = content_type.split('charset=')[-1].split(';')[0]
 24 |                     return response.content.decode(charset)
 25 |                 except:
 26 |                     pass
 27 |             
 28 |             try:
 29 |                 return response.content.decode('utf-8')
 30 |             except UnicodeDecodeError:
 31 |                 content = response.content
 32 |                 detected = chardet.detect(content)
 33 |                 encoding = detected['encoding']
 34 |                 
 35 |                 if encoding and encoding.lower() in ['gb2312', 'gbk']:
 36 |                     encoding = 'gb18030'
 37 |                 
 38 |                 return content.decode(encoding or 'utf-8')
 39 |             
 40 |         except Exception as e:
 41 |             raise HTTPException(status_code=400, detail=f"Error fetching URL: {str(e)}")
 42 | 
 43 | def convert_content(html: str, output_format: str) -> str:
 44 |     """
 45 |     将HTML内容转换为指定格式
 46 |     
 47 |     Args:
 48 |         html: HTML内容
 49 |         output_format: 输出格式 ("html", "markdown", "text")
 50 |         
 51 |     Returns:
 52 |         转换后的内容
 53 |     """
 54 |     if not isinstance(html, str):
 55 |         html = str(html)
 56 |         
 57 |     if output_format == "html":
 58 |         return html
 59 |     elif output_format == "markdown":
 60 |         return md(html, 
 61 |                  heading_style="ATX",  # 使用 # 风格的标题
 62 |                  bullets="*",  # 统一使用 * 作为列表符号
 63 |                  autolinks=True,  # 启用自动链接
 64 |                  code_language="",  # 保持代码块的语言标记
 65 |                  escape_asterisks=False,  # 不转义文本中的星号
 66 |                  escape_underscores=False,  # 不转义下划线
 67 |                  newline_style="SPACES")  # 使用标准markdown换行方式
 68 |     elif output_format == "text":
 69 |         soup = BeautifulSoup(html, 'html.parser')
 70 |         return soup.get_text(separator='\n', strip=True)
 71 |     else:
 72 |         return html
 73 | 
 74 | def extract_html_content(data: dict) -> str:
 75 |     """
 76 |     从magic_html返回的数据中提取HTML内容
 77 |     
 78 |     Args:
 79 |         data: magic_html返回的数据
 80 |         
 81 |     Returns:
 82 |         HTML内容
 83 |     """
 84 |     if isinstance(data, dict):
 85 |         return data.get('html', '')
 86 |     return ''
 87 | 
 88 | def detect_html_type(html: str, url: str) -> str:
 89 |     """
 90 |     自动检测HTML类型
 91 |     
 92 |     Args:
 93 |         html: HTML内容
 94 |         url: 页面URL
 95 |         
 96 |     Returns:
 97 |         检测到的类型 ("article", "forum", "weixin", "jina")
 98 |     """
 99 |     # 检查URL特征
100 |     url_lower = url.lower()
101 |     if any(domain in url_lower for domain in ['mp.weixin.qq.com', 'weixin.qq.com']):
102 |         return 'weixin'
103 |     elif 'zhihu.com' in url_lower:
104 |         return 'jina'
105 |     
106 |     # 检查HTML特征
107 |     soup = BeautifulSoup(html, 'html.parser')
108 |     
109 |     # 论坛特征检测
110 |     forum_indicators = [
111 |         'forum', 'topic', 'thread', 'post', 'reply', 'comment', 'discuss',
112 |         '论坛', '帖子', '回复', '评论', '讨论'
113 |     ]
114 |     
115 |     # 检查类名和ID
116 |     classes_and_ids = []
117 |     for element in soup.find_all(class_=True):
118 |         classes_and_ids.extend(element.get('class', []))
119 |     for element in soup.find_all(id=True):
120 |         classes_and_ids.append(element.get('id', ''))
121 |     
122 |     classes_and_ids = ' '.join(classes_and_ids).lower()
123 |     
124 |     if any(indicator in classes_and_ids for indicator in forum_indicators):
125 |         return 'forum'
126 |     
127 |     # 默认为文章类型
128 |     return 'article'
129 | 
130 | # 添加jina.ai提取函数
131 | async def fetch_from_jina(url: str) -> str:
132 |     """
133 |     使用jina.ai服务提取内容,最多等待15秒
134 |     
135 |     Args:
136 |         url: 目标网页URL
137 |         
138 |     Returns:
139 |         提取的内容
140 |     """
141 |     jina_url = f"https://r.jina.ai/{url}"
142 |     async with httpx.AsyncClient(timeout=15.0) as client:
143 |         response = await client.get(jina_url)
144 |         response.raise_for_status()
145 |         return response.text
146 | 
147 | def convert_markdown(markdown: str, output_format: str) -> str:
148 |     """
149 |     将markdown内容转换为指定格式
150 |     
151 |     Args:
152 |         markdown: Markdown内容
153 |         output_format: 输出格式 ("html", "markdown", "text")
154 |         
155 |     Returns:
156 |         转换后的内容
157 |     """
158 |     if output_format == "markdown":
159 |         return markdown
160 |     elif output_format == "text":
161 |         # 移除markdown标记
162 |         text = re.sub(r'!\[.*?\]\(.*?\)', '[图片]', markdown)  # 替换图片
163 |         text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)  # 替换链接
164 |         text = re.sub(r'[#*`]', '', text)  # 移除特殊字符
165 |         return text.strip()
166 |     elif output_format == "html":
167 |         # 这里可以使用markdown到html的转换库，比如markdown2或mistune
168 |         # 暂时返回原始markdown
169 |         return markdown
170 |     return markdown
171 | 
172 | @app.get("/api/extract")
173 | async def extract_content(
174 |     url: str, 
175 |     output_format: Optional[Literal["html", "markdown", "text"]] = "text"
176 | ):
177 |     """
178 |     从URL提取内容
179 |     
180 |     Args:
181 |         url: 目标网页URL
182 |         output_format: 输出格式 ("html", "markdown", "text")，默认为text
183 |     
184 |     Returns:
185 |         JSON格式的提取内容
186 |     """
187 |     try:
188 |         # 检测是否是知乎页面
189 |         if 'zhihu.com' in url:
190 |             markdown_content = await fetch_from_jina(url)
191 |             content = convert_markdown(markdown_content, output_format)
192 |             return {
193 |                 "url": url,
194 |                 "content": content,
195 |                 "format": output_format,
196 |                 "type": "jina",
197 |                 "success": True
198 |             }
199 |             
200 |         # 尝试使用原有逻辑
201 |         try:
202 |             html = await fetch_url(url)
203 |             html_type = detect_html_type(html, url)
204 |             extracted_data = extractor.extract(html, base_url=url, html_type=html_type)
205 |             html_content = extract_html_content(extracted_data)
206 |             
207 |             # 检查提取结果是否为空
208 |             if not html_content or html_content.isspace():
209 |                 # 如果为空，尝试使用jina.ai
210 |                 markdown_content = await fetch_from_jina(url)
211 |                 content = convert_markdown(markdown_content, output_format)
212 |                 return {
213 |                     "url": url,
214 |                     "content": content,
215 |                     "format": output_format,
216 |                     "type": "jina",
217 |                     "success": True
218 |                 }
219 |             
220 |             # 使用原有结果
221 |             converted_content = convert_content(html_content, output_format)
222 |             return {
223 |                 "url": url,
224 |                 "content": converted_content,
225 |                 "format": output_format,
226 |                 "type": html_type,
227 |                 "success": True
228 |             }
229 |             
230 |         except Exception as e:
231 |             # 如果原有逻辑失败，尝试使用jina.ai
232 |             markdown_content = await fetch_from_jina(url)
233 |             content = convert_markdown(markdown_content, output_format)
234 |             return {
235 |                 "url": url,
236 |                 "content": content,
237 |                 "format": output_format,
238 |                 "type": "jina",
239 |                 "success": True
240 |             }
241 |         
242 |     except Exception as e:
243 |         raise HTTPException(status_code=500, detail=str(e)) 
244 | 


--------------------------------------------------------------------------------
/app/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eggacheb/Magic-HTML-API/b931cdd4aa6c10377026cf2b4a01acc9f8130e22/app/favicon.ico


--------------------------------------------------------------------------------
/app/globals.css:
--------------------------------------------------------------------------------
 1 | @tailwind base;
 2 | @tailwind components;
 3 | @tailwind utilities;
 4 | 
 5 | :root {
 6 |   --foreground-rgb: 0, 0, 0;
 7 |   --background-rgb: 250, 250, 250;
 8 | }
 9 | 
10 | body {
11 |   color: rgb(var(--foreground-rgb));
12 |   background: rgb(var(--background-rgb));
13 | }
14 | 


--------------------------------------------------------------------------------
/app/layout.tsx:
--------------------------------------------------------------------------------
 1 | import type { Metadata } from "next";
 2 | import { Inter } from "next/font/google";
 3 | import "./globals.css";
 4 | 
 5 | const inter = Inter({
 6 |   subsets: ["latin"],
 7 |   variable: "--font-inter",
 8 | });
 9 | 
10 | export const metadata: Metadata = {
11 |   title: "Web Content Extractor - AI驱动的网页内容提取工具",
12 |   description: "通过智能算法提取网页的核心内容，移除广告和干扰元素，为AI模型提供清晰的输入数据。",
13 | };
14 | 
15 | export default function RootLayout({
16 |   children,
17 | }: Readonly<{
18 |   children: React.ReactNode;
19 | }>) {
20 |   return (
21 |     <html lang="zh" className={`${inter.variable} antialiased`}>
22 |       <body className="min-h-screen bg-white text-gray-900">
23 |         {children}
24 |       </body>
25 |     </html>
26 |   );
27 | }
28 | 


--------------------------------------------------------------------------------
/app/page.tsx:
--------------------------------------------------------------------------------
  1 | 'use client';
  2 | 
  3 | import { useState } from 'react';
  4 | import { ArrowRight, Copy, Check, Github, MessageSquare, Volume2 } from 'lucide-react';
  5 | 
  6 | interface CachedResult {
  7 |   content: string;
  8 |   format: string;
  9 | }
 10 | 
 11 | interface ResultCache {
 12 |   [key: string]: {
 13 |     [format: string]: CachedResult;
 14 |   };
 15 | }
 16 | 
 17 | type ContentType = "article" | "forum" | "weixin" | "jina";
 18 | 
 19 | export default function Home() {
 20 |   const [url, setUrl] = useState('');
 21 |   const [outputFormat, setOutputFormat] = useState('text');
 22 |   const [result, setResult] = useState<any>(null);
 23 |   const [loading, setLoading] = useState(false);
 24 |   const [error, setError] = useState('');
 25 |   const [copied, setCopied] = useState(false);
 26 |   const [cache, setCache] = useState<ResultCache>({});
 27 | 
 28 |   const handleExtract = async (format = outputFormat) => {
 29 |     try {
 30 |       // 检查缓存
 31 |       if (cache[url]?.[format]) {
 32 |         setResult(cache[url][format]);
 33 |         return;
 34 |       }
 35 | 
 36 |       setLoading(true);
 37 |       setError('');
 38 |       setResult(null);
 39 | 
 40 |       const response = await fetch(`/api/extract?url=${encodeURIComponent(url)}&output_format=${format}`);
 41 |       const data = await response.json();
 42 | 
 43 |       if (!response.ok) {
 44 |         throw new Error(data.error || '提取内容失败');
 45 |       }
 46 | 
 47 |       // 更新缓存
 48 |       setCache(prevCache => ({
 49 |         ...prevCache,
 50 |         [url]: {
 51 |           ...(prevCache[url] || {}),
 52 |           [format]: data
 53 |         }
 54 |       }));
 55 | 
 56 |       setResult(data);
 57 |     } catch (err: any) {
 58 |       setError(err.message);
 59 |     } finally {
 60 |       setLoading(false);
 61 |     }
 62 |   };
 63 | 
 64 |   const handleFormatChange = async (format: string) => {
 65 |     setOutputFormat(format);
 66 |     if (result) {
 67 |       await handleExtract(format);
 68 |     }
 69 |   };
 70 | 
 71 |   const handleUrlChange = (newUrl: string) => {
 72 |     setUrl(newUrl);
 73 |     // 当URL改变时，清除所有结果和缓存
 74 |     setResult(null);
 75 |     setError('');
 76 |     setCache({});  // 清除所有缓存
 77 |   };
 78 | 
 79 |   const handleCopy = async () => {
 80 |     if (result?.content) {
 81 |       await navigator.clipboard.writeText(result.content);
 82 |       setCopied(true);
 83 |       setTimeout(() => setCopied(false), 2000);
 84 |     }
 85 |   };
 86 | 
 87 |   const getTypeText = (type: ContentType) => {
 88 |     switch (type) {
 89 |       case "article":
 90 |         return "文章";
 91 |       case "forum":
 92 |         return "论坛";
 93 |       case "weixin":
 94 |         return "微信";
 95 |       case "jina":
 96 |         return "AI提取";  // 或者 "Jina提取"
 97 |       default:
 98 |         return type;
 99 |     }
100 |   };
101 | 
102 |   const getTypeIcon = (type: ContentType) => {
103 |     switch (type) {
104 |       case "article":
105 |         return "📄";
106 |       case "forum":
107 |         return "💬";
108 |       case "weixin":
109 |         return "💚";
110 |       case "jina":
111 |         return "🤖";  // 使用机器人图标表示AI提取
112 |       default:
113 |         return "📝";
114 |     }
115 |   };
116 | 
117 |   return (
118 |     <div className="min-h-screen bg-gradient-to-b from-blue-50 to-white">
119 |       {/* 导航栏 */}
120 |       <nav className="bg-white/80 backdrop-blur-md border-b border-blue-100 fixed top-0 w-full z-10">
121 |         <div className="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8">
122 |           <div className="flex justify-between h-16 items-center">
123 |             <div className="flex items-center gap-3">
124 |               <div className="w-10 h-10 bg-gradient-to-br from-blue-500 to-blue-600 rounded-xl flex items-center justify-center shadow-lg shadow-blue-200">
125 |                 <MessageSquare className="w-5 h-5 text-white" />
126 |               </div>
127 |               <span className="text-xl font-bold bg-gradient-to-r from-blue-600 to-blue-400 bg-clip-text text-transparent">
128 |                 Web Extractor
129 |               </span>
130 |             </div>
131 |             <div className="flex items-center gap-8">
132 |               <a href="#features" className="text-gray-600 hover:text-blue-600 transition-colors">功能</a>
133 |               <a href="#api" className="text-gray-600 hover:text-blue-600 transition-colors">API</a>
134 |               <a
135 |                 href="https://github.com/eggacheb/web-content-extractor"
136 |                 target="_blank"
137 |                 rel="noopener noreferrer"
138 |                 className="text-gray-600 hover:text-blue-600 transition-colors"
139 |               >
140 |                 <Github className="w-5 h-5" />
141 |               </a>
142 |             </div>
143 |           </div>
144 |         </div>
145 |       </nav>
146 | 
147 |       <main className="pt-28 pb-16">
148 |         {/* Hero Section */}
149 |         <div className="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8">
150 |           <div className="text-center mb-20">
151 |             <h1 className="text-5xl font-bold text-gray-900 mb-6 bg-gradient-to-r from-blue-600 to-blue-400 bg-clip-text text-transparent">
152 |               智能网页内容提取
153 |             </h1>
154 |             <p className="text-xl text-gray-600 max-w-2xl mx-auto">
155 |               使用智能算法自动识别网页类型并提取核心内容，
156 |               支持文章、论坛和微信等多种类型。
157 |             </p>
158 |           </div>
159 | 
160 |           {/* 功能卡片 */}
161 |           <div className="grid md:grid-cols-2 gap-8 mb-20" id="features">
162 |             <div className="bg-white rounded-2xl p-8 shadow-lg border border-blue-50 hover:shadow-xl transition-all duration-300">
163 |               <div className="w-14 h-14 bg-blue-50 rounded-xl flex items-center justify-center mb-6">
164 |                 <MessageSquare className="w-7 h-7 text-blue-600" />
165 |               </div>
166 |               <h3 className="text-xl font-semibold mb-4 text-gray-900">智能内容提取</h3>
167 |               <p className="text-gray-600">
168 |                 自动识别网页类型和主要内容，
169 |                 智能去除干扰元素。
170 |               </p>
171 |             </div>
172 |             <div className="bg-white rounded-2xl p-8 shadow-lg border border-blue-50 hover:shadow-xl transition-all duration-300">
173 |               <div className="w-14 h-14 bg-blue-50 rounded-xl flex items-center justify-center mb-6">
174 |                 <Volume2 className="w-7 h-7 text-blue-600" />
175 |               </div>
176 |               <h3 className="text-xl font-semibold mb-4 text-gray-900">多格式支持</h3>
177 |               <p className="text-gray-600">
178 |                 支持输出为纯文本、Markdown和HTML等多种格式，
179 |                 满足不同场景需求。
180 |               </p>
181 |             </div>
182 |           </div>
183 | 
184 |           {/* 输入部分 */}
185 |           <div className="max-w-3xl mx-auto">
186 |             <div className="bg-white rounded-2xl shadow-lg border border-blue-50 p-8 mb-8">
187 |               <div className="space-y-4">
188 |                 <input
189 |                   type="url"
190 |                   value={url}
191 |                   onChange={(e) => handleUrlChange(e.target.value)}
192 |                   placeholder="输入网页URL，例如: https://example.com"
193 |                   className="w-full px-4 py-3 bg-gray-50 border border-gray-100 rounded-xl focus:ring-2 focus:ring-blue-500 focus:border-transparent outline-none transition-all text-gray-900 placeholder-gray-400"
194 |                 />
195 |                 
196 |                 <div>
197 |                   <label className="block text-sm font-medium text-gray-700 mb-2">输出格式</label>
198 |                   <select
199 |                     value={outputFormat}
200 |                     onChange={(e) => handleFormatChange(e.target.value)}
201 |                     className="w-full px-4 py-3 bg-gray-50 border border-gray-100 rounded-xl focus:ring-2 focus:ring-blue-500 focus:border-transparent outline-none transition-all text-gray-900"
202 |                   >
203 |                     <option value="text">纯文本</option>
204 |                     <option value="markdown">Markdown</option>
205 |                     <option value="html">HTML</option>
206 |                   </select>
207 |                 </div>
208 |                 
209 |                 <button
210 |                   onClick={() => handleExtract()}
211 |                   disabled={loading || !url}
212 |                   className="w-full px-6 py-3 bg-gradient-to-r from-blue-600 to-blue-500 text-white rounded-xl hover:from-blue-700 hover:to-blue-600 disabled:from-gray-400 disabled:to-gray-300 disabled:cursor-not-allowed transition-all flex items-center justify-center gap-2 font-medium shadow-lg shadow-blue-200 hover:shadow-xl disabled:shadow-none"
213 |                 >
214 |                   {loading ? (
215 |                     <div className="w-5 h-5 border-2 border-white border-t-transparent rounded-full animate-spin" />
216 |                   ) : (
217 |                     <>
218 |                       开始提取
219 |                       <ArrowRight className="w-4 h-4" />
220 |                     </>
221 |                   )}
222 |                 </button>
223 |               </div>
224 | 
225 |               {/* API使用说明 */}
226 |               <div className="mt-8 pt-8 border-t border-gray-100" id="api">
227 |                 <h4 className="text-sm font-medium text-gray-900 mb-3">API 快速上手</h4>
228 |                 <div className="space-y-4">
229 |                   <div>
230 |                     <div className="bg-gray-50 p-4 rounded-xl flex items-center gap-3 font-mono text-sm border border-gray-100">
231 |                       <span className="flex-1 overflow-x-auto whitespace-nowrap text-gray-600">
232 |                         GET /api/extract?url=https://example.com
233 |                       </span>
234 |                       <button
235 |                         onClick={() => {
236 |                           navigator.clipboard.writeText('/api/extract?url=https://example.com');
237 |                         }}
238 |                         className="text-gray-400 hover:text-gray-600 transition-colors"
239 |                       >
240 |                         <Copy className="w-4 h-4" />
241 |                       </button>
242 |                     </div>
243 |                     <div className="mt-4 text-sm text-gray-600">
244 |                       <p className="mb-2">参数说明：</p>
245 |                       <ul className="space-y-2 list-disc pl-5">
246 |                         <li><code className="text-blue-600">url</code>: 要提取内容的网页地址（必需）</li>
247 |                         <li>
248 |                           <code className="text-blue-600">output_format</code>: 输出格式（可选，默认为text）
249 |                           <ul className="mt-1 space-y-1 list-none pl-5">
250 |                             <li>• text: 纯文本格式</li>
251 |                             <li>• markdown: Markdown格式</li>
252 |                             <li>• html: HTML格式</li>
253 |                           </ul>
254 |                         </li>
255 |                       </ul>
256 |                     </div>
257 |                   </div>
258 |                 </div>
259 |               </div>
260 |             </div>
261 | 
262 |             {/* 错误提示 */}
263 |             {error && (
264 |               <div className="mb-8 p-4 bg-red-50 border border-red-100 rounded-xl text-red-600 text-sm animate-shake">
265 |                 {error}
266 |               </div>
267 |             )}
268 | 
269 |             {/* 结果展示 */}
270 |             {result && (
271 |               <div className="bg-white rounded-2xl shadow-lg border border-blue-50 p-8 space-y-6">
272 |                 <div className="flex items-center justify-between gap-4">
273 |                   <div>
274 |                     <h3 className="text-xl font-semibold text-gray-900">提取结果</h3>
275 |                     {result.type && (
276 |                       <p className="text-sm text-gray-500 mt-1">
277 |                         检测类型：{getTypeText(result.type)} {getTypeIcon(result.type)}
278 |                       </p>
279 |                     )}
280 |                   </div>
281 |                   <button
282 |                     onClick={handleCopy}
283 |                     className="flex items-center gap-2 px-4 py-2 text-sm text-gray-600 hover:text-blue-600 bg-gray-50 rounded-lg hover:bg-blue-50 transition-colors"
284 |                   >
285 |                     {copied ? (
286 |                       <>
287 |                         <Check className="w-4 h-4" />
288 |                         已复制
289 |                       </>
290 |                     ) : (
291 |                       <>
292 |                         <Copy className="w-4 h-4" />
293 |                         复制内容
294 |                       </>
295 |                     )}
296 |                   </button>
297 |                 </div>
298 |                 <div className={`prose prose-gray max-w-none max-h-[600px] overflow-y-auto ${
299 |                   outputFormat === 'html' ? 'whitespace-pre-wrap font-mono text-sm' : ''
300 |                 }`}>
301 |                   <div className="break-words whitespace-pre-wrap">
302 |                     {result.content}
303 |                   </div>
304 |                 </div>
305 |               </div>
306 |             )}
307 |           </div>
308 |         </div>
309 |       </main>
310 |     </div>
311 |   );
312 | }
313 | 


--------------------------------------------------------------------------------
/eslint.config.mjs:
--------------------------------------------------------------------------------
 1 | import { dirname } from "path";
 2 | import { fileURLToPath } from "url";
 3 | import { FlatCompat } from "@eslint/eslintrc";
 4 | 
 5 | const __filename = fileURLToPath(import.meta.url);
 6 | const __dirname = dirname(__filename);
 7 | 
 8 | const compat = new FlatCompat({
 9 |   baseDirectory: __dirname,
10 | });
11 | 
12 | const eslintConfig = [
13 |   ...compat.extends("next/core-web-vitals", "next/typescript"),
14 | ];
15 | 
16 | export default eslintConfig;
17 | 


--------------------------------------------------------------------------------
/next.config.js:
--------------------------------------------------------------------------------
 1 | /** @type {import('next').NextConfig} */
 2 | const nextConfig = {
 3 |   webpack: (config, { isServer }) => {
 4 |     if (isServer) {
 5 |       config.externals.push({
 6 |         'chrome-aws-lambda': 'chrome-aws-lambda',
 7 |       });
 8 |     }
 9 |     return config;
10 |   },
11 |   rewrites: async () => {
12 |     return [
13 |       {
14 |         source: '/api/:path*',
15 |         destination: '/api/:path*',
16 |       },
17 |     ];
18 |   },
19 | }
20 | 
21 | module.exports = nextConfig 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "lbl",
 3 |   "version": "0.1.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "dev": "next dev",
 7 |     "build": "next build",
 8 |     "start": "next start",
 9 |     "lint": "next lint",
10 |     "clean": "rimraf .next out",
11 |     "type-check": "tsc --noEmit",
12 |     "lint:fix": "next lint --fix",
13 |     "format": "prettier --write ."
14 |   },
15 |   "dependencies": {
16 |     "@mozilla/readability": "^0.5.0",
17 |     "@tailwindcss/typography": "^0.5.15",
18 |     "@types/jsdom": "^21.1.7",
19 |     "jsdom": "^21.1.2",
20 |     "lucide-react": "^0.469.0",
21 |     "next": "14.0.4",
22 |     "react": "^18.2.0",
23 |     "react-dom": "^18.2.0"
24 |   },
25 |   "devDependencies": {
26 |     "@types/node": "^20",
27 |     "@types/react": "^18",
28 |     "@types/react-dom": "^18",
29 |     "autoprefixer": "^10.4.20",
30 |     "eslint": "^8",
31 |     "eslint-config-next": "14.0.4",
32 |     "postcss": "^8.4.49",
33 |     "prettier": "^3.1.1",
34 |     "rimraf": "^5.0.5",
35 |     "tailwindcss": "^3.4.17",
36 |     "typescript": "^5"
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/postcss.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   plugins: {
3 |     tailwindcss: {},
4 |     autoprefixer: {},
5 |   },
6 | } 


--------------------------------------------------------------------------------
/public/file.svg:
--------------------------------------------------------------------------------
1 | <svg fill="none" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg"><path d="M14.5 13.5V5.41a1 1 0 0 0-.3-.7L9.8.29A1 1 0 0 0 9.08 0H1.5v13.5A2.5 2.5 0 0 0 4 16h8a2.5 2.5 0 0 0 2.5-2.5m-1.5 0v-7H8v-5H3v12a1 1 0 0 0 1 1h8a1 1 0 0 0 1-1M9.5 5V2.12L12.38 5zM5.13 5h-.62v1.25h2.12V5zm-.62 3h7.12v1.25H4.5zm.62 3h-.62v1.25h7.12V11z" clip-rule="evenodd" fill="#666" fill-rule="evenodd"/></svg>


--------------------------------------------------------------------------------
/public/globe.svg:
--------------------------------------------------------------------------------
1 | <svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><g clip-path="url(#a)"><path fill-rule="evenodd" clip-rule="evenodd" d="M10.27 14.1a6.5 6.5 0 0 0 3.67-3.45q-1.24.21-2.7.34-.31 1.83-.97 3.1M8 16A8 8 0 1 0 8 0a8 8 0 0 0 0 16m.48-1.52a7 7 0 0 1-.96 0H7.5a4 4 0 0 1-.84-1.32q-.38-.89-.63-2.08a40 40 0 0 0 3.92 0q-.25 1.2-.63 2.08a4 4 0 0 1-.84 1.31zm2.94-4.76q1.66-.15 2.95-.43a7 7 0 0 0 0-2.58q-1.3-.27-2.95-.43a18 18 0 0 1 0 3.44m-1.27-3.54a17 17 0 0 1 0 3.64 39 39 0 0 1-4.3 0 17 17 0 0 1 0-3.64 39 39 0 0 1 4.3 0m1.1-1.17q1.45.13 2.69.34a6.5 6.5 0 0 0-3.67-3.44q.65 1.26.98 3.1M8.48 1.5l.01.02q.41.37.84 1.31.38.89.63 2.08a40 40 0 0 0-3.92 0q.25-1.2.63-2.08a4 4 0 0 1 .85-1.32 7 7 0 0 1 .96 0m-2.75.4a6.5 6.5 0 0 0-3.67 3.44 29 29 0 0 1 2.7-.34q.31-1.83.97-3.1M4.58 6.28q-1.66.16-2.95.43a7 7 0 0 0 0 2.58q1.3.27 2.95.43a18 18 0 0 1 0-3.44m.17 4.71q-1.45-.12-2.69-.34a6.5 6.5 0 0 0 3.67 3.44q-.65-1.27-.98-3.1" fill="#666"/></g><defs><clipPath id="a"><path fill="#fff" d="M0 0h16v16H0z"/></clipPath></defs></svg>


--------------------------------------------------------------------------------
/public/next.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 394 80"><path fill="#000" d="M262 0h68.5v12.7h-27.2v66.6h-13.6V12.7H262V0ZM149 0v12.7H94v20.4h44.3v12.6H94v21h55v12.6H80.5V0h68.7zm34.3 0h-17.8l63.8 79.4h17.9l-32-39.7 32-39.6h-17.9l-23 28.6-23-28.6zm18.3 56.7-9-11-27.1 33.7h17.8l18.3-22.7z"/><path fill="#000" d="M81 79.3 17 0H0v79.3h13.6V17l50.2 62.3H81Zm252.6-.4c-1 0-1.8-.4-2.5-1s-1.1-1.6-1.1-2.6.3-1.8 1-2.5 1.6-1 2.6-1 1.8.3 2.5 1a3.4 3.4 0 0 1 .6 4.3 3.7 3.7 0 0 1-3 1.8zm23.2-33.5h6v23.3c0 2.1-.4 4-1.3 5.5a9.1 9.1 0 0 1-3.8 3.5c-1.6.8-3.5 1.3-5.7 1.3-2 0-3.7-.4-5.3-1s-2.8-1.8-3.7-3.2c-.9-1.3-1.4-3-1.4-5h6c.1.8.3 1.6.7 2.2s1 1.2 1.6 1.5c.7.4 1.5.5 2.4.5 1 0 1.8-.2 2.4-.6a4 4 0 0 0 1.6-1.8c.3-.8.5-1.8.5-3V45.5zm30.9 9.1a4.4 4.4 0 0 0-2-3.3 7.5 7.5 0 0 0-4.3-1.1c-1.3 0-2.4.2-3.3.5-.9.4-1.6 1-2 1.6a3.5 3.5 0 0 0-.3 4c.3.5.7.9 1.3 1.2l1.8 1 2 .5 3.2.8c1.3.3 2.5.7 3.7 1.2a13 13 0 0 1 3.2 1.8 8.1 8.1 0 0 1 3 6.5c0 2-.5 3.7-1.5 5.1a10 10 0 0 1-4.4 3.5c-1.8.8-4.1 1.2-6.8 1.2-2.6 0-4.9-.4-6.8-1.2-2-.8-3.4-2-4.5-3.5a10 10 0 0 1-1.7-5.6h6a5 5 0 0 0 3.5 4.6c1 .4 2.2.6 3.4.6 1.3 0 2.5-.2 3.5-.6 1-.4 1.8-1 2.4-1.7a4 4 0 0 0 .8-2.4c0-.9-.2-1.6-.7-2.2a11 11 0 0 0-2.1-1.4l-3.2-1-3.8-1c-2.8-.7-5-1.7-6.6-3.2a7.2 7.2 0 0 1-2.4-5.7 8 8 0 0 1 1.7-5 10 10 0 0 1 4.3-3.5c2-.8 4-1.2 6.4-1.2 2.3 0 4.4.4 6.2 1.2 1.8.8 3.2 2 4.3 3.4 1 1.4 1.5 3 1.5 5h-5.8z"/></svg>


--------------------------------------------------------------------------------
/public/vercel.svg:
--------------------------------------------------------------------------------
1 | <svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1155 1000"><path d="m577.3 0 577.4 1000H0z" fill="#fff"/></svg>


--------------------------------------------------------------------------------
/public/window.svg:
--------------------------------------------------------------------------------
1 | <svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><path fill-rule="evenodd" clip-rule="evenodd" d="M1.5 2.5h13v10a1 1 0 0 1-1 1h-11a1 1 0 0 1-1-1zM0 1h16v11.5a2.5 2.5 0 0 1-2.5 2.5h-11A2.5 2.5 0 0 1 0 12.5zm3.75 4.5a.75.75 0 1 0 0-1.5.75.75 0 0 0 0 1.5M7 4.75a.75.75 0 1 1-1.5 0 .75.75 0 0 1 1.5 0m1.75.75a.75.75 0 1 0 0-1.5.75.75 0 0 0 0 1.5" fill="#666"/></svg>


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi==0.95.2
2 | httpx==0.24.1
3 | uvicorn==0.22.0
4 | chardet==4.0.0
5 | ./wheels/magic_html-0.1.2-py3-none-any.whl
6 | markdownify==0.11.6
7 | beautifulsoup4==4.12.2
8 | 


--------------------------------------------------------------------------------
/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.9 


--------------------------------------------------------------------------------
/tailwind.config.ts:
--------------------------------------------------------------------------------
 1 | import type { Config } from "tailwindcss";
 2 | 
 3 | const config: Config = {
 4 |   content: [
 5 |     "./pages/**/*.{js,ts,jsx,tsx,mdx}",
 6 |     "./components/**/*.{js,ts,jsx,tsx,mdx}",
 7 |     "./app/**/*.{js,ts,jsx,tsx,mdx}",
 8 |   ],
 9 |   theme: {
10 |     extend: {
11 |       borderRadius: {
12 |         'xl': '1rem',
13 |         '2xl': '1.25rem',
14 |       },
15 |       typography: {
16 |         DEFAULT: {
17 |           css: {
18 |             maxWidth: 'none',
19 |             color: '#334155',
20 |             p: {
21 |               marginTop: '1.25em',
22 |               marginBottom: '1.25em',
23 |             },
24 |           },
25 |         },
26 |       },
27 |     },
28 |   },
29 |   plugins: [
30 |     require('@tailwindcss/typography'),
31 |   ],
32 | };
33 | 
34 | export default config;
35 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "ES2017",
 4 |     "lib": ["dom", "dom.iterable", "esnext"],
 5 |     "allowJs": true,
 6 |     "skipLibCheck": true,
 7 |     "strict": true,
 8 |     "noEmit": true,
 9 |     "esModuleInterop": true,
10 |     "module": "esnext",
11 |     "moduleResolution": "bundler",
12 |     "resolveJsonModule": true,
13 |     "isolatedModules": true,
14 |     "jsx": "preserve",
15 |     "incremental": true,
16 |     "plugins": [
17 |       {
18 |         "name": "next"
19 |       }
20 |     ],
21 |     "paths": {
22 |       "@/*": ["./*"],
23 |       "@/components/*": ["./components/*"],
24 |       "@/app/*": ["./app/*"],
25 |       "@/lib/*": ["./lib/*"],
26 |       "@/styles/*": ["./styles/*"],
27 |       "@/types/*": ["./types/*"]
28 |     },
29 |     "baseUrl": "."
30 |   },
31 |   "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
32 |   "exclude": ["node_modules"]
33 | }
34 | 


--------------------------------------------------------------------------------
/vercel.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": 2,
 3 |     "functions": {
 4 |         "api/**/*.py": {
 5 |             "maxDuration": 60,
 6 |             "memory": 1024
 7 |         }
 8 |     },
 9 |     "routes": [
10 |         {
11 |             "src": "/api/extract",
12 |             "dest": "api/extract.py",
13 |             "methods": ["GET"],
14 |             "headers": {
15 |                 "Access-Control-Allow-Origin": "*",
16 |                 "Access-Control-Allow-Methods": "GET",
17 |                 "Access-Control-Allow-Headers": "Content-Type",
18 |                 "Cache-Control": "no-cache, no-store, must-revalidate"
19 |             }
20 |         },
21 |         {
22 |             "src": "/api/(.*)",
23 |             "dest": "api/extract.py"
24 |         },
25 |         {
26 |             "handle": "filesystem"
27 |         },
28 |         {
29 |             "src": "/(.*)",
30 |             "dest": "/"
31 |         }
32 |     ]
33 | } 
34 | 


--------------------------------------------------------------------------------
/wheels/magic_html-0.1.2-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eggacheb/Magic-HTML-API/b931cdd4aa6c10377026cf2b4a01acc9f8130e22/wheels/magic_html-0.1.2-py3-none-any.whl


--------------------------------------------------------------------------------