├── .github └── workflows │ └── sync-upstream.yml ├── .gitignore ├── .prettierrc ├── README.md ├── api └── extract.py ├── app ├── favicon.ico ├── globals.css ├── layout.tsx └── page.tsx ├── eslint.config.mjs ├── next.config.js ├── package.json ├── postcss.config.js ├── public ├── file.svg ├── globe.svg ├── next.svg ├── vercel.svg └── window.svg ├── requirements.txt ├── runtime.txt ├── tailwind.config.ts ├── tsconfig.json ├── vercel.json └── wheels └── magic_html-0.1.2-py3-none-any.whl /.github/workflows/sync-upstream.yml: -------------------------------------------------------------------------------- 1 | name: Upstream Sync 2 | 3 | permissions: 4 | contents: write 5 | issues: write 6 | actions: write 7 | 8 | on: 9 | schedule: 10 | - cron: '0 */6 * * *' # 每6小时运行一次 11 | workflow_dispatch: # 允许手动触发 12 | 13 | jobs: 14 | sync_latest_from_upstream: 15 | name: Sync latest commits from upstream repo 16 | runs-on: ubuntu-latest 17 | if: ${{ github.event.repository.fork }} 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | 22 | - name: Clean issue notice 23 | uses: actions-cool/issues-helper@v3 24 | with: 25 | actions: 'close-issues' 26 | labels: '🚨 Sync Fail' 27 | 28 | - name: Sync upstream changes 29 | id: sync 30 | uses: aormsby/Fork-Sync-With-Upstream-action@v3.4 31 | with: 32 | upstream_sync_repo: eggacheb/Magic-HTML-API # 已更新为正确的上游仓库 33 | upstream_sync_branch: main 34 | target_sync_branch: main 35 | target_repo_token: ${{ secrets.GITHUB_TOKEN }} 36 | 37 | - name: Sync check 38 | if: failure() 39 | uses: actions-cool/issues-helper@v3 40 | with: 41 | actions: 'create-issue' 42 | title: '🚨 同步失败 | Sync Fail' 43 | labels: '🚨 Sync Fail' 44 | body: | 45 | 同步过程中发生错误,请手动同步一次。 46 | An error occurred during synchronization. Please sync manually. -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | env/ 8 | build/ 9 | develop-eggs/ 10 | dist/ 11 | downloads/ 12 | eggs/ 13 | .eggs/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | 23 | # Virtual Environment 24 | venv/ 25 | ENV/ 26 | 27 | # IDE 28 | .idea/ 29 | .vscode/ 30 | *.swp 31 | *.swo 32 | 33 | # Vercel 34 | .vercel 35 | .env 36 | .env.local 37 | .env*.local 38 | 39 | # Next.js 40 | node_modules/ 41 | .next/ 42 | out/ 43 | .DS_Store 44 | *.pem 45 | npm-debug.log* 46 | yarn-debug.log* 47 | yarn-error.log* 48 | .pnpm-debug.log* 49 | 50 | # TypeScript 51 | *.tsbuildinfo 52 | 53 | # Testing 54 | coverage/ 55 | 56 | # Production 57 | build/ 58 | dist/ 59 | out/ 60 | 61 | # Misc 62 | .DS_Store 63 | *.pem 64 | .env.local 65 | .env.development.local 66 | .env.test.local 67 | .env.production.local -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "semi": true, 3 | "singleQuote": true, 4 | "tabWidth": 2, 5 | "trailingComma": "es5", 6 | "printWidth": 100, 7 | "bracketSpacing": true, 8 | "arrowParens": "avoid" 9 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # magic-html-api 2 | 3 | 一个智能的网页内容提取API服务,基于magic-html和jina-ai/reader。支持多种内容类型(文章/论坛/微信/知乎),多种输出格式(文本/Markdown/HTML)。只保留主要文章内容,使AI能够更好地理解和分析文本。 4 | 5 | ## 功能特点 6 | 7 | - 🔍 智能识别网页类型并提取主要内容 8 | - 📚 支持多种内容类型(文章/论坛/微信/知乎) 9 | - 📝 多种输出格式(文本/Markdown/HTML) 10 | - ⚡ 异步处理,响应迅速 11 | - 🚀 部署在Vercel上,免费使用 12 | - 🤖 自动降级处理:当默认提取失败时自动使用jina-ai/reader 13 | 14 | ## 🔗 在线演示 15 | 16 | 访问 [https://magic-html-api.vercel.app](https://magic-html-api.vercel.app) 体验在线版本。 17 | 18 | 一键部署:[![Vercel Deployment](https://vercel.com/button)](https://vercel.com/new/clone?repository-url=https://github.com/eggacheb/Magic-HTML-API) 19 | 20 | 21 | ![image](https://github.com/user-attachments/assets/6ac7637c-909a-47c1-b756-9e7d93c466e3) 22 | ![image](https://github.com/user-attachments/assets/3ddacaf3-8fbe-4ab5-a306-e81fdc2a2152) 23 | ![image](https://github.com/user-attachments/assets/03637a58-6870-4101-b350-785a3f36bed3) 24 | 25 | 26 | ## API使用 27 | 28 | ### 内容提取 29 | 30 | ``` 31 | GET /api/extract 32 | ``` 33 | 34 | 参数: 35 | - `url`: 要提取内容的网页URL(必需) 36 | - `output_format`: 输出格式(可选,默认为"text") 37 | - text: 纯文本格式 38 | - markdown: Markdown格式 39 | - html: HTML格式 40 | 41 | 示例请求: 42 | ``` 43 | https://your-domain.vercel.app/api/extract?url=https://example.com&output_format=markdown 44 | ``` 45 | 46 | 响应格式: 47 | ```json 48 | { 49 | "url": "请求的URL", 50 | "content": "提取的内容", 51 | "format": "输出格式", 52 | "type": "内容类型", 53 | "success": true 54 | } 55 | ``` 56 | 57 | 内容类型(type)包括: 58 | - article: 文章 59 | - forum: 论坛 60 | - weixin: 微信文章 61 | - jina: AI提取(使用jina-ai/reader处理) 62 | 63 | ## 技术实现 64 | 65 | - 使用[magic-html](https://github.com/opendatalab/magic-html)作为主要内容提取引擎 66 | - 集成[jina-ai/reader](https://github.com/jina-ai/reader)作为备选提取方案 67 | - 自动识别网页类型并选择最佳提取策略 68 | - 智能降级:当默认提取失败时自动切换到jina-ai/reader 69 | 70 | ## 部署 71 | 72 | 本项目使用Vercel部署,直接导入GitHub仓库即可。 73 | 74 | ### 环境要求 75 | - Python 3.9+ 76 | - Node.js 16+ 77 | 78 | ### 部署步骤 79 | 1. Fork本仓库 80 | 2. 在Vercel中导入项目 81 | 3. 部署完成后即可使用 82 | 83 | ## 技术栈 84 | 85 | ### 后端 86 | - FastAPI 87 | - magic-html 88 | - jina-ai/reader 89 | - Python 3.9+ 90 | 91 | ### 前端 92 | - Next.js 13 93 | - React 94 | - Tailwind CSS 95 | - TypeScript 96 | 97 | ### 部署 98 | - Vercel 99 | 100 | ## 致谢 101 | 102 | - [magic-html](https://github.com/opendatalab/magic-html) - 强大的网页内容提取库 103 | - [jina-ai/reader](https://github.com/jina-ai/reader) - 优秀的AI内容提取服务 104 | 105 | ## License 106 | 107 | MIT 108 | -------------------------------------------------------------------------------- /api/extract.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, HTTPException 2 | import httpx 3 | from magic_html import GeneralExtractor 4 | from typing import Optional, Literal 5 | from markdownify import markdownify as md 6 | from bs4 import BeautifulSoup 7 | import re 8 | import chardet 9 | 10 | app = FastAPI() 11 | extractor = GeneralExtractor() 12 | 13 | async def fetch_url(url: str) -> str: 14 | async with httpx.AsyncClient() as client: 15 | try: 16 | response = await client.get(url) 17 | response.raise_for_status() 18 | 19 | # 处理响应编码 20 | content_type = response.headers.get('content-type', '').lower() 21 | if 'charset=' in content_type: 22 | try: 23 | charset = content_type.split('charset=')[-1].split(';')[0] 24 | return response.content.decode(charset) 25 | except: 26 | pass 27 | 28 | try: 29 | return response.content.decode('utf-8') 30 | except UnicodeDecodeError: 31 | content = response.content 32 | detected = chardet.detect(content) 33 | encoding = detected['encoding'] 34 | 35 | if encoding and encoding.lower() in ['gb2312', 'gbk']: 36 | encoding = 'gb18030' 37 | 38 | return content.decode(encoding or 'utf-8') 39 | 40 | except Exception as e: 41 | raise HTTPException(status_code=400, detail=f"Error fetching URL: {str(e)}") 42 | 43 | def convert_content(html: str, output_format: str) -> str: 44 | """ 45 | 将HTML内容转换为指定格式 46 | 47 | Args: 48 | html: HTML内容 49 | output_format: 输出格式 ("html", "markdown", "text") 50 | 51 | Returns: 52 | 转换后的内容 53 | """ 54 | if not isinstance(html, str): 55 | html = str(html) 56 | 57 | if output_format == "html": 58 | return html 59 | elif output_format == "markdown": 60 | return md(html, 61 | heading_style="ATX", # 使用 # 风格的标题 62 | bullets="*", # 统一使用 * 作为列表符号 63 | autolinks=True, # 启用自动链接 64 | code_language="", # 保持代码块的语言标记 65 | escape_asterisks=False, # 不转义文本中的星号 66 | escape_underscores=False, # 不转义下划线 67 | newline_style="SPACES") # 使用标准markdown换行方式 68 | elif output_format == "text": 69 | soup = BeautifulSoup(html, 'html.parser') 70 | return soup.get_text(separator='\n', strip=True) 71 | else: 72 | return html 73 | 74 | def extract_html_content(data: dict) -> str: 75 | """ 76 | 从magic_html返回的数据中提取HTML内容 77 | 78 | Args: 79 | data: magic_html返回的数据 80 | 81 | Returns: 82 | HTML内容 83 | """ 84 | if isinstance(data, dict): 85 | return data.get('html', '') 86 | return '' 87 | 88 | def detect_html_type(html: str, url: str) -> str: 89 | """ 90 | 自动检测HTML类型 91 | 92 | Args: 93 | html: HTML内容 94 | url: 页面URL 95 | 96 | Returns: 97 | 检测到的类型 ("article", "forum", "weixin", "jina") 98 | """ 99 | # 检查URL特征 100 | url_lower = url.lower() 101 | if any(domain in url_lower for domain in ['mp.weixin.qq.com', 'weixin.qq.com']): 102 | return 'weixin' 103 | elif 'zhihu.com' in url_lower: 104 | return 'jina' 105 | 106 | # 检查HTML特征 107 | soup = BeautifulSoup(html, 'html.parser') 108 | 109 | # 论坛特征检测 110 | forum_indicators = [ 111 | 'forum', 'topic', 'thread', 'post', 'reply', 'comment', 'discuss', 112 | '论坛', '帖子', '回复', '评论', '讨论' 113 | ] 114 | 115 | # 检查类名和ID 116 | classes_and_ids = [] 117 | for element in soup.find_all(class_=True): 118 | classes_and_ids.extend(element.get('class', [])) 119 | for element in soup.find_all(id=True): 120 | classes_and_ids.append(element.get('id', '')) 121 | 122 | classes_and_ids = ' '.join(classes_and_ids).lower() 123 | 124 | if any(indicator in classes_and_ids for indicator in forum_indicators): 125 | return 'forum' 126 | 127 | # 默认为文章类型 128 | return 'article' 129 | 130 | # 添加jina.ai提取函数 131 | async def fetch_from_jina(url: str) -> str: 132 | """ 133 | 使用jina.ai服务提取内容,最多等待15秒 134 | 135 | Args: 136 | url: 目标网页URL 137 | 138 | Returns: 139 | 提取的内容 140 | """ 141 | jina_url = f"https://r.jina.ai/{url}" 142 | async with httpx.AsyncClient(timeout=15.0) as client: 143 | response = await client.get(jina_url) 144 | response.raise_for_status() 145 | return response.text 146 | 147 | def convert_markdown(markdown: str, output_format: str) -> str: 148 | """ 149 | 将markdown内容转换为指定格式 150 | 151 | Args: 152 | markdown: Markdown内容 153 | output_format: 输出格式 ("html", "markdown", "text") 154 | 155 | Returns: 156 | 转换后的内容 157 | """ 158 | if output_format == "markdown": 159 | return markdown 160 | elif output_format == "text": 161 | # 移除markdown标记 162 | text = re.sub(r'!\[.*?\]\(.*?\)', '[图片]', markdown) # 替换图片 163 | text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) # 替换链接 164 | text = re.sub(r'[#*`]', '', text) # 移除特殊字符 165 | return text.strip() 166 | elif output_format == "html": 167 | # 这里可以使用markdown到html的转换库,比如markdown2或mistune 168 | # 暂时返回原始markdown 169 | return markdown 170 | return markdown 171 | 172 | @app.get("/api/extract") 173 | async def extract_content( 174 | url: str, 175 | output_format: Optional[Literal["html", "markdown", "text"]] = "text" 176 | ): 177 | """ 178 | 从URL提取内容 179 | 180 | Args: 181 | url: 目标网页URL 182 | output_format: 输出格式 ("html", "markdown", "text"),默认为text 183 | 184 | Returns: 185 | JSON格式的提取内容 186 | """ 187 | try: 188 | # 检测是否是知乎页面 189 | if 'zhihu.com' in url: 190 | markdown_content = await fetch_from_jina(url) 191 | content = convert_markdown(markdown_content, output_format) 192 | return { 193 | "url": url, 194 | "content": content, 195 | "format": output_format, 196 | "type": "jina", 197 | "success": True 198 | } 199 | 200 | # 尝试使用原有逻辑 201 | try: 202 | html = await fetch_url(url) 203 | html_type = detect_html_type(html, url) 204 | extracted_data = extractor.extract(html, base_url=url, html_type=html_type) 205 | html_content = extract_html_content(extracted_data) 206 | 207 | # 检查提取结果是否为空 208 | if not html_content or html_content.isspace(): 209 | # 如果为空,尝试使用jina.ai 210 | markdown_content = await fetch_from_jina(url) 211 | content = convert_markdown(markdown_content, output_format) 212 | return { 213 | "url": url, 214 | "content": content, 215 | "format": output_format, 216 | "type": "jina", 217 | "success": True 218 | } 219 | 220 | # 使用原有结果 221 | converted_content = convert_content(html_content, output_format) 222 | return { 223 | "url": url, 224 | "content": converted_content, 225 | "format": output_format, 226 | "type": html_type, 227 | "success": True 228 | } 229 | 230 | except Exception as e: 231 | # 如果原有逻辑失败,尝试使用jina.ai 232 | markdown_content = await fetch_from_jina(url) 233 | content = convert_markdown(markdown_content, output_format) 234 | return { 235 | "url": url, 236 | "content": content, 237 | "format": output_format, 238 | "type": "jina", 239 | "success": True 240 | } 241 | 242 | except Exception as e: 243 | raise HTTPException(status_code=500, detail=str(e)) 244 | -------------------------------------------------------------------------------- /app/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eggacheb/Magic-HTML-API/b931cdd4aa6c10377026cf2b4a01acc9f8130e22/app/favicon.ico -------------------------------------------------------------------------------- /app/globals.css: -------------------------------------------------------------------------------- 1 | @tailwind base; 2 | @tailwind components; 3 | @tailwind utilities; 4 | 5 | :root { 6 | --foreground-rgb: 0, 0, 0; 7 | --background-rgb: 250, 250, 250; 8 | } 9 | 10 | body { 11 | color: rgb(var(--foreground-rgb)); 12 | background: rgb(var(--background-rgb)); 13 | } 14 | -------------------------------------------------------------------------------- /app/layout.tsx: -------------------------------------------------------------------------------- 1 | import type { Metadata } from "next"; 2 | import { Inter } from "next/font/google"; 3 | import "./globals.css"; 4 | 5 | const inter = Inter({ 6 | subsets: ["latin"], 7 | variable: "--font-inter", 8 | }); 9 | 10 | export const metadata: Metadata = { 11 | title: "Web Content Extractor - AI驱动的网页内容提取工具", 12 | description: "通过智能算法提取网页的核心内容,移除广告和干扰元素,为AI模型提供清晰的输入数据。", 13 | }; 14 | 15 | export default function RootLayout({ 16 | children, 17 | }: Readonly<{ 18 | children: React.ReactNode; 19 | }>) { 20 | return ( 21 | 22 | 23 | {children} 24 | 25 | 26 | ); 27 | } 28 | -------------------------------------------------------------------------------- /app/page.tsx: -------------------------------------------------------------------------------- 1 | 'use client'; 2 | 3 | import { useState } from 'react'; 4 | import { ArrowRight, Copy, Check, Github, MessageSquare, Volume2 } from 'lucide-react'; 5 | 6 | interface CachedResult { 7 | content: string; 8 | format: string; 9 | } 10 | 11 | interface ResultCache { 12 | [key: string]: { 13 | [format: string]: CachedResult; 14 | }; 15 | } 16 | 17 | type ContentType = "article" | "forum" | "weixin" | "jina"; 18 | 19 | export default function Home() { 20 | const [url, setUrl] = useState(''); 21 | const [outputFormat, setOutputFormat] = useState('text'); 22 | const [result, setResult] = useState(null); 23 | const [loading, setLoading] = useState(false); 24 | const [error, setError] = useState(''); 25 | const [copied, setCopied] = useState(false); 26 | const [cache, setCache] = useState({}); 27 | 28 | const handleExtract = async (format = outputFormat) => { 29 | try { 30 | // 检查缓存 31 | if (cache[url]?.[format]) { 32 | setResult(cache[url][format]); 33 | return; 34 | } 35 | 36 | setLoading(true); 37 | setError(''); 38 | setResult(null); 39 | 40 | const response = await fetch(`/api/extract?url=${encodeURIComponent(url)}&output_format=${format}`); 41 | const data = await response.json(); 42 | 43 | if (!response.ok) { 44 | throw new Error(data.error || '提取内容失败'); 45 | } 46 | 47 | // 更新缓存 48 | setCache(prevCache => ({ 49 | ...prevCache, 50 | [url]: { 51 | ...(prevCache[url] || {}), 52 | [format]: data 53 | } 54 | })); 55 | 56 | setResult(data); 57 | } catch (err: any) { 58 | setError(err.message); 59 | } finally { 60 | setLoading(false); 61 | } 62 | }; 63 | 64 | const handleFormatChange = async (format: string) => { 65 | setOutputFormat(format); 66 | if (result) { 67 | await handleExtract(format); 68 | } 69 | }; 70 | 71 | const handleUrlChange = (newUrl: string) => { 72 | setUrl(newUrl); 73 | // 当URL改变时,清除所有结果和缓存 74 | setResult(null); 75 | setError(''); 76 | setCache({}); // 清除所有缓存 77 | }; 78 | 79 | const handleCopy = async () => { 80 | if (result?.content) { 81 | await navigator.clipboard.writeText(result.content); 82 | setCopied(true); 83 | setTimeout(() => setCopied(false), 2000); 84 | } 85 | }; 86 | 87 | const getTypeText = (type: ContentType) => { 88 | switch (type) { 89 | case "article": 90 | return "文章"; 91 | case "forum": 92 | return "论坛"; 93 | case "weixin": 94 | return "微信"; 95 | case "jina": 96 | return "AI提取"; // 或者 "Jina提取" 97 | default: 98 | return type; 99 | } 100 | }; 101 | 102 | const getTypeIcon = (type: ContentType) => { 103 | switch (type) { 104 | case "article": 105 | return "📄"; 106 | case "forum": 107 | return "💬"; 108 | case "weixin": 109 | return "💚"; 110 | case "jina": 111 | return "🤖"; // 使用机器人图标表示AI提取 112 | default: 113 | return "📝"; 114 | } 115 | }; 116 | 117 | return ( 118 |
119 | {/* 导航栏 */} 120 | 146 | 147 |
148 | {/* Hero Section */} 149 |
150 |
151 |

152 | 智能网页内容提取 153 |

154 |

155 | 使用智能算法自动识别网页类型并提取核心内容, 156 | 支持文章、论坛和微信等多种类型。 157 |

158 |
159 | 160 | {/* 功能卡片 */} 161 |
162 |
163 |
164 | 165 |
166 |

智能内容提取

167 |

168 | 自动识别网页类型和主要内容, 169 | 智能去除干扰元素。 170 |

171 |
172 |
173 |
174 | 175 |
176 |

多格式支持

177 |

178 | 支持输出为纯文本、Markdown和HTML等多种格式, 179 | 满足不同场景需求。 180 |

181 |
182 |
183 | 184 | {/* 输入部分 */} 185 |
186 |
187 |
188 | handleUrlChange(e.target.value)} 192 | placeholder="输入网页URL,例如: https://example.com" 193 | className="w-full px-4 py-3 bg-gray-50 border border-gray-100 rounded-xl focus:ring-2 focus:ring-blue-500 focus:border-transparent outline-none transition-all text-gray-900 placeholder-gray-400" 194 | /> 195 | 196 |
197 | 198 | 207 |
208 | 209 | 223 |
224 | 225 | {/* API使用说明 */} 226 |
227 |

API 快速上手

228 |
229 |
230 |
231 | 232 | GET /api/extract?url=https://example.com 233 | 234 | 242 |
243 |
244 |

参数说明:

245 |
    246 |
  • url: 要提取内容的网页地址(必需)
  • 247 |
  • 248 | output_format: 输出格式(可选,默认为text) 249 |
      250 |
    • • text: 纯文本格式
    • 251 |
    • • markdown: Markdown格式
    • 252 |
    • • html: HTML格式
    • 253 |
    254 |
  • 255 |
256 |
257 |
258 |
259 |
260 |
261 | 262 | {/* 错误提示 */} 263 | {error && ( 264 |
265 | {error} 266 |
267 | )} 268 | 269 | {/* 结果展示 */} 270 | {result && ( 271 |
272 |
273 |
274 |

提取结果

275 | {result.type && ( 276 |

277 | 检测类型:{getTypeText(result.type)} {getTypeIcon(result.type)} 278 |

279 | )} 280 |
281 | 297 |
298 |
301 |
302 | {result.content} 303 |
304 |
305 |
306 | )} 307 |
308 |
309 |
310 |
311 | ); 312 | } 313 | -------------------------------------------------------------------------------- /eslint.config.mjs: -------------------------------------------------------------------------------- 1 | import { dirname } from "path"; 2 | import { fileURLToPath } from "url"; 3 | import { FlatCompat } from "@eslint/eslintrc"; 4 | 5 | const __filename = fileURLToPath(import.meta.url); 6 | const __dirname = dirname(__filename); 7 | 8 | const compat = new FlatCompat({ 9 | baseDirectory: __dirname, 10 | }); 11 | 12 | const eslintConfig = [ 13 | ...compat.extends("next/core-web-vitals", "next/typescript"), 14 | ]; 15 | 16 | export default eslintConfig; 17 | -------------------------------------------------------------------------------- /next.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('next').NextConfig} */ 2 | const nextConfig = { 3 | webpack: (config, { isServer }) => { 4 | if (isServer) { 5 | config.externals.push({ 6 | 'chrome-aws-lambda': 'chrome-aws-lambda', 7 | }); 8 | } 9 | return config; 10 | }, 11 | rewrites: async () => { 12 | return [ 13 | { 14 | source: '/api/:path*', 15 | destination: '/api/:path*', 16 | }, 17 | ]; 18 | }, 19 | } 20 | 21 | module.exports = nextConfig -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "lbl", 3 | "version": "0.1.0", 4 | "private": true, 5 | "scripts": { 6 | "dev": "next dev", 7 | "build": "next build", 8 | "start": "next start", 9 | "lint": "next lint", 10 | "clean": "rimraf .next out", 11 | "type-check": "tsc --noEmit", 12 | "lint:fix": "next lint --fix", 13 | "format": "prettier --write ." 14 | }, 15 | "dependencies": { 16 | "@mozilla/readability": "^0.5.0", 17 | "@tailwindcss/typography": "^0.5.15", 18 | "@types/jsdom": "^21.1.7", 19 | "jsdom": "^21.1.2", 20 | "lucide-react": "^0.469.0", 21 | "next": "14.0.4", 22 | "react": "^18.2.0", 23 | "react-dom": "^18.2.0" 24 | }, 25 | "devDependencies": { 26 | "@types/node": "^20", 27 | "@types/react": "^18", 28 | "@types/react-dom": "^18", 29 | "autoprefixer": "^10.4.20", 30 | "eslint": "^8", 31 | "eslint-config-next": "14.0.4", 32 | "postcss": "^8.4.49", 33 | "prettier": "^3.1.1", 34 | "rimraf": "^5.0.5", 35 | "tailwindcss": "^3.4.17", 36 | "typescript": "^5" 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /postcss.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | plugins: { 3 | tailwindcss: {}, 4 | autoprefixer: {}, 5 | }, 6 | } -------------------------------------------------------------------------------- /public/file.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /public/globe.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /public/next.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /public/vercel.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /public/window.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.95.2 2 | httpx==0.24.1 3 | uvicorn==0.22.0 4 | chardet==4.0.0 5 | ./wheels/magic_html-0.1.2-py3-none-any.whl 6 | markdownify==0.11.6 7 | beautifulsoup4==4.12.2 8 | -------------------------------------------------------------------------------- /runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.9 -------------------------------------------------------------------------------- /tailwind.config.ts: -------------------------------------------------------------------------------- 1 | import type { Config } from "tailwindcss"; 2 | 3 | const config: Config = { 4 | content: [ 5 | "./pages/**/*.{js,ts,jsx,tsx,mdx}", 6 | "./components/**/*.{js,ts,jsx,tsx,mdx}", 7 | "./app/**/*.{js,ts,jsx,tsx,mdx}", 8 | ], 9 | theme: { 10 | extend: { 11 | borderRadius: { 12 | 'xl': '1rem', 13 | '2xl': '1.25rem', 14 | }, 15 | typography: { 16 | DEFAULT: { 17 | css: { 18 | maxWidth: 'none', 19 | color: '#334155', 20 | p: { 21 | marginTop: '1.25em', 22 | marginBottom: '1.25em', 23 | }, 24 | }, 25 | }, 26 | }, 27 | }, 28 | }, 29 | plugins: [ 30 | require('@tailwindcss/typography'), 31 | ], 32 | }; 33 | 34 | export default config; 35 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2017", 4 | "lib": ["dom", "dom.iterable", "esnext"], 5 | "allowJs": true, 6 | "skipLibCheck": true, 7 | "strict": true, 8 | "noEmit": true, 9 | "esModuleInterop": true, 10 | "module": "esnext", 11 | "moduleResolution": "bundler", 12 | "resolveJsonModule": true, 13 | "isolatedModules": true, 14 | "jsx": "preserve", 15 | "incremental": true, 16 | "plugins": [ 17 | { 18 | "name": "next" 19 | } 20 | ], 21 | "paths": { 22 | "@/*": ["./*"], 23 | "@/components/*": ["./components/*"], 24 | "@/app/*": ["./app/*"], 25 | "@/lib/*": ["./lib/*"], 26 | "@/styles/*": ["./styles/*"], 27 | "@/types/*": ["./types/*"] 28 | }, 29 | "baseUrl": "." 30 | }, 31 | "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], 32 | "exclude": ["node_modules"] 33 | } 34 | -------------------------------------------------------------------------------- /vercel.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 2, 3 | "functions": { 4 | "api/**/*.py": { 5 | "maxDuration": 60, 6 | "memory": 1024 7 | } 8 | }, 9 | "routes": [ 10 | { 11 | "src": "/api/extract", 12 | "dest": "api/extract.py", 13 | "methods": ["GET"], 14 | "headers": { 15 | "Access-Control-Allow-Origin": "*", 16 | "Access-Control-Allow-Methods": "GET", 17 | "Access-Control-Allow-Headers": "Content-Type", 18 | "Cache-Control": "no-cache, no-store, must-revalidate" 19 | } 20 | }, 21 | { 22 | "src": "/api/(.*)", 23 | "dest": "api/extract.py" 24 | }, 25 | { 26 | "handle": "filesystem" 27 | }, 28 | { 29 | "src": "/(.*)", 30 | "dest": "/" 31 | } 32 | ] 33 | } 34 | -------------------------------------------------------------------------------- /wheels/magic_html-0.1.2-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eggacheb/Magic-HTML-API/b931cdd4aa6c10377026cf2b4a01acc9f8130e22/wheels/magic_html-0.1.2-py3-none-any.whl --------------------------------------------------------------------------------