├── entrypoint.sh
├── main.py
├── readme.md
├── requirements.txt
└── src
    ├── api.py
    └── testCrawler.py


/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | python3 -m venv myenv  # 创建虚拟环境
 3 | source myenv/bin/activate  # 激活虚拟环境
 4 | 
 5 | pip install -r requirements.txt # 安装依赖
 6 | 
 7 | playwright install # 安装 Playwright 所需的浏览器
 8 | playwright install-deps # 安装 Playwright 所需的依赖
 9 | 
10 | python3 -m main # 运行主程序


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import uvicorn
2 | from src.api import app
3 | 
4 | if __name__ == "__main__":
5 |     uvicorn.run(app, host="0.0.0.0", port=8080) 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | ## 🤖 简介
 2 | 
 3 | AI 日报机器人的数据源部分代码，需结合 Sealos Devbox 使用
 4 | 
 5 | 详情可见 https://mp.weixin.qq.com/s/C3edt0mrQ6Ql1ggNltVa9w
 6 | 
 7 | ## 🚀 运行
 8 | 
 9 | 1. 创建 Devbox python 项目并连接
10 | 
11 | ![open_devbox](https://oss.laf.dev/lk63dw-crawl/open_devbox.jpg)
12 | 
13 | 2. `git clone https://github.com/newfish-cmyk/crawl.git` 或 
14 | 
15 | 连不上 GitHub 可以使用 `wget https://oss.laf.dev/lk63dw-crawl/crawl-0.0.1.tar.gz`
16 | 
17 | ![wget](https://oss.laf.dev/lk63dw-crawl/commond.jpg)
18 | 
19 | 3. 将 crawl 中的文件拖到根目录进行替换
20 | 
21 | ![directory](https://oss.laf.dev/lk63dw-crawl/directory.jpg)
22 | 
23 | 4. 执行 ./entrypoint.sh，等待依赖安装完成并启动
24 | 
25 | ![entrypoint](https://oss.laf.dev/lk63dw-crawl/entrypoint.jpg)
26 | 
27 | 5. 使用 Devbox 的公网地址进行测试，将 {{Devbox 公网地址}}/api/test 填入 http 节点，使用 get 方法
28 | 
29 | ![link](https://oss.laf.dev/lk63dw-crawl/link.png)
30 | ![result](https://oss.laf.dev/lk63dw-crawl/result.jpg)
31 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | uvicorn==0.32.1
2 | fastapi==0.115.6
3 | crawl4ai


--------------------------------------------------------------------------------
/src/api.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI, HTTPException
 2 | from fastapi.middleware.cors import CORSMiddleware
 3 | from typing import List, Dict, Any
 4 | import asyncio
 5 | from src.testCrawler import TestCrawler
 6 | 
 7 | app = FastAPI(
 8 |     title="Tech News API",
 9 |     description="API for crawling tech news",
10 |     version="1.0.0"
11 | )
12 | 
13 | # CORS配置
14 | app.add_middleware(
15 |     CORSMiddleware,
16 |     allow_origins=["*"],
17 |     allow_credentials=True,
18 |     allow_methods=["*"],
19 |     allow_headers=["*"],
20 | )
21 | 
22 | @app.get("/")
23 | async def root():
24 |     """健康检查接口"""
25 |     return {"status": "ok"}
26 | 
27 | @app.get("/api/test", response_model=List[Dict[str, Any]])
28 | async def get_test():
29 |     try:
30 |         crawler = TestCrawler(verbose=True)
31 |         results = await asyncio.wait_for(crawler.crawl(), timeout=600.0)
32 |         return results
33 |     except Exception as e:
34 |         raise HTTPException(status_code=500, detail=str(e))


--------------------------------------------------------------------------------
/src/testCrawler.py:
--------------------------------------------------------------------------------
 1 | from crawl4ai import AsyncWebCrawler
 2 | from typing import List, Dict, Any
 3 | import json
 4 | from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 5 | 
 6 | class TestCrawler:
 7 |     def __init__(self, verbose=True):
 8 |         self.verbose = verbose
 9 |         self.url = "https://rsshub-ppzgvjqi.cloud.sealos.io/telegram/channel/xhqcankao"
10 | 
11 |     async def get_strategy(self):
12 |         schema = {
13 |             "name": "测试用 schema",
14 |             "baseSelector": "item",
15 |             "isList": True,
16 |             "fields": [
17 |                 {
18 |                     "name": "title",
19 |                     "selector": "title",
20 |                     "type": "text",
21 |                 },
22 |                 {
23 |                     "name": "date",
24 |                     "selector": "pubDate",
25 |                     "type": "text",
26 |                 },
27 |                 {
28 |                     "name": "content",
29 |                     "selector": "description",
30 |                     "type": "text",  
31 |                 },
32 |             ],
33 |         }
34 |         return JsonCssExtractionStrategy(schema, verbose=True)
35 |         
36 |     async def crawl(self) -> List[Dict[str, Any]]:
37 |         """抓取消息内容"""
38 |         results = []
39 |         
40 |         async with AsyncWebCrawler(verbose=self.verbose) as crawler:
41 |             result = await crawler.arun(
42 |                 url=self.url,
43 |                 magic=True,
44 |                 bypass_cache=True,
45 |                 extraction_strategy=await self.get_strategy()
46 |             )
47 |             
48 |             if result.success:
49 |                 print(f"成功抓取: {self.url}") if self.verbose else None
50 |                 content = json.loads(result.extracted_content)
51 |                 
52 |                 print(content)
53 |                 for item in content:
54 |                     try:
55 |                         if (item['content'].strip() and 
56 |                             not item['content'].startswith('![') and
57 |                             not item['content'].startswith('#')):
58 |                             
59 |                             results.append({
60 |                                 "title": item['title'].strip(),
61 |                                 "date": item['date'].strip(),
62 |                                 "content": item['content'].strip()
63 |                             })
64 |                     except Exception as e:
65 |                         print(f"处理内容时出错: {str(e)}") if self.verbose else None
66 |                 
67 |                 return results
68 |             else:
69 |                 print(f"抓取失败: {self.url}, 错误: {result.error_message}") if self.verbose else None
70 |                 return []


--------------------------------------------------------------------------------