├── entrypoint.sh ├── main.py ├── readme.md ├── requirements.txt └── src ├── api.py └── testCrawler.py /entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python3 -m venv myenv # 创建虚拟环境 3 | source myenv/bin/activate # 激活虚拟环境 4 | 5 | pip install -r requirements.txt # 安装依赖 6 | 7 | playwright install # 安装 Playwright 所需的浏览器 8 | playwright install-deps # 安装 Playwright 所需的依赖 9 | 10 | python3 -m main # 运行主程序 -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import uvicorn 2 | from src.api import app 3 | 4 | if __name__ == "__main__": 5 | uvicorn.run(app, host="0.0.0.0", port=8080) -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ## 🤖 简介 2 | 3 | AI 日报机器人的数据源部分代码,需结合 Sealos Devbox 使用 4 | 5 | 详情可见 https://mp.weixin.qq.com/s/C3edt0mrQ6Ql1ggNltVa9w 6 | 7 | ## 🚀 运行 8 | 9 | 1. 创建 Devbox python 项目并连接 10 | 11 | ![open_devbox](https://oss.laf.dev/lk63dw-crawl/open_devbox.jpg) 12 | 13 | 2. `git clone https://github.com/newfish-cmyk/crawl.git` 或 14 | 15 | 连不上 GitHub 可以使用 `wget https://oss.laf.dev/lk63dw-crawl/crawl-0.0.1.tar.gz` 16 | 17 | ![wget](https://oss.laf.dev/lk63dw-crawl/commond.jpg) 18 | 19 | 3. 将 crawl 中的文件拖到根目录进行替换 20 | 21 | ![directory](https://oss.laf.dev/lk63dw-crawl/directory.jpg) 22 | 23 | 4. 执行 ./entrypoint.sh,等待依赖安装完成并启动 24 | 25 | ![entrypoint](https://oss.laf.dev/lk63dw-crawl/entrypoint.jpg) 26 | 27 | 5. 使用 Devbox 的公网地址进行测试,将 {{Devbox 公网地址}}/api/test 填入 http 节点,使用 get 方法 28 | 29 | ![link](https://oss.laf.dev/lk63dw-crawl/link.png) 30 | ![result](https://oss.laf.dev/lk63dw-crawl/result.jpg) 31 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | uvicorn==0.32.1 2 | fastapi==0.115.6 3 | crawl4ai -------------------------------------------------------------------------------- /src/api.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, HTTPException 2 | from fastapi.middleware.cors import CORSMiddleware 3 | from typing import List, Dict, Any 4 | import asyncio 5 | from src.testCrawler import TestCrawler 6 | 7 | app = FastAPI( 8 | title="Tech News API", 9 | description="API for crawling tech news", 10 | version="1.0.0" 11 | ) 12 | 13 | # CORS配置 14 | app.add_middleware( 15 | CORSMiddleware, 16 | allow_origins=["*"], 17 | allow_credentials=True, 18 | allow_methods=["*"], 19 | allow_headers=["*"], 20 | ) 21 | 22 | @app.get("/") 23 | async def root(): 24 | """健康检查接口""" 25 | return {"status": "ok"} 26 | 27 | @app.get("/api/test", response_model=List[Dict[str, Any]]) 28 | async def get_test(): 29 | try: 30 | crawler = TestCrawler(verbose=True) 31 | results = await asyncio.wait_for(crawler.crawl(), timeout=600.0) 32 | return results 33 | except Exception as e: 34 | raise HTTPException(status_code=500, detail=str(e)) -------------------------------------------------------------------------------- /src/testCrawler.py: -------------------------------------------------------------------------------- 1 | from crawl4ai import AsyncWebCrawler 2 | from typing import List, Dict, Any 3 | import json 4 | from crawl4ai.extraction_strategy import JsonCssExtractionStrategy 5 | 6 | class TestCrawler: 7 | def __init__(self, verbose=True): 8 | self.verbose = verbose 9 | self.url = "https://rsshub-ppzgvjqi.cloud.sealos.io/telegram/channel/xhqcankao" 10 | 11 | async def get_strategy(self): 12 | schema = { 13 | "name": "测试用 schema", 14 | "baseSelector": "item", 15 | "isList": True, 16 | "fields": [ 17 | { 18 | "name": "title", 19 | "selector": "title", 20 | "type": "text", 21 | }, 22 | { 23 | "name": "date", 24 | "selector": "pubDate", 25 | "type": "text", 26 | }, 27 | { 28 | "name": "content", 29 | "selector": "description", 30 | "type": "text", 31 | }, 32 | ], 33 | } 34 | return JsonCssExtractionStrategy(schema, verbose=True) 35 | 36 | async def crawl(self) -> List[Dict[str, Any]]: 37 | """抓取消息内容""" 38 | results = [] 39 | 40 | async with AsyncWebCrawler(verbose=self.verbose) as crawler: 41 | result = await crawler.arun( 42 | url=self.url, 43 | magic=True, 44 | bypass_cache=True, 45 | extraction_strategy=await self.get_strategy() 46 | ) 47 | 48 | if result.success: 49 | print(f"成功抓取: {self.url}") if self.verbose else None 50 | content = json.loads(result.extracted_content) 51 | 52 | print(content) 53 | for item in content: 54 | try: 55 | if (item['content'].strip() and 56 | not item['content'].startswith('![') and 57 | not item['content'].startswith('#')): 58 | 59 | results.append({ 60 | "title": item['title'].strip(), 61 | "date": item['date'].strip(), 62 | "content": item['content'].strip() 63 | }) 64 | except Exception as e: 65 | print(f"处理内容时出错: {str(e)}") if self.verbose else None 66 | 67 | return results 68 | else: 69 | print(f"抓取失败: {self.url}, 错误: {result.error_message}") if self.verbose else None 70 | return [] --------------------------------------------------------------------------------