├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── config.yml │ └── feature_request.md └── workflows │ └── workflow.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs ├── .nojekyll ├── README.md ├── _coverpage.md ├── _navbar.md ├── _sidebar.md ├── command │ └── cmdline.md ├── favicon.ico ├── feapder_platform │ ├── feaplat.md │ ├── feaplat_bak.md │ ├── question.md │ └── usage.md ├── foreword │ ├── 10分钟上手.md │ ├── 功能概览.md │ └── 架构设计.md ├── images │ └── qingguo.jpg ├── index.html ├── lib │ ├── docsify-copy-code │ │ └── docsify-copy-code.min.js │ ├── docsify │ │ └── lib │ │ │ ├── docsify.min.js │ │ │ ├── plugins │ │ │ ├── docsify-edit-on-github.js │ │ │ ├── ga.js │ │ │ └── search.js │ │ │ └── themes │ │ │ └── vue.css │ └── prismjs │ │ └── components │ │ ├── prism-bash.js │ │ ├── prism-java.js │ │ ├── prism-python.js │ │ ├── prism-sql.js │ │ └── prism-yaml.js ├── question │ ├── setting不生效问题.md │ ├── 安装问题.md │ ├── 请求问题.md │ └── 运行问题.md ├── robots.txt ├── source_code │ ├── BaseParser.md │ ├── BatchParser.md │ ├── BatchSpider进阶.md │ ├── Item.md │ ├── MongoDB.md │ ├── MysqlDB.md │ ├── RedisDB.md │ ├── Request.md │ ├── Response.md │ ├── Spider进阶.md │ ├── UpdateItem.md │ ├── UserPool.md │ ├── custom_downloader.md │ ├── dedup.md │ ├── logger.md │ ├── pipeline.md │ ├── proxy.md │ ├── tools.md │ ├── 报警及监控.md │ ├── 浏览器渲染-Playwright.md │ ├── 浏览器渲染-Selenium.md │ ├── 监控打点.md │ └── 配置文件.md └── usage │ ├── AirSpider.md │ ├── BatchSpider.md │ ├── Spider.md │ ├── TaskSpider.md │ ├── 使用前必读.md │ └── 爬虫集成.md ├── feapder ├── VERSION ├── __init__.py ├── buffer │ ├── __init__.py │ ├── item_buffer.py │ └── request_buffer.py ├── commands │ ├── __init__.py │ ├── cmdline.py │ ├── create │ │ ├── __init__.py │ │ ├── create_cookies.py │ │ ├── create_init.py │ │ ├── create_item.py │ │ ├── create_json.py │ │ ├── create_params.py │ │ ├── create_project.py │ │ ├── create_setting.py │ │ ├── create_spider.py │ │ └── create_table.py │ ├── create_builder.py │ ├── retry.py │ ├── shell.py │ └── zip.py ├── core │ ├── __init__.py │ ├── base_parser.py │ ├── collector.py │ ├── handle_failed_items.py │ ├── handle_failed_requests.py │ ├── parser_control.py │ ├── scheduler.py │ └── spiders │ │ ├── __init__.py │ │ ├── air_spider.py │ │ ├── batch_spider.py │ │ ├── spider.py │ │ └── task_spider.py ├── db │ ├── __init__.py │ ├── memorydb.py │ ├── mongodb.py │ ├── mysqldb.py │ └── redisdb.py ├── dedup │ ├── README.md │ ├── __init__.py │ ├── basefilter.py │ ├── bitarray.py │ ├── bloomfilter.py │ ├── expirefilter.py │ └── litefilter.py ├── network │ ├── __init__.py │ ├── downloader │ │ ├── __init__.py │ │ ├── _playwright.py │ │ ├── _requests.py │ │ ├── _selenium.py │ │ └── base.py │ ├── item.py │ ├── proxy_pool │ │ ├── __init__.py │ │ ├── base.py │ │ └── proxy_pool.py │ ├── proxy_pool_old.py │ ├── request.py │ ├── response.py │ ├── selector.py │ ├── user_agent.py │ └── user_pool │ │ ├── __init__.py │ │ ├── base_user_pool.py │ │ ├── gold_user_pool.py │ │ ├── guest_user_pool.py │ │ └── normal_user_pool.py ├── pipelines │ ├── __init__.py │ ├── console_pipeline.py │ ├── mongo_pipeline.py │ └── mysql_pipeline.py ├── requirements.txt ├── setting.py ├── templates │ ├── air_spider_template.tmpl │ ├── batch_spider_template.tmpl │ ├── item_template.tmpl │ ├── project_template │ │ ├── CHECK_DATA.md │ │ ├── README.md │ │ ├── items │ │ │ └── __init__.py │ │ ├── main.py │ │ ├── setting.py │ │ └── spiders │ │ │ └── __init__.py │ ├── spider_template.tmpl │ ├── task_spider_template.tmpl │ └── update_item_template.tmpl └── utils │ ├── __init__.py │ ├── custom_argparse.py │ ├── email_sender.py │ ├── js │ ├── intercept.js │ └── stealth.min.js │ ├── log.py │ ├── metrics.py │ ├── perfect_dict.py │ ├── redis_lock.py │ ├── tail_thread.py │ ├── tools.py │ └── webdriver │ ├── __init__.py │ ├── playwright_driver.py │ ├── selenium_driver.py │ ├── webdirver.py │ └── webdriver_pool.py ├── setup.py └── tests ├── air-spider ├── test_air_spider.py ├── test_air_spider_filter.py ├── test_air_spider_item.py └── test_render_spider.py ├── batch-spider-integration ├── batch_spider_integration_task.sql ├── items │ └── __init__.py ├── main.py ├── setting.py └── spiders │ ├── __init__.py │ ├── sina_news_parser.py │ └── tencent_news_parser.py ├── batch-spider ├── items │ ├── __init__.py │ └── spider_data_item.py ├── main.py ├── setting.py ├── spiders │ ├── __init__.py │ └── test_spider.py └── table.sql ├── db └── test_redis.py ├── jd_spider.py ├── mongo_spider.py ├── spider-integration ├── items │ └── __init__.py ├── main.py ├── setting.py └── spiders │ ├── __init__.py │ ├── sina_news_parser.py │ └── tencent_news_parser.py ├── spider ├── items │ ├── __init__.py │ └── spider_data_item.py ├── main.py ├── setting.py ├── spiders │ ├── __init__.py │ ├── test_spider.py │ └── test_spider2.py └── table.sql ├── task-spider └── test_task_spider.py ├── test-debugger ├── README.md ├── items │ └── __init__.py ├── main.py ├── setting.py └── spiders │ ├── __init__.py │ └── test_debugger.py ├── test-pipeline ├── items │ ├── __init__.py │ └── spider_data_item.py ├── main.py ├── pipeline.py ├── setting.py ├── spiders │ ├── __init__.py │ └── test_spider.py └── table.sql ├── test_dedup.py ├── test_download_midware.py ├── test_lock.py ├── test_log.py ├── test_metrics.py ├── test_mongodb.py ├── test_mysqldb.py ├── test_playwright.py ├── test_playwright2.py ├── test_rander.py ├── test_rander2.py ├── test_rander3.py ├── test_rander_xhr.py ├── test_redisdb.py ├── test_request.py ├── test_spider_params.py ├── test_task.py ├── test_template └── test_spider.py ├── test_tools.py ├── test_webdriver.py └── user_pool ├── test_gold_user_pool.py ├── test_guest_user_pool.py └── test_normal_user_pool.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **需知** 11 | 12 | 升级feapder,保证feapder是最新版,若BUG仍然存在,则详细描述问题 13 | > pip install --upgrade feapder 14 | 15 | **问题** 16 | 17 | **截图** 18 | 19 | **代码** 20 | 21 | ```python 22 | 23 | ``` 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | # https://docs.github.com/en/github/building-a-strong-community/configuring-issue-templates-for-your-repository#configuring-the-template-chooser 2 | blank_issues_allowed: false # We have a blank template which assigns labels 3 | contact_links: 4 | - name: Questions about using feapder? 5 | url: "https://github.com/Boris-code/feapder/discussions" 6 | about: Please see our guide on how to ask questions -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/workflow.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/.github/workflows/workflow.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | files/* 2 | .DS_Store 3 | .idea/* 4 | */.idea/* 5 | venv/* 6 | venv2/* 7 | *.pyc 8 | *test.py 9 | *.log 10 | **/proxy_file 11 | build/ 12 | dist/ 13 | *.egg-info/ 14 | .vscode/ 15 | media/ 16 | .MWebMetaData/ 17 | push.sh 18 | assets/ -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # 贡献指南 2 | 感谢你的宝贵时间。你的贡献将使这个项目变得更好!在提交贡献之前,请务必花点时间阅读下面的入门指南。 3 | 4 | ## 提交 Pull Request 5 | 1. Fork [此仓库](https://github.com/Boris-code/feapder.git), 6 | 2. clone到本地,从 `develop` 创建分支,对代码进行更改。 7 | 3. 请确保进行了相应的测试。 8 | 4. 推送代码到自己Fork的仓库中。 9 | 5. 在Fork的仓库中点击 Pull request 链接 10 | 6. 点击「New pull request」按钮。 11 | 7. 填写提交说明后,「Create pull request」。提交到`develop`分支。 12 | 13 | ## License 14 | 15 | [MIT](./LICENSE) 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Modifications: 4 | 5 | Copyright (c) 2020 Boris 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | 4 | include feapder/requirements.txt 5 | include feapder/VERSION 6 | 7 | recursive-include feapder/utils/js * 8 | recursive-include feapder/templates * 9 | recursive-include tests * 10 | 11 | global-exclude __pycache__ *.py[cod] -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/docs/.nojekyll -------------------------------------------------------------------------------- /docs/_coverpage.md: -------------------------------------------------------------------------------- 1 | ![feapder](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/02/08/feapder.png) 2 | 3 | # feapder 爬虫框架文档 4 | 5 | > [ˈfiːpdə] 6 | 7 | feapder 命名源于 fast-easy-air-pro-spider 缩写 8 | 9 | 秉承着开发快速、抓取快速、简单、轻量且功能强大的原则,倾心打造。 10 | 11 | 支持轻量级爬虫、分布式爬虫、批次爬虫、爬虫集成,以及完善的报警等。 12 | 13 | 14 | [GitHub](https://github.com/Boris-code/feapder) 15 | [Get Started](README.md) 16 | -------------------------------------------------------------------------------- /docs/_navbar.md: -------------------------------------------------------------------------------- 1 | 2 | * [爬虫管理系统](feapder_platform/feaplat.md) 3 | * [爬虫工具库](https://spidertools.cn) 4 | * [知识星球](https://t.zsxq.com/mmAmAuF) 5 | * [微信公众号](https://open.weixin.qq.com/qr/code?username=gh_870ffb1242a7) 6 | * [知乎](https://www.zhihu.com/people/boris-97-17/posts) 7 | * [讨论](https://gitter.im/feapder/community?utm_source=share-link&utm_medium=link&utm_campaign=share-link) -------------------------------------------------------------------------------- /docs/_sidebar.md: -------------------------------------------------------------------------------- 1 | * 序章 2 | * [简介及安装](README.md) 3 | * [10分钟快速上手](foreword/10分钟上手.md) 4 | * [架构设计](foreword/架构设计.md) 5 | * [功能概览](foreword/功能概览.md) 6 | 7 | * 常用工具 8 | * [命令行工具](command/cmdline.md) 9 | 10 | * 使用说明 11 | * [使用前必读](usage/使用前必读.md) 12 | * [轻量爬虫-AirSpider](usage/AirSpider.md) 13 | * [分布式爬虫-Spider](usage/Spider.md) 14 | * [任务爬虫-TaskSpider](usage/TaskSpider.md) 15 | * [批次爬虫-BatchSpider](usage/BatchSpider.md) 16 | * [爬虫集成](usage/爬虫集成.md) 17 | 18 | * 使用进阶 19 | * [请求-Request](source_code/Request.md) 20 | * [响应-Response](source_code/Response.md) 21 | * [代理使用说明](source_code/proxy.md) 22 | * [用户池说明](source_code/UserPool.md) 23 | * [浏览器渲染-Selenium](source_code/浏览器渲染-Selenium.md) 24 | * [浏览器渲染-Playwright](source_code/浏览器渲染-Playwright) 25 | * [解析器-BaseParser](source_code/BaseParser.md) 26 | * [批次解析器-BatchParser](source_code/BatchParser.md) 27 | * [Spider进阶](source_code/Spider进阶.md) 28 | * [BatchSpider进阶](source_code/BatchSpider进阶.md) 29 | * [配置文件](source_code/配置文件.md) 30 | * [Item](source_code/Item.md) 31 | * [UpdateItem](source_code/UpdateItem.md) 32 | * [数据管道-pipeline](source_code/pipeline.md) 33 | * [MysqlDB](source_code/MysqlDB.md) 34 | * [MongoDB](source_code/MongoDB.md) 35 | * [RedisDB](source_code/RedisDB.md) 36 | * [工具库-tools](source_code/tools.md) 37 | * [日志配置及使用](source_code/logger.md) 38 | * [海量数据去重-dedup](source_code/dedup.md) 39 | * [报警及监控](source_code/报警及监控.md) 40 | * [监控打点](source_code/监控打点.md) 41 | * [自定义下载器](source_code/custom_downloader.md) 42 | 43 | * 爬虫管理系统 44 | * [简介及部署](feapder_platform/feaplat.md) 45 | * [使用说明](feapder_platform/usage.md) 46 | * [常见问题](feapder_platform/question.md) 47 | 48 | * 常见问题 49 | * [安装问题](question/安装问题.md) 50 | * [运行问题](question/运行问题.md) 51 | * [请求问题](question/请求问题.md) 52 | * [setting不生效问题](question/setting不生效问题.md) -------------------------------------------------------------------------------- /docs/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/docs/favicon.ico -------------------------------------------------------------------------------- /docs/feapder_platform/usage.md: -------------------------------------------------------------------------------- 1 | # FEAPLAT使用说明 2 | 3 | ## 首次运行须知 4 | 5 | 1. 管理系统默认账号密码:admin / admin 6 | 7 | ## 添加项目 8 | 9 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/09/17/16318800747189.jpg) 10 | 11 | 1. 使用git方式上传项目时,需要使用SSH协议,若拉取私有项目,可在feaplat的设置页面添加 SSH 密钥。使用git方式,每次运行前会拉取默认分支最新的代码 12 | 2. 项目会被放到爬虫`worker`容器的根目录下 即 `/项目文件` 13 | 3. 工作路径:是指你的项目路径,比如下面的项目结构: 14 | 15 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/09/13/16315322995977.jpg) 16 | 17 | 工作路径为 `/spider-project`,feaplat会进入到这个目录,后续的代码执行命令都是在这个路径下运行的 18 | 19 | 1. requirements.txt:用于安装依赖包,填写依赖包的绝对路径 20 | 21 | ## 运行 22 | 23 | 1. 启动命令:启动命令是在您添加项目时配置的工作路径下执行的 24 | 2. 定时类型: 25 | 1. cron:crontab表达式,参考:https://tool.lu/crontab/ 26 | 2. interval:时间间隔 27 | 3. date:指定日期 28 | 4. once:立即运行,且只运行一次 29 | 30 | ## 示例 31 | 32 | 1. 准备项目,项目结构如下: 33 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/10/16/16343707944750.jpg) 34 | 2. 压缩后上传:(推荐使用 `feapder zip` 命令压缩) 35 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/10/16/16343709590040.jpg) 36 | - 工作路径:上传的项目会被放到docker里的根目录下(跟你本机项目路径没关系),然后解压运行。因`feapder_demo.zip`解压后为`feapder_demo`,所以工作路径配置`/feapder_demo` 37 | - 本项目没依赖,可以不配置`requirements.txt` 38 | - 若需要第三放库,则在项目下创建requirements.txt文件,把依赖库写进去,然后路径指向这个文件即可,如`/feaplat_demo/requirements.txt` 39 | 1. 点击项目进入任务列表,添加任务 40 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/10/16/16343712604864.jpg) 41 | 启动命令的执行位置是在上面配置的工作路径下执行的,定时类型为once时点击确认添加会自动执行 42 | 1. 查看任务实例: 43 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/10/16/16343720658671.jpg) 44 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/10/16/16343720862217.jpg) 45 | 46 | 可以看到已经运行完毕 47 | 48 | ## git方式拉取私有项目 49 | 50 | 拉取私有项目需在git仓库里添加如下公钥 51 | 52 | ``` 53 | ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCd/k/tjbcMislEunjtYQNXxz5tgEDc/fSvuLHBNUX4PtfmMQ07TuUX2XJIIzLRPaqv3nsMn3+QZrV0xQd545FG1Cq83JJB98ATTW7k5Q0eaWXkvThdFeG5+n85KeVV2W4BpdHHNZ5h9RxBUmVZPpAZacdC6OUSBYTyCblPfX9DvjOk+KfwAZVwpJSkv4YduwoR3DNfXrmK5P+wrYW9z/VHUf0hcfWEnsrrHktCKgohZn9Fe8uS3B5wTNd9GgVrLGRk85ag+CChoqg80DjgFt/IhzMCArqwLyMn7rGG4Iu2Ie0TcdMc0TlRxoBhqrfKkN83cfQ3gDf41tZwp67uM9ZN feapder@qq.com 54 | ``` 55 | 56 | 或在系统设置页面配置您的SSH私钥,然后在git仓库里添加您的公钥,例如: 57 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/10/19/16346353514967.jpg) 58 | 59 | 注意,公私钥加密方式为RSA,其他的可能会有问题 60 | 61 | 生成RSA公私钥方式如下: 62 | ```shell 63 | ssh-keygen -t rsa -C "备注" -f 生成路径/文件名 64 | ``` 65 | 如: 66 | `ssh-keygen -t rsa -C "feaplat" -f id_rsa` 67 | 然后一路回车,不要输密码 68 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/11/17/16371210640228.jpg) 69 | 最终生成 `id_rsa`、`id_rsa.pub` 文件,复制`id_rsa.pub`文件内容到git仓库,复制`id_rsa`文件内容到feaplat爬虫管理系统 70 | 71 | 72 | 73 | ## 爬虫监控 74 | 75 | > 若您使用的是feapder爬虫或者使用了自定义打点,监控才会有对应的数据 76 | 77 | 1. 表名:以 task_id 命名 78 | 2. 保留策略:这是influxdb的概念,监控数据默认保留180天,滚动更新,这个保留策略为`feapder_180d`,同时也被设置成了默认策略`default`。所以直接用`default`就可以。 79 | 80 | ## 系统设置 81 | 82 | 1. GIT_SSH_PRIVATE_KEY:可以在自己的笔记本上使用`cat .ssh/id_rsa`查看,然后把内容复制到进来。不了解git ssh协议的,自行查资料 83 | 84 | ## 更新版本 85 | 86 | ``` 87 | git pull 88 | docker-compose up -d 89 | ``` 90 | 依次执行以上命令即可 91 | -------------------------------------------------------------------------------- /docs/foreword/功能概览.md: -------------------------------------------------------------------------------- 1 | # FEAPDER 2 | 3 | ## 1. 支持周期性采集 4 | 5 | 周期性抓取是爬虫中常见的需求,如每日抓取一次商品的销量等,我们把每个周期称为一个批次。 6 | 7 | 本框架支持批次采集,引入了批次表的概念,详细记录了每一批次的抓取状态 8 | 9 | ![-w899](http://markdown-media.oss-cn-beijing.aliyuncs.com/2020/12/20/16084680404224.jpg) 10 | 11 | ## 2. 支持分布式采集 12 | 13 | 面对海量的数据,分布式采集必不可少的,本框架支持分布式,且可随时重启爬虫,任务不丢失 14 | 15 | ## 3. 支持爬虫集成 16 | 17 | 本功能可以将多个爬虫以插件的形式集成为一个爬虫,常用于采集周期一致,需求一致的,但需要采集多个数据源的项目 18 | 19 | ## 4. 支持海量数据去重 20 | 21 | 框架内置3种去重机制,通过简单的配置可对任务及数据自动去重,也可拿出来单独作为模块使用,支持批量去重。 22 | 23 | 1. 临时去重:处理一万条数据约0.26秒。 去重一亿条数据占用内存约1.43G,可指定去重的失效周期 24 | 2. 内存去重:处理一万条数据约0.5秒。 去重一亿条数据占用内存约285MB 25 | 3. 永久去重:处理一万条数据约3.5秒。去重一亿条数据占用内存约285MB 26 | 27 | ## 5. 数据采集完整性 28 | 29 | feapder对于每一条URL数据的抓取采取了强状态的控制,做到采集任务中URL抓取100%不丢失,即使多次尝试失败的URL也会进入错误队列并记录失败原因日志。这一特性对于很多强依赖采集数据的业务场景非常重要,保证数据用的放心。 30 | 31 | ## 6. 数据自动入库 32 | 33 | 只需要根据数据库表自动生成item,然后给item属性赋值,直接yield 返回即可批量入库 34 | 35 | ## 7. 支持Debug模式 36 | 37 | 爬虫支持debug模式,debug模式下默认数据不入库、不修改任务状态。可针对某个任务进行调试,方便开发 38 | 39 | ## 8. 完善的报警机制 40 | 41 | 为了保证数据的全量性、准确性、时效性,本框架内置报警机制,有了这些报警,我们可以实时掌握爬虫状态 42 | 43 | 1. 实时计算爬虫抓取速度,估算剩余时间,在指定的抓取周期内预判是否会超时 44 | 45 | ![-w657](http://markdown-media.oss-cn-beijing.aliyuncs.com/2020/12/20/16084718683378.jpg) 46 | 47 | 48 | 2. 爬虫卡死报警 49 | 50 | ![-w501](http://markdown-media.oss-cn-beijing.aliyuncs.com/2020/12/20/16084718974597.jpg) 51 | 52 | 3. 爬虫任务失败数过多报警,可能是由于网站模板改动或封堵导致 53 | 54 | ![-w416](http://markdown-media.oss-cn-beijing.aliyuncs.com/2020/12/29/16092335882158.jpg) 55 | 56 | ## 9. 下载监控 57 | 58 | 框架对请求总数、成功数、失败数、解析异常数进行监控,将数据点打入到infuxdb,结合Grafana面板,可方便掌握抓取情况 59 | 60 | ![-w1299](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/02/09/16128568548280.jpg) 61 | 62 | 63 | -------------------------------------------------------------------------------- /docs/foreword/架构设计.md: -------------------------------------------------------------------------------- 1 | 2 | # 框架流程图 3 | 4 | ![boris-spider -1-](http://markdown-media.oss-cn-beijing.aliyuncs.com/2020/06/08/borisspider-1.png) 5 | 6 | ## 模块说明: 7 | 8 | * spider **框架调度核心** 9 | * parser_control **模版控制器**,负责调度parser 10 | * collector **任务收集器**,负责从任务队里中批量取任务到内存,以减少爬虫对任务队列数据库的访问频率及并发量 11 | * parser **数据解析器** 12 | * start_request 初始任务下发函数 13 | * item_buffer **数据缓冲队列**,批量将数据存储到数据库中 14 | * request_buffer **请求任务缓冲队列**,批量将请求任务存储到任务队列中 15 | * request **数据下载器**,封装了requests,用于从互联网上下载数据 16 | * response **请求响应**,封装了response, 支持xpath、css、re等解析方式,自动处理中文乱码 17 | 18 | ## 流程说明: 19 | 20 | 1. spider调度**start_request**生产任务 21 | 2. **start_request**下发任务到request_buffer中 22 | 3. spider调度**request_buffer**批量将任务存储到任务队列数据库中 23 | 4. spider调度**collector**从任务队列中批量获取任务到内存队列 24 | 5. spider调度**parser_control**从collector的内存队列中获取任务 25 | 6. **parser_control**调度**request**请求数据 26 | 7. **request**请求与下载数据 27 | 8. request将下载后的数据给**response**,进一步封装 28 | 9. 将封装好的**response**返回给**parser_control**(图示为多个parser_control,表示多线程) 29 | 10. parser_control调度对应的**parser**,解析返回的response(图示多组parser表示不同的网站解析器) 30 | 11. parser_control将parser解析到的数据item及新产生的request分发到**item_buffer**与**request_buffer** 31 | 12. spider调度**item_buffer**与**request_buffer**将数据批量入库 32 | 33 | -------------------------------------------------------------------------------- /docs/images/qingguo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/docs/images/qingguo.jpg -------------------------------------------------------------------------------- /docs/lib/docsify-copy-code/docsify-copy-code.min.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * docsify-copy-code 3 | * v2.1.0 4 | * https://github.com/jperasmus/docsify-copy-code 5 | * (c) 2017-2019 JP Erasmus 6 | * MIT license 7 | */ 8 | !function(){"use strict";function r(o){return(r="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(o){return typeof o}:function(o){return o&&"function"==typeof Symbol&&o.constructor===Symbol&&o!==Symbol.prototype?"symbol":typeof o})(o)}!function(o,e){void 0===e&&(e={});var t=e.insertAt;if(o&&"undefined"!=typeof document){var n=document.head||document.getElementsByTagName("head")[0],c=document.createElement("style");c.type="text/css","top"===t&&n.firstChild?n.insertBefore(c,n.firstChild):n.appendChild(c),c.styleSheet?c.styleSheet.cssText=o:c.appendChild(document.createTextNode(o))}}(".docsify-copy-code-button,.docsify-copy-code-button span{cursor:pointer;transition:all .25s ease}.docsify-copy-code-button{position:absolute;z-index:1;top:0;right:0;overflow:visible;padding:.65em .8em;border:0;border-radius:0;outline:0;font-size:1em;background:grey;background:var(--theme-color,grey);color:#fff;opacity:0}.docsify-copy-code-button span{border-radius:3px;background:inherit;pointer-events:none}.docsify-copy-code-button .error,.docsify-copy-code-button .success{position:absolute;z-index:-100;top:50%;left:0;padding:.5em .65em;font-size:.825em;opacity:0;-webkit-transform:translateY(-50%);transform:translateY(-50%)}.docsify-copy-code-button.error .error,.docsify-copy-code-button.success .success{opacity:1;-webkit-transform:translate(-115%,-50%);transform:translate(-115%,-50%)}.docsify-copy-code-button:focus,pre:hover .docsify-copy-code-button{opacity:1}"),document.querySelector('link[href*="docsify-copy-code"]')&&console.warn("[Deprecation] Link to external docsify-copy-code stylesheet is no longer necessary."),window.DocsifyCopyCodePlugin={init:function(){return function(o,e){o.ready(function(){console.warn("[Deprecation] Manually initializing docsify-copy-code using window.DocsifyCopyCodePlugin.init() is no longer necessary.")})}}},window.$docsify=window.$docsify||{},window.$docsify.plugins=[function(o,s){o.doneEach(function(){var o=Array.apply(null,document.querySelectorAll("pre[data-lang]")),c={buttonText:"Copy to clipboard",errorText:"Error",successText:"Copied"};s.config.copyCode&&Object.keys(c).forEach(function(t){var n=s.config.copyCode[t];"string"==typeof n?c[t]=n:"object"===r(n)&&Object.keys(n).some(function(o){var e=-1',''.concat(c.buttonText,""),''.concat(c.errorText,""),''.concat(c.successText,""),""].join("");o.forEach(function(o){o.insertAdjacentHTML("beforeend",e)})}),o.mounted(function(){document.querySelector(".content").addEventListener("click",function(o){if(o.target.classList.contains("docsify-copy-code-button")){var e="BUTTON"===o.target.tagName?o.target:o.target.parentNode,t=document.createRange(),n=e.parentNode.querySelector("code"),c=window.getSelection();t.selectNode(n),c.removeAllRanges(),c.addRange(t);try{document.execCommand("copy")&&(e.classList.add("success"),setTimeout(function(){e.classList.remove("success")},1e3))}catch(o){console.error("docsify-copy-code: ".concat(o)),e.classList.add("error"),setTimeout(function(){e.classList.remove("error")},1e3)}"function"==typeof(c=window.getSelection()).removeRange?c.removeRange(t):"function"==typeof c.removeAllRanges&&c.removeAllRanges()}})})}].concat(window.$docsify.plugins||[])}(); 9 | //# sourceMappingURL=docsify-copy-code.min.js.map 10 | -------------------------------------------------------------------------------- /docs/lib/docsify/lib/plugins/docsify-edit-on-github.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Minified by jsDelivr using Terser v3.14.1. 3 | * Original file: /npm/docsify-edit-on-github@1.0.3/index.js 4 | * 5 | * Do NOT use SRI with dynamically generated files! More information: https://www.jsdelivr.com/using-sri-with-dynamic-files 6 | */ 7 | ! function(t) { 8 | t.EditOnGithubPlugin = {}, t.EditOnGithubPlugin.create = function(n, i, e) { 9 | function u(t) { 10 | return header = ['
', 11 | '

', 'memo  ', t, "

", "
" 13 | ].join("") 14 | } 15 | return e = e || "Edit on github", i = i || n.replace(/\/blob\//, "/edit/"), t.EditOnGithubPlugin.editDoc = 16 | function(t, n) { 17 | var e = n.route.file; 18 | if (e) { 19 | var u = i + e; 20 | return window.open(u), t.preventDefault(), !1 21 | } 22 | return !0 23 | }, 24 | function(n, i) { 25 | if (t.EditOnGithubPlugin.onClick = function(t) { 26 | EditOnGithubPlugin.editDoc(t, i) 27 | }, (r = e) && "[object Function]" === {}.toString.call(r)) n.afterEach(function(t) { 28 | return u(e(i.route.file)) + t 29 | }); 30 | else { 31 | var o = u(e); 32 | n.afterEach(function(t) { 33 | return o + t 34 | }) 35 | } 36 | var r 37 | } 38 | } 39 | }(window); 40 | //# sourceMappingURL=/sm/eef821f4877f09e27be373326100cefe923735a9bb303de51b16f9079d063a86.map -------------------------------------------------------------------------------- /docs/lib/docsify/lib/plugins/ga.js: -------------------------------------------------------------------------------- 1 | (function () { 2 | // From https://github.com/egoist/vue-ga/blob/master/src/index.js 3 | function appendScript() { 4 | var script = document.createElement('script'); 5 | script.async = true; 6 | script.src = 'https://www.google-analytics.com/analytics.js'; 7 | document.body.appendChild(script); 8 | } 9 | 10 | function init(id) { 11 | appendScript(); 12 | window.ga = 13 | window.ga || 14 | function () { 15 | (window.ga.q = window.ga.q || []).push(arguments); 16 | }; 17 | window.ga.l = Number(new Date()); 18 | window.ga('create', id, 'auto'); 19 | } 20 | 21 | function collect() { 22 | if (!window.ga) { 23 | init($docsify.ga); 24 | } 25 | 26 | window.ga('set', 'page', location.hash); 27 | window.ga('send', 'pageview'); 28 | } 29 | 30 | var install = function (hook) { 31 | if (!$docsify.ga) { 32 | console.error('[Docsify] ga is required.'); 33 | return 34 | } 35 | 36 | hook.beforeEach(collect); 37 | }; 38 | 39 | $docsify.plugins = [].concat(install, $docsify.plugins); 40 | 41 | }()); 42 | -------------------------------------------------------------------------------- /docs/lib/prismjs/components/prism-java.js: -------------------------------------------------------------------------------- 1 | (function (Prism) { 2 | 3 | var keywords = /\b(?:abstract|assert|boolean|break|byte|case|catch|char|class|const|continue|default|do|double|else|enum|exports|extends|final|finally|float|for|goto|if|implements|import|instanceof|int|interface|long|module|native|new|null|open|opens|package|private|protected|provides|public|requires|return|short|static|strictfp|super|switch|synchronized|this|throw|throws|to|transient|transitive|try|uses|var|void|volatile|while|with|yield)\b/; 4 | 5 | // based on the java naming conventions 6 | var className = /\b[A-Z](?:\w*[a-z]\w*)?\b/; 7 | 8 | Prism.languages.java = Prism.languages.extend('clike', { 9 | 'class-name': [ 10 | className, 11 | 12 | // variables and parameters 13 | // this to support class names (or generic parameters) which do not contain a lower case letter (also works for methods) 14 | /\b[A-Z]\w*(?=\s+\w+\s*[;,=())])/ 15 | ], 16 | 'keyword': keywords, 17 | 'function': [ 18 | Prism.languages.clike.function, 19 | { 20 | pattern: /(\:\:)[a-z_]\w*/, 21 | lookbehind: true 22 | } 23 | ], 24 | 'number': /\b0b[01][01_]*L?\b|\b0x[\da-f_]*\.?[\da-f_p+-]+\b|(?:\b\d[\d_]*\.?[\d_]*|\B\.\d[\d_]*)(?:e[+-]?\d[\d_]*)?[dfl]?/i, 25 | 'operator': { 26 | pattern: /(^|[^.])(?:<<=?|>>>?=?|->|--|\+\+|&&|\|\||::|[?:~]|[-+*/%&|^!=<>]=?)/m, 27 | lookbehind: true 28 | } 29 | }); 30 | 31 | Prism.languages.insertBefore('java', 'string', { 32 | 'triple-quoted-string': { 33 | // http://openjdk.java.net/jeps/355#Description 34 | pattern: /"""[ \t]*[\r\n](?:(?:"|"")?(?:\\.|[^"\\]))*"""/, 35 | greedy: true, 36 | alias: 'string' 37 | } 38 | }); 39 | 40 | Prism.languages.insertBefore('java', 'class-name', { 41 | 'annotation': { 42 | alias: 'punctuation', 43 | pattern: /(^|[^.])@\w+/, 44 | lookbehind: true 45 | }, 46 | 'namespace': { 47 | pattern: /(\b(?:exports|import(?:\s+static)?|module|open|opens|package|provides|requires|to|transitive|uses|with)\s+)[a-z]\w*(?:\.[a-z]\w*)+/, 48 | lookbehind: true, 49 | inside: { 50 | 'punctuation': /\./, 51 | } 52 | }, 53 | 'generics': { 54 | pattern: /<(?:[\w\s,.&?]|<(?:[\w\s,.&?]|<(?:[\w\s,.&?]|<[\w\s,.&?]*>)*>)*>)*>/, 55 | inside: { 56 | 'class-name': className, 57 | 'keyword': keywords, 58 | 'punctuation': /[<>(),.:]/, 59 | 'operator': /[?&|]/ 60 | } 61 | } 62 | }); 63 | }(Prism)); 64 | -------------------------------------------------------------------------------- /docs/lib/prismjs/components/prism-python.js: -------------------------------------------------------------------------------- 1 | Prism.languages.python={comment:{pattern:/(^|[^\\])#.*/,lookbehind:!0},"string-interpolation":{pattern:/(?:f|rf|fr)(?:("""|''')[\s\S]*?\1|("|')(?:\\.|(?!\2)[^\\\r\n])*\2)/i,greedy:!0,inside:{interpolation:{pattern:/((?:^|[^{])(?:{{)*){(?!{)(?:[^{}]|{(?!{)(?:[^{}]|{(?!{)(?:[^{}])+})+})+}/,lookbehind:!0,inside:{"format-spec":{pattern:/(:)[^:(){}]+(?=}$)/,lookbehind:!0},"conversion-option":{pattern:/![sra](?=[:}]$)/,alias:"punctuation"},rest:null}},string:/[\s\S]+/}},"triple-quoted-string":{pattern:/(?:[rub]|rb|br)?("""|''')[\s\S]*?\1/i,greedy:!0,alias:"string"},string:{pattern:/(?:[rub]|rb|br)?("|')(?:\\.|(?!\1)[^\\\r\n])*\1/i,greedy:!0},function:{pattern:/((?:^|\s)def[ \t]+)[a-zA-Z_]\w*(?=\s*\()/g,lookbehind:!0},"class-name":{pattern:/(\bclass\s+)\w+/i,lookbehind:!0},decorator:{pattern:/(^\s*)@\w+(?:\.\w+)*/im,lookbehind:!0,alias:["annotation","punctuation"],inside:{punctuation:/\./}},keyword:/\b(?:and|as|assert|async|await|break|class|continue|def|del|elif|else|except|exec|finally|for|from|global|if|import|in|is|lambda|nonlocal|not|or|pass|print|raise|return|try|while|with|yield)\b/,builtin:/\b(?:__import__|abs|all|any|apply|ascii|basestring|bin|bool|buffer|bytearray|bytes|callable|chr|classmethod|cmp|coerce|compile|complex|delattr|dict|dir|divmod|enumerate|eval|execfile|file|filter|float|format|frozenset|getattr|globals|hasattr|hash|help|hex|id|input|int|intern|isinstance|issubclass|iter|len|list|locals|long|map|max|memoryview|min|next|object|oct|open|ord|pow|property|range|raw_input|reduce|reload|repr|reversed|round|set|setattr|slice|sorted|staticmethod|str|sum|super|tuple|type|unichr|unicode|vars|xrange|zip)\b/,boolean:/\b(?:True|False|None)\b/,number:/(?:\b(?=\d)|\B(?=\.))(?:0[bo])?(?:(?:\d|0x[\da-f])[\da-f]*(?:\.\d*)?|\.\d+)(?:e[+-]?\d+)?j?\b/i,operator:/[-+%=]=?|!=|\*\*?=?|\/\/?=?|<[<=>]?|>[=>]?|[&|^~]/,punctuation:/[{}[\];(),.:]/},Prism.languages.python["string-interpolation"].inside.interpolation.inside.rest=Prism.languages.python,Prism.languages.py=Prism.languages.python; -------------------------------------------------------------------------------- /docs/lib/prismjs/components/prism-sql.js: -------------------------------------------------------------------------------- 1 | Prism.languages.sql = { 2 | 'comment': { 3 | pattern: /(^|[^\\])(?:\/\*[\s\S]*?\*\/|(?:--|\/\/|#).*)/, 4 | lookbehind: true 5 | }, 6 | 'variable': [ 7 | { 8 | pattern: /@(["'`])(?:\\[\s\S]|(?!\1)[^\\])+\1/, 9 | greedy: true 10 | }, 11 | /@[\w.$]+/ 12 | ], 13 | 'string': { 14 | pattern: /(^|[^@\\])("|')(?:\\[\s\S]|(?!\2)[^\\]|\2\2)*\2/, 15 | greedy: true, 16 | lookbehind: true 17 | }, 18 | 'function': /\b(?:AVG|COUNT|FIRST|FORMAT|LAST|LCASE|LEN|MAX|MID|MIN|MOD|NOW|ROUND|SUM|UCASE)(?=\s*\()/i, // Should we highlight user defined functions too? 19 | 'keyword': /\b(?:ACTION|ADD|AFTER|ALGORITHM|ALL|ALTER|ANALYZE|ANY|APPLY|AS|ASC|AUTHORIZATION|AUTO_INCREMENT|BACKUP|BDB|BEGIN|BERKELEYDB|BIGINT|BINARY|BIT|BLOB|BOOL|BOOLEAN|BREAK|BROWSE|BTREE|BULK|BY|CALL|CASCADED?|CASE|CHAIN|CHAR(?:ACTER|SET)?|CHECK(?:POINT)?|CLOSE|CLUSTERED|COALESCE|COLLATE|COLUMNS?|COMMENT|COMMIT(?:TED)?|COMPUTE|CONNECT|CONSISTENT|CONSTRAINT|CONTAINS(?:TABLE)?|CONTINUE|CONVERT|CREATE|CROSS|CURRENT(?:_DATE|_TIME|_TIMESTAMP|_USER)?|CURSOR|CYCLE|DATA(?:BASES?)?|DATE(?:TIME)?|DAY|DBCC|DEALLOCATE|DEC|DECIMAL|DECLARE|DEFAULT|DEFINER|DELAYED|DELETE|DELIMITERS?|DENY|DESC|DESCRIBE|DETERMINISTIC|DISABLE|DISCARD|DISK|DISTINCT|DISTINCTROW|DISTRIBUTED|DO|DOUBLE|DROP|DUMMY|DUMP(?:FILE)?|DUPLICATE|ELSE(?:IF)?|ENABLE|ENCLOSED|END|ENGINE|ENUM|ERRLVL|ERRORS|ESCAPED?|EXCEPT|EXEC(?:UTE)?|EXISTS|EXIT|EXPLAIN|EXTENDED|FETCH|FIELDS|FILE|FILLFACTOR|FIRST|FIXED|FLOAT|FOLLOWING|FOR(?: EACH ROW)?|FORCE|FOREIGN|FREETEXT(?:TABLE)?|FROM|FULL|FUNCTION|GEOMETRY(?:COLLECTION)?|GLOBAL|GOTO|GRANT|GROUP|HANDLER|HASH|HAVING|HOLDLOCK|HOUR|IDENTITY(?:_INSERT|COL)?|IF|IGNORE|IMPORT|INDEX|INFILE|INNER|INNODB|INOUT|INSERT|INT|INTEGER|INTERSECT|INTERVAL|INTO|INVOKER|ISOLATION|ITERATE|JOIN|KEYS?|KILL|LANGUAGE|LAST|LEAVE|LEFT|LEVEL|LIMIT|LINENO|LINES|LINESTRING|LOAD|LOCAL|LOCK|LONG(?:BLOB|TEXT)|LOOP|MATCH(?:ED)?|MEDIUM(?:BLOB|INT|TEXT)|MERGE|MIDDLEINT|MINUTE|MODE|MODIFIES|MODIFY|MONTH|MULTI(?:LINESTRING|POINT|POLYGON)|NATIONAL|NATURAL|NCHAR|NEXT|NO|NONCLUSTERED|NULLIF|NUMERIC|OFF?|OFFSETS?|ON|OPEN(?:DATASOURCE|QUERY|ROWSET)?|OPTIMIZE|OPTION(?:ALLY)?|ORDER|OUT(?:ER|FILE)?|OVER|PARTIAL|PARTITION|PERCENT|PIVOT|PLAN|POINT|POLYGON|PRECEDING|PRECISION|PREPARE|PREV|PRIMARY|PRINT|PRIVILEGES|PROC(?:EDURE)?|PUBLIC|PURGE|QUICK|RAISERROR|READS?|REAL|RECONFIGURE|REFERENCES|RELEASE|RENAME|REPEAT(?:ABLE)?|REPLACE|REPLICATION|REQUIRE|RESIGNAL|RESTORE|RESTRICT|RETURNS?|REVOKE|RIGHT|ROLLBACK|ROUTINE|ROW(?:COUNT|GUIDCOL|S)?|RTREE|RULE|SAVE(?:POINT)?|SCHEMA|SECOND|SELECT|SERIAL(?:IZABLE)?|SESSION(?:_USER)?|SET(?:USER)?|SHARE|SHOW|SHUTDOWN|SIMPLE|SMALLINT|SNAPSHOT|SOME|SONAME|SQL|START(?:ING)?|STATISTICS|STATUS|STRIPED|SYSTEM_USER|TABLES?|TABLESPACE|TEMP(?:ORARY|TABLE)?|TERMINATED|TEXT(?:SIZE)?|THEN|TIME(?:STAMP)?|TINY(?:BLOB|INT|TEXT)|TOP?|TRAN(?:SACTIONS?)?|TRIGGER|TRUNCATE|TSEQUAL|TYPES?|UNBOUNDED|UNCOMMITTED|UNDEFINED|UNION|UNIQUE|UNLOCK|UNPIVOT|UNSIGNED|UPDATE(?:TEXT)?|USAGE|USE|USER|USING|VALUES?|VAR(?:BINARY|CHAR|CHARACTER|YING)|VIEW|WAITFOR|WARNINGS|WHEN|WHERE|WHILE|WITH(?: ROLLUP|IN)?|WORK|WRITE(?:TEXT)?|YEAR)\b/i, 20 | 'boolean': /\b(?:TRUE|FALSE|NULL)\b/i, 21 | 'number': /\b0x[\da-f]+\b|\b\d+\.?\d*|\B\.\d+\b/i, 22 | 'operator': /[-+*\/=%^~]|&&?|\|\|?|!=?|<(?:=>?|<|>)?|>[>=]?|\b(?:AND|BETWEEN|IN|LIKE|NOT|OR|IS|DIV|REGEXP|RLIKE|SOUNDS LIKE|XOR)\b/i, 23 | 'punctuation': /[;[\]()`,.]/ 24 | }; 25 | -------------------------------------------------------------------------------- /docs/lib/prismjs/components/prism-yaml.js: -------------------------------------------------------------------------------- 1 | Prism.languages.yaml = { 2 | 'scalar': { 3 | pattern: /([\-:]\s*(?:![^\s]+)?[ \t]*[|>])[ \t]*(?:((?:\r?\n|\r)[ \t]+)[^\r\n]+(?:\2[^\r\n]+)*)/, 4 | lookbehind: true, 5 | alias: 'string' 6 | }, 7 | 'comment': /#.*/, 8 | 'key': { 9 | pattern: /(\s*(?:^|[:\-,[{\r\n?])[ \t]*(?:![^\s]+)?[ \t]*)[^\r\n{[\]},#\s]+?(?=\s*:\s)/, 10 | lookbehind: true, 11 | alias: 'atrule' 12 | }, 13 | 'directive': { 14 | pattern: /(^[ \t]*)%.+/m, 15 | lookbehind: true, 16 | alias: 'important' 17 | }, 18 | 'datetime': { 19 | pattern: /([:\-,[{]\s*(?:![^\s]+)?[ \t]*)(?:\d{4}-\d\d?-\d\d?(?:[tT]|[ \t]+)\d\d?:\d{2}:\d{2}(?:\.\d*)?[ \t]*(?:Z|[-+]\d\d?(?::\d{2})?)?|\d{4}-\d{2}-\d{2}|\d\d?:\d{2}(?::\d{2}(?:\.\d*)?)?)(?=[ \t]*(?:$|,|]|}))/m, 20 | lookbehind: true, 21 | alias: 'number' 22 | }, 23 | 'boolean': { 24 | pattern: /([:\-,[{]\s*(?:![^\s]+)?[ \t]*)(?:true|false)[ \t]*(?=$|,|]|})/im, 25 | lookbehind: true, 26 | alias: 'important' 27 | }, 28 | 'null': { 29 | pattern: /([:\-,[{]\s*(?:![^\s]+)?[ \t]*)(?:null|~)[ \t]*(?=$|,|]|})/im, 30 | lookbehind: true, 31 | alias: 'important' 32 | }, 33 | 'string': { 34 | pattern: /([:\-,[{]\s*(?:![^\s]+)?[ \t]*)("|')(?:(?!\2)[^\\\r\n]|\\.)*\2(?=[ \t]*(?:$|,|]|}|\s*#))/m, 35 | lookbehind: true, 36 | greedy: true 37 | }, 38 | 'number': { 39 | pattern: /([:\-,[{]\s*(?:![^\s]+)?[ \t]*)[+-]?(?:0x[\da-f]+|0o[0-7]+|(?:\d+\.?\d*|\.?\d+)(?:e[+-]?\d+)?|\.inf|\.nan)[ \t]*(?=$|,|]|})/im, 40 | lookbehind: true 41 | }, 42 | 'tag': /![^\s]+/, 43 | 'important': /[&*][\w]+/, 44 | 'punctuation': /---|[:[\]{}\-,|>?]|\.\.\./ 45 | }; 46 | 47 | Prism.languages.yml = Prism.languages.yaml; -------------------------------------------------------------------------------- /docs/question/setting不生效问题.md: -------------------------------------------------------------------------------- 1 | # setting不生效问题 2 | 3 | ## 问题 4 | 5 | 以下面这个项目结构为例,在`spiders`目录下运行`spider_test.py`读取不到`setting.py`,所以`setting`的配置不生效。 6 | 7 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2022/11/01/16672715088563.jpg) 8 | 9 | 读取不到是因为python的环境变量问题,在spiders目录下运行,只会找spides目录下的文件 10 | 11 | ## 解决方式 12 | 13 | ### 方法1:在setting同级目录下运行 14 | 15 | 在main.py中导入spider_test, 然后运行main.py 16 | 17 | ### 方法2:设置工作区间 18 | 19 | 设置工作区间方式(以pycharm为例):项目->右键->Mark Directory as -> Sources Root 20 | 21 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2022/11/01/16672717483410.jpg) 22 | 23 | ### 方法3:设置PYTHONPATH 24 | 25 | 以mac或linux举例,执行如下命令 26 | 27 | ```shell 28 | export PYTHONPATH=$PYTHONPATH:/绝对路径/spider-project 29 | ``` 30 | 注:这个命令设置的环境变量只在当前终端有效 31 | 32 | 然后即可在spiders目录下运行 33 | 34 | ```shell 35 | python spider_test.py 36 | ``` 37 | 38 | window如何添加环境变量大家自行探索,搞定了可在评论区留言 -------------------------------------------------------------------------------- /docs/question/安装问题.md: -------------------------------------------------------------------------------- 1 | # 安装问题 2 | 3 | ## 1. bitarray问题 4 | 5 | > window下pip 安装报错 6 | 7 | 8 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/02/09/16128685646774.jpg) 9 | 10 | 解决办法:安装 Microsoft Visual C++ 工具,工具下载地址如下所示: 11 | https://download.microsoft.com/download/5/f/7/5f7acaeb-8363-451f-9425-68a90f98b238/visualcppbuildtools_full.exe 12 | 13 | ## 2. AttributeError 'str' object has not attribute 'decode' 14 | 15 | > window下pip 安装报错 16 | 17 | ![670479264](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/16/670479264.jpg) 18 | 19 | 下载bitarray离线包,版本要求`bitarray>=1.5.3` 20 | 21 | https://www.lfd.uci.edu/~gohlke/pythonlibs/#bitarray 22 | 23 | ![-w722](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/16/16158992617537.jpg) 24 | 25 | 26 | 解压,进入目录下执行: 27 | 28 | python setup.py install 29 | -------------------------------------------------------------------------------- /docs/question/请求问题.md: -------------------------------------------------------------------------------- 1 | # 请求问题 2 | 3 | ## ValueError: check_hostname requires server_hostname 4 | 5 | pip install urllib3==1.25.8 6 | 7 | 参考:https://stackoverflow.com/questions/66642705/why-requests-raise-this-exception-check-hostname-requires-server-hostname -------------------------------------------------------------------------------- /docs/question/运行问题.md: -------------------------------------------------------------------------------- 1 | # 运行问题 2 | 3 | ## 1. 二次运行时卡住,不继续抓取 4 | 5 | ![1779423237](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/11/1779423237.jpg) 6 | 7 | **原因:** 8 | 9 | 因爬虫支持分布式和任务防丢,为防止任务抢占和任务丢失,巧妙的利用了redis有序集合来存储任务。 10 | 11 | 策略:有序集合有个分数,爬虫取任务时,只取小于当前时间戳分数的任务,同时将任务分数修改为当前时间戳+10分钟,当任务做完时,再主动将任务删除。 12 | 13 | 目的:将取到的任务分数修改成10分钟后,可防止其他爬虫节点取到同样的任务,同时当爬虫意外退出后,任务也不会丢失,10分钟后还可以取到。但也会导致有时爬虫启动时,明明有任务,却处于等待任务的情况。 14 | 15 | 应对等待情况: 16 | 17 | 1. 可将任务清空,重新抓取,可直接操作redis清空,或通过传参方式 18 | 19 | spider = test_spider.TestSpider(redis_key="feapder:test_spider", delete_keys="*z_requsets") 20 | spider.start() 21 | 22 | delete_keys为需要删除的key,类型: 元组/bool/string,支持正则; 常用于清空任务队列,否则重启时会断点续爬,如写成`delete_keys=True`也是可以的 23 | 24 | 1. 手动修改任务分数为小于当前时间搓的分数 25 | 26 | ![-w917](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/11/16154327722622.jpg) 27 | 28 | 1. 等10分钟就好了 29 | 30 | 2. 用debug模式开发 31 | -------------------------------------------------------------------------------- /docs/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * -------------------------------------------------------------------------------- /docs/source_code/BaseParser.md: -------------------------------------------------------------------------------- 1 | 2 | # BaseParser 3 | 4 | BaseParser为Spider的基类,用来定义任务下发与数据解析,是面向用户提供的接口 5 | 6 | ## 源码 7 | 8 | 9 | ```python 10 | class BaseParser(object): 11 | def start_requests(self): 12 | """ 13 | @summary: 添加初始url 14 | --------- 15 | --------- 16 | @result: yield Request() 17 | """ 18 | 19 | pass 20 | 21 | def download_midware(self, request): 22 | """ 23 | @summary: 下载中间件 可修改请求的一些参数, 或可自定义下载,然后返回 request, response 24 | --------- 25 | @param request: 26 | --------- 27 | @result: return request / request, response 28 | """ 29 | 30 | pass 31 | 32 | def validate(self, request, response): 33 | """ 34 | @summary: 校验函数, 可用于校验response是否正确 35 | 若函数内抛出异常,则重试请求 36 | 若返回True 或 None,则进入解析函数 37 | 若返回False,则抛弃当前请求 38 | 可通过request.callback_name 区分不同的回调函数,编写不同的校验逻辑 39 | --------- 40 | @param request: 41 | @param response: 42 | --------- 43 | @result: True / None / False 44 | """ 45 | 46 | pass 47 | 48 | def parse(self, request, response): 49 | """ 50 | @summary: 默认的解析函数 51 | --------- 52 | @param request: 53 | @param response: 54 | --------- 55 | @result: 56 | """ 57 | 58 | pass 59 | 60 | def exception_request(self, request, response): 61 | """ 62 | @summary: 请求或者parser里解析出异常的request 63 | --------- 64 | @param request: 65 | @param response: 66 | --------- 67 | @result: request / callback / None (返回值必须可迭代) 68 | """ 69 | 70 | pass 71 | 72 | def failed_request(self, request, response): 73 | """ 74 | @summary: 超过最大重试次数的request 75 | 可返回修改后的request 若不返回request,则将传进来的request直接人redis的failed表。否则将修改后的request入failed表 76 | --------- 77 | @param request: 78 | --------- 79 | @result: request / item / callback / None (返回值必须可迭代) 80 | """ 81 | 82 | pass 83 | 84 | def start_callback(self): 85 | """ 86 | @summary: 程序开始的回调 87 | --------- 88 | --------- 89 | @result: None 90 | """ 91 | 92 | pass 93 | 94 | def end_callback(self): 95 | """ 96 | @summary: 程序结束的回调 97 | --------- 98 | --------- 99 | @result: None 100 | """ 101 | 102 | pass 103 | 104 | @property 105 | def name(self): 106 | return self.__class__.__name__ 107 | 108 | def close(self): 109 | pass 110 | ``` 111 | 112 | ## 使用 113 | 114 | 以程序开始结束回调举例: 115 | 116 | ```python 117 | import feapder 118 | 119 | 120 | class TestSpider(feapder.Spider): 121 | def start_callback(self): 122 | print("爬虫开始了") 123 | 124 | def end_callback(self): 125 | print("爬虫结束了") 126 | ``` -------------------------------------------------------------------------------- /docs/source_code/BatchParser.md: -------------------------------------------------------------------------------- 1 | # BatchParser 2 | 3 | BaseParser为BatchSpider的基类,用来定义任务下发与数据解析,是面向用户提供的接口 4 | 5 | 除了提供[BaseParser](source_code/BaseParser)所有接口外,还提供以下方法 6 | 7 | ## 方法详解 8 | 9 | ### 1. 添加任务 add_task 10 | 11 | add_task, 每次执行start_monitor都会调用,且在init_task之前调用, 用于在批次爬虫启动前添加任务到数据库 12 | 13 | ``` 14 | class TestSpider(feapder.BatchSpider): 15 | def add_task(self): 16 | pass 17 | ``` 18 | 19 | ### 2. 更新任务 20 | 21 | #### 方法一: 22 | 23 | 一条条更新 24 | 25 | ```python 26 | def update_task_state(self, task_id, state=1, **kwargs): 27 | """ 28 | @summary: 更新任务表中任务状态,做完每个任务时代码逻辑中要主动调用 29 | 调用方法为 yield lambda : self.update_task_state(task_id, state) 30 | --------- 31 | @param task_id: 任务id 32 | @param state: 任务状态 33 | --------- 34 | @result: 35 | """ 36 | ``` 37 | 38 | 举例说明 39 | 40 | ``` 41 | def parse(self, request, response): 42 | yield item # 返回item, item会自动批量入库 43 | yield lambda : self.update_task_state(request.task_id, 1) 44 | ``` 45 | 46 | 在`yield item`后,调用`self.update_task_state`函数实现任务状态更新。 47 | 48 | 这里为什么使用`yield lambda`方式呢?因为`yield item`后,item不会马上入库,会存在一个buffer中,批量入库,如果我们直接调用`self.update_task_state`更新任务状态,可能这时item还并未入库,如果此时程序意外退出,那么缓存中的这一部分item数据将会丢失,但是此时任务状态已更新,任务不会重做,这便会导致这个任务所对应的数据丢失 49 | 50 | `yield lambda`返回的是一个回调函数,这个函数并不会马上执行,系统会保证item入库后再执行,因此这么写的用意在于item入库后再更新任务状态 51 | 52 | #### 方法二: 53 | 54 | 批量更新 55 | 56 | ```python 57 | def update_task_batch(self, task_id, state=1, **kwargs): 58 | """ 59 | 批量更新任务 多处调用,更新的字段必须一致 60 | 注意:需要 写成 yield update_task_batch(...) 否则不会更新 61 | @param task_id: 62 | @param state: 63 | @param kwargs: 64 | @return: 65 | """ 66 | ``` 67 | 68 | 举例说明 69 | 70 | ```python 71 | def parse(self, request, response): 72 | yield item # 返回item, item会自动批量入库 73 | yield self.update_task_batch(request.task_id, 1) # 更新任务状态为1 74 | ``` 75 | 76 | 在`yield item`后调用`self.update_task_batch`实现批量更新 77 | 78 | 注意,批量更新必须使用 `yield`, 因为`update_task_batch`函数并未实现更新逻辑,只是返回了`UpdateItem`, `UpdateItem`与`Item`类似,只不过带有更新功能,框架会在Item入库后在调用`UpdateItem`实现批量更新。关于`UpdateItem`详解,请参考[UpdateItem]() 79 | 80 | #### 两种方式选取 81 | 82 | 同一张表,若更新字段相同,推荐使用批量更新的方式,效率更高,若字段不同,用一条条更新的方式。因为批量更新,这一批的更新字段必须一致 83 | 84 | 比如当请求失败时,将任务更新为-1,同时标记失败原因,成功时将任务更新为1,写法如下: 85 | 86 | ```python 87 | def parse(self, request, response): 88 | yield self.update_task_batch(request.task_id, 1) # 更新任务状态为1 89 | 90 | def failed_request(self, request, response): 91 | """ 92 | @summary: 超过最大重试次数的request 93 | --------- 94 | @param request: 95 | --------- 96 | @result: request / item / callback / None (返回值必须可迭代) 97 | """ 98 | 99 | yield request 100 | yield lambda : self.update_task_state(request.task_id, -1, remark="失败原因") # 更新任务状态为-1 101 | ``` 102 | 103 | 因任务失败时多更新了个remark字段,与任务成功时只更新state字段不同,因此需要将此更新操作单独拆出来,用`update_task_state`方式更新 104 | 105 | ### 3. 获取批次时间 106 | 107 | 示例: 108 | 109 | def parse(self, request, response): 110 | item = SpiderDataItem() # 声明一个item 111 | item.batch_data = self.batch_date 112 | item.title = title # 给item属性赋值 113 | yield item # 返回item, item会自动批量入库 114 | 115 | 使用`self.batch_date`可获取当前批次时间,然后拼接到item入库 116 | 117 | 数据示例 118 | 119 | | id | title | batch_date | 120 | | --- | --- | --- | 121 | | 1 | 百度一下 | 2021-01-01 | -------------------------------------------------------------------------------- /docs/source_code/Item.md: -------------------------------------------------------------------------------- 1 | # Item 2 | 3 | 有关Item的简介及创建,可参考[命令行工具](command/cmdline?id=_3-创建-item) 4 | 5 | ## 数据入库 6 | 7 | 数据自动入库,除了根据mysql表生产item外,也可以直接给item赋值,示例如下: 8 | 9 | ``` 10 | from feapder import Item 11 | 12 | item = Item() 13 | item.table_name = "spider_data" # 表名 14 | item.title = title 15 | yield item 16 | ``` 17 | 18 | 等价于: 19 | 20 | 1. 生成item 21 | 22 | ``` 23 | from feapder import Item 24 | 25 | class SpiderDataItem(Item): 26 | """ 27 | This class was generated by feapder. 28 | command: feapder create -i spider_data. 29 | """ 30 | 31 | def __init__(self, *args, **kwargs): 32 | # self.id = None 33 | self.title = None 34 | ``` 35 | 36 | 1. 使用 37 | 38 | ``` 39 | item = SpiderDataItem() 40 | item.title = title 41 | yield item 42 | ``` 43 | 44 | ## Item指纹 45 | 46 | item指纹用于数据入库前的去重,默认为所有字段值排序后计算的md5,但当数据中有采集时间时,这种指纹计算方式明显不合理。因此我们可以通过如下方法指定参与去重的key 47 | 48 | ``` 49 | from feapder import Item 50 | 51 | 52 | class SpiderDataItem(Item): 53 | 54 | __unique_key__ = ["title", "url"] # 指定去重的key为 title、url,最后的指纹为title与url值联合计算的md5 55 | 56 | def __init__(self, *args, **kwargs): 57 | # self.id = None 58 | self.title = None 59 | self.url = None 60 | self.crawl_time = None 61 | ``` 62 | 63 | 或可通过如下方式指定`__unique_key__` 64 | 65 | ``` 66 | item = SpiderDataItem() 67 | item.unique_key = ["title", "url"] # 支持列表、元组、字符串 68 | ``` 69 | 70 | 或者重写指纹函数 71 | 72 | ``` 73 | from feapder import Item 74 | 75 | 76 | class SpiderDataItem(Item): 77 | ... 78 | 79 | @property 80 | def fingerprint(self): 81 | return "我是指纹" 82 | ``` 83 | 84 | ## 入库前对item进行处理 85 | 86 | pre_to_db函数为每个item入库前的回调函数,可通过此函数对数据进行处理 87 | 88 | ```python 89 | from feapder import Item 90 | 91 | 92 | class SpiderDataItem(Item): 93 | 94 | def __init__(self, *args, **kwargs): 95 | # self.id = None 96 | self.title = None 97 | 98 | def pre_to_db(self): 99 | """ 100 | 入库前的处理 101 | """ 102 | self.title = self.title.strip() 103 | ``` 104 | 105 | ## 更新数据 106 | 107 | 采集过程中,往往会有些数据漏采或解析出错,如果我们想更新已入库的数据,可将Item转为UpdateItem 108 | 109 | item = SpiderDataItem.to_UpdateItem() 110 | 111 | 或直接修改继承类 112 | 113 | ``` 114 | from feapder import Item, UpdateItem 115 | 116 | class SpiderDataItem(UpdateItem): 117 | ... 118 | ``` 119 | 120 | 关于UpdateItem使用,详见[UpdateItem](source_code/UpdateItem) 121 | -------------------------------------------------------------------------------- /docs/source_code/MongoDB.md: -------------------------------------------------------------------------------- 1 | # MongoDB 2 | 3 | ## 数据自动入Mongo库使用须知 4 | 5 | - 使用`MongoDb`存储数据,需要使用`MongoPipeline` 6 | 7 | 示例: 8 | 9 | ```python 10 | import feapder 11 | from feapder import Item 12 | 13 | 14 | class TestMongo(feapder.AirSpider): 15 | __custom_setting__ = dict( 16 | ITEM_PIPELINES=["feapder.pipelines.mongo_pipeline.MongoPipeline"], 17 | MONGO_IP="localhost", 18 | MONGO_PORT=27017, 19 | MONGO_DB="feapder", 20 | MONGO_USER_NAME="", 21 | MONGO_USER_PASS="", 22 | ) 23 | 24 | def start_requests(self): 25 | yield feapder.Request("https://www.baidu.com") 26 | 27 | def parse(self, request, response): 28 | title = response.xpath("//title/text()").extract_first() # 取标题 29 | item = Item() # 声明一个item 30 | item.table_name = "test_mongo" # 指定存储的表名 31 | item.title = title # 给item属性赋值 32 | yield item # 返回item, item会自动批量入库 33 | 34 | 35 | if __name__ == "__main__": 36 | TestMongo().start() 37 | ``` 38 | 39 | 40 | ## 直接使用 41 | 42 | ### 连接 43 | 44 | ```python 45 | from feapder.db.mongodb import MongoDB 46 | 47 | 48 | db = MongoDB( 49 | ip="localhost", port=27017, db="feapder", user_name="feapder", user_pass="feapder123" 50 | ) 51 | ``` 52 | 53 | 若环境变量中配置了数据库连接方式或者setting中已配置,则可不传参 54 | 55 | ```python 56 | db = MongoDB() 57 | ``` 58 | 59 | 或者可以根据url连接 60 | 61 | ```python 62 | db = MongoDB.from_url("mongodb://username:password@ip:port/db") 63 | ``` 64 | 65 | ### 方法 66 | 67 | > MongoDB封装了增删改查等方法,方便使用 68 | 69 | #### 查 70 | 71 | ```python 72 | def find(self, table, limit=0) -> List[Dict]: 73 | """ 74 | @summary: 75 | 无数据: 返回() 76 | 有数据: 若limit == 1 则返回 (data1, data2) 77 | 否则返回 ((data1, data2),) 78 | --------- 79 | @param table: 80 | @param limit: 81 | --------- 82 | @result: 83 | """ 84 | ``` 85 | 86 | 87 | #### 增 88 | 89 | ```python 90 | def add(self, table, data, **kwargs): 91 | """ 92 | 93 | Args: 94 | table: 95 | data: 96 | kwargs: 97 | auto_update: 覆盖更新,将替换唯一索引重复的数据,默认False 98 | update_columns: 更新指定的列(如果数据的唯一索引存在,则更新指定字段,如 update_columns = ["name", "title"] 99 | insert_ignore: 唯一索引冲突时是否忽略,默认为False 100 | condition_fields: 用于条件查找的字段,默认以`_id`作为查找条件,默认:['_id'] 101 | exception_callfunc: 异常回调 102 | 103 | Returns: 添加行数 104 | 105 | """ 106 | ``` 107 | 108 | ```python 109 | def add_batch(self, table: str, datas: List[Dict], **kwargs): 110 | """ 111 | @summary: 批量添加数据 112 | --------- 113 | @param command: 字典 114 | @param datas: 列表 [[..], [...]] 115 | @param **kwargs: 116 | auto_update: 覆盖更新,将替换唯一索引重复的数据,默认False 117 | update_columns: 更新指定的列(如果数据的唯一索引存在,则更新指定字段,如 update_columns = ["name", "title"] 118 | update_columns_value: 指定更新的字段对应的值 119 | condition_fields: 用于条件查找的字段,默认以`_id`作为查找条件,默认:['_id'] 120 | --------- 121 | @result: 添加行数 122 | """ 123 | ``` 124 | 125 | #### 更新 126 | 127 | ```python 128 | def update(self, coll_name, data: Dict, condition: Dict, upsert: bool = False): 129 | """ 130 | 更新 131 | Args: 132 | coll_name: 集合名 133 | data: 单条数据 {"xxx":"xxx"} 134 | condition: 更新条件 {"_id": "xxxx"} 135 | upsert: 数据不存在则插入,默认为 False 136 | 137 | Returns: True / False 138 | """ 139 | ``` 140 | 141 | #### 删除 142 | 143 | ```python 144 | def delete(self, table, condition: Dict): 145 | """ 146 | 删除 147 | Args: 148 | table: 149 | condition: 查找条件 150 | Returns: True / False 151 | """ 152 | ``` 153 | -------------------------------------------------------------------------------- /docs/source_code/MysqlDB.md: -------------------------------------------------------------------------------- 1 | # MysqlDB 2 | 3 | MysqlDB具有断开自动重连特性,支持多线程下操作,内置连接池,最大连接数100 4 | 5 | ## 连接 6 | 7 | ```python 8 | from feapder.db.mysqldb import MysqlDB 9 | 10 | 11 | db = MysqlDB( 12 | ip="localhost", port=3306, db="feapder", user_name="feapder", user_pass="feapder123" 13 | ) 14 | ``` 15 | 16 | 若环境变量中配置了数据库连接方式或者setting中已配置,则可不传参 17 | 18 | ```python 19 | db = MysqlDB() 20 | ``` 21 | 22 | 或者可以根据url连接 23 | 24 | ```python 25 | db = MysqlDB.from_url("mysql://username:password@ip:port/db?charset=utf8mb4") 26 | ``` 27 | 28 | ## 方法 29 | 30 | > MysqlDB封装了增删改查等方法,方便使用 31 | 32 | ### 查 33 | 34 | ```python 35 | def find(self, sql, limit=0, to_json=False): 36 | """ 37 | @summary: 38 | 无数据: 返回() 39 | 有数据: 若limit == 1 则返回 (data1, data2) 40 | 否则返回 ((data1, data2),) 41 | --------- 42 | @param sql: 43 | @param limit: 44 | @param to_json 是否将查询结果转为json 45 | --------- 46 | @result: 47 | """ 48 | ``` 49 | 50 | 51 | ### 增 52 | 53 | ```python 54 | def add(self, sql, exception_callfunc=None): 55 | """ 56 | Args: 57 | sql: 58 | exception_callfunc: 异常回调 59 | 60 | Returns:添加行数 61 | 62 | """ 63 | ``` 64 | 65 | ```python 66 | def add_smart(self, table, data: Dict, **kwargs): 67 | """ 68 | 添加数据, 直接传递json格式的数据,不用拼sql 69 | Args: 70 | table: 表名 71 | data: 字典 {"xxx":"xxx"} 72 | **kwargs: 73 | 74 | Returns:添加行数 75 | 76 | """ 77 | ``` 78 | 79 | 80 | ```python 81 | def add_batch(self, sql, datas: List[Dict]): 82 | """ 83 | @summary: 批量添加数据 84 | --------- 85 | @ param sql: insert ignore into (xxx, xxx) values (%s, %s, %s) 86 | @ param datas: 列表 [{}, {}, {}] 87 | --------- 88 | @result:添加行数 89 | """ 90 | ``` 91 | 92 | ```python 93 | def add_batch_smart(self, table, datas: List[Dict], **kwargs): 94 | """ 95 | 批量添加数据, 直接传递list格式的数据,不用拼sql 96 | Args: 97 | table: 表名 98 | datas: 列表 [{}, {}, {}] 99 | **kwargs: 100 | 101 | Returns: 添加行数 102 | 103 | """ 104 | ``` 105 | 106 | ### 更新 107 | 108 | ```python 109 | def update(self, sql): 110 | pass 111 | ``` 112 | 113 | ```python 114 | def update_smart(self, table, data: Dict, condition): 115 | """ 116 | 更新, 不用拼sql 117 | Args: 118 | table: 表名 119 | data: 数据 {"xxx":"xxx"} 120 | condition: 更新条件 where后面的条件,如 condition='status=1' 121 | 122 | Returns: True / False 123 | 124 | """ 125 | ``` 126 | 127 | ### 删除 128 | 129 | ```python 130 | def delete(self, sql): 131 | """ 132 | 删除 133 | Args: 134 | sql: 135 | 136 | Returns: True / False 137 | 138 | """ 139 | ``` 140 | 141 | ### 执行其他sql 142 | 143 | ```python 144 | def execute(self, sql): 145 | pass 146 | ``` -------------------------------------------------------------------------------- /docs/source_code/RedisDB.md: -------------------------------------------------------------------------------- 1 | # RedisDB 2 | 3 | RedisDB支持**哨兵模式**、**集群模式**与单节点的**普通模式**,封装了操作redis的常用的方法 4 | 5 | ## 连接 6 | 7 | > 若环境变量中配置了数据库连接方式或者setting中已配置,则可不传参 8 | 9 | ### 普通模式 10 | 11 | ```python 12 | from feapder.db.redisdb import RedisDB 13 | 14 | db = RedisDB(ip_ports="localhost:6379", db=0, user_pass=None) 15 | ``` 16 | 17 | 使用地址连接 18 | 19 | ```python 20 | from feapder.db.redisdb import RedisDB 21 | 22 | db = RedisDB.from_url("redis://[[username]:[password]]@[host]:[port]/[db]") 23 | ``` 24 | 25 | ### 哨兵模式 26 | 27 | ```python 28 | from feapder.db.redisdb import RedisDB 29 | 30 | db = RedisDB(ip_ports="172.25.21.4:26379,172.25.21.5:26379,172.25.21.6:26379", db=0, user_pass=None, service_name="my_master") 31 | ``` 32 | 33 | 注意:多个地址用逗号分隔,需传递`service_name` 34 | 35 | 对应setting配置文件,配置方式为: 36 | 37 | ```python 38 | REDISDB_IP_PORTS = "172.25.21.4:26379,172.25.21.5:26379,172.25.21.6:26379" 39 | REDISDB_USER_PASS = "" 40 | REDISDB_DB = 0 41 | REDISDB_SERVICE_NAME = "my_master" 42 | ``` 43 | 44 | ### 集群模式 45 | 46 | ```python 47 | from feapder.db.redisdb import RedisDB 48 | 49 | db = RedisDB(ip_ports="172.25.21.4:26379,172.25.21.5:26379,172.25.21.6:26379", db=0, user_pass=None) 50 | ``` 51 | 52 | 注意:多个地址用逗号分隔,不用传递`service_name` 53 | 54 | 对应setting配置文件,配置方式为: 55 | 56 | ```python 57 | REDISDB_IP_PORTS = "172.25.21.4:26379,172.25.21.5:26379,172.25.21.6:26379" 58 | REDISDB_USER_PASS = "" 59 | REDISDB_DB = 0 60 | ``` 61 | 62 | ## 方法: 63 | 64 | 详见源码,此处不一一列举, 源码:`feapder.db.redisdb` -------------------------------------------------------------------------------- /docs/source_code/UpdateItem.md: -------------------------------------------------------------------------------- 1 | # UpdateItem 2 | 3 | UpdateItem用于更新数据,继承至Item,所以使用方式基本与Item一致,下载只说不同之处 4 | 5 | ## 更新逻辑 6 | 7 | 更新逻辑借助了数据库的唯一索引,即插入数据时发现数据已存在,则更新。因此要求数据表必须存在唯一索引,才能使用UpdateItem 8 | 9 | 比如将title设置唯一,要求每条数据的title都不能重复 10 | 11 | ![-w781](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/16/16158245077159.jpg) 12 | 13 | 或联合索引,要求title与url不能同时重复 14 | 15 | ![-w761](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/16/16158245648750.jpg) 16 | 17 | 18 | ## 指定更新的字段 19 | 20 | 方式1:指定`__update_key__` 21 | 22 | ```python 23 | from feapder import UpdateItem 24 | 25 | 26 | class SpiderDataItem(UpdateItem): 27 | 28 | __update_key__ = ["title"] # 更新title字段 29 | 30 | def __init__(self, *args, **kwargs): 31 | # self.id = None 32 | self.title = None 33 | self.url = None 34 | ``` 35 | 36 | 方式2:赋值`update_key` 37 | 38 | ```python 39 | from feapder import UpdateItem 40 | 41 | 42 | class SpiderDataItem(UpdateItem): 43 | 44 | 45 | def __init__(self, *args, **kwargs): 46 | # self.id = None 47 | self.title = None 48 | self.url = None 49 | 50 | item = SpiderDataItem() 51 | item.update_key = "title" # 支持列表、元组、字符串 52 | ``` 53 | 54 | 方式3:将普通的item转为UpdateItem,然后再指定更新的key 55 | 56 | ```python 57 | from feapder import Item 58 | 59 | 60 | class SpiderDataItem(Item): 61 | 62 | 63 | def __init__(self, *args, **kwargs): 64 | # self.id = None 65 | self.title = None 66 | self.url = None 67 | 68 | item = SpiderDataItem() 69 | item = item.to_UpdateItem() 70 | item.update_key = "title" 71 | ``` 72 | 73 | **推荐方式1,直接改Item类,不用修改爬虫代码** -------------------------------------------------------------------------------- /docs/source_code/dedup.md: -------------------------------------------------------------------------------- 1 | # Dedup 2 | 3 | Dedup是feapder大数据去重模块,不同于BloomFilter,去重受槽位数量影响,Dedup使用了弹性的去重机制,可容纳海量的数据去重。 4 | 5 | 6 | ## 去重方式 7 | 8 | ### 临时去重 9 | 10 | > 基于redis,支持批量,去重有时效性。去重一万条数据约0.26秒,一亿条数据占用内存约1.43G 11 | 12 | ```python 13 | from feapder.dedup import Dedup 14 | 15 | data = {"xxx": 123, "xxxx": "xxxx"} 16 | datas = ["xxx", "bbb"] 17 | 18 | def test_ExpireFilter(): 19 | dedup = Dedup( 20 | Dedup.ExpireFilter, expire_time=10, redis_url="redis://@localhost:6379/0" 21 | ) 22 | 23 | # 逐条去重 24 | assert dedup.add(data) == 1 25 | assert dedup.get(data) == 1 26 | 27 | # 批量去重 28 | assert dedup.add(datas) == [1, 1] 29 | assert dedup.get(datas) == [1, 1] 30 | ``` 31 | 32 | 33 | ### 内存去重 34 | 35 | > 基于内存,支持批量。去重一万条数据约0.5秒,一亿条数据占用内存约285MB 36 | 37 | ```python 38 | from feapder.dedup import Dedup 39 | 40 | data = {"xxx": 123, "xxxx": "xxxx"} 41 | datas = ["xxx", "bbb"] 42 | 43 | def test_MemoryFilter(): 44 | dedup = Dedup(Dedup.MemoryFilter) # 表名为test 历史数据3秒有效期 45 | 46 | # 逐条去重 47 | assert dedup.add(data) == 1 48 | assert dedup.get(data) == 1 49 | 50 | # 批量去重 51 | assert dedup.add(datas) == [1, 1] 52 | assert dedup.get(datas) == [1, 1] 53 | ``` 54 | 55 | ### 永久去重 56 | 57 | > 基于redis,支持批量,永久去重。 去重一万条数据约3.5秒,一亿条数据占用内存约285MB 58 | 59 | ```python 60 | from feapder.dedup import Dedup 61 | 62 | def test_BloomFilter(): 63 | dedup = Dedup(Dedup.BloomFilter, redis_url="redis://@localhost:6379/0") 64 | 65 | # 逐条去重 66 | assert dedup.add(data) == 1 67 | assert dedup.get(data) == 1 68 | 69 | # 批量去重 70 | assert dedup.add(datas) == [1, 1] 71 | assert dedup.get(datas) == [1, 1] 72 | ``` 73 | 74 | ## 过滤数据 75 | 76 | Dedup可以通过如下方法,过滤掉已存在的数据 77 | 78 | 79 | ```python 80 | from feapder.dedup import Dedup 81 | 82 | def test_filter(): 83 | dedup = Dedup(Dedup.BloomFilter, redis_url="redis://@localhost:6379/0") 84 | 85 | # 制造已存在数据 86 | datas = ["xxx", "bbb"] 87 | dedup.add(datas) 88 | 89 | # 过滤掉已存在数据 "xxx", "bbb" 90 | datas = ["xxx", "bbb", "ccc"] 91 | dedup.filter_exist_data(datas) 92 | assert datas == ["ccc"] 93 | ``` 94 | 95 | ## Dedup参数 96 | 97 | - **filter_type**:去重类型,支持BloomFilter、MemoryFilter、ExpireFilter三种 98 | - **redis_url**不是必须传递的,若项目中存在setting.py文件,且已配置redis连接方式,则可以不传递redis_url 99 | 100 | ![-w294](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/07/16151133801599.jpg) 101 | 102 | ``` 103 | import feapder 104 | from feapder.dedup import Dedup 105 | 106 | class TestSpider(feapder.Spider): 107 | def __init__(self, *args, **kwargs): 108 | self.dedup = Dedup() # 默认是永久去重 109 | ``` 110 | 111 | - **name**: 过滤器名称 该名称会默认以dedup作为前缀 `dedup:expire_set:[name]`或`dedup:bloomfilter:[name]`。 默认ExpireFilter name=过期时间,BloomFilter name=`dedup:bloomfilter:bloomfilter` 112 | 113 | ![-w499](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/07/16151136442498.jpg) 114 | 115 | 若对不同数据源去重,可通过name参数来指定不同去重库 116 | 117 | - **absolute_name**:过滤器绝对名称 不会加dedup前缀 118 | - **expire_time**:ExpireFilter的过期时间 单位为秒,其他两种过滤器不用指定 119 | - **error_rate**:BloomFilter/MemoryFilter的误判率 默认为0.00001 120 | - **to_md5**:去重前是否将数据转为MD5,默认是 121 | 122 | ## 爬虫中使用 123 | 124 | 框架支持对请求和入库的数据进行去重,仅需要在[配置文件](source_code/配置文件)中进行配置即可 125 | 126 | ```python 127 | ITEM_FILTER_ENABLE = False # item 去重 128 | REQUEST_FILTER_ENABLE = False # request 去重 129 | ``` 130 | 131 | 或者可以直接导入此去重模块使用 132 | 133 | ```python 134 | from feapder.dedup import Dedup 135 | ``` 136 | 137 | -------------------------------------------------------------------------------- /docs/source_code/logger.md: -------------------------------------------------------------------------------- 1 | # 日志配置及使用 2 | 3 | ## 日志配置 4 | 5 | 见配置文件,相关配置如下: 6 | 7 | ```python 8 | LOG_NAME = os.path.basename(os.getcwd()) 9 | LOG_PATH = "log/%s.log" % LOG_NAME # log存储路径 10 | LOG_LEVEL = "DEBUG" 11 | LOG_COLOR = True # 是否带有颜色 12 | LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台 13 | LOG_IS_WRITE_TO_FILE = False # 是否写文件 14 | LOG_MODE = "w" # 写文件的模式 15 | LOG_MAX_BYTES = 10 * 1024 * 1024 # 每个日志文件的最大字节数 16 | LOG_BACKUP_COUNT = 20 # 日志文件保留数量 17 | LOG_ENCODING = "utf8" # 日志文件编码 18 | OTHERS_LOG_LEVAL = "ERROR" # 第三方库的log等级 19 | ``` 20 | 21 | 框架屏蔽了requests、selenium等一些第三方库的日志,OTHERS_LOG_LEVAL是用来控制这些第三库日志等级的。 22 | 23 | ## 使用日志工具 24 | 25 | 26 | ```python 27 | from feapder.utils.log import log 28 | 29 | log.debug("xxx") 30 | log.info("xxx") 31 | log.warning("xxx") 32 | log.error("xxx") 33 | log.critical("xxx") 34 | ``` 35 | 36 | 默认是带有颜色的日志: 37 | 38 | ![-w583](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/08/06/16282311862710.jpg) 39 | 40 | 日志等级:CRITICAL > ERROR > WARNING > INFO > DEBUG 41 | -------------------------------------------------------------------------------- /docs/source_code/pipeline.md: -------------------------------------------------------------------------------- 1 | # Pipeline 2 | 3 | Pipeline是数据入库时流经的管道,用户可自定义,以便对接其他数据库。 4 | 5 | 框架已内置mysql及mongo管道,其他管道作为扩展方式提供,可从[feapder_pipelines](https://github.com/Boris-code/feapder_pipelines)项目中按需安装 6 | 7 | 项目地址:https://github.com/Boris-code/feapder_pipelines 8 | 9 | ## 使用方式 10 | 11 | 注:item会被聚合成多条一起流经pipeline,方便批量入库 12 | 13 | ### 1. 编写pipeline 14 | 15 | ```python 16 | from feapder.pipelines import BasePipeline 17 | from typing import Dict, List, Tuple 18 | 19 | 20 | class Pipeline(BasePipeline): 21 | """ 22 | pipeline 是单线程的,批量保存数据的操作,不建议在这里写网络请求代码,如下载图片等 23 | """ 24 | 25 | def save_items(self, table, items: List[Dict]) -> bool: 26 | """ 27 | 保存数据 28 | Args: 29 | table: 表名 30 | items: 数据,[{},{},...] 31 | 32 | Returns: 是否保存成功 True / False 33 | 若False,不会将本批数据入到去重库,以便再次入库 34 | 35 | """ 36 | 37 | print("自定义pipeline, 保存数据 >>>>", table, items) 38 | 39 | return True 40 | 41 | def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool: 42 | """ 43 | 更新数据, 与UpdateItem配合使用,若爬虫中没使用UpdateItem,则可不实现此接口 44 | Args: 45 | table: 表名 46 | items: 数据,[{},{},...] 47 | update_keys: 更新的字段, 如 ("title", "publish_time") 48 | 49 | Returns: 是否更新成功 True / False 50 | 若False,不会将本批数据入到去重库,以便再次入库 51 | 52 | """ 53 | 54 | print("自定义pipeline, 更新数据 >>>>", table, items, update_keys) 55 | 56 | return True 57 | ``` 58 | 59 | `Pipeline`需继承`BasePipeline`,类名和存放位置随意,需要实现`save_items`接口。一定要有返回值,返回`False`表示数据没保存成功,会触发重试逻辑 60 | 61 | `update_items`接口与`UpdateItem`配合使用,更新数据时使用,若爬虫中没使用UpdateItem,则可不实现此接口 62 | 63 | ### 2. 编写配置文件 64 | 65 | ```python 66 | # 数据入库的pipeline,支持多个 67 | ITEM_PIPELINES = [ 68 | "pipeline.Pipeline" 69 | ] 70 | ``` 71 | 72 | 将编写好的pipeline配置进来,值为类的模块路径,需要指定到具体的类名 73 | 74 | ## 示例 75 | 76 | 地址:https://github.com/Boris-code/feapder/tree/master/tests/test-pipeline 77 | -------------------------------------------------------------------------------- /docs/source_code/proxy.md: -------------------------------------------------------------------------------- 1 | # 代理使用说明 2 | 3 | 代理使用有三种方式 4 | 1. 使用框架内置代理池 5 | 2. 自定义代理池 6 | 3. 请求中直接指定 7 | 8 | ## 方式1. 使用框架内置代理池 9 | 10 | ### 配置代理 11 | 12 | 在配置文件中配置代理提取接口 13 | 14 | ```python 15 | # 设置代理 16 | PROXY_EXTRACT_API = None # 代理提取API ,返回的代理分割符为\r\n 17 | PROXY_ENABLE = True 18 | PROXY_MAX_FAILED_TIMES = 5 # 代理最大失败次数,超过则不使用,自动删除 19 | ``` 20 | 21 | 要求API返回的代理格式为使用 /r/n 分隔: 22 | 23 | ``` 24 | ip:port 25 | ip:port 26 | ip:port 27 | ``` 28 | 29 | 这样feapder在请求时会自动随机使用上面的代理请求了 30 | 31 | ## 管理代理 32 | 33 | 1. 删除代理(默认是请求异常连续5次,再删除代理) 34 | 35 | 例如在发生异常时删除代理 36 | 37 | ```python 38 | import feapder 39 | class TestProxy(feapder.AirSpider): 40 | def start_requests(self): 41 | yield feapder.Request("https://www.baidu.com") 42 | 43 | def parse(self, request, response): 44 | print(response) 45 | 46 | def exception_request(self, request, response): 47 | request.del_proxy() 48 | 49 | ``` 50 | 51 | ## 方式2. 自定义代理池 52 | 53 | 1. 编写代理池:例如在你的项目下创建个my_proxypool.py,实现下面的函数 54 | 55 | ```python 56 | from feapder.network.proxy_pool import BaseProxyPool 57 | 58 | class MyProxyPool(BaseProxyPool): 59 | def get_proxy(self): 60 | """ 61 | 获取代理 62 | Returns: 63 | {"http": "xxx", "https": "xxx"} 64 | """ 65 | pass 66 | 67 | def del_proxy(self, proxy): 68 | """ 69 | @summary: 删除代理 70 | --------- 71 | @param proxy: xxx 72 | """ 73 | pass 74 | ``` 75 | 76 | 3. 修改setting的代理配置 77 | 78 | ``` 79 | PROXY_POOL = "my_proxypool.MyProxyPool" # 代理池 80 | ``` 81 | 82 | 将编写好的代理池配置进来,值为类的模块路径,需要指定到具体的类名 83 | 84 | 85 | 86 | ## 方式3. 不使用代理池,直接给请求指定代理 87 | 88 | 直接给request.proxies赋值即可,例如在下载中间件里使用 89 | 90 | ```python 91 | import feapder 92 | 93 | class TestProxy(feapder.AirSpider): 94 | def start_requests(self): 95 | yield feapder.Request("https://www.baidu.com") 96 | 97 | def download_midware(self, request): 98 | # 这里使用代理使用即可 99 | request.proxies = {"https": "https://ip:port", "http": "http://ip:port"} 100 | return request 101 | 102 | def parse(self, request, response): 103 | print(response) 104 | ``` -------------------------------------------------------------------------------- /docs/source_code/tools.md: -------------------------------------------------------------------------------- 1 | 2 | # tools 3 | 4 | `feapder.utils.tools`里封装了爬虫中常用的函数,目前共计**129**个,可通过阅读源码了解使用 5 | 6 | ## 举例 7 | 8 | ### 时间格式化 9 | 10 | ```python 11 | from feapder.utils import tools 12 | 13 | time = "昨天" 14 | 15 | date = tools.format_time(time) 16 | assert date == "2021-03-15 00:00:00" 17 | ``` 18 | -------------------------------------------------------------------------------- /docs/source_code/报警及监控.md: -------------------------------------------------------------------------------- 1 | # 报警及监控 2 | 3 | 支持钉钉、飞书、企业微信、邮件报警 4 | 5 | ## 钉钉报警 6 | 7 | 条件:需要有钉钉群,需要获取钉钉机器人的Webhook地址 8 | 9 | 获取方式参考官方文档:https://developers.dingtalk.com/document/app/custom-robot-access 10 | 11 | 安全设置选择自定义关键词,填入**feapder** 12 | 13 | ![-w547](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/27/16167753030324.jpg) 14 | 15 | 或使用加签方式,然后在setting中设置密钥 16 | 17 | 相关配置: 18 | 19 | ```python 20 | # 钉钉报警 21 | DINGDING_WARNING_URL = "" # 钉钉机器人api 22 | DINGDING_WARNING_PHONE = "" # 报警人 支持列表,可指定多个 23 | DINGDING_WARNING_ALL = False # 是否提示所有人, 默认为False 24 | DINGDING_WARNING_SECRET = None # 加签密钥 25 | ``` 26 | 27 | ## 企业微信报警 28 | 29 | 条件:需要企业微信群,并获取企业微信机器人的Webhook地址 30 | 31 | 获取方式:https://weibanzhushou.com/blog/330 32 | 33 | 报警简介: 34 | 35 | - 仅支持文本模式 36 | - 当用户手机号码为空字符串或`WECHAT_WARNING_ALL`为`True`时将会`@全体成员` 37 | 38 | 39 | 相关设置: 40 | 41 | ```python 42 | # 企业微信报警 43 | WECHAT_WARNING_URL = "" # 企业微信机器人api 44 | WECHAT_WARNING_PHONE = "" # 报警人 将会在群内@此人, 支持列表,可指定多人 45 | WECHAT_WARNING_ALL = False # 是否提示所有人, 默认为False 46 | ``` 47 | 48 | ## 飞书报警 49 | 50 | 可参考文档设置机器人:https://open.feishu.cn/document/ukTMukTMukTM/ucTM5YjL3ETO24yNxkjN#e1cdee9f 51 | 52 | 然后在feapder的setting文件中修改如下配置 53 | 54 | ``` 55 | FEISHU_WARNING_URL = "" # 飞书机器人api 56 | FEISHU_WARNING_USER = None # 报警人 {"open_id":"ou_xxxxx", "name":"xxxx"} 或 [{"open_id":"ou_xxxxx", "name":"xxxx"}] 57 | FEISHU_WARNING_ALL = False # 是否提示所有人, 默认为False 58 | ``` 59 | 60 | ## 邮件报警 61 | 62 | 相关配置: 63 | 64 | ``` 65 | # 邮件报警 66 | EMAIL_SENDER = "" # 发件人 67 | EMAIL_PASSWORD = "" # 授权码 68 | EMAIL_RECEIVER = "" # 收件人 支持列表,可指定多个 69 | ``` 70 | 71 | 邮件报警目前支持163邮箱作为发送者,`EMAIL_SENDER`为邮箱账号,如`feapder@163.com`, `EMAIL_PASSWORD`为授权码,不是登录密码,获取授权码的流程如下: 72 | 73 | 1. 设置 -> POP3/SMTP/IMAP 74 | 75 | ![-w258](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/27/16167719328720.jpg) 76 | 77 | 2. 开启SMTP服务 78 | 79 | ![-w444](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/27/16167719490656.jpg) 80 | 81 | 开启后,会弹出授权码,该授权码即为EMAIL_PASSWORD 82 | 83 | 3. 设置反垃圾规则为高级 84 | 85 | ![-w1112](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/27/16167719655644.jpg) 86 | 87 | 4. 将本邮箱账号添加到白名单中 88 | 89 | ## 报警间隔及报警级别 90 | 91 | 框架会对相同的报警进行过滤,防止刷屏,默认的报警时间间隔为1小时,可通过以下配置修改: 92 | 93 | ```python 94 | WARNING_INTERVAL = 3600 # 相同报警的报警时间间隔,防止刷屏 95 | WARNING_LEVEL = "DEBUG" # 报警级别, DEBUG / ERROR 96 | ``` 97 | 98 | DEBUG级别的报警包含一些运行信息,ERROR级别的报警都是有问题的报警,需要及时处理 99 | 100 | 101 | ## 可视化监控 102 | 103 | 支持对爬虫运行情况进行监控,除了数据监控和请求监控外,用户还可自定义监控内容,详情参考[自定义监控](source_code/监控打点?id=自定义监控) 104 | 105 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/09/14/16316112326191.jpg) 106 | 107 | 需 feapder>=1.6.6, 需配合feaplat爬虫管理平台 -------------------------------------------------------------------------------- /docs/source_code/监控打点.md: -------------------------------------------------------------------------------- 1 | # 监控打点 2 | 3 | 需配合爬虫管理系统 **feaplat** 4 | 5 | 监控数据默认保留180天,滚动删除 6 | 7 | ## 爬虫中使用 8 | 9 | > 需feapder>=1.6.6 10 | 11 | feapder内置了监控打点,只需要部署到feaplat爬虫管理系统即可实现对请求和数据监控 12 | 13 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/09/14/16316112326191.jpg) 14 | 15 | - 注意使用 `yield item` 的方式入库的数据,才能看到数据监控的指标,图表的title是表名,折线图展示了每个字段是否有值的情况以及数据总量(total count) 16 | 17 | - document为下载情况 18 | 19 | 若想监控些其他的指标,参考自定义监控: 20 | 21 | 22 | ## 自定义监控 23 | 24 | 举例:编写`test_metrics.py`代码如下: 25 | 26 | ```python 27 | from feapder.utils import metrics 28 | 29 | # 初始化打点系统 30 | metrics.init() 31 | 32 | metrics.emit_counter("key", count=1, classify="test") 33 | 34 | metrics.close() 35 | ``` 36 | 37 | 部署到feaplat: 38 | 39 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/09/13/16315065474223.jpg) 40 | 41 | 查看监控: 42 | 43 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/09/13/16315067391666.jpg) 44 | 45 | 再来解释下 46 | ``` 47 | metrics.emit_counter("key", count=1, classify="test") 48 | ``` 49 | - key 对应上图中的折线 50 | - count 对应上图中的点数 51 | - classify 对应上图中的图表标题 52 | 53 | 若代码如下: 54 | ```python 55 | from feapder.utils import metrics 56 | 57 | # 初始化打点系统 58 | metrics.init() 59 | 60 | metrics.emit_counter("key", count=1, classify="test") 61 | metrics.emit_counter("key2", count=1, classify="test") 62 | metrics.emit_counter("key3", count=1, classify="test") 63 | 64 | metrics.emit_counter("哈哈", count=1, classify="test2") 65 | 66 | metrics.close() 67 | ``` 68 | 69 | 应该生成两张图表,第一个图表3条折线,实际生成如下: 70 | 71 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/09/13/16315071385604.jpg) 72 | 73 | 74 | 如在feapder爬虫中使用,示例如下: 75 | 76 | ```python 77 | import feapder 78 | from feapder.utils import metrics 79 | 80 | 81 | class TestSpider(feapder.AirSpider): 82 | def start_requests(self): 83 | yield feapder.Request("https://www.baidu.com") 84 | 85 | def parse(self, request, response): 86 | # 自定义监控 87 | metrics.emit_counter("success", count=1, classify="自定义的监控指标") 88 | 89 | 90 | if __name__ == "__main__": 91 | TestSpider().start() 92 | ``` 93 | 94 | 我们只需要导包,然后`metrics.emit_counter`即可,不需要关心 `metrics.init`和`metrics.close`, 若在scrapy或其他python脚本中使用,必须调用`metrics.init`和`metrics.close` 95 | -------------------------------------------------------------------------------- /docs/usage/使用前必读.md: -------------------------------------------------------------------------------- 1 | # 使用前必读 2 | 3 | ## 爬虫种类简介 4 | 5 | feapder爬虫框架内置三种爬虫 6 | 7 | 1. AirSpider - 轻量级爬虫 8 | 2. Spider - 分布式爬虫 9 | 3. BatchSpider - 分布式批次爬虫 10 | 11 | **一、AirSpider :** 12 | 13 | 轻量爬虫,学习成本低。面对一些数据量较少,无需断点续爬,无需分布式采集的需求,可采用此爬虫。 14 | 15 | **二、Spider :** 16 | 17 | 分布式爬虫,适用于海量数据采集,支持断点续爬、爬虫报警、数据自动入库等功能 18 | 19 | 20 | **三、BatchSpider** 21 | 22 | 分布式批次爬虫,对于需要周期性采集的数据,优先考虑使用本爬虫。 23 | 24 | 本爬虫会自动维护个批次信息表,详细的记录了每个批次时间、任务完成情况、批次周期等信息,示例数据如下 25 | ![-w899](http://markdown-media.oss-cn-beijing.aliyuncs.com/2020/12/20/16084680404224.jpg) 26 | 27 | 另外本爬虫与其他爬虫最大的区别是,会维护个批次时间信息,本批次未完成下一批次不会开始。 28 | 29 | 举个例子 30 | 31 | > 需求:每7天全量抓取一次商品价格信息。表结构需要包含每个批次信息 32 | 33 | 表设计如下: 34 | 35 | | 字段 | 说明 | 36 | | --- | --- | 37 | | id | 主键 | 38 | | item_id | 商品id | 39 | | price | 价格 | 40 | | crawl_time | 采集时间 | 41 | | batch_date | 批次时间 | 42 | 43 | 数据示例 44 | 45 | | id | item_id | price | crawl_time | batch_date | 46 | | --- | --- | --- | --- | --- | 47 | | 1 | 3213 | 99 | 2021-01-01 | 2021-01-01 | 48 | | 2 | 3214 | 90 | 2021-01-05 | 2021-01-01 | 49 | | 3 | 3213 | 95 | 2021-01-08 | 2021-01-08 | 50 | | 4 | 3214 | 92 | 2021-01-20| 2021-01-08 | 51 | 52 | 从数据示例中可以看到 53 | - id(1,2) 两条数据虽然是不同天采集的,但都归属于2021-01-01这个批次。 54 | - id(3,4) 为7天后抓取的新一批数据,归属于2021-01-08这个批次。 55 | - id为4的数琚,采集时间为20号,虽然已经超出了7天这个维度,但因是采集超时等某种原因导致,为了保证每个批次数据的完整性,仍会归属于2021-01-08这个批次。 56 | 57 | BatchSpider爬虫会自动维护这个batch_date, 有了这个batch_date,方便业务做时序数据展示 58 | 59 | 并且在采集过程中,可随时重启爬虫,若本批次还有剩余任务,会继续抓取,若本批次结束了,下一批次未到时,爬虫会自动退出 60 | 61 | ## 学习路线 62 | 63 | feapder虽然内置三种爬虫,但对于开发者暴露的接口一致。只需要继承不同的类即可,使用方式雷同。 64 | 65 | 建议学习路线为 AirSpider->Spider->BatchSpider。因为后一个爬虫是基于前一个爬虫丰富而来的,与我们读书 小学->初中->高中这个路线类似 66 | -------------------------------------------------------------------------------- /feapder/VERSION: -------------------------------------------------------------------------------- 1 | 1.9.2 -------------------------------------------------------------------------------- /feapder/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2020/4/21 10:41 PM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | import os 11 | import re 12 | import sys 13 | 14 | sys.path.insert(0, re.sub(r"([\\/]items$)|([\\/]spiders$)", "", os.getcwd())) 15 | 16 | __all__ = [ 17 | "AirSpider", 18 | "Spider", 19 | "TaskSpider", 20 | "BatchSpider", 21 | "BaseParser", 22 | "TaskParser", 23 | "BatchParser", 24 | "Request", 25 | "Response", 26 | "Item", 27 | "UpdateItem", 28 | "ArgumentParser", 29 | ] 30 | 31 | from feapder.core.spiders import AirSpider, Spider, TaskSpider, BatchSpider 32 | from feapder.core.base_parser import BaseParser, TaskParser, BatchParser 33 | from feapder.network.request import Request 34 | from feapder.network.response import Response 35 | from feapder.network.item import Item, UpdateItem 36 | from feapder.utils.custom_argparse import ArgumentParser 37 | -------------------------------------------------------------------------------- /feapder/buffer/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | Created on 2020/4/23 12:09 AM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | ''' -------------------------------------------------------------------------------- /feapder/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/feapder/commands/__init__.py -------------------------------------------------------------------------------- /feapder/commands/create/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "CreateProject", 3 | "CreateSpider", 4 | "CreateItem", 5 | "CreateInit", 6 | "CreateJson", 7 | "CreateTable", 8 | "CreateCookies", 9 | "CreateSetting", 10 | "CreateParams", 11 | ] 12 | 13 | from .create_table import CreateTable 14 | from .create_json import CreateJson 15 | from .create_spider import CreateSpider 16 | from .create_init import CreateInit 17 | from .create_item import CreateItem 18 | from .create_project import CreateProject 19 | from .create_cookies import CreateCookies 20 | from .create_setting import CreateSetting 21 | from .create_params import CreateParams 22 | -------------------------------------------------------------------------------- /feapder/commands/create/create_cookies.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/4/25 10:22 上午 4 | --------- 5 | @summary: 将浏览器的cookie转为request的cookie 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import json 12 | 13 | import pyperclip 14 | 15 | from feapder.utils.tools import get_cookies_from_str, print_pretty 16 | 17 | 18 | class CreateCookies: 19 | def get_data(self): 20 | """ 21 | @summary: 从剪切板中读取内容 22 | --------- 23 | --------- 24 | @result: 25 | """ 26 | input("请复制浏览器cookie (列表或字符串格式), 复制后按任意键读取剪切板内容\n") 27 | 28 | text = pyperclip.paste() 29 | print(text + "\n") 30 | 31 | return text 32 | 33 | def create(self): 34 | data = self.get_data() 35 | cookies = {} 36 | try: 37 | data_json = json.loads(data) 38 | 39 | for data in data_json: 40 | cookies[data.get("name")] = data.get("value") 41 | 42 | except: 43 | cookies = get_cookies_from_str(data) 44 | 45 | print_pretty(cookies) 46 | -------------------------------------------------------------------------------- /feapder/commands/create/create_init.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2018-08-28 17:38:43 4 | --------- 5 | @summary: 创建__init__.py 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | from feapder.utils.tools import dumps_json 12 | 13 | 14 | class CreateInit: 15 | def create(self): 16 | __all__ = [] 17 | 18 | import os 19 | 20 | path = os.getcwd() 21 | for file in os.listdir(path): 22 | if file.endswith(".py") and not file.startswith("__init__"): 23 | model = file.split(".")[0] 24 | __all__.append(model) 25 | 26 | del os 27 | 28 | with open("__init__.py", "w", encoding="utf-8") as file: 29 | text = "__all__ = %s" % dumps_json(__all__) 30 | file.write(text) 31 | -------------------------------------------------------------------------------- /feapder/commands/create/create_json.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2018-08-28 17:38:43 4 | --------- 5 | @summary: 字符串转json 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import pyperclip 12 | 13 | import feapder.utils.tools as tools 14 | 15 | 16 | class CreateJson: 17 | def get_data(self): 18 | """ 19 | @summary: 从控制台读取多行 20 | --------- 21 | --------- 22 | @result: 23 | """ 24 | input("请复制需要转换的内容(xxx:xxx格式,支持多行),复制后按任意键读取剪切板内容\n") 25 | 26 | text = pyperclip.paste() 27 | print(text + "\n") 28 | 29 | data = [] 30 | for line in text.split("\n"): 31 | line = line.strip().replace("\t", " " * 4) 32 | if not line: 33 | break 34 | 35 | data.append(line) 36 | 37 | return data 38 | 39 | def create(self, sort_keys=False): 40 | contents = self.get_data() 41 | 42 | json = {} 43 | for content in contents: 44 | content = content.strip() 45 | if not content or content.startswith(":"): 46 | continue 47 | 48 | regex = "([^:\s]*)[:|\s]*(.*)" 49 | 50 | result = tools.get_info(content, regex, fetch_one=True) 51 | if result[0] in json: 52 | json[result[0]] = json[result[0]] + "&" + result[1] 53 | else: 54 | json[result[0]] = result[1].strip() 55 | 56 | print(tools.dumps_json(json, sort_keys=sort_keys)) 57 | -------------------------------------------------------------------------------- /feapder/commands/create/create_params.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/4/25 10:22 上午 4 | --------- 5 | @summary: 将浏览器的cookie转为request的cookie 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import sys 12 | 13 | from feapder.utils.tools import dumps_json 14 | 15 | 16 | class CreateParams: 17 | def get_data(self): 18 | """ 19 | @summary: 从控制台读取多行 20 | --------- 21 | --------- 22 | @result: 23 | """ 24 | print("请输入请求地址") 25 | data = [] 26 | while True: 27 | line = sys.stdin.readline().strip() 28 | if not line: 29 | break 30 | 31 | data.append(line) 32 | 33 | return "".join(data) 34 | 35 | def get_params(self, url): 36 | params_json = {} 37 | params = url.split("?")[-1].split("&") 38 | for param in params: 39 | key_value = param.split("=", 1) 40 | params_json[key_value[0]] = key_value[1] 41 | 42 | return params_json 43 | 44 | def create(self): 45 | data = self.get_data() 46 | 47 | params = self.get_params(data) 48 | url = data.split("?")[0] 49 | 50 | print(f'url = "{url}"') 51 | print(f"params = {dumps_json(params)}") 52 | -------------------------------------------------------------------------------- /feapder/commands/create/create_project.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2018-08-28 17:38:43 4 | --------- 5 | @summary: 创建项目 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import getpass 12 | import os 13 | import shutil 14 | 15 | import feapder.utils.tools as tools 16 | 17 | 18 | def deal_file_info(file): 19 | file = file.replace("{DATE}", tools.get_current_date()) 20 | file = file.replace("{USER}", os.getenv("FEAPDER_USER") or getpass.getuser()) 21 | 22 | return file 23 | 24 | 25 | class CreateProject: 26 | def copy_callback(self, src, dst, *, follow_symlinks=True): 27 | if src.endswith(".py"): 28 | with open(src, "r", encoding="utf-8") as src_file, open( 29 | dst, "w", encoding="utf8" 30 | ) as dst_file: 31 | content = src_file.read() 32 | content = deal_file_info(content) 33 | dst_file.write(content) 34 | 35 | else: 36 | shutil.copy2(src, dst, follow_symlinks=follow_symlinks) 37 | 38 | def create(self, project_name): 39 | if os.path.exists(project_name): 40 | print("%s 项目已经存在" % project_name) 41 | else: 42 | template_path = os.path.abspath( 43 | os.path.join(__file__, "../../../templates/project_template") 44 | ) 45 | shutil.copytree( 46 | template_path, project_name, copy_function=self.copy_callback 47 | ) 48 | 49 | print("\n%s 项目生成成功" % project_name) 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /feapder/commands/create/create_setting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/4/23 13:20 4 | --------- 5 | @summary: 生成配置文件 6 | --------- 7 | @author: mkdir700 8 | @email: mkdir700@gmail.com 9 | """ 10 | 11 | import os 12 | import shutil 13 | 14 | 15 | class CreateSetting: 16 | def create(self): 17 | if os.path.exists("setting.py"): 18 | confirm = input("配置文件已存在 是否覆盖 (y/n). ") 19 | if confirm != "y": 20 | print("取消覆盖 退出") 21 | return 22 | 23 | template_file_path = os.path.abspath( 24 | os.path.join(__file__, "../../../templates/project_template/setting.py") 25 | ) 26 | shutil.copy(template_file_path, "./", follow_symlinks=False) 27 | print("配置文件生成成功") 28 | -------------------------------------------------------------------------------- /feapder/commands/retry.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2022/11/18 12:33 PM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | import argparse 11 | 12 | from feapder.core.handle_failed_items import HandleFailedItems 13 | from feapder.core.handle_failed_requests import HandleFailedRequests 14 | 15 | 16 | def retry_failed_requests(redis_key): 17 | handle_failed_requests = HandleFailedRequests(redis_key) 18 | handle_failed_requests.reput_failed_requests_to_requests() 19 | 20 | 21 | def retry_failed_items(redis_key): 22 | handle_failed_items = HandleFailedItems(redis_key) 23 | handle_failed_items.reput_failed_items_to_db() 24 | handle_failed_items.close() 25 | 26 | 27 | def parse_args(): 28 | parser = argparse.ArgumentParser( 29 | description="重试失败的请求或入库失败的item", 30 | usage="usage: feapder retry [options] [args]", 31 | ) 32 | parser.add_argument( 33 | "-r", 34 | "--request", 35 | help="重试失败的request 如 feapder retry --request ", 36 | metavar="", 37 | ) 38 | parser.add_argument( 39 | "-i", "--item", help="重试失败的item 如 feapder retry --item ", metavar="" 40 | ) 41 | args = parser.parse_args() 42 | return args 43 | 44 | 45 | def main(): 46 | args = parse_args() 47 | if args.request: 48 | retry_failed_requests(args.request) 49 | if args.item: 50 | retry_failed_items(args.item) 51 | 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /feapder/commands/zip.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2022/2/13 12:59 上午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import argparse 12 | import os 13 | import re 14 | import zipfile 15 | 16 | 17 | def is_ignore_file(ignore_files: list, filename): 18 | for ignore_file in ignore_files: 19 | if re.search(ignore_file, filename): 20 | return True 21 | return False 22 | 23 | 24 | def zip(dir_path, zip_name, ignore_dirs: list = None, ignore_files: list = None): 25 | print(f"正在压缩 {dir_path} >> {zip_name}") 26 | ignore_files.append(os.path.basename(zip_name)) 27 | with zipfile.ZipFile(zip_name, "w") as file: 28 | dir_name = os.path.basename(dir_path) 29 | parent_dir = os.path.dirname(dir_path) 30 | if parent_dir: 31 | os.chdir(parent_dir) 32 | for path, dirs, filenames in os.walk(dir_name): 33 | # 修改原dirs,方式遍历忽略文件夹里的文件 34 | if ignore_dirs: 35 | dirs[:] = [d for d in dirs if d not in ignore_dirs] 36 | for filename in filenames: 37 | if ignore_files and is_ignore_file(ignore_files, filename): 38 | continue 39 | 40 | filepath = os.path.join(path, filename) 41 | print(f" adding {filepath}") 42 | file.write(filepath) 43 | 44 | print(f"压缩成功 {dir_path} >> {zip_name}") 45 | 46 | 47 | def parse_args(): 48 | parser = argparse.ArgumentParser( 49 | description="压缩文件夹, 默认排除以下文件夹及文件 .git,__pycache__,.idea,venv,.DS_Store", 50 | usage="feapder zip dir_path [zip_name]", 51 | ) 52 | parser.add_argument("dir_path", type=str, help="文件夹路径") 53 | parser.add_argument("zip_name", type=str, nargs="?", help="压缩后的文件名,默认为文件夹名.zip") 54 | parser.add_argument("-i", help="忽略文件,逗号分隔,支持正则", metavar="") 55 | parser.add_argument("-I", help="忽略文件夹,逗号分隔,支持正则 ", metavar="") 56 | parser.add_argument("-o", help="输出路径,默认为当前目录", metavar="") 57 | 58 | args = parser.parse_args() 59 | return args 60 | 61 | 62 | def main(): 63 | ignore_dirs = [".git", "__pycache__", ".idea", "venv", "env"] 64 | ignore_files = [".DS_Store"] 65 | args = parse_args() 66 | if args.i: 67 | ignore_files.extend(args.i.split(",")) 68 | if args.I: 69 | ignore_dirs.extend(args.I.split(",")) 70 | dir_path = args.dir_path 71 | zip_name = args.zip_name or os.path.basename(dir_path) + ".zip" 72 | if args.o: 73 | zip_name = os.path.join(args.o, os.path.basename(zip_name)) 74 | 75 | zip(dir_path, zip_name, ignore_dirs=ignore_dirs, ignore_files=ignore_files) 76 | -------------------------------------------------------------------------------- /feapder/core/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | Created on 2020/4/23 12:09 AM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | ''' -------------------------------------------------------------------------------- /feapder/core/handle_failed_items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2022/11/18 11:33 AM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | import feapder.setting as setting 11 | from feapder.buffer.item_buffer import ItemBuffer 12 | from feapder.db.redisdb import RedisDB 13 | from feapder.network.item import Item, UpdateItem 14 | from feapder.utils.log import log 15 | 16 | 17 | class HandleFailedItems: 18 | def __init__(self, redis_key, task_table=None, item_buffer=None): 19 | if redis_key.endswith(":s_failed_items"): 20 | redis_key = redis_key.replace(":s_failed_items", "") 21 | 22 | self._redisdb = RedisDB() 23 | self._item_buffer = item_buffer or ItemBuffer(redis_key, task_table=task_table) 24 | 25 | self._table_failed_items = setting.TAB_FAILED_ITEMS.format(redis_key=redis_key) 26 | 27 | def get_failed_items(self, count=1): 28 | failed_items = self._redisdb.sget( 29 | self._table_failed_items, count=count, is_pop=False 30 | ) 31 | return failed_items 32 | 33 | def reput_failed_items_to_db(self): 34 | log.debug("正在重新写入失败的items...") 35 | total_count = 0 36 | while True: 37 | try: 38 | failed_items = self.get_failed_items() 39 | if not failed_items: 40 | break 41 | 42 | for data_str in failed_items: 43 | data = eval(data_str) 44 | 45 | for add in data.get("add"): 46 | table = add.get("table") 47 | datas = add.get("datas") 48 | for _data in datas: 49 | item = Item(**_data) 50 | item.table_name = table 51 | self._item_buffer.put_item(item) 52 | total_count += 1 53 | 54 | for update in data.get("update"): 55 | table = update.get("table") 56 | datas = update.get("datas") 57 | update_keys = update.get("update_keys") 58 | for _data in datas: 59 | item = UpdateItem(**_data) 60 | item.table_name = table 61 | item.update_key = update_keys 62 | self._item_buffer.put_item(item) 63 | total_count += 1 64 | 65 | # 入库成功后删除 66 | def delete_item(): 67 | self._redisdb.srem(self._table_failed_items, data_str) 68 | 69 | self._item_buffer.put_item(delete_item) 70 | self._item_buffer.flush() 71 | 72 | except Exception as e: 73 | log.exception(e) 74 | 75 | if total_count: 76 | log.debug("导入%s条失败item到数库" % total_count) 77 | else: 78 | log.debug("没有失败的item") 79 | 80 | def close(self): 81 | self._item_buffer.close() 82 | -------------------------------------------------------------------------------- /feapder/core/handle_failed_requests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2018-08-13 11:43:01 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | import feapder.setting as setting 11 | from feapder.buffer.request_buffer import RequestBuffer 12 | from feapder.db.redisdb import RedisDB 13 | from feapder.network.request import Request 14 | from feapder.utils.log import log 15 | 16 | 17 | class HandleFailedRequests: 18 | def __init__(self, redis_key): 19 | if redis_key.endswith(":z_failed_requests"): 20 | redis_key = redis_key.replace(":z_failed_requests", "") 21 | 22 | self._redisdb = RedisDB() 23 | self._request_buffer = RequestBuffer(redis_key) 24 | 25 | self._table_failed_request = setting.TAB_FAILED_REQUESTS.format( 26 | redis_key=redis_key 27 | ) 28 | 29 | def get_failed_requests(self, count=10000): 30 | failed_requests = self._redisdb.zget(self._table_failed_request, count=count) 31 | failed_requests = [eval(failed_request) for failed_request in failed_requests] 32 | return failed_requests 33 | 34 | def reput_failed_requests_to_requests(self): 35 | log.debug("正在重置失败的requests...") 36 | total_count = 0 37 | while True: 38 | try: 39 | failed_requests = self.get_failed_requests() 40 | if not failed_requests: 41 | break 42 | 43 | for request in failed_requests: 44 | request["retry_times"] = 0 45 | request_obj = Request.from_dict(request) 46 | self._request_buffer.put_request(request_obj) 47 | 48 | total_count += 1 49 | except Exception as e: 50 | log.exception(e) 51 | 52 | self._request_buffer.flush() 53 | 54 | log.debug("重置%s条失败requests为待抓取requests" % total_count) 55 | -------------------------------------------------------------------------------- /feapder/core/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2020/4/22 12:08 AM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | __all__ = ["AirSpider", "TaskSpider", "Spider", "BatchSpider"] 12 | 13 | from feapder.core.spiders.air_spider import AirSpider 14 | from feapder.core.spiders.spider import Spider 15 | from feapder.core.spiders.task_spider import TaskSpider 16 | from feapder.core.spiders.batch_spider import BatchSpider 17 | -------------------------------------------------------------------------------- /feapder/db/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2020/4/23 12:09 AM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ -------------------------------------------------------------------------------- /feapder/db/memorydb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2020/4/21 11:42 PM 4 | --------- 5 | @summary: 基于内存的队列,代替redis 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | from queue import PriorityQueue 11 | 12 | from feapder import setting 13 | 14 | 15 | class MemoryDB: 16 | def __init__(self): 17 | self.priority_queue = PriorityQueue(maxsize=setting.TASK_MAX_CACHED_SIZE) 18 | 19 | def add(self, item, ignore_max_size=False): 20 | """ 21 | 添加任务 22 | :param item: 数据: 支持小于号比较的类 或者 (priority, item) 23 | :param ignore_max_size: queue满时是否等待,为True时无视队列的maxsize,直接往里塞 24 | :return: 25 | """ 26 | if ignore_max_size: 27 | self.priority_queue._put(item) 28 | self.priority_queue.unfinished_tasks += 1 29 | else: 30 | self.priority_queue.put(item) 31 | 32 | def get(self): 33 | """ 34 | 获取任务 35 | :return: 36 | """ 37 | try: 38 | item = self.priority_queue.get(timeout=1) 39 | return item 40 | except: 41 | return 42 | 43 | def empty(self): 44 | return self.priority_queue.empty() 45 | -------------------------------------------------------------------------------- /feapder/dedup/README.md: -------------------------------------------------------------------------------- 1 | # Dedup 2 | 3 | Dedup是feapder大数据去重模块,内置3种去重机制,使用方式一致,可容纳的去重数据量与内存有关。不同于BloomFilter,去重受槽位数量影响,Dedup使用了弹性的去重机制,可容纳海量的数据去重。 4 | 5 | 6 | ## 去重方式 7 | 8 | ### 临时去重 9 | 10 | > 基于redis,支持批量,去重有时效性。去重一万条数据约0.26秒,一亿条数据占用内存约1.43G 11 | 12 | ``` 13 | from feapder.dedup import Dedup 14 | 15 | data = {"xxx": 123, "xxxx": "xxxx"} 16 | datas = ["xxx", "bbb"] 17 | 18 | def test_ExpireFilter(): 19 | dedup = Dedup( 20 | Dedup.ExpireFilter, expire_time=10, redis_url="redis://@localhost:6379/0" 21 | ) 22 | 23 | # 逐条去重 24 | assert dedup.add(data) == 1 25 | assert dedup.get(data) == 1 26 | 27 | # 批量去重 28 | assert dedup.add(datas) == [1, 1] 29 | assert dedup.get(datas) == [1, 1] 30 | ``` 31 | 32 | 33 | ### 内存去重 34 | 35 | > 基于内存,支持批量。去重一万条数据约0.5秒,一亿条数据占用内存约285MB 36 | 37 | ``` 38 | from feapder.dedup import Dedup 39 | 40 | data = {"xxx": 123, "xxxx": "xxxx"} 41 | datas = ["xxx", "bbb"] 42 | 43 | def test_MemoryFilter(): 44 | dedup = Dedup(Dedup.MemoryFilter) # 表名为test 历史数据3秒有效期 45 | 46 | # 逐条去重 47 | assert dedup.add(data) == 1 48 | assert dedup.get(data) == 1 49 | 50 | # 批量去重 51 | assert dedup.add(datas) == [1, 1] 52 | assert dedup.get(datas) == [1, 1] 53 | ``` 54 | 55 | ### 永久去重 56 | 57 | > 基于redis,支持批量,永久去重。 去重一万条数据约3.5秒,一亿条数据占用内存约285MB 58 | 59 | from feapder.dedup import Dedup 60 | 61 | datas = { 62 | "xxx": xxx, 63 | "xxxx": "xxxx", 64 | } 65 | 66 | dedup = Dedup() 67 | 68 | print(dedup) # 69 | print(dedup.add(datas)) # 0 不存在 70 | print(dedup.get(datas)) # 1 存在 71 | 72 | ## 过滤数据 73 | 74 | Dedup可以通过如下方法,过滤掉已存在的数据 75 | 76 | 77 | ```python 78 | from feapder.dedup import Dedup 79 | 80 | def test_filter(): 81 | dedup = Dedup(Dedup.BloomFilter, redis_url="redis://@localhost:6379/0") 82 | 83 | # 制造已存在数据 84 | datas = ["xxx", "bbb"] 85 | dedup.add(datas) 86 | 87 | # 过滤掉已存在数据 "xxx", "bbb" 88 | datas = ["xxx", "bbb", "ccc"] 89 | dedup.filter_exist_data(datas) 90 | assert datas == ["ccc"] 91 | ``` 92 | 93 | 94 | -------------------------------------------------------------------------------- /feapder/dedup/basefilter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2022/9/21 11:17 AM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | import abc 11 | from typing import List, Union 12 | 13 | 14 | class BaseFilter: 15 | @abc.abstractmethod 16 | def add( 17 | self, keys: Union[List[str], str], *args, **kwargs 18 | ) -> Union[List[bool], bool]: 19 | """ 20 | 21 | Args: 22 | keys: list / 单个值 23 | *args: 24 | **kwargs: 25 | 26 | Returns: 27 | list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功) 28 | """ 29 | pass 30 | 31 | @abc.abstractmethod 32 | def get(self, keys: Union[List[str], str]) -> Union[List[bool], bool]: 33 | """ 34 | 检查数据是否存在 35 | Args: 36 | keys: list / 单个值 37 | 38 | Returns: 39 | list / 单个值 (如果数据已存在 返回 1 否则返回 0) 40 | """ 41 | pass 42 | -------------------------------------------------------------------------------- /feapder/dedup/expirefilter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2018/12/13 9:44 PM 4 | --------- 5 | @summary: 带有有效期的去重集合 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import time 12 | 13 | from feapder.db.redisdb import RedisDB 14 | from feapder.dedup.basefilter import BaseFilter 15 | 16 | 17 | class ExpireFilter(BaseFilter): 18 | redis_db = None 19 | 20 | def __init__( 21 | self, name: str, expire_time: int, expire_time_record_key=None, redis_url=None 22 | ): 23 | if not name: 24 | raise ValueError("name cant't be None") 25 | if not expire_time: 26 | raise ValueError("please set expire time, units is seconds") 27 | 28 | if not self.__class__.redis_db: 29 | self.__class__.redis_db = RedisDB(url=redis_url) 30 | 31 | self.name = name 32 | self.expire_time = expire_time 33 | self.expire_time_record_key = expire_time_record_key 34 | self.del_expire_key_time = None 35 | 36 | self.record_expire_time() 37 | 38 | self.del_expire_key() 39 | 40 | def __repr__(self): 41 | return "".format(self.name) 42 | 43 | @property 44 | def current_timestamp(self): 45 | return int(time.time()) 46 | 47 | def add(self, keys, *args, **kwargs): 48 | """ 49 | @param keys: 检查关键词在zset中是否存在,支持列表批量 50 | @return: list / 单个值 51 | """ 52 | if self.current_timestamp - self.del_expire_key_time > self.expire_time: 53 | self.del_expire_key() 54 | 55 | is_added = self.redis_db.zadd(self.name, keys, self.current_timestamp) 56 | return is_added 57 | 58 | def get(self, keys): 59 | is_exist = self.redis_db.zexists(self.name, keys) 60 | if isinstance(keys, list): 61 | # 判断数据本身是否重复 62 | temp_set = set() 63 | for i, key in enumerate(keys): 64 | if key in temp_set: 65 | is_exist[i] = 1 66 | else: 67 | temp_set.add(key) 68 | 69 | return is_exist 70 | 71 | def del_expire_key(self): 72 | self.redis_db.zremrangebyscore( 73 | self.name, "-inf", self.current_timestamp - self.expire_time 74 | ) 75 | self.del_expire_key_time = self.current_timestamp 76 | 77 | def record_expire_time(self): 78 | if self.expire_time_record_key: 79 | self.redis_db.hset( 80 | self.expire_time_record_key, key=self.name, value=self.expire_time 81 | ) 82 | -------------------------------------------------------------------------------- /feapder/dedup/litefilter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2022/9/21 11:28 AM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | from typing import List, Union, Set 11 | 12 | from feapder.dedup.basefilter import BaseFilter 13 | 14 | 15 | class LiteFilter(BaseFilter): 16 | def __init__(self): 17 | self.datas: Set[str] = set() 18 | 19 | def add( 20 | self, keys: Union[List[str], str], *args, **kwargs 21 | ) -> Union[List[int], int]: 22 | """ 23 | 24 | Args: 25 | keys: list / 单个值 26 | *args: 27 | **kwargs: 28 | 29 | Returns: 30 | list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功) 31 | """ 32 | if isinstance(keys, list): 33 | is_add = [] 34 | for key in keys: 35 | if key not in self.datas: 36 | self.datas.add(key) 37 | is_add.append(1) 38 | else: 39 | is_add.append(0) 40 | else: 41 | if keys not in self.datas: 42 | is_add = 1 43 | self.datas.add(keys) 44 | else: 45 | is_add = 0 46 | return is_add 47 | 48 | def get(self, keys: Union[List[str], str]) -> Union[List[int], int]: 49 | """ 50 | 检查数据是否存在 51 | Args: 52 | keys: list / 单个值 53 | 54 | Returns: 55 | list / 单个值 (如果数据已存在 返回 1 否则返回 0) 56 | """ 57 | if isinstance(keys, list): 58 | temp_set = set() 59 | is_exist = [] 60 | for key in keys: 61 | # 数据本身重复或者数据在去重库里 62 | if key in temp_set or key in self.datas: 63 | is_exist.append(1) 64 | else: 65 | is_exist.append(0) 66 | temp_set.add(key) 67 | 68 | return is_exist 69 | else: 70 | return int(keys in self.datas) 71 | -------------------------------------------------------------------------------- /feapder/network/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/feapder/network/__init__.py -------------------------------------------------------------------------------- /feapder/network/downloader/__init__.py: -------------------------------------------------------------------------------- 1 | from ._requests import RequestsDownloader 2 | from ._requests import RequestsSessionDownloader 3 | 4 | # 下面是非必要依赖 5 | try: 6 | from ._selenium import SeleniumDownloader 7 | except ModuleNotFoundError: 8 | pass 9 | try: 10 | from ._playwright import PlaywrightDownloader 11 | except ModuleNotFoundError: 12 | pass 13 | -------------------------------------------------------------------------------- /feapder/network/downloader/_requests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2022/4/10 5:57 下午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import requests 12 | from requests.adapters import HTTPAdapter 13 | 14 | from feapder.network.downloader.base import Downloader 15 | from feapder.network.response import Response 16 | 17 | 18 | class RequestsDownloader(Downloader): 19 | def download(self, request) -> Response: 20 | response = requests.request( 21 | request.method, request.url, **request.requests_kwargs 22 | ) 23 | response = Response(response) 24 | return response 25 | 26 | 27 | class RequestsSessionDownloader(Downloader): 28 | session = None 29 | 30 | @property 31 | def _session(self): 32 | if not self.__class__.session: 33 | self.__class__.session = requests.Session() 34 | # pool_connections – 缓存的 urllib3 连接池个数 pool_maxsize – 连接池中保存的最大连接数 35 | http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000) 36 | # 任何使用该session会话的 HTTP 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。 37 | self.__class__.session.mount("http", http_adapter) 38 | 39 | return self.__class__.session 40 | 41 | def download(self, request) -> Response: 42 | response = self._session.request( 43 | request.method, request.url, **request.requests_kwargs 44 | ) 45 | response = Response(response) 46 | return response 47 | -------------------------------------------------------------------------------- /feapder/network/downloader/_selenium.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2022/7/26 4:28 下午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import feapder.setting as setting 12 | import feapder.utils.tools as tools 13 | from feapder.network.downloader.base import RenderDownloader 14 | from feapder.network.response import Response 15 | from feapder.utils.webdriver import WebDriverPool, SeleniumDriver 16 | 17 | 18 | class SeleniumDownloader(RenderDownloader): 19 | webdriver_pool: WebDriverPool = None 20 | 21 | @property 22 | def _webdriver_pool(self): 23 | if not self.__class__.webdriver_pool: 24 | self.__class__.webdriver_pool = WebDriverPool( 25 | **setting.WEBDRIVER, driver=SeleniumDriver 26 | ) 27 | 28 | return self.__class__.webdriver_pool 29 | 30 | def download(self, request) -> Response: 31 | # 代理优先级 自定义 > 配置文件 > 随机 32 | if request.custom_proxies: 33 | proxy = request.get_proxy() 34 | elif setting.WEBDRIVER.get("proxy"): 35 | proxy = setting.WEBDRIVER.get("proxy") 36 | else: 37 | proxy = request.get_proxy() 38 | 39 | # user_agent优先级 自定义 > 配置文件 > 随机 40 | if request.custom_ua: 41 | user_agent = request.get_user_agent() 42 | elif setting.WEBDRIVER.get("user_agent"): 43 | user_agent = setting.WEBDRIVER.get("user_agent") 44 | else: 45 | user_agent = request.get_user_agent() 46 | 47 | cookies = request.get_cookies() 48 | url = request.url 49 | render_time = request.render_time or setting.WEBDRIVER.get("render_time") 50 | if request.get_params(): 51 | url = tools.joint_url(url, request.get_params()) 52 | 53 | browser: SeleniumDriver = self._webdriver_pool.get( 54 | user_agent=user_agent, proxy=proxy 55 | ) 56 | try: 57 | browser.get(url) 58 | if cookies: 59 | browser.cookies = cookies 60 | # 刷新使cookie生效 61 | browser.get(url) 62 | 63 | if render_time: 64 | tools.delay_time(render_time) 65 | 66 | html = browser.page_source 67 | response = Response.from_dict( 68 | { 69 | "url": browser.current_url, 70 | "cookies": browser.cookies, 71 | "_content": html.encode(), 72 | "status_code": 200, 73 | "elapsed": 666, 74 | "headers": { 75 | "User-Agent": browser.user_agent, 76 | "Cookie": tools.cookies2str(browser.cookies), 77 | }, 78 | } 79 | ) 80 | 81 | response.driver = browser 82 | response.browser = browser 83 | return response 84 | except Exception as e: 85 | self._webdriver_pool.remove(browser) 86 | raise e 87 | 88 | def close(self, driver): 89 | if driver: 90 | self._webdriver_pool.remove(driver) 91 | 92 | def put_back(self, driver): 93 | """ 94 | 释放浏览器对象 95 | """ 96 | self._webdriver_pool.put(driver) 97 | 98 | def close_all(self): 99 | """ 100 | 关闭所有浏览器 101 | """ 102 | self._webdriver_pool.close() 103 | -------------------------------------------------------------------------------- /feapder/network/downloader/base.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from abc import ABC 3 | 4 | from feapder.network.response import Response 5 | 6 | 7 | class Downloader: 8 | @abc.abstractmethod 9 | def download(self, request) -> Response: 10 | """ 11 | 12 | Args: 13 | request: feapder.Request 14 | 15 | Returns: feapder.Response 16 | 17 | """ 18 | raise NotImplementedError 19 | 20 | def close(self, response: Response): 21 | pass 22 | 23 | 24 | class RenderDownloader(Downloader, ABC): 25 | def put_back(self, driver): 26 | """ 27 | 释放浏览器对象 28 | """ 29 | pass 30 | 31 | def close(self, driver): 32 | """ 33 | 关闭浏览器 34 | """ 35 | pass 36 | 37 | def close_all(self): 38 | """ 39 | 关闭所有浏览器 40 | """ 41 | pass 42 | -------------------------------------------------------------------------------- /feapder/network/proxy_pool/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2023/7/25 10:16 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | from .base import BaseProxyPool 11 | from .proxy_pool import ProxyPool 12 | -------------------------------------------------------------------------------- /feapder/network/proxy_pool/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2023/7/25 10:03 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import abc 12 | 13 | from feapder.utils.log import log 14 | 15 | 16 | class BaseProxyPool: 17 | @abc.abstractmethod 18 | def get_proxy(self): 19 | """ 20 | 获取代理 21 | Returns: 22 | {"http": "xxx", "https": "xxx"} 23 | """ 24 | raise NotImplementedError 25 | 26 | @abc.abstractmethod 27 | def del_proxy(self, proxy): 28 | """ 29 | @summary: 删除代理 30 | --------- 31 | @param proxy: ip:port 32 | """ 33 | raise NotImplementedError 34 | 35 | def tag_proxy(self, **kwargs): 36 | """ 37 | @summary: 标记代理 38 | --------- 39 | @param kwargs: 40 | @return: 41 | """ 42 | log.warning("暂不支持标记代理") 43 | pass 44 | -------------------------------------------------------------------------------- /feapder/network/proxy_pool/proxy_pool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2022/10/19 10:40 AM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | from queue import Queue 11 | 12 | import requests 13 | 14 | import feapder.setting as setting 15 | from feapder.network.proxy_pool.base import BaseProxyPool 16 | from feapder.utils import metrics 17 | from feapder.utils import tools 18 | 19 | 20 | class ProxyPool(BaseProxyPool): 21 | """ 22 | 通过API提取代理,存储在内存中,无代理时会自动提取 23 | API返回的代理以 \r\n 分隔 24 | """ 25 | 26 | def __init__(self, proxy_api=None, **kwargs): 27 | self.proxy_api = proxy_api or setting.PROXY_EXTRACT_API 28 | self.proxy_queue = Queue() 29 | 30 | def format_proxy(self, proxy): 31 | return {"http": "http://" + proxy, "https": "http://" + proxy} 32 | 33 | @tools.retry(3, interval=5) 34 | def pull_proxies(self): 35 | resp = requests.get(self.proxy_api) 36 | proxies = resp.text.strip() 37 | resp.close() 38 | if "{" in proxies or not proxies: 39 | raise Exception("获取代理失败", proxies) 40 | # 使用 /r/n 分隔 41 | return proxies.split("\r\n") 42 | 43 | def get_proxy(self): 44 | try: 45 | if self.proxy_queue.empty(): 46 | proxies = self.pull_proxies() 47 | for proxy in proxies: 48 | self.proxy_queue.put_nowait(proxy) 49 | metrics.emit_counter("total", 1, classify="proxy") 50 | 51 | proxy = self.proxy_queue.get_nowait() 52 | self.proxy_queue.put_nowait(proxy) 53 | 54 | metrics.emit_counter("used_times", 1, classify="proxy") 55 | 56 | return self.format_proxy(proxy) 57 | except Exception as e: 58 | tools.send_msg("获取代理失败", level="error") 59 | raise Exception("获取代理失败", e) 60 | 61 | def del_proxy(self, proxy): 62 | """ 63 | @summary: 删除代理 64 | --------- 65 | @param proxy: ip:port 66 | """ 67 | if proxy in self.proxy_queue.queue: 68 | self.proxy_queue.queue.remove(proxy) 69 | metrics.emit_counter("invalid", 1, classify="proxy") 70 | -------------------------------------------------------------------------------- /feapder/network/user_pool/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "GuestUserPool", 3 | "GuestUser", 4 | "NormalUserPool", 5 | "NormalUser", 6 | "GoldUserPool", 7 | "GoldUser", 8 | "GoldUserStatus", 9 | ] 10 | 11 | from .gold_user_pool import GoldUserPool, GoldUser, GoldUserStatus 12 | from .guest_user_pool import GuestUserPool, GuestUser 13 | from .normal_user_pool import NormalUserPool, NormalUser 14 | -------------------------------------------------------------------------------- /feapder/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/3/17 10:57 下午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import abc 12 | from typing import Dict, List, Tuple 13 | 14 | 15 | class BasePipeline(metaclass=abc.ABCMeta): 16 | """ 17 | pipeline 是单线程的,批量保存数据的操作,不建议在这里写网络请求代码,如下载图片等 18 | """ 19 | 20 | @abc.abstractmethod 21 | def save_items(self, table, items: List[Dict]) -> bool: 22 | """ 23 | 保存数据 24 | Args: 25 | table: 表名 26 | items: 数据,[{},{},...] 27 | 28 | Returns: 是否保存成功 True / False 29 | 若False,不会将本批数据入到去重库,以便再次入库 30 | 31 | """ 32 | 33 | return True 34 | 35 | def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool: 36 | """ 37 | 更新数据, 与UpdateItem配合使用,若爬虫中没使用UpdateItem,则可不实现此接口 38 | Args: 39 | table: 表名 40 | items: 数据,[{},{},...] 41 | update_keys: 更新的字段, 如 ("title", "publish_time") 42 | 43 | Returns: 是否更新成功 True / False 44 | 若False,不会将本批数据入到去重库,以便再次入库 45 | 46 | """ 47 | 48 | return True 49 | 50 | def close(self): 51 | """ 52 | 关闭,爬虫结束时调用 53 | Returns: 54 | 55 | """ 56 | pass 57 | -------------------------------------------------------------------------------- /feapder/pipelines/console_pipeline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/3/18 12:39 上午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | from feapder.pipelines import BasePipeline 12 | from typing import Dict, List, Tuple 13 | from feapder.utils.log import log 14 | 15 | 16 | class ConsolePipeline(BasePipeline): 17 | """ 18 | pipeline 是单线程的,批量保存数据的操作,不建议在这里写网络请求代码,如下载图片等 19 | """ 20 | 21 | def save_items(self, table, items: List[Dict]) -> bool: 22 | """ 23 | 保存数据 24 | Args: 25 | table: 表名 26 | items: 数据,[{},{},...] 27 | 28 | Returns: 是否保存成功 True / False 29 | 若False,不会将本批数据入到去重库,以便再次入库 30 | 31 | """ 32 | log.info("【调试输出】共导出 %s 条数据 到 %s" % (len(items), table)) 33 | return True 34 | 35 | def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool: 36 | """ 37 | 更新数据 38 | Args: 39 | table: 表名 40 | items: 数据,[{},{},...] 41 | update_keys: 更新的字段, 如 ("title", "publish_time") 42 | 43 | Returns: 是否更新成功 True / False 44 | 若False,不会将本批数据入到去重库,以便再次入库 45 | 46 | """ 47 | log.info("【调试输出】共导出 %s 条数据 到 %s" % (len(items), table)) 48 | return True 49 | -------------------------------------------------------------------------------- /feapder/pipelines/mongo_pipeline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-04-18 14:12:21 4 | --------- 5 | @summary: 导出数据 6 | --------- 7 | @author: Mkdir700 8 | @email: mkdir700@gmail.com 9 | """ 10 | from typing import Dict, List, Tuple 11 | 12 | from feapder.db.mongodb import MongoDB 13 | from feapder.pipelines import BasePipeline 14 | from feapder.utils.log import log 15 | 16 | 17 | class MongoPipeline(BasePipeline): 18 | def __init__(self): 19 | self._to_db = None 20 | 21 | @property 22 | def to_db(self): 23 | if not self._to_db: 24 | self._to_db = MongoDB() 25 | 26 | return self._to_db 27 | 28 | def save_items(self, table, items: List[Dict]) -> bool: 29 | """ 30 | 保存数据 31 | Args: 32 | table: 表名 33 | items: 数据,[{},{},...] 34 | 35 | Returns: 是否保存成功 True / False 36 | 若False,不会将本批数据入到去重库,以便再次入库 37 | 38 | """ 39 | try: 40 | add_count = self.to_db.add_batch(coll_name=table, datas=items) 41 | datas_size = len(items) 42 | log.info( 43 | "共导出 %s 条数据到 %s, 新增 %s条, 重复 %s 条" 44 | % (datas_size, table, add_count, datas_size - add_count) 45 | ) 46 | return True 47 | except Exception as e: 48 | log.exception(e) 49 | return False 50 | 51 | def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool: 52 | """ 53 | 更新数据 54 | Args: 55 | table: 表名 56 | items: 数据,[{},{},...] 57 | update_keys: 更新的字段, 如 ("title", "publish_time") 58 | 59 | Returns: 是否更新成功 True / False 60 | 若False,不会将本批数据入到去重库,以便再次入库 61 | 62 | """ 63 | try: 64 | add_count = self.to_db.add_batch( 65 | coll_name=table, 66 | datas=items, 67 | update_columns=update_keys or list(items[0].keys()), 68 | ) 69 | datas_size = len(items) 70 | update_count = datas_size - add_count 71 | msg = "共导出 %s 条数据到 %s, 新增 %s 条, 更新 %s 条" % ( 72 | datas_size, 73 | table, 74 | add_count, 75 | update_count, 76 | ) 77 | if update_keys: 78 | msg += " 更新字段为 {}".format(update_keys) 79 | log.info(msg) 80 | 81 | return True 82 | except Exception as e: 83 | log.exception(e) 84 | return False 85 | -------------------------------------------------------------------------------- /feapder/pipelines/mysql_pipeline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2018-07-29 22:48:30 4 | --------- 5 | @summary: 导出数据 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | from typing import Dict, List, Tuple 11 | 12 | import feapder.utils.tools as tools 13 | from feapder.db.mysqldb import MysqlDB 14 | from feapder.pipelines import BasePipeline 15 | from feapder.utils.log import log 16 | 17 | 18 | class MysqlPipeline(BasePipeline): 19 | def __init__(self): 20 | self._to_db = None 21 | 22 | @property 23 | def to_db(self): 24 | if not self._to_db: 25 | self._to_db = MysqlDB() 26 | 27 | return self._to_db 28 | 29 | def save_items(self, table, items: List[Dict]) -> bool: 30 | """ 31 | 保存数据 32 | Args: 33 | table: 表名 34 | items: 数据,[{},{},...] 35 | 36 | Returns: 是否保存成功 True / False 37 | 若False,不会将本批数据入到去重库,以便再次入库 38 | 39 | """ 40 | 41 | sql, datas = tools.make_batch_sql(table, items) 42 | add_count = self.to_db.add_batch(sql, datas) 43 | datas_size = len(datas) 44 | if add_count: 45 | log.info( 46 | "共导出 %s 条数据 到 %s, 重复 %s 条" % (datas_size, table, datas_size - add_count) 47 | ) 48 | 49 | return add_count != None 50 | 51 | def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool: 52 | """ 53 | 更新数据 54 | Args: 55 | table: 表名 56 | items: 数据,[{},{},...] 57 | update_keys: 更新的字段, 如 ("title", "publish_time") 58 | 59 | Returns: 是否更新成功 True / False 60 | 若False,不会将本批数据入到去重库,以便再次入库 61 | 62 | """ 63 | 64 | sql, datas = tools.make_batch_sql( 65 | table, items, update_columns=update_keys or list(items[0].keys()) 66 | ) 67 | update_count = self.to_db.add_batch(sql, datas) 68 | if update_count: 69 | msg = "共更新 %s 条数据 到 %s" % (update_count // 2, table) 70 | if update_keys: 71 | msg += " 更新字段为 {}".format(update_keys) 72 | log.info(msg) 73 | 74 | return update_count != None 75 | -------------------------------------------------------------------------------- /feapder/requirements.txt: -------------------------------------------------------------------------------- 1 | better-exceptions>=0.2.2 2 | DBUtils>=2.0 3 | parsel>=1.5.2 4 | PyExecJS>=1.5.1 5 | pymongo>=3.10.1 6 | PyMySQL>=0.9.3 7 | redis>=2.10.6,<4.0.0 8 | requests>=2.22.0 9 | selenium>=3.141.0 10 | bs4>=0.0.1 11 | ipython>=7.14.0 12 | bitarray>=1.5.3 13 | redis-py-cluster>=2.1.0 14 | cryptography>=3.3.2 15 | urllib3>=1.25.8 16 | loguru>=0.5.3 17 | influxdb>=5.3.1 18 | pyperclip>=1.8.2 19 | webdriver-manager>=4.0.0 20 | terminal-layout>=2.1.3 21 | playwright -------------------------------------------------------------------------------- /feapder/templates/air_spider_template.tmpl: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on {DATE} 4 | --------- 5 | @summary: 6 | --------- 7 | @author: {USER} 8 | """ 9 | 10 | import feapder 11 | 12 | 13 | class ${spider_name}(feapder.AirSpider): 14 | def start_requests(self): 15 | yield feapder.Request("https://spidertools.cn") 16 | 17 | def parse(self, request, response): 18 | # 提取网站title 19 | print(response.xpath("//title/text()").extract_first()) 20 | # 提取网站描述 21 | print(response.xpath("//meta[@name='description']/@content").extract_first()) 22 | print("网站地址: ", response.url) 23 | 24 | 25 | if __name__ == "__main__": 26 | ${spider_name}().start() -------------------------------------------------------------------------------- /feapder/templates/batch_spider_template.tmpl: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on {DATE} 4 | --------- 5 | @summary: 6 | --------- 7 | @author: {USER} 8 | """ 9 | 10 | import feapder 11 | from feapder import ArgumentParser 12 | 13 | 14 | class ${spider_name}(feapder.BatchSpider): 15 | # 自定义数据库,若项目中有setting.py文件,此自定义可删除 16 | __custom_setting__ = dict( 17 | REDISDB_IP_PORTS="localhost:6379", 18 | REDISDB_USER_PASS="", 19 | REDISDB_DB=0, 20 | MYSQL_IP="localhost", 21 | MYSQL_PORT=3306, 22 | MYSQL_DB="", 23 | MYSQL_USER_NAME="", 24 | MYSQL_USER_PASS="", 25 | ) 26 | 27 | def start_requests(self, task): 28 | yield feapder.Request("https://spidertools.cn") 29 | 30 | def parse(self, request, response): 31 | # 提取网站title 32 | print(response.xpath("//title/text()").extract_first()) 33 | # 提取网站描述 34 | print(response.xpath("//meta[@name='description']/@content").extract_first()) 35 | print("网站地址: ", response.url) 36 | 37 | 38 | if __name__ == "__main__": 39 | spider = ${spider_name}( 40 | redis_key="xxx:xxxx", # 分布式爬虫调度信息存储位置 41 | task_table="", # mysql中的任务表 42 | task_keys=["id", "xxx"], # 需要获取任务表里的字段名,可添加多个 43 | task_state="state", # mysql中任务状态字段 44 | batch_record_table="xxx_batch_record", # mysql中的批次记录表 45 | batch_name="xxx(周全)", # 批次名字 46 | batch_interval=7, # 批次周期 天为单位 若为小时 可写 1 / 24 47 | ) 48 | 49 | parser = ArgumentParser(description="${spider_name}爬虫") 50 | 51 | parser.add_argument( 52 | "--start_master", 53 | action="store_true", 54 | help="添加任务", 55 | function=spider.start_monitor_task, 56 | ) 57 | parser.add_argument( 58 | "--start_worker", action="store_true", help="启动爬虫", function=spider.start 59 | ) 60 | 61 | parser.start() 62 | 63 | # 直接启动 64 | # spider.start() # 启动爬虫 65 | # spider.start_monitor_task() # 添加任务 66 | 67 | # 通过命令行启动 68 | # python ${file_name} --start_master # 添加任务 69 | # python ${file_name} --start_worker # 启动爬虫 70 | -------------------------------------------------------------------------------- /feapder/templates/item_template.tmpl: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on {DATE} 4 | --------- 5 | @summary: 6 | --------- 7 | @author: {USER} 8 | """ 9 | 10 | from feapder import Item 11 | 12 | 13 | class ${item_name}Item(Item): 14 | """ 15 | This class was generated by feapder 16 | command: feapder create -i ${command} 17 | """ 18 | 19 | __table_name__ = "${table_name}" 20 | 21 | def __init__(self, *args, **kwargs): 22 | ${propertys} 23 | -------------------------------------------------------------------------------- /feapder/templates/project_template/CHECK_DATA.md: -------------------------------------------------------------------------------- 1 | # 数据审核 2 | ## 表说明: 3 | 4 | > 表名 含义(更新策略) 5 | 6 | ## 一、准确性 7 | 8 | **字段设计是否满足需求? 表之间的关联字段是否满足要求? (需要人工检查)** 9 | 10 | > 注意:是否设计了自增 id,id 的类型是否设置为 bigint? 11 | > 注意:unique index 是否需要设计? 12 | > 注意:各张表之间是否需要设计关联字段; 13 | 14 | * [ ] 是 15 | * [ ] 否 16 | 17 | **各字段采集内容及存储格式是否满足要求?是否与网页一致?是否有信息缺失?** 18 | 19 | > 备注:可尝试对每个字段进行升降序排列,然后抽样检查; 20 | 21 | **是否考虑了网站同一类数据可能出现的数据格式不一致情况?** 22 | 23 | > 建议:代码对各个字段不做兼容性处理、数据不一致则抛出异常并记录 24 | 25 | * [ ] 是 26 | * [ ] 否 27 | 28 | ## 二、全量性 29 | 30 | **如果是增量采集,是否最早信息和最晚信息都采集了,同时条目总数是否正确;** 31 | **如果是批次采集,是否每个批次都有?** 32 | 33 | >备注:需要去网页端评估单个批次的总量; 34 | >参考sql语句:SELECT count(1), batch_date from [table_name] GROUP BY batch_date; 35 | 36 | **如果与另外一张表有关联关系,是否信息关联完整?** 37 | 38 | ## 三、稳定性 39 | 40 | * [ ] 是否能够长期稳定采集? 41 | * [ ] 是否加IP代理? 42 | * [ ] 是否支持断点续跑? 43 | * [ ] 是否能确保按时启动,定期采集? 44 | * [ ] 是否已开启报警? 45 | 46 | ## 四、采集频次、类型、存储方式 47 | 48 | * [ ] 采集频次是否满足要求? 49 | * [ ] 采集类型是否满足要求:增量采集 or 批次采集? 50 | -------------------------------------------------------------------------------- /feapder/templates/project_template/README.md: -------------------------------------------------------------------------------- 1 | # xxx爬虫文档 2 | ## 调研 3 | 4 | ## 数据库设计 5 | 6 | ## 爬虫逻辑 7 | 8 | ## 项目架构 -------------------------------------------------------------------------------- /feapder/templates/project_template/items/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/feapder/templates/project_template/items/__init__.py -------------------------------------------------------------------------------- /feapder/templates/project_template/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on {DATE} 4 | --------- 5 | @summary: 爬虫入口 6 | --------- 7 | @author: {USER} 8 | """ 9 | 10 | from feapder import ArgumentParser 11 | 12 | from spiders import * 13 | 14 | def crawl_xxx(): 15 | """ 16 | AirSpider爬虫 17 | """ 18 | spider = xxx.XXXSpider() 19 | spider.start() 20 | 21 | def crawl_xxx(): 22 | """ 23 | Spider爬虫 24 | """ 25 | spider = xxx.XXXSpider(redis_key="xxx:xxx") 26 | spider.start() 27 | 28 | 29 | def crawl_xxx(args): 30 | """ 31 | BatchSpider爬虫 32 | """ 33 | spider = xxx_spider.XXXSpider( 34 | task_table="", # mysql中的任务表 35 | batch_record_table="", # mysql中的批次记录表 36 | batch_name="xxx(周全)", # 批次名字 37 | batch_interval=7, # 批次时间 天为单位 若为小时 可写 1 / 24 38 | task_keys=["id", "xxx"], # 需要获取任务表里的字段名,可添加多个 39 | redis_key="xxx:xxxx", # redis中存放request等信息的根key 40 | task_state="state", # mysql中任务状态字段 41 | ) 42 | 43 | if args == 1: 44 | spider.start_monitor_task() 45 | elif args == 2: 46 | spider.start() 47 | elif args == 3: 48 | spider.init_task() 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = ArgumentParser(description="xxx爬虫") 53 | 54 | parser.add_argument( 55 | "--crawl_xxx", action="store_true", help="xxx爬虫", function=crawl_xxx 56 | ) 57 | parser.add_argument( 58 | "--crawl_xxx", action="store_true", help="xxx爬虫", function=crawl_xxx 59 | ) 60 | parser.add_argument( 61 | "--crawl_xxx", 62 | type=int, 63 | nargs=1, 64 | help="xxx爬虫", 65 | choices=[1, 2, 3], 66 | function=crawl_xxx, 67 | ) 68 | 69 | parser.start() 70 | 71 | # main.py作为爬虫启动的统一入口,提供命令行的方式启动多个爬虫,若只有一个爬虫,可不编写main.py 72 | # 将上面的xxx修改为自己实际的爬虫名 73 | # 查看运行命令 python main.py --help 74 | # AirSpider与Spider爬虫运行方式 python main.py --crawl_xxx 75 | # BatchSpider运行方式 76 | # 1. 下发任务:python main.py --crawl_xxx 1 77 | # 2. 采集:python main.py --crawl_xxx 2 78 | # 3. 重置任务:python main.py --crawl_xxx 3 79 | 80 | -------------------------------------------------------------------------------- /feapder/templates/project_template/spiders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/feapder/templates/project_template/spiders/__init__.py -------------------------------------------------------------------------------- /feapder/templates/spider_template.tmpl: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on {DATE} 4 | --------- 5 | @summary: 6 | --------- 7 | @author: {USER} 8 | """ 9 | 10 | import feapder 11 | 12 | 13 | class ${spider_name}(feapder.Spider): 14 | # 自定义数据库,若项目中有setting.py文件,此自定义可删除 15 | __custom_setting__ = dict( 16 | REDISDB_IP_PORTS="localhost:6379", REDISDB_USER_PASS="", REDISDB_DB=0 17 | ) 18 | 19 | def start_requests(self): 20 | yield feapder.Request("https://spidertools.cn") 21 | 22 | def parse(self, request, response): 23 | # 提取网站title 24 | print(response.xpath("//title/text()").extract_first()) 25 | # 提取网站描述 26 | print(response.xpath("//meta[@name='description']/@content").extract_first()) 27 | print("网站地址: ", response.url) 28 | 29 | 30 | if __name__ == "__main__": 31 | ${spider_name}(redis_key="xxx:xxx").start() 32 | -------------------------------------------------------------------------------- /feapder/templates/task_spider_template.tmpl: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on {DATE} 4 | --------- 5 | @summary: 6 | --------- 7 | @author: {USER} 8 | """ 9 | 10 | import feapder 11 | from feapder import ArgumentParser 12 | 13 | 14 | class ${spider_name}(feapder.TaskSpider): 15 | # 自定义数据库,若项目中有setting.py文件,此自定义可删除 16 | __custom_setting__ = dict( 17 | REDISDB_IP_PORTS="localhost:6379", 18 | REDISDB_USER_PASS="", 19 | REDISDB_DB=0, 20 | MYSQL_IP="localhost", 21 | MYSQL_PORT=3306, 22 | MYSQL_DB="", 23 | MYSQL_USER_NAME="", 24 | MYSQL_USER_PASS="", 25 | ) 26 | 27 | def start_requests(self, task): 28 | task_id = task.id 29 | url = task.url 30 | yield feapder.Request(url, task_id=task_id) 31 | 32 | def parse(self, request, response): 33 | # 提取网站title 34 | print(response.xpath("//title/text()").extract_first()) 35 | # 提取网站描述 36 | print(response.xpath("//meta[@name='description']/@content").extract_first()) 37 | print("网站地址: ", response.url) 38 | 39 | # mysql 需要更新任务状态为做完 即 state=1 40 | yield self.update_task_batch(request.task_id) 41 | 42 | 43 | if __name__ == "__main__": 44 | # 用mysql做任务表,需要先建好任务任务表 45 | spider = ${spider_name}( 46 | redis_key="xxx:xxx", # 分布式爬虫调度信息存储位置 47 | task_table="", # mysql中的任务表 48 | task_keys=["id", "url"], # 需要获取任务表里的字段名,可添加多个 49 | task_state="state", # mysql中任务状态字段 50 | ) 51 | 52 | # 用redis做任务表 53 | # spider = ${spider_name}( 54 | # redis_key="xxx:xxxx", # 分布式爬虫调度信息存储位置 55 | # task_table="", # 任务表名 56 | # task_table_type="redis", # 任务表类型为redis 57 | # ) 58 | 59 | parser = ArgumentParser(description="${spider_name}爬虫") 60 | 61 | parser.add_argument( 62 | "--start_master", 63 | action="store_true", 64 | help="添加任务", 65 | function=spider.start_monitor_task, 66 | ) 67 | parser.add_argument( 68 | "--start_worker", action="store_true", help="启动爬虫", function=spider.start 69 | ) 70 | 71 | parser.start() 72 | 73 | # 直接启动 74 | # spider.start() # 启动爬虫 75 | # spider.start_monitor_task() # 添加任务 76 | 77 | # 通过命令行启动 78 | # python ${file_name} --start_master # 添加任务 79 | # python ${file_name} --start_worker # 启动爬虫 -------------------------------------------------------------------------------- /feapder/templates/update_item_template.tmpl: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on {DATE} 4 | --------- 5 | @summary: 6 | --------- 7 | @author: {USER} 8 | """ 9 | 10 | from feapder import UpdateItem 11 | 12 | 13 | class ${item_name}Item(UpdateItem): 14 | """ 15 | This class was generated by feapder 16 | command: feapder create -i ${command} 17 | """ 18 | 19 | __table_name__ = "${table_name}" 20 | 21 | def __init__(self, *args, **kwargs): 22 | ${propertys} 23 | -------------------------------------------------------------------------------- /feapder/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | Created on 2019/11/5 4:41 PM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | ''' -------------------------------------------------------------------------------- /feapder/utils/custom_argparse.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2018-10-15 14:32:12 4 | --------- 5 | @summary: 封装ArgumentParser, 使其支持function, 调用start自动执行 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import argparse 12 | 13 | 14 | class ArgumentParser(argparse.ArgumentParser): 15 | def __init__(self, *args, **kwargs): 16 | self.functions = {} 17 | 18 | super(ArgumentParser, self).__init__(*args, **kwargs) 19 | 20 | def add_argument(self, *args, **kwargs): 21 | function = kwargs.pop("function") if "function" in kwargs else None 22 | key = self._get_optional_kwargs(*args, **kwargs).get("dest") 23 | self.functions[key] = function 24 | 25 | return super(ArgumentParser, self).add_argument(*args, **kwargs) 26 | 27 | def start(self, args=None, namespace=None): 28 | args = self.parse_args(args=args, namespace=namespace) 29 | for key, value in vars(args).items(): # vars() 函数返回对象object的属性和属性值的字典对象 30 | if value not in (None, False): 31 | if callable(self.functions[key]): 32 | if value != True: 33 | if isinstance(value, list) and len(value) == 1: 34 | value = value[0] 35 | self.functions[key](value) 36 | else: 37 | self.functions[key]() 38 | 39 | def run(self, args, values=None): 40 | if args in self.functions: 41 | if values: 42 | self.functions[args](values) 43 | else: 44 | self.functions[args]() 45 | 46 | else: 47 | raise Exception(f"无此方法: {args}") 48 | 49 | 50 | if __name__ == "__main__": 51 | 52 | def test(): 53 | print("test not args func") 54 | 55 | def test2(args): 56 | print("test args func", args) 57 | 58 | parser = ArgumentParser(description="测试") 59 | 60 | parser.add_argument("--test2", type=int, nargs=1, help="(1|2)", function=test2) 61 | parser.add_argument("--test", action="store_true", help="", function=test) 62 | 63 | parser.start() 64 | -------------------------------------------------------------------------------- /feapder/utils/email_sender.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2020/2/19 12:57 PM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import os 12 | import smtplib 13 | from email.header import Header 14 | from email.mime.multipart import MIMEMultipart 15 | from email.mime.text import MIMEText 16 | from email.utils import formataddr 17 | 18 | from feapder.utils.log import log 19 | 20 | 21 | class EmailSender(object): 22 | SENDER = "feapder报警系统" 23 | 24 | def __init__(self, username, password, smtpserver="smtp.163.com"): 25 | self.username = username 26 | self.password = password 27 | self.smtpserver = smtpserver 28 | self.smtp_client = smtplib.SMTP_SSL(smtpserver) 29 | self.sender = EmailSender.SENDER 30 | 31 | def __enter__(self): 32 | self.login() 33 | return self 34 | 35 | def __exit__(self, exc_type, exc_val, exc_tb): 36 | self.quit() 37 | 38 | def quit(self): 39 | self.smtp_client.quit() 40 | 41 | def login(self): 42 | self.smtp_client.connect(self.smtpserver) 43 | self.smtp_client.login(self.username, self.password) 44 | 45 | def send( 46 | self, 47 | receivers: list, 48 | title: str, 49 | content: str, 50 | content_type: str = "plain", 51 | filepath: str = None, 52 | ): 53 | """ 54 | 55 | Args: 56 | receivers: 57 | title: 58 | content: 59 | content_type: html / plain 60 | filepath: 61 | 62 | Returns: 63 | 64 | """ 65 | # 创建一个带附件的实例 66 | message = MIMEMultipart() 67 | message["From"] = formataddr( 68 | (self.sender, self.username) 69 | ) # 括号里的对应发件人邮箱昵称、发件人邮箱账号 70 | message["To"] = ",".join( 71 | [formataddr((receiver, receiver)) for receiver in receivers] 72 | ) 73 | 74 | message["Subject"] = Header(title, "utf-8") 75 | 76 | content = MIMEText(content, content_type, "utf-8") 77 | message.attach(content) 78 | 79 | # 构造附件 80 | if filepath: 81 | attach = MIMEText(open(filepath, "rb").read(), "base64", "utf-8") 82 | attach.add_header( 83 | "content-disposition", 84 | "attachment", 85 | filename=("utf-8", "", os.path.basename(filepath)), 86 | ) 87 | message.attach(attach) 88 | 89 | msg = message.as_string() 90 | # 此处直接发送多个邮箱有问题,改成一个个发送 91 | for receiver in receivers: 92 | log.debug("发送邮件到 {}".format(receiver)) 93 | self.smtp_client.sendmail(self.username, receiver, msg) 94 | log.debug("邮件发送成功!!!") 95 | return True 96 | -------------------------------------------------------------------------------- /feapder/utils/perfect_dict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/4/8 11:32 上午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | 12 | def ensure_value(value): 13 | if isinstance(value, (list, tuple)): 14 | _value = [] 15 | for v in value: 16 | _value.append(ensure_value(v)) 17 | 18 | if isinstance(value, tuple): 19 | value = tuple(_value) 20 | else: 21 | value = _value 22 | 23 | if isinstance(value, dict): 24 | return PerfectDict(value) 25 | else: 26 | return value 27 | 28 | 29 | class PerfectDict(dict): 30 | """ 31 | >>> data = PerfectDict({"id":1, "url":"xxx"}) 32 | >>> data 33 | {'id': 1, 'url': 'xxx'} 34 | >>> data = PerfectDict(id=1, url="xxx") 35 | >>> data 36 | {'id': 1, 'url': 'xxx'} 37 | >>> data.id 38 | 1 39 | >>> data.get("id") 40 | 1 41 | >>> data["id"] 42 | 1 43 | >>> id, url = data 44 | >>> id 45 | 1 46 | >>> url 47 | 'xxx' 48 | >>> data[0] 49 | 1 50 | >>> data[1] 51 | 'xxx' 52 | >>> data = PerfectDict({"a": 1, "b": {"b1": 2}, "c": [{"c1": [{"d": 1}]}]}) 53 | >>> data.b.b1 54 | 2 55 | >>> data[1].b1 56 | 2 57 | >>> data.get("b").b1 58 | 2 59 | >>> data.c[0].c1 60 | [{'d': 1}] 61 | >>> data.c[0].c1[0] 62 | {'d': 1} 63 | """ 64 | 65 | def __init__(self, _dict: dict = None, _values: list = None, **kwargs): 66 | self.__dict__ = _dict or kwargs or {} 67 | self.__dict__.pop("__values__", None) 68 | super().__init__(self.__dict__, **kwargs) 69 | self.__values__ = _values or list(self.__dict__.values()) 70 | 71 | def __getitem__(self, key): 72 | if isinstance(key, int): 73 | value = self.__values__[key] 74 | else: 75 | value = self.__dict__[key] 76 | 77 | return ensure_value(value) 78 | 79 | def __iter__(self, *args, **kwargs): 80 | for value in self.__values__: 81 | yield ensure_value(value) 82 | 83 | def __getattribute__(self, item): 84 | value = object.__getattribute__(self, item) 85 | if item == "__dict__" or item == "__values__": 86 | return value 87 | return ensure_value(value) 88 | 89 | def get(self, key, default=None): 90 | if key in self.__dict__: 91 | value = self.__dict__[key] 92 | return ensure_value(value) 93 | 94 | return default 95 | -------------------------------------------------------------------------------- /feapder/utils/tail_thread.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2024/3/19 20:00 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | import sys 11 | import threading 12 | 13 | 14 | class TailThread(threading.Thread): 15 | """ 16 | 所有子线程结束后,主线程才会退出 17 | """ 18 | 19 | def start(self) -> None: 20 | """ 21 | 解决python3.12 RuntimeError: cannot join thread before it is started的报错 22 | """ 23 | super().start() 24 | 25 | if sys.version_info.minor >= 12 and sys.version_info.major >= 3: 26 | for thread in threading.enumerate(): 27 | if ( 28 | thread.daemon 29 | or thread is threading.current_thread() 30 | or not thread.is_alive() 31 | ): 32 | continue 33 | thread.join() 34 | -------------------------------------------------------------------------------- /feapder/utils/webdriver/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2022/9/7 4:39 PM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | from .playwright_driver import PlaywrightDriver 11 | from .selenium_driver import SeleniumDriver 12 | from .webdirver import InterceptRequest, InterceptResponse 13 | from .webdriver_pool import WebDriverPool 14 | 15 | # 为了兼容老代码 16 | WebDriver = SeleniumDriver 17 | -------------------------------------------------------------------------------- /feapder/utils/webdriver/webdirver.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2022/9/7 4:27 PM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | import abc 11 | 12 | from feapder import setting 13 | 14 | 15 | class InterceptRequest: 16 | def __init__(self, url, data, headers): 17 | self.url = url 18 | self.data = data 19 | self.headers = headers 20 | 21 | 22 | class InterceptResponse: 23 | def __init__(self, request: InterceptRequest, url, headers, content, status_code): 24 | self.request = request 25 | self.url = url 26 | self.headers = headers 27 | self.content = content 28 | self.status_code = status_code 29 | 30 | 31 | class WebDriver: 32 | def __init__( 33 | self, 34 | load_images=True, 35 | user_agent=None, 36 | proxy=None, 37 | headless=False, 38 | driver_type=None, 39 | timeout=16, 40 | window_size=(1024, 800), 41 | executable_path=None, 42 | custom_argument=None, 43 | download_path=None, 44 | auto_install_driver=True, 45 | use_stealth_js=True, 46 | **kwargs, 47 | ): 48 | """ 49 | webdirver 封装,支持chrome、phantomjs 和 firefox 50 | Args: 51 | load_images: 是否加载图片 52 | user_agent: 字符串 或 无参函数,返回值为user_agent 53 | proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址 54 | headless: 是否启用无头模式 55 | driver_type: CHROME,EDGE 或 PHANTOMJS,FIREFOX 56 | timeout: 请求超时时间 57 | window_size: # 窗口大小 58 | executable_path: 浏览器路径,默认为默认路径 59 | custom_argument: 自定义参数 用于webdriver.Chrome(options=chrome_options, **kwargs) 60 | download_path: 文件下载保存路径;如果指定,不再出现“保留”“放弃”提示,仅对Chrome有效 61 | auto_install_driver: 自动下载浏览器驱动 支持chrome 和 firefox 62 | use_stealth_js: 使用stealth.min.js隐藏浏览器特征 63 | **kwargs: 64 | """ 65 | self._load_images = load_images 66 | self._user_agent = user_agent or setting.DEFAULT_USERAGENT 67 | self._proxy = proxy 68 | self._headless = headless 69 | self._timeout = timeout 70 | self._window_size = window_size 71 | self._executable_path = executable_path 72 | self._custom_argument = custom_argument 73 | self._download_path = download_path 74 | self._auto_install_driver = auto_install_driver 75 | self._use_stealth_js = use_stealth_js 76 | self._driver_type = driver_type 77 | self._kwargs = kwargs 78 | 79 | @abc.abstractmethod 80 | def quit(self): 81 | pass 82 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2020/4/22 10:45 PM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | from os.path import dirname, join 12 | from sys import version_info 13 | 14 | import setuptools 15 | 16 | if version_info < (3, 6, 0): 17 | raise SystemExit("Sorry! feapder requires python 3.6.0 or later.") 18 | 19 | with open(join(dirname(__file__), "feapder/VERSION"), "rb") as fh: 20 | version = fh.read().decode("ascii").strip() 21 | 22 | with open("README.md", "r", encoding="utf8") as fh: 23 | long_description = fh.read() 24 | 25 | packages = setuptools.find_packages() 26 | packages.extend( 27 | [ 28 | "feapder", 29 | "feapder.templates", 30 | "feapder.templates.project_template", 31 | "feapder.templates.project_template.spiders", 32 | "feapder.templates.project_template.items", 33 | ] 34 | ) 35 | 36 | requires = [ 37 | "better-exceptions>=0.2.2", 38 | "DBUtils>=2.0", 39 | "parsel>=1.5.2", 40 | "PyMySQL>=0.9.3", 41 | "redis>=2.10.6,<4.0.0", 42 | "requests>=2.22.0", 43 | "bs4>=0.0.1", 44 | "ipython>=7.14.0", 45 | "cryptography>=3.3.2", 46 | "urllib3>=1.25.8", 47 | "loguru>=0.5.3", 48 | "influxdb>=5.3.1", 49 | "pyperclip>=1.8.2", 50 | "terminal-layout>=2.1.3", 51 | ] 52 | 53 | render_requires = [ 54 | "webdriver-manager>=4.0.0", 55 | "playwright", 56 | "selenium>=3.141.0", 57 | ] 58 | 59 | all_requires = [ 60 | "bitarray>=1.5.3", 61 | "PyExecJS>=1.5.1", 62 | "pymongo>=3.10.1", 63 | "redis-py-cluster>=2.1.0", 64 | ] + render_requires 65 | 66 | setuptools.setup( 67 | name="feapder", 68 | version=version, 69 | author="Boris", 70 | license="MIT", 71 | author_email="feapder@qq.com", 72 | python_requires=">=3.6", 73 | description="feapder是一款支持分布式、批次采集、数据防丢、报警丰富的python爬虫框架", 74 | long_description=long_description, 75 | long_description_content_type="text/markdown", 76 | install_requires=requires, 77 | extras_require={"all": all_requires, "render": render_requires}, 78 | entry_points={"console_scripts": ["feapder = feapder.commands.cmdline:execute"]}, 79 | url="https://github.com/Boris-code/feapder.git", 80 | packages=packages, 81 | include_package_data=True, 82 | classifiers=["Programming Language :: Python :: 3"], 83 | ) 84 | -------------------------------------------------------------------------------- /tests/air-spider/test_air_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2020/4/22 10:41 PM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import feapder 12 | 13 | 14 | class TestAirSpider(feapder.AirSpider): 15 | __custom_setting__ = dict( 16 | USE_SESSION=True, 17 | TASK_MAX_CACHED_SIZE=10, 18 | ) 19 | 20 | def start_callback(self): 21 | print("爬虫开始") 22 | 23 | def end_callback(self): 24 | print("爬虫结束") 25 | 26 | def start_requests(self, *args, **kws): 27 | for i in range(1): 28 | print(i) 29 | yield feapder.Request("https://www.baidu.com") 30 | 31 | def download_midware(self, request): 32 | # request.headers = {'User-Agent': ""} 33 | # request.proxies = {"https":"https://12.12.12.12:6666"} 34 | # request.cookies = {} 35 | return request 36 | 37 | def validate(self, request, response): 38 | if response.status_code != 200: 39 | raise Exception("response code not 200") # 重试 40 | 41 | # if "哈哈" not in response.text: 42 | # return False # 抛弃当前请求 43 | 44 | def parse(self, request, response): 45 | print(response.bs4().title) 46 | print(response.xpath("//title").extract_first()) 47 | 48 | 49 | if __name__ == "__main__": 50 | TestAirSpider(thread_count=1).start() 51 | -------------------------------------------------------------------------------- /tests/air-spider/test_air_spider_filter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2020/4/22 10:41 PM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import feapder 12 | 13 | 14 | class TestAirSpider(feapder.AirSpider): 15 | __custom_setting__ = dict( 16 | REQUEST_FILTER_ENABLE=True, # request 去重 17 | # REQUEST_FILTER_SETTING=dict( 18 | # filter_type=3, # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、 轻量去重(LiteFilter)= 4 19 | # expire_time=2592000, # 过期时间1个月 20 | # ), 21 | REQUEST_FILTER_SETTING=dict( 22 | filter_type=4, # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、 轻量去重(LiteFilter)= 4 23 | ), 24 | ) 25 | 26 | def start_requests(self, *args, **kws): 27 | for i in range(200): 28 | yield feapder.Request("https://www.baidu.com") 29 | 30 | def parse(self, request, response): 31 | print(response.bs4().title) 32 | 33 | 34 | if __name__ == "__main__": 35 | TestAirSpider(thread_count=1).start() 36 | -------------------------------------------------------------------------------- /tests/air-spider/test_air_spider_item.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-03-30 10:27:21 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | import feapder 11 | from feapder import Item 12 | 13 | 14 | class TestAirSpiderItem(feapder.AirSpider): 15 | __custom_setting__ = dict( 16 | MYSQL_IP="localhost", 17 | MYSQL_PORT=3306, 18 | MYSQL_DB="feapder", 19 | MYSQL_USER_NAME="feapder", 20 | MYSQL_USER_PASS="feapder123", 21 | ITEM_FILTER_ENABLE=True, # item 去重 22 | ITEM_FILTER_SETTING = dict( 23 | filter_type=4 # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、轻量去重(LiteFilter)= 4 24 | ) 25 | ) 26 | 27 | def start_requests(self): 28 | yield feapder.Request("https://www.baidu.com") 29 | 30 | def parse(self, request, response): 31 | title = response.xpath("string(//title)").extract_first() 32 | for i in range(3): 33 | item = Item() 34 | item.table_name = "spider_data" 35 | item.url = request.url 36 | item.title = title 37 | yield item 38 | 39 | 40 | if __name__ == "__main__": 41 | TestAirSpiderItem().start() 42 | -------------------------------------------------------------------------------- /tests/air-spider/test_render_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2020/4/22 10:41 PM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import feapder 12 | 13 | 14 | class TestAirSpider(feapder.AirSpider): 15 | def start_requests(self, *args, **kws): 16 | yield feapder.Request("https://www.baidu.com", render=True) 17 | 18 | # def download_midware(self, request): 19 | # request.proxies = { 20 | # "http": "http://xxx.xxx.xxx.xxx:8888", 21 | # "https": "http://xxx.xxx.xxx.xxx:8888", 22 | # } 23 | 24 | def parse(self, request, response): 25 | print(response.bs4().title) 26 | 27 | 28 | if __name__ == "__main__": 29 | TestAirSpider(thread_count=1).start() 30 | -------------------------------------------------------------------------------- /tests/batch-spider-integration/batch_spider_integration_task.sql: -------------------------------------------------------------------------------- 1 | -- ---------------------------- 2 | -- Table structure for batch_spider_integration_task 3 | -- ---------------------------- 4 | CREATE TABLE `batch_spider_integration_task` ( 5 | `id` int(11) unsigned NOT NULL AUTO_INCREMENT, 6 | `url` varchar(255) DEFAULT NULL, 7 | `parser_name` varchar(255) DEFAULT NULL, 8 | `state` int(11) DEFAULT '0', 9 | PRIMARY KEY (`id`) 10 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; 11 | 12 | -- ---------------------------- 13 | -- Records of batch_spider_integration_task 14 | -- ---------------------------- 15 | INSERT INTO `batch_spider_integration_task` VALUES (1, 'https://news.sina.com.cn/', 'SinaNewsParser', 0); 16 | INSERT INTO `batch_spider_integration_task` VALUES (2, 'https://news.qq.com/', 'TencentNewsParser', 0); -------------------------------------------------------------------------------- /tests/batch-spider-integration/items/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/tests/batch-spider-integration/items/__init__.py -------------------------------------------------------------------------------- /tests/batch-spider-integration/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-03-02 23:38:24 4 | --------- 5 | @summary: 爬虫入口 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | from feapder import ArgumentParser 11 | from feapder import BatchSpider 12 | 13 | from spiders import * 14 | 15 | 16 | def batch_spider_integration_test(args): 17 | """ 18 | BatchSpider集成测试 19 | """ 20 | 21 | spider = BatchSpider( 22 | task_table="batch_spider_integration_task", # mysql中的任务表 23 | batch_record_table="batch_spider_integration_batch_record", # mysql中的批次记录表 24 | batch_name="批次爬虫集成测试", # 批次名字 25 | batch_interval=7, # 批次时间 天为单位 若为小时 可写 1 / 24 26 | task_keys=["id", "url", "parser_name"], # 集成批次爬虫,需要将批次爬虫的名字取出来,任务分发时才知道分发到哪个模板上 27 | redis_key="feapder:test_batch_spider_integration", # redis中存放request等信息的根key 28 | task_state="state", # mysql中任务状态字段 29 | ) 30 | 31 | # 集成 32 | spider.add_parser(sina_news_parser.SinaNewsParser) 33 | spider.add_parser(tencent_news_parser.TencentNewsParser) 34 | 35 | if args == 1: 36 | spider.start_monitor_task() 37 | elif args == 2: 38 | spider.start() 39 | 40 | 41 | if __name__ == "__main__": 42 | parser = ArgumentParser(description="批次爬虫集成测试") 43 | 44 | parser.add_argument( 45 | "--batch_spider_integration_test", 46 | type=int, 47 | nargs=1, 48 | help="批次爬虫集成测试(1|2)", 49 | function=batch_spider_integration_test, 50 | ) 51 | 52 | parser.start() 53 | 54 | # 运行 55 | # 下发任务及监控进度 python3 main.py --batch_spider_integration_test 1 56 | # 采集 python3 main.py --batch_spider_integration_test 2 57 | -------------------------------------------------------------------------------- /tests/batch-spider-integration/setting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """爬虫配置文件""" 3 | import os 4 | 5 | # MYSQL 6 | MYSQL_IP = "localhost" 7 | MYSQL_PORT = 3306 8 | MYSQL_DB = "feapder" 9 | MYSQL_USER_NAME = "feapder" 10 | MYSQL_USER_PASS = "feapder123" 11 | 12 | # REDIS 13 | # IP:PORT 14 | REDISDB_IP_PORTS = "localhost:6379" 15 | REDISDB_USER_PASS = "" 16 | # 默认 0 到 15 共16个数据库 17 | REDISDB_DB = 0 18 | 19 | # # 爬虫相关 20 | # # COLLECTOR 21 | # COLLECTOR_SLEEP_TIME = 1 # 从任务队列中获取任务到内存队列的间隔 22 | # COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量 23 | # 24 | # # SPIDER 25 | # SPIDER_THREAD_COUNT = 10 # 爬虫并发数 26 | # SPIDER_SLEEP_TIME = 0 # 下载时间间隔(解析完一个response后休眠时间) 27 | # SPIDER_MAX_RETRY_TIMES = 100 # 每个请求最大重试次数 28 | 29 | # # 重新尝试失败的requests 当requests重试次数超过允许的最大重试次数算失败 30 | # RETRY_FAILED_REQUESTS = False 31 | # # request 超时时间,超过这个时间重新做(不是网络请求的超时时间)单位秒 32 | # REQUEST_LOST_TIMEOUT = 600 # 10分钟 33 | # # 保存失败的request 34 | # SAVE_FAILED_REQUEST = True 35 | # 36 | # # 下载缓存 利用redis缓存,由于内存小,所以仅供测试时使用 37 | # RESPONSE_CACHED_ENABLE = False # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True 38 | # RESPONSE_CACHED_EXPIRE_TIME = 3600 # 缓存时间 秒 39 | # RESPONSE_CACHED_USED = False # 是否使用缓存 补采数据时可设置为True 40 | # 41 | # WARNING_FAILED_COUNT = 1000 # 任务失败数 超过WARNING_FAILED_COUNT则报警 42 | # 43 | # # 爬虫初始化工作 44 | # # 爬虫是否常驻 45 | # KEEP_ALIVE = False 46 | # 47 | # 48 | # # 设置代理 49 | # PROXY_EXTRACT_API = None # 代理提取API ,返回的代理分割符为\r\n 50 | # PROXY_ENABLE = True 51 | # 52 | # # 随机headers 53 | # RANDOM_HEADERS = True 54 | # # requests 使用session 55 | # USE_SESSION = False 56 | # 57 | # # 去重 58 | # ITEM_FILTER_ENABLE = False # item 去重 59 | # REQUEST_FILTER_ENABLE = False # request 去重 60 | # 61 | # # 报警 62 | # DINGDING_WARNING_URL = "" # 钉钉机器人api 63 | # DINGDING_WARNING_PHONE = "" # 报警人 64 | # LINGXI_TOKEN = "" # 灵犀报警token 65 | # 66 | # LOG_NAME = os.path.basename(os.getcwd()) 67 | # LOG_PATH = "log/%s.log" % LOG_NAME # log存储路径 68 | # LOG_LEVEL = "DEBUG" 69 | # LOG_IS_WRITE_TO_FILE = False 70 | # OTHERS_LOG_LEVAL = "ERROR" # 第三方库的log等级 71 | -------------------------------------------------------------------------------- /tests/batch-spider-integration/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "sina_news_parser", 3 | "tencent_news_parser" 4 | ] -------------------------------------------------------------------------------- /tests/batch-spider-integration/spiders/sina_news_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-03-02 23:40:37 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | import feapder 11 | 12 | 13 | class SinaNewsParser(feapder.BatchParser): 14 | """ 15 | 注意 这里继承的是BatchParser,而不是BatchSpider 16 | """ 17 | 18 | def start_requests(self, task): 19 | task_id = task[0] 20 | url = task[1] 21 | yield feapder.Request(url, task_id=task_id) 22 | 23 | def parse(self, request, response): 24 | title = response.xpath("//title/text()").extract_first() 25 | print(self.name, title) 26 | yield self.update_task_batch(request.task_id, 1) 27 | -------------------------------------------------------------------------------- /tests/batch-spider-integration/spiders/tencent_news_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-03-02 23:42:40 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | import feapder 11 | 12 | 13 | class TencentNewsParser(feapder.BatchParser): 14 | """ 15 | 注意 这里继承的是BatchParser,而不是BatchSpider 16 | """ 17 | 18 | def start_requests(self, task): 19 | task_id = task[0] 20 | url = task[1] 21 | yield feapder.Request(url, task_id=task_id) 22 | 23 | def parse(self, request, response): 24 | title = response.xpath("//title/text()").extract_first() 25 | print(self.name, title) 26 | yield self.update_task_batch(request.task_id, 1) 27 | -------------------------------------------------------------------------------- /tests/batch-spider/items/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "spider_data_item" 3 | ] -------------------------------------------------------------------------------- /tests/batch-spider/items/spider_data_item.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-02-08 16:39:27 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | from feapder import Item 11 | 12 | 13 | class SpiderDataItem(Item): 14 | """ 15 | This class was generated by feapder. 16 | command: feapder create -i spider_data. 17 | """ 18 | 19 | def __init__(self, *args, **kwargs): 20 | # self.id = None # type : int(10) unsigned | allow_null : NO | key : PRI | default_value : None | extra : auto_increment | column_comment : 21 | self.title = None # type : varchar(255) | allow_null : YES | key : | default_value : None | extra : | column_comment : 22 | -------------------------------------------------------------------------------- /tests/batch-spider/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-02-08 16:02:02 4 | --------- 5 | @summary: 爬虫入口 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | from spiders import * 11 | from feapder import ArgumentParser 12 | 13 | 14 | def crawl_test(args): 15 | spider = test_spider.TestSpider( 16 | redis_key="feapder:test_batch_spider", # 分布式爬虫调度信息存储位置 17 | task_table="batch_spider_task", # mysql中的任务表 18 | task_keys=["id", "url"], # 需要获取任务表里的字段名,可添加多个 19 | task_state="state", # mysql中任务状态字段 20 | batch_record_table="batch_spider_batch_record", # mysql中的批次记录表 21 | batch_name="批次爬虫测试(周全)", # 批次名字 22 | batch_interval=7, # 批次周期 天为单位 若为小时 可写 1 / 24 23 | ) 24 | 25 | if args == 1: 26 | spider.start_monitor_task() # 下发及监控任务 27 | else: 28 | spider.start() # 采集 29 | 30 | def test_debug(): 31 | spider = test_spider.TestSpider.to_DebugBatchSpider( 32 | task_id=1, 33 | redis_key="feapder:test_batch_spider", # 分布式爬虫调度信息存储位置 34 | task_table="batch_spider_task", # mysql中的任务表 35 | task_keys=["id", "url"], # 需要获取任务表里的字段名,可添加多个 36 | task_state="state", # mysql中任务状态字段 37 | batch_record_table="batch_spider_batch_record", # mysql中的批次记录表 38 | batch_name="批次爬虫测试(周全)", # 批次名字 39 | batch_interval=7, # 批次周期 天为单位 若为小时 可写 1 / 24 40 | ) 41 | 42 | 43 | spider.start() # 采集 44 | 45 | 46 | if __name__ == "__main__": 47 | 48 | parser = ArgumentParser(description="批次爬虫测试") 49 | 50 | parser.add_argument( 51 | "--crawl_test", type=int, nargs=1, help="(1|2)", function=crawl_test 52 | ) 53 | parser.add_argument("--test_debug", action="store_true", help="测试debug", function=test_debug) 54 | 55 | parser.start() 56 | 57 | # 运行 58 | # 下发任务及监控进度 python3 main.py --crawl_test 1 59 | # 采集 python3 main.py --crawl_test 2 -------------------------------------------------------------------------------- /tests/batch-spider/setting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """爬虫配置文件""" 3 | import os 4 | 5 | 6 | # MYSQL 7 | MYSQL_IP = "localhost" 8 | MYSQL_PORT = 3306 9 | MYSQL_DB = "feapder" 10 | MYSQL_USER_NAME = "feapder" 11 | MYSQL_USER_PASS = "feapder123" 12 | 13 | # REDIS 14 | # IP:PORT 15 | REDISDB_IP_PORTS = "localhost:6379" 16 | REDISDB_USER_PASS = "" 17 | # 默认 0 到 15 共16个数据库 18 | REDISDB_DB = 0 19 | 20 | # # 爬虫相关 21 | # # COLLECTOR 22 | # COLLECTOR_SLEEP_TIME = 1 # 从任务队列中获取任务到内存队列的间隔 23 | # COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量 24 | # 25 | # # SPIDER 26 | # SPIDER_THREAD_COUNT = 10 # 爬虫并发数 27 | # SPIDER_SLEEP_TIME = 0 # 下载时间间隔(解析完一个response后休眠时间) 28 | # SPIDER_MAX_RETRY_TIMES = 100 # 每个请求最大重试次数 29 | 30 | # # 重新尝试失败的requests 当requests重试次数超过允许的最大重试次数算失败 31 | # RETRY_FAILED_REQUESTS = False 32 | # # request 超时时间,超过这个时间重新做(不是网络请求的超时时间)单位秒 33 | # REQUEST_LOST_TIMEOUT = 600 # 10分钟 34 | # # 保存失败的request 35 | # SAVE_FAILED_REQUEST = True 36 | # 37 | # # 下载缓存 利用redis缓存,由于内存小,所以仅供测试时使用 38 | # RESPONSE_CACHED_ENABLE = False # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True 39 | # RESPONSE_CACHED_EXPIRE_TIME = 3600 # 缓存时间 秒 40 | # RESPONSE_CACHED_USED = False # 是否使用缓存 补采数据时可设置为True 41 | # 42 | # WARNING_FAILED_COUNT = 1000 # 任务失败数 超过WARNING_FAILED_COUNT则报警 43 | # 44 | # # 爬虫初始化工作 45 | # # 爬虫是否常驻 46 | # KEEP_ALIVE = False 47 | # 48 | # 49 | # # 设置代理 50 | # PROXY_EXTRACT_API = None # 代理提取API ,返回的代理分割符为\r\n 51 | # PROXY_ENABLE = True 52 | # 53 | # # 随机headers 54 | # RANDOM_HEADERS = True 55 | # # requests 使用session 56 | # USE_SESSION = False 57 | # 58 | # # 去重 59 | # ITEM_FILTER_ENABLE = False # item 去重 60 | # REQUEST_FILTER_ENABLE = False # request 去重 61 | # 62 | # # 报警 63 | # DINGDING_WARNING_URL = "" # 钉钉机器人api 64 | # DINGDING_WARNING_PHONE = "" # 报警人 65 | # LINGXI_TOKEN = "" # 灵犀报警token 66 | # 67 | # LOG_NAME = os.path.basename(os.getcwd()) 68 | # LOG_PATH = "log/%s.log" % LOG_NAME # log存储路径 69 | # LOG_LEVEL = "DEBUG" 70 | # LOG_IS_WRITE_TO_FILE = False 71 | # OTHERS_LOG_LEVAL = "ERROR" # 第三方库的log等级 72 | -------------------------------------------------------------------------------- /tests/batch-spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "test_spider" 3 | ] -------------------------------------------------------------------------------- /tests/batch-spider/spiders/test_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-02-08 16:09:47 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | import feapder 11 | from items import * 12 | 13 | 14 | class TestSpider(feapder.BatchSpider): 15 | # def init_task(self): 16 | # pass 17 | 18 | def start_requests(self, task): 19 | # task 为在任务表中取出的每一条任务 20 | id, url = task # id, url为所取的字段,main函数中指定的 21 | yield feapder.Request(url, task_id=id, render=True) # task_id为任务id,用于更新任务状态 22 | 23 | def parse(self, request, response): 24 | title = response.xpath('//title/text()').extract_first() # 取标题 25 | item = spider_data_item.SpiderDataItem() # 声明一个item 26 | item.title = title # 给item属性赋值 27 | yield item # 返回item, item会自动批量入库 28 | yield self.update_task_batch(request.task_id, 1) # 更新任务状态为1 29 | 30 | def exception_request(self, request, response): 31 | """ 32 | @summary: 请求或者parser里解析出异常的request 33 | --------- 34 | @param request: 35 | @param response: 36 | --------- 37 | @result: request / callback / None (返回值必须可迭代) 38 | """ 39 | 40 | pass 41 | 42 | def failed_request(self, request, response): 43 | """ 44 | @summary: 超过最大重试次数的request 45 | --------- 46 | @param request: 47 | --------- 48 | @result: request / item / callback / None (返回值必须可迭代) 49 | """ 50 | 51 | yield request 52 | yield self.update_task_batch(request.task_id, -1) # 更新任务状态为-1 53 | -------------------------------------------------------------------------------- /tests/batch-spider/table.sql: -------------------------------------------------------------------------------- 1 | -- ---------------------------- 2 | -- Table structure for spider_data 3 | -- ---------------------------- 4 | CREATE TABLE `spider_data` ( 5 | `id` int(10) unsigned NOT NULL AUTO_INCREMENT, 6 | `title` varchar(255) DEFAULT NULL, 7 | PRIMARY KEY (`id`) 8 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; 9 | 10 | -- ---------------------------- 11 | -- Table structure for batch_spider_task 12 | -- ---------------------------- 13 | CREATE TABLE `batch_spider_task` ( 14 | `id` int(10) unsigned NOT NULL AUTO_INCREMENT, 15 | `url` varchar(255) DEFAULT NULL, 16 | `state` int(11) DEFAULT '0', 17 | PRIMARY KEY (`id`) 18 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; 19 | 20 | -- ---------------------------- 21 | -- Records of batch_spider_task 22 | -- ---------------------------- 23 | INSERT INTO `batch_spider_task` VALUES (1, 'https://www.baidu.com', 0); 24 | -------------------------------------------------------------------------------- /tests/db/test_redis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/3/4 11:01 下午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | from feapder.db.redisdb import RedisDB 12 | 13 | redis = RedisDB(ip_ports="localhost:6379", db=0) 14 | 15 | redis.lpush("l_test", 2) 16 | redis.lpush("l_test", 3) 17 | 18 | print(redis.lrange("l_test")) 19 | print(redis.lrem("l_test", 2)) 20 | print(redis.lrange("l_test")) 21 | -------------------------------------------------------------------------------- /tests/jd_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-03-09 20:45:36 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | import feapder 11 | from feapder import Item 12 | from feapder.utils import tools 13 | 14 | 15 | class JdSpider(feapder.BatchSpider): 16 | # 自定义数据库,若项目中有setting.py文件,此自定义可删除 17 | __custom_setting__ = dict( 18 | REDISDB_IP_PORTS="localhost:6379", 19 | REDISDB_DB=0, 20 | MYSQL_IP="localhost", 21 | MYSQL_PORT=3306, 22 | MYSQL_DB="feapder", 23 | MYSQL_USER_NAME="feapder", 24 | MYSQL_USER_PASS="feapder123", 25 | ) 26 | 27 | def start_requests(self, task): 28 | task_id, item_id = task 29 | url = "https://item.jd.com/{}.html".format(item_id) 30 | yield feapder.Request(url, task_id=task_id) # 携带task_id字段 31 | 32 | def parse(self, request, response): 33 | title = response.xpath("string(//div[@class='sku-name'])").extract_first(default="").strip() 34 | 35 | item = Item() 36 | item.table_name = "jd_item" # 指定入库的表名 37 | item.title = title 38 | item.batch_date = self.batch_date # 获取批次信息,批次信息框架自己维护 39 | item.crawl_time = tools.get_current_date() # 获取当前时间 40 | yield item # 自动批量入库 41 | yield self.update_task_batch(request.task_id, 1) # 更新任务状态 42 | 43 | 44 | if __name__ == "__main__": 45 | spider = JdSpider( 46 | redis_key="feapder:jd_item", # redis中存放任务等信息key前缀 47 | task_table="jd_item_task", # mysql中的任务表 48 | task_keys=["id", "item_id"], # 需要获取任务表里的字段名,可添加多个 49 | task_state="state", # mysql中任务状态字段 50 | batch_record_table="jd_item_batch_record", # mysql中的批次记录表,自动生成 51 | batch_name="京东商品爬虫(周度全量)", # 批次名字 52 | batch_interval=7, # 批次周期 天为单位 若为小时 可写 1 / 24 53 | ) 54 | 55 | # 下面两个启动函数 相当于 master、worker。需要分开运行 56 | # spider.start_monitor_task() # maser: 下发及监控任务 57 | spider.start() # worker: 采集 58 | -------------------------------------------------------------------------------- /tests/mongo_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-02-08 16:06:12 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | import feapder 11 | from feapder import Item, UpdateItem 12 | 13 | 14 | class TestMongo(feapder.AirSpider): 15 | __custom_setting__ = dict( 16 | ITEM_PIPELINES=["feapder.pipelines.mongo_pipeline.MongoPipeline"], 17 | MONGO_IP="localhost", 18 | MONGO_PORT=27017, 19 | MONGO_DB="feapder", 20 | MONGO_USER_NAME="", 21 | MONGO_USER_PASS="", 22 | ) 23 | 24 | def start_requests(self): 25 | yield feapder.Request("https://www.baidu.com") 26 | 27 | def parse(self, request, response): 28 | title = response.xpath("//title/text()").extract_first() # 取标题 29 | for i in range(10): 30 | item = Item() # 声明一个item 31 | item.table_name = "test_mongo" 32 | item.title = title + str(666) # 给item属性赋值 33 | item.i = i + 5 34 | item.c = "777" 35 | yield item # 返回item, item会自动批量入库 36 | 37 | 38 | if __name__ == "__main__": 39 | TestMongo().start() 40 | -------------------------------------------------------------------------------- /tests/spider-integration/items/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/tests/spider-integration/items/__init__.py -------------------------------------------------------------------------------- /tests/spider-integration/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-03-02 23:38:24 4 | --------- 5 | @summary: 爬虫入口 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | from feapder import Spider 11 | 12 | from spiders import * 13 | 14 | 15 | def spider_integration_test(): 16 | """ 17 | Spider集成测试 18 | """ 19 | spider = Spider(redis_key="feapder:test_spider_integration") 20 | # 集成 21 | spider.add_parser(sina_news_parser.SinaNewsParser) 22 | spider.add_parser(tencent_news_parser.TencentNewsParser) 23 | 24 | spider.start() 25 | 26 | 27 | if __name__ == "__main__": 28 | spider_integration_test() 29 | -------------------------------------------------------------------------------- /tests/spider-integration/setting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """爬虫配置文件""" 3 | import os 4 | 5 | # MYSQL 6 | MYSQL_IP = "localhost" 7 | MYSQL_PORT = 3306 8 | MYSQL_DB = "feapder" 9 | MYSQL_USER_NAME = "feapder" 10 | MYSQL_USER_PASS = "feapder123" 11 | 12 | # REDIS 13 | # IP:PORT 14 | REDISDB_IP_PORTS = "localhost:6379" 15 | REDISDB_USER_PASS = "" 16 | # 默认 0 到 15 共16个数据库 17 | REDISDB_DB = 0 18 | 19 | # # 爬虫相关 20 | # # COLLECTOR 21 | # COLLECTOR_SLEEP_TIME = 1 # 从任务队列中获取任务到内存队列的间隔 22 | # COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量 23 | # 24 | # # SPIDER 25 | # SPIDER_THREAD_COUNT = 10 # 爬虫并发数 26 | # SPIDER_SLEEP_TIME = 0 # 下载时间间隔(解析完一个response后休眠时间) 27 | # SPIDER_MAX_RETRY_TIMES = 100 # 每个请求最大重试次数 28 | 29 | # # 重新尝试失败的requests 当requests重试次数超过允许的最大重试次数算失败 30 | # RETRY_FAILED_REQUESTS = False 31 | # # request 超时时间,超过这个时间重新做(不是网络请求的超时时间)单位秒 32 | # REQUEST_LOST_TIMEOUT = 600 # 10分钟 33 | # # 保存失败的request 34 | # SAVE_FAILED_REQUEST = True 35 | # 36 | # # 下载缓存 利用redis缓存,由于内存小,所以仅供测试时使用 37 | # RESPONSE_CACHED_ENABLE = False # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True 38 | # RESPONSE_CACHED_EXPIRE_TIME = 3600 # 缓存时间 秒 39 | # RESPONSE_CACHED_USED = False # 是否使用缓存 补采数据时可设置为True 40 | # 41 | # WARNING_FAILED_COUNT = 1000 # 任务失败数 超过WARNING_FAILED_COUNT则报警 42 | # 43 | # # 爬虫初始化工作 44 | # # 爬虫是否常驻 45 | # KEEP_ALIVE = False 46 | # 47 | # 48 | # # 设置代理 49 | # PROXY_EXTRACT_API = None # 代理提取API ,返回的代理分割符为\r\n 50 | # PROXY_ENABLE = True 51 | # 52 | # # 随机headers 53 | # RANDOM_HEADERS = True 54 | # # requests 使用session 55 | # USE_SESSION = False 56 | # 57 | # # 去重 58 | # ITEM_FILTER_ENABLE = False # item 去重 59 | # REQUEST_FILTER_ENABLE = False # request 去重 60 | # 61 | # # 报警 62 | # DINGDING_WARNING_URL = "" # 钉钉机器人api 63 | # DINGDING_WARNING_PHONE = "" # 报警人 64 | # LINGXI_TOKEN = "" # 灵犀报警token 65 | # 66 | # LOG_NAME = os.path.basename(os.getcwd()) 67 | # LOG_PATH = "log/%s.log" % LOG_NAME # log存储路径 68 | # LOG_LEVEL = "DEBUG" 69 | # LOG_IS_WRITE_TO_FILE = False 70 | # OTHERS_LOG_LEVAL = "ERROR" # 第三方库的log等级 71 | -------------------------------------------------------------------------------- /tests/spider-integration/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "sina_news_parser", 3 | "tencent_news_parser" 4 | ] -------------------------------------------------------------------------------- /tests/spider-integration/spiders/sina_news_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-03-02 23:40:37 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | import feapder 11 | 12 | 13 | class SinaNewsParser(feapder.BaseParser): 14 | def start_requests(self): 15 | """ 16 | 注意 这里继承的是BaseParser,而不是Spider 17 | """ 18 | yield feapder.Request("https://news.sina.com.cn/") 19 | 20 | def parse(self, request, response): 21 | title = response.xpath("//title/text()").extract_first() 22 | print(title) 23 | -------------------------------------------------------------------------------- /tests/spider-integration/spiders/tencent_news_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-03-02 23:42:40 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | import feapder 11 | 12 | 13 | class TencentNewsParser(feapder.BaseParser): 14 | """ 15 | 注意 这里继承的是BaseParser,而不是Spider 16 | """ 17 | def start_requests(self): 18 | yield feapder.Request("https://news.qq.com/") 19 | 20 | def parse(self, request, response): 21 | title = response.xpath("//title/text()").extract_first() 22 | print(title) 23 | -------------------------------------------------------------------------------- /tests/spider/items/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "spider_data_item" 3 | ] -------------------------------------------------------------------------------- /tests/spider/items/spider_data_item.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-03-10 17:28:36 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | from feapder import Item 11 | 12 | 13 | class SpiderDataItem(Item): 14 | """ 15 | This class was generated by feapder. 16 | command: feapder create -i spider_data. 17 | """ 18 | 19 | def __init__(self, *args, **kwargs): 20 | # self.id = None 21 | self.title = None 22 | -------------------------------------------------------------------------------- /tests/spider/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-02-08 16:01:50 4 | --------- 5 | @summary: 爬虫入口 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | from spiders import * 11 | 12 | if __name__ == "__main__": 13 | spider = test_spider.TestSpider(redis_key="feapder3:test_spider", thread_count=100, keep_alive=False) 14 | spider.start() -------------------------------------------------------------------------------- /tests/spider/setting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """爬虫配置文件""" 3 | import os 4 | 5 | 6 | # MYSQL 7 | MYSQL_IP = "localhost" 8 | MYSQL_PORT = 3306 9 | MYSQL_DB = "feapder" 10 | MYSQL_USER_NAME = "feapder" 11 | MYSQL_USER_PASS = "feapder123" 12 | 13 | # REDIS 14 | # IP:PORT 15 | REDISDB_IP_PORTS = "localhost:6379" 16 | REDISDB_USER_PASS = "" 17 | REDISDB_DB = 0 18 | 19 | # # 爬虫相关 20 | # # COLLECTOR 21 | COLLECTOR_SLEEP_TIME = 1 # 从任务队列中获取任务到内存队列的间隔 22 | COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量 23 | # 24 | # # SPIDER 25 | SPIDER_THREAD_COUNT = 100 # 爬虫并发数 26 | SPIDER_SLEEP_TIME = 0 # 下载时间间隔(解析完一个response后休眠时间) 27 | # SPIDER_MAX_RETRY_TIMES = 100 # 每个请求最大重试次数 28 | 29 | # # 重新尝试失败的requests 当requests重试次数超过允许的最大重试次数算失败 30 | # RETRY_FAILED_REQUESTS = False 31 | # # request 超时时间,超过这个时间重新做(不是网络请求的超时时间)单位秒 32 | # REQUEST_LOST_TIMEOUT = 600 # 10分钟 33 | # # 保存失败的request 34 | # SAVE_FAILED_REQUEST = True 35 | # 36 | # # 下载缓存 利用redis缓存,由于内存小,所以仅供测试时使用 37 | # RESPONSE_CACHED_ENABLE = False # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True 38 | # RESPONSE_CACHED_EXPIRE_TIME = 3600 # 缓存时间 秒 39 | # RESPONSE_CACHED_USED = False # 是否使用缓存 补采数据时可设置为True 40 | # 41 | # WARNING_FAILED_COUNT = 1000 # 任务失败数 超过WARNING_FAILED_COUNT则报警 42 | # 43 | # # 爬虫初始化工作 44 | # # 爬虫是否常驻 45 | # KEEP_ALIVE = True 46 | # 47 | # # 设置代理 48 | # PROXY_EXTRACT_API = None # 代理提取API ,返回的代理分割符为\r\n 49 | # PROXY_ENABLE = True 50 | # 51 | # # 随机headers 52 | # RANDOM_HEADERS = True 53 | # # requests 使用session 54 | # USE_SESSION = False 55 | # 56 | # # 去重 57 | # ITEM_FILTER_ENABLE = False # item 去重 58 | # REQUEST_FILTER_ENABLE = False # request 去重 59 | # 60 | # # 报警 61 | # DINGDING_WARNING_URL = "" # 钉钉机器人api 62 | # DINGDING_WARNING_PHONE = "" # 报警人 63 | # LINGXI_TOKEN = "" # 灵犀报警token 64 | # 65 | # LOG_NAME = os.path.basename(os.getcwd()) 66 | # LOG_PATH = "log/%s.log" % LOG_NAME # log存储路径 67 | # LOG_LEVEL = "DEBUG" 68 | # LOG_IS_WRITE_TO_FILE = False 69 | # OTHERS_LOG_LEVAL = "ERROR" # 第三方库的log等级 70 | REQUEST_FILTER_ENABLE=True # request 去重 71 | # REQUEST_FILTER_SETTING=dict( 72 | # filter_type=3, # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、 轻量去重(LiteFilter)= 4 73 | # expire_time=2592000, # 过期时间1个月 74 | # ), 75 | REQUEST_FILTER_SETTING=dict( 76 | filter_type=4, # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、 轻量去重(LiteFilter)= 4 77 | ) -------------------------------------------------------------------------------- /tests/spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["test_spider", "test_spider2"] 2 | -------------------------------------------------------------------------------- /tests/spider/spiders/test_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-02-08 16:06:12 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | import feapder 11 | from items import * 12 | 13 | 14 | class TestSpider(feapder.Spider): 15 | def start_requests(self): 16 | for i in range(1): 17 | yield feapder.Request(f"https://www.baidu.com#{i}", callback=self.parse) 18 | 19 | def validate(self, request, response): 20 | if response.status_code != 200: 21 | raise Exception("response code not 200") # 重试 22 | 23 | # if "哈哈" not in response.text: 24 | # return False # 抛弃当前请求 25 | 26 | def parse(self, request, response): 27 | title = response.xpath("//title/text()").extract_first() # 取标题 28 | item = spider_data_item.SpiderDataItem() # 声明一个item 29 | item.title = title # 给item属性赋值 30 | yield item # 返回item, item会自动批量入库 31 | 32 | 33 | if __name__ == '__main__': 34 | spider = TestSpider(redis_key="feapder3:test_spider", thread_count=100) 35 | spider.start() -------------------------------------------------------------------------------- /tests/spider/spiders/test_spider2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-02-08 16:06:12 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | import feapder 11 | from items import * 12 | 13 | 14 | class TestSpider2(feapder.Spider): 15 | def start_requests(self): 16 | for i in range(100): 17 | yield feapder.Request("https://www.baidu.com#{}".format(i)) 18 | 19 | def parse(self, request, response): 20 | title = response.xpath("//title/text()").extract_first() # 取标题 21 | item = spider_data_item.SpiderDataItem() # 声明一个item 22 | item.title = title # 给item属性赋值 23 | yield item # 返回item, item会自动批量入库 24 | -------------------------------------------------------------------------------- /tests/spider/table.sql: -------------------------------------------------------------------------------- 1 | -- ---------------------------- 2 | -- Table structure for spider_data 3 | -- ---------------------------- 4 | CREATE TABLE `spider_data` ( 5 | `id` int(10) unsigned NOT NULL AUTO_INCREMENT, 6 | `title` varchar(255) DEFAULT NULL, 7 | PRIMARY KEY (`id`) 8 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; 9 | -------------------------------------------------------------------------------- /tests/task-spider/test_task_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2022-06-10 14:30:54 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | import feapder 11 | from feapder import ArgumentParser 12 | 13 | 14 | class TestTaskSpider(feapder.TaskSpider): 15 | def add_task(self): 16 | # 加种子任务 框架会调用这个函数,方便往redis里塞任务,但不能写成死循环。实际业务中可以自己写个脚本往redis里塞任务 17 | self._redisdb.zadd(self._task_table, {"id": 1, "url": "https://www.baidu.com"}) 18 | 19 | def start_requests(self, task): 20 | task_id, url = task 21 | yield feapder.Request(url, task_id=task_id) 22 | 23 | def parse(self, request, response): 24 | # 提取网站title 25 | print(response.xpath("//title/text()").extract_first()) 26 | # 提取网站描述 27 | print(response.xpath("//meta[@name='description']/@content").extract_first()) 28 | print("网站地址: ", response.url) 29 | 30 | # mysql 需要更新任务状态为做完 即 state=1 31 | # yield self.update_task_batch(request.task_id) 32 | 33 | 34 | def start(args): 35 | """ 36 | 用mysql做种子表 37 | """ 38 | spider = TestTaskSpider( 39 | task_table="spider_task", 40 | task_keys=["id", "url"], 41 | redis_key="test:task_spider", 42 | keep_alive=True, 43 | ) 44 | if args == 1: 45 | spider.start_monitor_task() 46 | else: 47 | spider.start() 48 | 49 | 50 | def start2(args): 51 | """ 52 | 用redis做种子表 53 | """ 54 | spider = TestTaskSpider( 55 | task_table="spider_task2", 56 | task_table_type="redis", 57 | redis_key="test:task_spider", 58 | keep_alive=True, 59 | use_mysql=False, 60 | ) 61 | if args == 1: 62 | spider.start_monitor_task() 63 | else: 64 | spider.start() 65 | 66 | 67 | if __name__ == "__main__": 68 | parser = ArgumentParser(description="测试TaskSpider") 69 | 70 | parser.add_argument( 71 | "--start", type=int, nargs=1, help="用mysql做种子表 (1|2)", function=start 72 | ) 73 | parser.add_argument( 74 | "--start2", type=int, nargs=1, help="用redis做种子表 (1|2)", function=start2 75 | ) 76 | 77 | parser.start() 78 | 79 | # 下发任务 python3 test_task_spider.py --start 1 80 | # 采集 python3 test_task_spider.py --start 2 81 | -------------------------------------------------------------------------------- /tests/test-debugger/README.md: -------------------------------------------------------------------------------- 1 | # xxx爬虫文档 2 | ## 调研 3 | 4 | ## 数据库设计 5 | 6 | ## 爬虫逻辑 7 | 8 | ## 项目架构 -------------------------------------------------------------------------------- /tests/test-debugger/items/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/tests/test-debugger/items/__init__.py -------------------------------------------------------------------------------- /tests/test-debugger/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2023-06-09 20:26:29 4 | --------- 5 | @summary: 爬虫入口 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | import feapder 11 | 12 | from spiders import * 13 | 14 | 15 | if __name__ == "__main__": 16 | test_debugger.TestDebugger.to_DebugSpider( 17 | request=feapder.Request("https://spidertools.cn", render=True), 18 | redis_key="test:xxx", 19 | ).start() 20 | -------------------------------------------------------------------------------- /tests/test-debugger/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "test_debugger" 3 | ] -------------------------------------------------------------------------------- /tests/test-debugger/spiders/test_debugger.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2023-06-09 20:26:47 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | import feapder 11 | 12 | 13 | class TestDebugger(feapder.Spider): 14 | def start_requests(self): 15 | yield feapder.Request("https://spidertools.cn", render=True) 16 | 17 | def parse(self, request, response): 18 | # 提取网站title 19 | print(response.xpath("//title/text()").extract_first()) 20 | # 提取网站描述 21 | print(response.xpath("//meta[@name='description']/@content").extract_first()) 22 | print("网站地址: ", response.url) 23 | 24 | 25 | if __name__ == "__main__": 26 | TestDebugger.to_DebugSpider( 27 | request=feapder.Request("https://spidertools.cn", render=True), redis_key="test:xxx" 28 | ).start() 29 | -------------------------------------------------------------------------------- /tests/test-pipeline/items/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "spider_data_item" 3 | ] -------------------------------------------------------------------------------- /tests/test-pipeline/items/spider_data_item.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-02-08 16:39:27 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | from feapder import Item 11 | 12 | 13 | class SpiderDataItem(Item): 14 | """ 15 | This class was generated by feapder. 16 | command: feapder create -i spider_data. 17 | """ 18 | 19 | def __init__(self, *args, **kwargs): 20 | # self.id = None # type : int(10) unsigned | allow_null : NO | key : PRI | default_value : None | extra : auto_increment | column_comment : 21 | self.title = None # type : varchar(255) | allow_null : YES | key : | default_value : None | extra : | column_comment : 22 | -------------------------------------------------------------------------------- /tests/test-pipeline/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-02-08 16:02:02 4 | --------- 5 | @summary: 爬虫入口 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | from spiders import * 11 | from feapder import ArgumentParser 12 | 13 | 14 | def crawl_test(args): 15 | spider = test_spider.TestSpider( 16 | redis_key="feapder:test_batch_spider", # 分布式爬虫调度信息存储位置 17 | task_table="batch_spider_task", # mysql中的任务表 18 | task_keys=["id", "url"], # 需要获取任务表里的字段名,可添加多个 19 | task_state="state", # mysql中任务状态字段 20 | batch_record_table="batch_spider_batch_record", # mysql中的批次记录表 21 | batch_name="批次爬虫测试(周全)", # 批次名字 22 | batch_interval=7, # 批次周期 天为单位 若为小时 可写 1 / 24 23 | ) 24 | 25 | if args == 1: 26 | spider.start_monitor_task() # 下发及监控任务 27 | else: 28 | spider.start() # 采集 29 | 30 | 31 | if __name__ == "__main__": 32 | 33 | parser = ArgumentParser(description="批次爬虫测试") 34 | 35 | parser.add_argument( 36 | "--crawl_test", type=int, nargs=1, help="(1|2)", function=crawl_test 37 | ) 38 | 39 | parser.start() 40 | 41 | # 运行 42 | # 下发任务及监控进度 python3 main.py --crawl_test 1 43 | # 采集 python3 main.py --crawl_test 2 -------------------------------------------------------------------------------- /tests/test-pipeline/pipeline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/3/18 12:39 上午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | from feapder.pipelines import BasePipeline 12 | from typing import Dict, List, Tuple 13 | 14 | 15 | class Pipeline(BasePipeline): 16 | """ 17 | pipeline 是单线程的,批量保存数据的操作,不建议在这里写网络请求代码,如下载图片等 18 | """ 19 | 20 | def save_items(self, table, items: List[Dict]) -> bool: 21 | """ 22 | 保存数据 23 | Args: 24 | table: 表名 25 | items: 数据,[{},{},...] 26 | 27 | Returns: 是否保存成功 True / False 28 | 若False,不会将本批数据入到去重库,以便再次入库 29 | 30 | """ 31 | 32 | print("自定义pipeline, 保存数据 >>>>", table, items) 33 | 34 | return True 35 | 36 | def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool: 37 | """ 38 | 更新数据, 与UpdateItem配合使用,若爬虫中没使用UpdateItem,则可不实现此接口 39 | Args: 40 | table: 表名 41 | items: 数据,[{},{},...] 42 | update_keys: 更新的字段, 如 ("title", "publish_time") 43 | 44 | Returns: 是否更新成功 True / False 45 | 若False,不会将本批数据入到去重库,以便再次入库 46 | 47 | """ 48 | 49 | print("自定义pipeline, 更新数据 >>>>", table, items, update_keys) 50 | 51 | return True 52 | -------------------------------------------------------------------------------- /tests/test-pipeline/setting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """爬虫配置文件""" 3 | import os 4 | 5 | 6 | # MYSQL 7 | MYSQL_IP = "localhost" 8 | MYSQL_PORT = 3306 9 | MYSQL_DB = "feapder" 10 | MYSQL_USER_NAME = "feapder" 11 | MYSQL_USER_PASS = "feapder123" 12 | 13 | # REDIS 14 | # IP:PORT 15 | REDISDB_IP_PORTS = "localhost:6379" 16 | REDISDB_USER_PASS = "" 17 | # 默认 0 到 15 共16个数据库 18 | REDISDB_DB = 0 19 | 20 | # 数据入库的pipeline,可自定义,默认MysqlPipeline 21 | ITEM_PIPELINES = [ 22 | "pipeline.Pipeline" 23 | ] 24 | 25 | # # 爬虫相关 26 | # # COLLECTOR 27 | # COLLECTOR_SLEEP_TIME = 1 # 从任务队列中获取任务到内存队列的间隔 28 | # COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量 29 | # 30 | # # SPIDER 31 | # SPIDER_THREAD_COUNT = 10 # 爬虫并发数 32 | # SPIDER_SLEEP_TIME = 0 # 下载时间间隔(解析完一个response后休眠时间) 33 | # SPIDER_MAX_RETRY_TIMES = 100 # 每个请求最大重试次数 34 | 35 | # # 重新尝试失败的requests 当requests重试次数超过允许的最大重试次数算失败 36 | # RETRY_FAILED_REQUESTS = False 37 | # # request 超时时间,超过这个时间重新做(不是网络请求的超时时间)单位秒 38 | # REQUEST_LOST_TIMEOUT = 600 # 10分钟 39 | # # 保存失败的request 40 | # SAVE_FAILED_REQUEST = True 41 | # 42 | # # 下载缓存 利用redis缓存,由于内存小,所以仅供测试时使用 43 | # RESPONSE_CACHED_ENABLE = False # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True 44 | # RESPONSE_CACHED_EXPIRE_TIME = 3600 # 缓存时间 秒 45 | # RESPONSE_CACHED_USED = False # 是否使用缓存 补采数据时可设置为True 46 | # 47 | # WARNING_FAILED_COUNT = 1000 # 任务失败数 超过WARNING_FAILED_COUNT则报警 48 | # 49 | # # 爬虫初始化工作 50 | # # 爬虫是否常驻 51 | # KEEP_ALIVE = False 52 | # 53 | # 54 | # # 设置代理 55 | # PROXY_EXTRACT_API = None # 代理提取API ,返回的代理分割符为\r\n 56 | # PROXY_ENABLE = True 57 | # 58 | # # 随机headers 59 | # RANDOM_HEADERS = True 60 | # # requests 使用session 61 | # USE_SESSION = False 62 | # 63 | # # 去重 64 | # ITEM_FILTER_ENABLE = False # item 去重 65 | # REQUEST_FILTER_ENABLE = False # request 去重 66 | # 67 | # # 报警 68 | # DINGDING_WARNING_URL = "" # 钉钉机器人api 69 | # DINGDING_WARNING_PHONE = "" # 报警人 70 | # LINGXI_TOKEN = "" # 灵犀报警token 71 | # 72 | # LOG_NAME = os.path.basename(os.getcwd()) 73 | # LOG_PATH = "log/%s.log" % LOG_NAME # log存储路径 74 | # LOG_LEVEL = "DEBUG" 75 | # LOG_IS_WRITE_TO_FILE = False 76 | # OTHERS_LOG_LEVAL = "ERROR" # 第三方库的log等级 77 | -------------------------------------------------------------------------------- /tests/test-pipeline/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "test_spider" 3 | ] -------------------------------------------------------------------------------- /tests/test-pipeline/spiders/test_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-02-08 16:09:47 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | import feapder 11 | from items import * 12 | 13 | 14 | class TestSpider(feapder.BatchSpider): 15 | # def init_task(self): 16 | # pass 17 | 18 | def start_requests(self, task): 19 | # task 为在任务表中取出的每一条任务 20 | id, url = task # id, url为所取的字段,main函数中指定的 21 | yield feapder.Request(url, task_id=id) 22 | 23 | def parse(self, request, response): 24 | title = response.xpath('//title/text()').extract_first() # 取标题 25 | item = spider_data_item.SpiderDataItem() # 声明一个item 26 | item.title = title # 给item属性赋值 27 | yield item # 返回item, item会自动批量入库 28 | yield self.update_task_batch(request.task_id, 1) # 更新任务状态为1 29 | 30 | def exception_request(self, request, response): 31 | """ 32 | @summary: 请求或者parser里解析出异常的request 33 | --------- 34 | @param request: 35 | @param response: 36 | --------- 37 | @result: request / callback / None (返回值必须可迭代) 38 | """ 39 | 40 | pass 41 | 42 | def failed_request(self, request, response): 43 | """ 44 | @summary: 超过最大重试次数的request 45 | --------- 46 | @param request: 47 | --------- 48 | @result: request / item / callback / None (返回值必须可迭代) 49 | """ 50 | 51 | yield request 52 | yield self.update_task_batch(request.task_id, -1) # 更新任务状态为-1 53 | 54 | 55 | -------------------------------------------------------------------------------- /tests/test-pipeline/table.sql: -------------------------------------------------------------------------------- 1 | -- ---------------------------- 2 | -- Table structure for spider_data 3 | -- ---------------------------- 4 | CREATE TABLE `spider_data` ( 5 | `id` int(10) unsigned NOT NULL AUTO_INCREMENT, 6 | `title` varchar(255) DEFAULT NULL, 7 | PRIMARY KEY (`id`) 8 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; 9 | 10 | -- ---------------------------- 11 | -- Table structure for batch_spider_task 12 | -- ---------------------------- 13 | CREATE TABLE `batch_spider_task` ( 14 | `id` int(10) unsigned NOT NULL AUTO_INCREMENT, 15 | `url` varchar(255) DEFAULT NULL, 16 | `state` int(11) DEFAULT '0', 17 | PRIMARY KEY (`id`) 18 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; 19 | 20 | -- ---------------------------- 21 | -- Records of batch_spider_task 22 | -- ---------------------------- 23 | INSERT INTO `batch_spider_task` VALUES (1, 'https://www.baidu.com', 0); 24 | -------------------------------------------------------------------------------- /tests/test_dedup.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from redis import Redis 4 | 5 | from feapder.dedup import Dedup 6 | 7 | 8 | class TestDedup(unittest.TestCase): 9 | def clear(self): 10 | self.absolute_name = "test_dedup" 11 | redis = Redis.from_url("redis://@localhost:6379/0", decode_responses=True) 12 | keys = redis.keys(self.absolute_name + "*") 13 | if keys: 14 | redis.delete(*keys) 15 | 16 | def setUp(self) -> None: 17 | self.clear() 18 | self.mock_data() 19 | 20 | def tearDown(self) -> None: 21 | self.clear() 22 | 23 | def mock_data(self): 24 | self.data = {"xxx": 123, "xxxx": "xxxx"} 25 | self.datas = ["xxx", "bbb", "xxx"] 26 | 27 | def test_MemoryFilter(self): 28 | dedup = Dedup( 29 | Dedup.MemoryFilter, absolute_name=self.absolute_name 30 | ) # 表名为test 历史数据3秒有效期 31 | 32 | # 逐条去重 33 | self.assertEqual(dedup.add(self.data), 1) 34 | self.assertEqual(dedup.get(self.data), 1) 35 | 36 | # 批量去重 37 | self.assertEqual(dedup.get(self.datas), [0, 0, 1]) 38 | self.assertEqual(dedup.add(self.datas), [1, 1, 0]) 39 | self.assertEqual(dedup.get(self.datas), [1, 1, 1]) 40 | 41 | def test_ExpireFilter(self): 42 | dedup = Dedup( 43 | Dedup.ExpireFilter, 44 | expire_time=10, 45 | redis_url="redis://@localhost:6379/0", 46 | absolute_name=self.absolute_name, 47 | ) 48 | 49 | # 逐条去重 50 | self.assertEqual(dedup.add(self.data), 1) 51 | self.assertEqual(dedup.get(self.data), 1) 52 | 53 | # 批量去重 54 | self.assertEqual(dedup.get(self.datas), [0, 0, 1]) 55 | self.assertEqual(dedup.add(self.datas), [1, 1, 0]) 56 | self.assertEqual(dedup.get(self.datas), [1, 1, 1]) 57 | 58 | def test_BloomFilter(self): 59 | dedup = Dedup( 60 | Dedup.BloomFilter, 61 | redis_url="redis://@localhost:6379/0", 62 | absolute_name=self.absolute_name, 63 | ) 64 | 65 | # 逐条去重 66 | self.assertEqual(dedup.add(self.data), 1) 67 | self.assertEqual(dedup.get(self.data), 1) 68 | 69 | # 批量去重 70 | self.assertEqual(dedup.get(self.datas), [0, 0, 1]) 71 | self.assertEqual(dedup.add(self.datas), [1, 1, 0]) 72 | self.assertEqual(dedup.get(self.datas), [1, 1, 1]) 73 | 74 | def test_LiteFilter(self): 75 | dedup = Dedup( 76 | Dedup.LiteFilter, 77 | ) 78 | 79 | # 逐条去重 80 | self.assertEqual(dedup.add(self.data), 1) 81 | self.assertEqual(dedup.get(self.data), 1) 82 | 83 | # 批量去重 84 | self.assertEqual(dedup.get(self.datas), [0, 0, 1]) 85 | self.assertEqual(dedup.add(self.datas), [1, 1, 0]) 86 | self.assertEqual(dedup.get(self.datas), [1, 1, 1]) 87 | 88 | def test_filter(self): 89 | dedup = Dedup( 90 | Dedup.BloomFilter, 91 | redis_url="redis://@localhost:6379/0", 92 | to_md5=True, 93 | absolute_name=self.absolute_name, 94 | ) 95 | 96 | # 制造已存在数据 97 | self.datas = ["xxx", "bbb"] 98 | result = dedup.add(self.datas) 99 | self.assertEqual(result, [1, 1]) 100 | 101 | # 过滤掉已存在数据 "xxx", "bbb" 102 | self.datas = ["xxx", "bbb", "ccc"] 103 | dedup.filter_exist_data(self.datas) 104 | self.assertEqual(self.datas, ["ccc"]) 105 | -------------------------------------------------------------------------------- /tests/test_download_midware.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2023/9/21 13:59 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import feapder 12 | 13 | 14 | def download_midware(request): 15 | print("outter download_midware") 16 | return request 17 | 18 | 19 | class TestAirSpider(feapder.AirSpider): 20 | def start_requests(self): 21 | yield feapder.Request( 22 | "https://www.baidu.com", download_midware=download_midware 23 | ) 24 | 25 | def parse(self, request, response): 26 | print(request, response) 27 | 28 | 29 | class TestSpiderSpider(feapder.Spider): 30 | def start_requests(self): 31 | yield feapder.Request( 32 | "https://www.baidu.com", download_midware=[download_midware, self.download_midware] 33 | ) 34 | 35 | def download_midware(self, request): 36 | print("class download_midware") 37 | return request 38 | 39 | def parse(self, request, response): 40 | print(request, response) 41 | 42 | 43 | if __name__ == "__main__": 44 | # TestAirSpider().start() 45 | TestSpiderSpider(redis_key="test").start() 46 | -------------------------------------------------------------------------------- /tests/test_lock.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/7/15 5:00 下午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | from feapder.utils.redis_lock import RedisLock 12 | from feapder.db.redisdb import RedisDB 13 | import time 14 | 15 | def test_lock(): 16 | with RedisLock(key="test", redis_cli=RedisDB().get_redis_obj(), wait_timeout=10) as _lock: 17 | if _lock.locked: 18 | print(1) 19 | time.sleep(100) 20 | 21 | if __name__ == '__main__': 22 | test_lock() -------------------------------------------------------------------------------- /tests/test_log.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/6/18 10:36 上午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | from feapder.utils.log import log 12 | 13 | log.debug("debug") 14 | log.info("info") 15 | log.success("success") 16 | log.warning("warning") 17 | log.error("error") 18 | log.critical("critical") 19 | log.exception("exception") -------------------------------------------------------------------------------- /tests/test_metrics.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from feapder.utils import metrics 4 | 5 | # 初始化打点系统 6 | metrics.init( 7 | influxdb_host="localhost", 8 | influxdb_port="8086", 9 | influxdb_udp_port="8089", 10 | influxdb_database="feapder", 11 | influxdb_user="***", 12 | influxdb_password="***", 13 | influxdb_measurement="test_metrics", 14 | debug=True, 15 | ) 16 | 17 | 18 | async def test_counter_async(): 19 | for i in range(100): 20 | await metrics.aemit_counter("total count", count=100, classify="test5") 21 | for j in range(100): 22 | await metrics.aemit_counter("key", count=1, classify="test5") 23 | 24 | 25 | def test_counter(): 26 | for i in range(100): 27 | metrics.emit_counter("total count", count=100, classify="test5") 28 | for j in range(100): 29 | metrics.emit_counter("key", count=1, classify="test5") 30 | 31 | 32 | def test_store(): 33 | metrics.emit_store("total", 100, classify="cookie_count") 34 | 35 | 36 | def test_time(): 37 | metrics.emit_timer("total", 100, classify="time") 38 | 39 | 40 | def test_any(): 41 | metrics.emit_any( 42 | tags={"_key": "total", "_type": "any"}, fields={"_value": 100}, classify="time" 43 | ) 44 | 45 | 46 | if __name__ == "__main__": 47 | asyncio.run(test_counter_async()) 48 | test_counter_async() 49 | test_store() 50 | test_time() 51 | test_any() 52 | metrics.close() 53 | -------------------------------------------------------------------------------- /tests/test_mysqldb.py: -------------------------------------------------------------------------------- 1 | from feapder.db.mysqldb import MysqlDB 2 | 3 | 4 | db = MysqlDB( 5 | ip="localhost", port=3306, db="feapder", user_name="feapder", user_pass="feapder123" 6 | ) 7 | 8 | MysqlDB.from_url("mysql://feapder:feapder123@localhost:3306/feapder?charset=utf8mb4") -------------------------------------------------------------------------------- /tests/test_playwright.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2022/9/15 8:47 PM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import time 12 | 13 | from playwright.sync_api import Page 14 | 15 | import feapder 16 | from feapder.utils.webdriver import PlaywrightDriver 17 | 18 | 19 | class TestPlaywright(feapder.AirSpider): 20 | __custom_setting__ = dict( 21 | RENDER_DOWNLOADER="feapder.network.downloader.PlaywrightDownloader", 22 | ) 23 | 24 | def start_requests(self): 25 | yield feapder.Request("https://www.baidu.com", render=True) 26 | 27 | def parse(self, reqeust, response): 28 | driver: PlaywrightDriver = response.driver 29 | page: Page = driver.page 30 | 31 | page.type("#kw", "feapder") 32 | page.click("#su") 33 | page.wait_for_load_state("networkidle") 34 | time.sleep(1) 35 | 36 | html = page.content() 37 | response.text = html # 使response加载最新的页面 38 | for data_container in response.xpath("//div[@class='c-container']"): 39 | print(data_container.xpath("string(.//h3)").extract_first()) 40 | 41 | 42 | if __name__ == "__main__": 43 | TestPlaywright(thread_count=1).run() 44 | -------------------------------------------------------------------------------- /tests/test_playwright2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2022/9/15 8:47 PM 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | from playwright.sync_api import Response 12 | from feapder.utils.webdriver import ( 13 | PlaywrightDriver, 14 | InterceptResponse, 15 | InterceptRequest, 16 | ) 17 | 18 | import feapder 19 | 20 | 21 | def on_response(response: Response): 22 | print(response.url) 23 | 24 | 25 | class TestPlaywright(feapder.AirSpider): 26 | __custom_setting__ = dict( 27 | RENDER_DOWNLOADER="feapder.network.downloader.PlaywrightDownloader", 28 | PLAYWRIGHT=dict( 29 | user_agent=None, # 字符串 或 无参函数,返回值为user_agent 30 | proxy=None, # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址 31 | headless=False, # 是否为无头浏览器 32 | driver_type="chromium", # chromium、firefox、webkit 33 | timeout=30, # 请求超时时间 34 | window_size=(1024, 800), # 窗口大小 35 | executable_path=None, # 浏览器路径,默认为默认路径 36 | download_path=None, # 下载文件的路径 37 | render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码 38 | wait_until="networkidle", # 等待页面加载完成的事件,可选值:"commit", "domcontentloaded", "load", "networkidle" 39 | use_stealth_js=False, # 使用stealth.min.js隐藏浏览器特征 40 | # page_on_event_callback=dict(response=on_response), # 监听response事件 41 | # page.on() 事件的回调 如 page_on_event_callback={"dialog": lambda dialog: dialog.accept()} 42 | storage_state_path=None, # 保存浏览器状态的路径 43 | url_regexes=["wallpaper/list"], # 拦截接口,支持正则,数组类型 44 | save_all=True, # 是否保存所有拦截的接口 45 | ), 46 | ) 47 | 48 | def start_requests(self): 49 | yield feapder.Request( 50 | "http://www.soutushenqi.com/image/search/?searchWord=%E6%A0%91%E5%8F%B6", 51 | render=True, 52 | ) 53 | 54 | def parse(self, reqeust, response): 55 | driver: PlaywrightDriver = response.driver 56 | 57 | intercept_response: InterceptResponse = driver.get_response("wallpaper/list") 58 | intercept_request: InterceptRequest = intercept_response.request 59 | 60 | req_url = intercept_request.url 61 | req_header = intercept_request.headers 62 | req_data = intercept_request.data 63 | print("请求url", req_url) 64 | print("请求header", req_header) 65 | print("请求data", req_data) 66 | 67 | data = driver.get_json("wallpaper/list") 68 | print("接口返回的数据", data) 69 | 70 | print("------ 测试save_all=True ------- ") 71 | 72 | # 测试save_all=True 73 | all_intercept_response: list = driver.get_all_response("wallpaper/list") 74 | for intercept_response in all_intercept_response: 75 | intercept_request: InterceptRequest = intercept_response.request 76 | req_url = intercept_request.url 77 | req_header = intercept_request.headers 78 | req_data = intercept_request.data 79 | print("请求url", req_url) 80 | print("请求header", req_header) 81 | print("请求data", req_data) 82 | 83 | all_intercept_json = driver.get_all_json("wallpaper/list") 84 | for intercept_json in all_intercept_json: 85 | print("接口返回的数据", intercept_json) 86 | 87 | # 千万别忘了 88 | driver.clear_cache() 89 | 90 | 91 | if __name__ == "__main__": 92 | TestPlaywright(thread_count=1).run() 93 | -------------------------------------------------------------------------------- /tests/test_rander.py: -------------------------------------------------------------------------------- 1 | import feapder 2 | 3 | 4 | class XueQiuSpider(feapder.AirSpider): 5 | def start_requests(self): 6 | for i in range(10): 7 | yield feapder.Request("https://baidu.com/#{}".format(i), render=True) 8 | 9 | def parse(self, request, response): 10 | print(response.cookies.get_dict()) 11 | print(response.headers) 12 | print(response.browser) 13 | print("response.url ", response.url) 14 | 15 | # article_list = response.xpath('//div[@class="detail"]') 16 | # for article in article_list: 17 | # title = article.xpath("string(.//a)").extract_first() 18 | # print(title) 19 | 20 | 21 | if __name__ == "__main__": 22 | XueQiuSpider(thread_count=1).start() 23 | -------------------------------------------------------------------------------- /tests/test_rander2.py: -------------------------------------------------------------------------------- 1 | import feapder 2 | 3 | 4 | class XueQiuSpider(feapder.Spider): 5 | __custom_setting__ = dict( 6 | REDISDB_IP_PORTS="localhost:6379", REDISDB_USER_PASS="", REDISDB_DB=0 7 | ) 8 | 9 | def start_requests(self): 10 | for i in range(10): 11 | yield feapder.Request("https://news.qq.com/#{}".format(i), render=True) 12 | 13 | def parse(self, request, response): 14 | print(response.cookies.get_dict()) 15 | print("response.url ", response.url) 16 | 17 | article_list = response.xpath('//div[@class="detail"]') 18 | for article in article_list: 19 | title = article.xpath("string(.//a)").extract_first() 20 | print(title) 21 | 22 | 23 | if __name__ == "__main__": 24 | XueQiuSpider( 25 | thread_count=10, redis_key="feapter:test_rander", delete_keys=True 26 | ).start() 27 | -------------------------------------------------------------------------------- /tests/test_rander3.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import feapder 4 | from feapder.utils.webdriver import WebDriver 5 | 6 | 7 | class TestRender(feapder.AirSpider): 8 | def start_requests(self): 9 | yield feapder.Request("http://www.baidu.com", render=True) 10 | 11 | def parse(self, request, response): 12 | browser: WebDriver = response.browser 13 | browser.find_element_by_id("kw").send_keys("feapder") 14 | browser.find_element_by_id("su").click() 15 | time.sleep(5) 16 | print(browser.page_source) 17 | 18 | 19 | if __name__ == "__main__": 20 | TestRender().start() 21 | -------------------------------------------------------------------------------- /tests/test_rander_xhr.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import feapder 4 | from feapder.utils.webdriver import WebDriver 5 | 6 | 7 | class TestRender(feapder.AirSpider): 8 | __custom_setting__ = dict( 9 | WEBDRIVER=dict( 10 | pool_size=1, # 浏览器的数量 11 | load_images=True, # 是否加载图片 12 | user_agent=None, # 字符串 或 无参函数,返回值为user_agent 13 | proxy=None, # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址 14 | headless=False, # 是否为无头浏览器 15 | driver_type="CHROME", # CHROME、EDGE、PHANTOMJS、FIREFOX 16 | timeout=30, # 请求超时时间 17 | window_size=(1024, 800), # 窗口大小 18 | executable_path=None, # 浏览器路径,默认为默认路径 19 | render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码 20 | custom_argument=["--ignore-certificate-errors"], # 自定义浏览器渲染参数 21 | xhr_url_regexes=[ 22 | "/ad", 23 | ], # 拦截 http://www.spidertools.cn/spidertools/ad 接口 24 | ) 25 | ) 26 | 27 | def start_requests(self): 28 | yield feapder.Request("http://www.spidertools.cn", render=True) 29 | 30 | def parse(self, request, response): 31 | browser: WebDriver = response.browser 32 | time.sleep(3) 33 | 34 | # 获取接口数据 文本类型 35 | ad = browser.xhr_text("/ad") 36 | print(ad) 37 | 38 | # 获取接口数据 转成json,本例因为返回的接口是文本,所以不转了 39 | # browser.xhr_json("/ad") 40 | 41 | xhr_response = browser.xhr_response("/ad") 42 | print("请求接口", xhr_response.request.url) 43 | # 请求头目前获取的不完整 44 | print("请求头", xhr_response.request.headers) 45 | print("请求体", xhr_response.request.data) 46 | print("返回头", xhr_response.headers) 47 | print("返回地址", xhr_response.url) 48 | print("返回内容", xhr_response.content) 49 | 50 | 51 | if __name__ == "__main__": 52 | TestRender().start() 53 | -------------------------------------------------------------------------------- /tests/test_redisdb.py: -------------------------------------------------------------------------------- 1 | from feapder.db.redisdb import RedisDB 2 | import time 3 | db = RedisDB.from_url("redis://localhost:6379") 4 | 5 | # db.clear("test") 6 | db.zincrby("test", 1.0, "a") 7 | -------------------------------------------------------------------------------- /tests/test_request.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/3/4 11:26 下午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | from feapder import Request, Response 12 | 13 | 14 | def test_selector(): 15 | request = Request("https://www.baidu.com?a=1&b=2", data={}, params=None) 16 | response = request.get_response() 17 | print(response) 18 | 19 | print(response.xpath("//a/@href")) 20 | print(response.css("a::attr(href)")) 21 | print(response.css("a::attr(href)").extract_first()) 22 | 23 | content = response.re(" 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | """ 38 | 39 | resp = Response.from_text(text=text, url="http://feapder.com/#/README") 40 | print(resp.text) 41 | print(resp) 42 | print(resp.xpath("//script")) 43 | 44 | def test_to_dict(): 45 | request = Request("https://www.baidu.com?a=1&b=2", data={"a":1}, params="k=1", callback="test", task_id=1, cookies={"a":1}) 46 | print(request.to_dict) -------------------------------------------------------------------------------- /tests/test_spider_params.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-03-07 21:27:00 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | import feapder 11 | 12 | 13 | class TestSpiderParams(feapder.Spider): 14 | # 自定义数据库,若项目中有setting.py文件,此自定义可删除 15 | __custom_setting__ = dict( 16 | REDISDB_IP_PORTS="localhost:6379", REDISDB_USER_PASS="", REDISDB_DB=0 17 | ) 18 | 19 | def start_requests(self): 20 | yield feapder.Request(f"https://www.baidu.com") 21 | 22 | def parse(self, request, response): 23 | print(request.url) 24 | 25 | 26 | if __name__ == "__main__": 27 | spider = TestSpiderParams(redis_key="feapder:test_spider_params") 28 | spider.start() 29 | -------------------------------------------------------------------------------- /tests/test_task.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/4/8 1:06 下午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | from feapder.utils.perfect_dict import PerfectDict 12 | 13 | 14 | task_key = ["id", "url"] 15 | task = [1, "http://www.badu.com"] 16 | task = PerfectDict(_dict=dict(zip(task_key, task)), _values=task) 17 | 18 | task = PerfectDict(id=1, url="http://www.badu.com") 19 | task = PerfectDict({"id":"1", "url":"http://www.badu.com"}) 20 | 21 | print(task) 22 | id, url = task 23 | print(id, url) 24 | print(task[0], task[1]) 25 | print(task.id, task.url) 26 | print(task["id"], task["url"]) 27 | print(task.get("id"), task.get("url")) 28 | -------------------------------------------------------------------------------- /tests/test_template/test_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2022-08-04 17:58:45 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | """ 9 | 10 | import feapder 11 | from feapder import ArgumentParser 12 | 13 | 14 | class TestSpider(feapder.TaskSpider): 15 | # 自定义数据库,若项目中有setting.py文件,此自定义可删除 16 | __custom_setting__ = dict( 17 | REDISDB_IP_PORTS="localhost:6379", 18 | REDISDB_USER_PASS="", 19 | REDISDB_DB=0, 20 | MYSQL_IP="localhost", 21 | MYSQL_PORT=3306, 22 | MYSQL_DB="", 23 | MYSQL_USER_NAME="", 24 | MYSQL_USER_PASS="", 25 | ) 26 | 27 | def start_requests(self, task): 28 | task_id = task.id 29 | url = task.url 30 | yield feapder.Request(url, task_id=task_id) 31 | 32 | def parse(self, request, response): 33 | # 提取网站title 34 | print(response.xpath("//title/text()").extract_first()) 35 | # 提取网站描述 36 | print(response.xpath("//meta[@name='description']/@content").extract_first()) 37 | print("网站地址: ", response.url) 38 | 39 | # mysql 需要更新任务状态为做完 即 state=1 40 | yield self.update_task_batch(request.task_id) 41 | 42 | 43 | if __name__ == "__main__": 44 | # 用mysql做任务表,需要先建好任务任务表 45 | spider = TestSpider( 46 | redis_key="xxx:xxx", # 分布式爬虫调度信息存储位置 47 | task_table="", # mysql中的任务表 48 | task_keys=["id", "url"], # 需要获取任务表里的字段名,可添加多个 49 | task_state="state", # mysql中任务状态字段 50 | ) 51 | 52 | # 用redis做任务表 53 | # spider = TestSpider( 54 | # redis_key="xxx:xxxx", # 分布式爬虫调度信息存储位置 55 | # task_table="", # 任务表名 56 | # task_table_type="redis", # 任务表类型为redis 57 | # ) 58 | 59 | parser = ArgumentParser(description="TestSpider爬虫") 60 | 61 | parser.add_argument( 62 | "--start_master", 63 | action="store_true", 64 | help="添加任务", 65 | function=spider.start_monitor_task, 66 | ) 67 | parser.add_argument( 68 | "--start_worker", action="store_true", help="启动爬虫", function=spider.start 69 | ) 70 | 71 | parser.start() 72 | 73 | # 直接启动 74 | # spider.start() # 启动爬虫 75 | # spider.start_monitor_task() # 添加任务 76 | 77 | # 通过命令行启动 78 | # python test_spider.py --start_master # 添加任务 79 | # python test_spider.py --start_worker # 启动爬虫 -------------------------------------------------------------------------------- /tests/test_tools.py: -------------------------------------------------------------------------------- 1 | from feapder.utils import tools 2 | from datetime import datetime 3 | 4 | 5 | date = tools.format_time("昨天3:10") 6 | print(date) 7 | 8 | print(tools.format_date("2017年4月17日 3时27分12秒")) 9 | 10 | date = tools.format_time("昨天") 11 | print(date) 12 | 13 | date = tools.format_time("2021-11-05 14:18:10") 14 | print(date) 15 | 16 | date = tools.format_time("1 年前") 17 | print(date) 18 | 19 | 20 | class C: 21 | pass 22 | 23 | 24 | data = {"date": datetime.now(), "c": C()} 25 | print(tools.dumps_json(data)) 26 | -------------------------------------------------------------------------------- /tests/test_webdriver.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/3/18 7:05 下午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | from feapder.utils.webdriver import WebDriverPool, WebDriver 11 | import threading 12 | 13 | 14 | def test_webdirver_pool(): 15 | 16 | webdriver_pool = WebDriverPool( 17 | pool_size=2, load_images=False, driver_type=WebDriver.FIREFOX, timeout=30 18 | ) 19 | 20 | def request(): 21 | try: 22 | browser = webdriver_pool.get() 23 | browser.get("https://baidu.com") 24 | print(browser.title) 25 | webdriver_pool.put(browser) 26 | except: 27 | print("失败") 28 | 29 | for i in range(5): 30 | threading.Thread(target=request).start() 31 | 32 | 33 | def test_webdriver(): 34 | with WebDriver( 35 | load_images=True, driver_type=WebDriver.CHROME, timeout=30 36 | ) as browser: 37 | browser.get("https://httpbin.org/get") 38 | html = browser.page_source 39 | print(html) 40 | print(browser.user_agent) 41 | 42 | import time 43 | time.sleep(1000) 44 | 45 | test_webdriver() -------------------------------------------------------------------------------- /tests/user_pool/test_gold_user_pool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/9/13 2:33 下午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import unittest 12 | 13 | from feapder.network.user_pool import GoldUser 14 | from feapder.network.user_pool import GoldUserPool 15 | 16 | 17 | class TestUserPool(unittest.TestCase): 18 | def setUp(self) -> None: 19 | users = [ 20 | GoldUser( 21 | username="zhangsan", 22 | password="1234", 23 | max_use_times=10, 24 | use_interval=5, 25 | ), 26 | GoldUser( 27 | username="lisi", 28 | password="1234", 29 | max_use_times=10, 30 | use_interval=5, 31 | login_interval=50, 32 | ), 33 | ] 34 | 35 | class CustomGoldUserPool(GoldUserPool): 36 | def login(self, user: GoldUser) -> GoldUser: 37 | # 此处为假数据,正常需通过登录网站获取cookie 38 | username = user.username 39 | password = user.password 40 | 41 | # 登录获取cookie 42 | cookie = "zzzz" 43 | user.cookies = cookie 44 | 45 | return user 46 | 47 | self.user_pool = CustomGoldUserPool( 48 | "test:user_pool", 49 | users=users, 50 | keep_alive=True, 51 | ) 52 | 53 | def test_run(self): 54 | self.user_pool.run() 55 | 56 | def test_get_user(self): 57 | user = self.user_pool.get_user() 58 | print(user) 59 | 60 | user = self.user_pool.get_user(username="zhangsan") 61 | print(user) 62 | 63 | def test_del_user(self): 64 | self.user_pool.del_user("lisi") 65 | 66 | def test_delay_user(self): 67 | user = self.user_pool.get_user(username="lisi") 68 | print(user) 69 | self.user_pool.delay_use("lisi", 60) 70 | user = self.user_pool.get_user(username="lisi") 71 | print(user) 72 | 73 | def test_exclusive(self): 74 | """ 75 | 测试独占 76 | """ 77 | # 用户lisi被test_spider爬虫独占 78 | user = self.user_pool.get_user( 79 | username="lisi", used_for_spider_name="test_spider" 80 | ) 81 | print(user) 82 | 83 | # test_spider爬虫可以正常使用 84 | user = self.user_pool.get_user( 85 | username="lisi", used_for_spider_name="test_spider" 86 | ) 87 | print(user) 88 | 89 | # 其他的爬虫需要在独占的间隔后使用 90 | user = self.user_pool.get_user(username="lisi") 91 | print(user) 92 | 93 | 94 | if __name__ == "__main__": 95 | unittest.main() 96 | -------------------------------------------------------------------------------- /tests/user_pool/test_guest_user_pool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/9/13 2:33 下午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import time 12 | import unittest 13 | from typing import Optional 14 | 15 | from feapder.network.user_pool import GuestUser 16 | from feapder.network.user_pool import GuestUserPool 17 | 18 | 19 | class TestUserPool(unittest.TestCase): 20 | def setUp(self) -> None: 21 | # 默认的用户池,使用webdriver访问page_url生产cookie 22 | self.user_pool = GuestUserPool( 23 | "test:user_pool", page_url="https://www.baidu.com" 24 | ) 25 | 26 | # 自定义生产cookie的方法 27 | class CustomGuestUserPool(GuestUserPool): 28 | def login(self) -> Optional[GuestUser]: 29 | # 此处为假数据,正常需通过网站获取cookie 30 | user = GuestUser( 31 | user_agent="xxx", 32 | proxies="yyy", 33 | cookies={"some_key": "some_value{}".format(time.time())}, 34 | ) 35 | return user 36 | 37 | self.custom_user_pool = CustomGuestUserPool( 38 | "test:custom_user_pool", min_users=10, keep_alive=True 39 | ) 40 | 41 | def test_get_user(self): 42 | """ 43 | 测试直接获取游客用户 44 | Returns: 45 | 46 | """ 47 | user = self.custom_user_pool.get_user(block=True) 48 | print("取到user:", user) 49 | print("cookie:", user.cookies) 50 | print("user_agent:", user.user_agent) 51 | print("proxies:", user.proxies) 52 | 53 | def test_del_user(self): 54 | user = GuestUser( 55 | **{ 56 | "user_id": "9f1654ba654e12adfea548eae89a8f6f", 57 | "user_agent": "xxx", 58 | "proxies": "yyy", 59 | "cookies": {"some_key": "some_value1640006728.908013"}, 60 | } 61 | ) 62 | print(user.user_id) 63 | self.custom_user_pool.del_user(user.user_id) 64 | 65 | def test_keep_alive(self): 66 | """ 67 | 测试生产游客用户,面对需要大量cookie,需要单独起个进程维护cookie的场景 68 | Returns: 69 | 70 | """ 71 | 72 | self.custom_user_pool.run() 73 | 74 | 75 | if __name__ == "__main__": 76 | unittest.main() 77 | -------------------------------------------------------------------------------- /tests/user_pool/test_normal_user_pool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021/9/13 2:33 下午 4 | --------- 5 | @summary: 6 | --------- 7 | @author: Boris 8 | @email: boris_liu@foxmail.com 9 | """ 10 | 11 | import unittest 12 | 13 | from feapder.network.user_pool import NormalUser 14 | from feapder.network.user_pool import NormalUserPool 15 | 16 | 17 | class TestUserPool(unittest.TestCase): 18 | def setUp(self) -> None: 19 | class CustomNormalUserPool(NormalUserPool): 20 | def login(self, user: NormalUser) -> NormalUser: 21 | # 此处为假数据,正常需通过登录网站获取cookie 22 | username = user.username 23 | password = user.password 24 | 25 | # 登录获取cookie 26 | cookie = "xxx" 27 | user.cookies = cookie 28 | 29 | return user 30 | 31 | self.user_pool = CustomNormalUserPool( 32 | "test:user_pool", 33 | table_userbase="test_userbase", 34 | login_retry_times=0, 35 | keep_alive=True, 36 | ) 37 | 38 | def test_get_user(self): 39 | user = self.user_pool.get_user() 40 | print("取到user:", user) 41 | print("cookie:", user.cookies) 42 | print("user_agent:", user.user_agent) 43 | print("proxies:", user.proxies) 44 | 45 | def test_del_user(self): 46 | self.user_pool.del_user(1) 47 | 48 | def test_tag_user_locked(self): 49 | self.user_pool.tag_user_locked(2) 50 | 51 | def test_keep_alive(self): 52 | self.user_pool.run() 53 | 54 | 55 | if __name__ == "__main__": 56 | unittest.main() 57 | --------------------------------------------------------------------------------