The response has been limited to 50k tokens of the smallest files in the repo. You can remove this limitation by removing the max tokens filter.
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── config.yml
    │   └── feature_request.md
    └── workflows
    │   └── workflow.yml
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
    ├── .nojekyll
    ├── README.md
    ├── _coverpage.md
    ├── _navbar.md
    ├── _sidebar.md
    ├── command
    │   └── cmdline.md
    ├── favicon.ico
    ├── feapder_platform
    │   ├── feaplat.md
    │   ├── feaplat_bak.md
    │   ├── question.md
    │   └── usage.md
    ├── foreword
    │   ├── 10分钟上手.md
    │   ├── 功能概览.md
    │   └── 架构设计.md
    ├── images
    │   └── qingguo.jpg
    ├── index.html
    ├── lib
    │   ├── docsify-copy-code
    │   │   └── docsify-copy-code.min.js
    │   ├── docsify
    │   │   └── lib
    │   │   │   ├── docsify.min.js
    │   │   │   ├── plugins
    │   │   │       ├── docsify-edit-on-github.js
    │   │   │       ├── ga.js
    │   │   │       └── search.js
    │   │   │   └── themes
    │   │   │       └── vue.css
    │   └── prismjs
    │   │   └── components
    │   │       ├── prism-bash.js
    │   │       ├── prism-java.js
    │   │       ├── prism-python.js
    │   │       ├── prism-sql.js
    │   │       └── prism-yaml.js
    ├── question
    │   ├── setting不生效问题.md
    │   ├── 安装问题.md
    │   ├── 请求问题.md
    │   └── 运行问题.md
    ├── robots.txt
    ├── source_code
    │   ├── BaseParser.md
    │   ├── BatchParser.md
    │   ├── BatchSpider进阶.md
    │   ├── Item.md
    │   ├── MongoDB.md
    │   ├── MysqlDB.md
    │   ├── RedisDB.md
    │   ├── Request.md
    │   ├── Response.md
    │   ├── Spider进阶.md
    │   ├── UpdateItem.md
    │   ├── UserPool.md
    │   ├── custom_downloader.md
    │   ├── dedup.md
    │   ├── logger.md
    │   ├── pipeline.md
    │   ├── proxy.md
    │   ├── tools.md
    │   ├── 报警及监控.md
    │   ├── 浏览器渲染-Playwright.md
    │   ├── 浏览器渲染-Selenium.md
    │   ├── 监控打点.md
    │   └── 配置文件.md
    └── usage
    │   ├── AirSpider.md
    │   ├── BatchSpider.md
    │   ├── Spider.md
    │   ├── TaskSpider.md
    │   ├── 使用前必读.md
    │   └── 爬虫集成.md
├── feapder
    ├── VERSION
    ├── __init__.py
    ├── buffer
    │   ├── __init__.py
    │   ├── item_buffer.py
    │   └── request_buffer.py
    ├── commands
    │   ├── __init__.py
    │   ├── cmdline.py
    │   ├── create
    │   │   ├── __init__.py
    │   │   ├── create_cookies.py
    │   │   ├── create_init.py
    │   │   ├── create_item.py
    │   │   ├── create_json.py
    │   │   ├── create_params.py
    │   │   ├── create_project.py
    │   │   ├── create_setting.py
    │   │   ├── create_spider.py
    │   │   └── create_table.py
    │   ├── create_builder.py
    │   ├── retry.py
    │   ├── shell.py
    │   └── zip.py
    ├── core
    │   ├── __init__.py
    │   ├── base_parser.py
    │   ├── collector.py
    │   ├── handle_failed_items.py
    │   ├── handle_failed_requests.py
    │   ├── parser_control.py
    │   ├── scheduler.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── air_spider.py
    │   │   ├── batch_spider.py
    │   │   ├── spider.py
    │   │   └── task_spider.py
    ├── db
    │   ├── __init__.py
    │   ├── memorydb.py
    │   ├── mongodb.py
    │   ├── mysqldb.py
    │   └── redisdb.py
    ├── dedup
    │   ├── README.md
    │   ├── __init__.py
    │   ├── basefilter.py
    │   ├── bitarray.py
    │   ├── bloomfilter.py
    │   ├── expirefilter.py
    │   └── litefilter.py
    ├── network
    │   ├── __init__.py
    │   ├── downloader
    │   │   ├── __init__.py
    │   │   ├── _playwright.py
    │   │   ├── _requests.py
    │   │   ├── _selenium.py
    │   │   └── base.py
    │   ├── item.py
    │   ├── proxy_pool
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   └── proxy_pool.py
    │   ├── proxy_pool_old.py
    │   ├── request.py
    │   ├── response.py
    │   ├── selector.py
    │   ├── user_agent.py
    │   └── user_pool
    │   │   ├── __init__.py
    │   │   ├── base_user_pool.py
    │   │   ├── gold_user_pool.py
    │   │   ├── guest_user_pool.py
    │   │   └── normal_user_pool.py
    ├── pipelines
    │   ├── __init__.py
    │   ├── console_pipeline.py
    │   ├── mongo_pipeline.py
    │   └── mysql_pipeline.py
    ├── requirements.txt
    ├── setting.py
    ├── templates
    │   ├── air_spider_template.tmpl
    │   ├── batch_spider_template.tmpl
    │   ├── item_template.tmpl
    │   ├── project_template
    │   │   ├── CHECK_DATA.md
    │   │   ├── README.md
    │   │   ├── items
    │   │   │   └── __init__.py
    │   │   ├── main.py
    │   │   ├── setting.py
    │   │   └── spiders
    │   │   │   └── __init__.py
    │   ├── spider_template.tmpl
    │   ├── task_spider_template.tmpl
    │   └── update_item_template.tmpl
    └── utils
    │   ├── __init__.py
    │   ├── custom_argparse.py
    │   ├── email_sender.py
    │   ├── js
    │       ├── intercept.js
    │       └── stealth.min.js
    │   ├── log.py
    │   ├── metrics.py
    │   ├── perfect_dict.py
    │   ├── redis_lock.py
    │   ├── tail_thread.py
    │   ├── tools.py
    │   └── webdriver
    │       ├── __init__.py
    │       ├── playwright_driver.py
    │       ├── selenium_driver.py
    │       ├── webdirver.py
    │       └── webdriver_pool.py
├── setup.py
└── tests
    ├── air-spider
        ├── test_air_spider.py
        ├── test_air_spider_filter.py
        ├── test_air_spider_item.py
        └── test_render_spider.py
    ├── batch-spider-integration
        ├── batch_spider_integration_task.sql
        ├── items
        │   └── __init__.py
        ├── main.py
        ├── setting.py
        └── spiders
        │   ├── __init__.py
        │   ├── sina_news_parser.py
        │   └── tencent_news_parser.py
    ├── batch-spider
        ├── items
        │   ├── __init__.py
        │   └── spider_data_item.py
        ├── main.py
        ├── setting.py
        ├── spiders
        │   ├── __init__.py
        │   └── test_spider.py
        └── table.sql
    ├── db
        └── test_redis.py
    ├── jd_spider.py
    ├── mongo_spider.py
    ├── spider-integration
        ├── items
        │   └── __init__.py
        ├── main.py
        ├── setting.py
        └── spiders
        │   ├── __init__.py
        │   ├── sina_news_parser.py
        │   └── tencent_news_parser.py
    ├── spider
        ├── items
        │   ├── __init__.py
        │   └── spider_data_item.py
        ├── main.py
        ├── setting.py
        ├── spiders
        │   ├── __init__.py
        │   ├── test_spider.py
        │   └── test_spider2.py
        └── table.sql
    ├── task-spider
        └── test_task_spider.py
    ├── test-debugger
        ├── README.md
        ├── items
        │   └── __init__.py
        ├── main.py
        ├── setting.py
        └── spiders
        │   ├── __init__.py
        │   └── test_debugger.py
    ├── test-pipeline
        ├── items
        │   ├── __init__.py
        │   └── spider_data_item.py
        ├── main.py
        ├── pipeline.py
        ├── setting.py
        ├── spiders
        │   ├── __init__.py
        │   └── test_spider.py
        └── table.sql
    ├── test_dedup.py
    ├── test_download_midware.py
    ├── test_lock.py
    ├── test_log.py
    ├── test_metrics.py
    ├── test_mongodb.py
    ├── test_mysqldb.py
    ├── test_playwright.py
    ├── test_playwright2.py
    ├── test_rander.py
    ├── test_rander2.py
    ├── test_rander3.py
    ├── test_rander_xhr.py
    ├── test_redisdb.py
    ├── test_request.py
    ├── test_spider_params.py
    ├── test_task.py
    ├── test_template
        └── test_spider.py
    ├── test_tools.py
    ├── test_webdriver.py
    └── user_pool
        ├── test_gold_user_pool.py
        ├── test_guest_user_pool.py
        └── test_normal_user_pool.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **需知**
11 | 
12 | 升级feapder,保证feapder是最新版,若BUG仍然存在,则详细描述问题
13 | > pip install --upgrade feapder
14 | 
15 | **问题**
16 | 
17 | **截图**
18 | 
19 | **代码**
20 | 
21 | ```python
22 | 
23 | ```
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | # https://docs.github.com/en/github/building-a-strong-community/configuring-issue-templates-for-your-repository#configuring-the-template-chooser
2 | blank_issues_allowed: false  # We have a blank template which assigns labels
3 | contact_links:
4 |   - name: Questions about using feapder?
5 |     url: "https://github.com/Boris-code/feapder/discussions"
6 |     about: Please see our guide on how to ask questions


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/workflow.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/.github/workflows/workflow.yml


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | files/*
 2 | .DS_Store
 3 | .idea/*
 4 | */.idea/*
 5 | venv/*
 6 | venv2/*
 7 | *.pyc
 8 | *test.py
 9 | *.log
10 | **/proxy_file
11 | build/
12 | dist/
13 | *.egg-info/
14 | .vscode/
15 | media/
16 | .MWebMetaData/
17 | push.sh
18 | assets/


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # 贡献指南
 2 | 感谢你的宝贵时间。你的贡献将使这个项目变得更好!在提交贡献之前,请务必花点时间阅读下面的入门指南。
 3 | 
 4 | ## 提交 Pull Request
 5 | 1. Fork [此仓库](https://github.com/Boris-code/feapder.git),
 6 | 2. clone到本地,从 `develop` 创建分支,对代码进行更改。
 7 | 3. 请确保进行了相应的测试。
 8 | 4. 推送代码到自己Fork的仓库中。
 9 | 5. 在Fork的仓库中点击 Pull request 链接
10 | 6. 点击「New pull request」按钮。
11 | 7. 填写提交说明后,「Create pull request」。提交到`develop`分支。
12 | 
13 | ## License
14 | 
15 | [MIT](./LICENSE)
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Modifications:
 4 | 
 5 | Copyright (c) 2020 Boris <boris_liu@foxmail.com>
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include README.md
 2 | include LICENSE
 3 | 
 4 | include feapder/requirements.txt
 5 | include feapder/VERSION
 6 | 
 7 | recursive-include feapder/utils/js *
 8 | recursive-include feapder/templates *
 9 | recursive-include tests *
10 | 
11 | global-exclude __pycache__ *.py[cod]


--------------------------------------------------------------------------------
/docs/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/docs/.nojekyll


--------------------------------------------------------------------------------
/docs/_coverpage.md:
--------------------------------------------------------------------------------
 1 | ![feapder](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/02/08/feapder.png)
 2 | 
 3 | #  feapder 爬虫框架文档
 4 | 
 5 | > [ˈfiːpdə]
 6 | 
 7 | feapder 命名源于 fast-easy-air-pro-spider 缩写
 8 | 
 9 | 秉承着开发快速、抓取快速、简单、轻量且功能强大的原则,倾心打造。
10 | 
11 | 支持轻量级爬虫、分布式爬虫、批次爬虫、爬虫集成,以及完善的报警等。
12 | 
13 | 
14 | [GitHub](https://github.com/Boris-code/feapder)
15 | [Get Started](README.md)
16 | 


--------------------------------------------------------------------------------
/docs/_navbar.md:
--------------------------------------------------------------------------------
1 | 
2 |   * [爬虫管理系统](feapder_platform/feaplat.md)
3 |   * [爬虫工具库](https://spidertools.cn)
4 |   * [知识星球](https://t.zsxq.com/mmAmAuF)
5 |   * [微信公众号](https://open.weixin.qq.com/qr/code?username=gh_870ffb1242a7)
6 |   * [知乎](https://www.zhihu.com/people/boris-97-17/posts)
7 |   * [讨论](https://gitter.im/feapder/community?utm_source=share-link&utm_medium=link&utm_campaign=share-link)


--------------------------------------------------------------------------------
/docs/_sidebar.md:
--------------------------------------------------------------------------------
 1 | * 序章
 2 |   * [简介及安装](README.md)
 3 |   * [10分钟快速上手](foreword/10分钟上手.md)
 4 |   * [架构设计](foreword/架构设计.md)
 5 |   * [功能概览](foreword/功能概览.md)
 6 | 
 7 | * 常用工具
 8 |   * [命令行工具](command/cmdline.md)
 9 | 
10 | * 使用说明
11 |   * [使用前必读](usage/使用前必读.md)
12 |   * [轻量爬虫-AirSpider](usage/AirSpider.md)
13 |   * [分布式爬虫-Spider](usage/Spider.md)
14 |   * [任务爬虫-TaskSpider](usage/TaskSpider.md)
15 |   * [批次爬虫-BatchSpider](usage/BatchSpider.md)
16 |   * [爬虫集成](usage/爬虫集成.md)
17 | 
18 | * 使用进阶
19 |   * [请求-Request](source_code/Request.md)
20 |   * [响应-Response](source_code/Response.md)
21 |   * [代理使用说明](source_code/proxy.md)
22 |   * [用户池说明](source_code/UserPool.md)
23 |   * [浏览器渲染-Selenium](source_code/浏览器渲染-Selenium.md)
24 |   * [浏览器渲染-Playwright](source_code/浏览器渲染-Playwright)
25 |   * [解析器-BaseParser](source_code/BaseParser.md)
26 |   * [批次解析器-BatchParser](source_code/BatchParser.md)
27 |   * [Spider进阶](source_code/Spider进阶.md)
28 |   * [BatchSpider进阶](source_code/BatchSpider进阶.md)
29 |   * [配置文件](source_code/配置文件.md)
30 |   * [Item](source_code/Item.md)
31 |   * [UpdateItem](source_code/UpdateItem.md)
32 |   * [数据管道-pipeline](source_code/pipeline.md)
33 |   * [MysqlDB](source_code/MysqlDB.md)
34 |   * [MongoDB](source_code/MongoDB.md)
35 |   * [RedisDB](source_code/RedisDB.md)
36 |   * [工具库-tools](source_code/tools.md)
37 |   * [日志配置及使用](source_code/logger.md)
38 |   * [海量数据去重-dedup](source_code/dedup.md)
39 |   * [报警及监控](source_code/报警及监控.md)
40 |   * [监控打点](source_code/监控打点.md)
41 |   * [自定义下载器](source_code/custom_downloader.md)
42 | 
43 | * 爬虫管理系统
44 |   * [简介及部署](feapder_platform/feaplat.md)
45 |   * [使用说明](feapder_platform/usage.md)
46 |   * [常见问题](feapder_platform/question.md)
47 | 
48 | * 常见问题
49 |   * [安装问题](question/安装问题.md)
50 |   * [运行问题](question/运行问题.md)
51 |   * [请求问题](question/请求问题.md)
52 |   * [setting不生效问题](question/setting不生效问题.md)


--------------------------------------------------------------------------------
/docs/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/docs/favicon.ico


--------------------------------------------------------------------------------
/docs/feapder_platform/usage.md:
--------------------------------------------------------------------------------
 1 | # FEAPLAT使用说明
 2 | 
 3 | ## 首次运行须知
 4 | 
 5 | 1. 管理系统默认账号密码:admin / admin
 6 | 
 7 | ## 添加项目
 8 | 
 9 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/09/17/16318800747189.jpg)
10 | 
11 | 1. 使用git方式上传项目时,需要使用SSH协议,若拉取私有项目,可在feaplat的设置页面添加 SSH 密钥。使用git方式,每次运行前会拉取默认分支最新的代码
12 | 2. 项目会被放到爬虫`worker`容器的根目录下 即 `/项目文件`
13 | 3. 工作路径:是指你的项目路径,比如下面的项目结构:
14 |     
15 |     ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/09/13/16315322995977.jpg)
16 |     
17 |     工作路径为 `/spider-project`,feaplat会进入到这个目录,后续的代码执行命令都是在这个路径下运行的 
18 |     
19 | 1. requirements.txt:用于安装依赖包,填写依赖包的绝对路径
20 | 
21 | ## 运行
22 | 
23 | 1. 启动命令:启动命令是在您添加项目时配置的工作路径下执行的
24 | 2. 定时类型:
25 |     1. cron:crontab表达式,参考:https://tool.lu/crontab/
26 |     2. interval:时间间隔
27 |     3. date:指定日期
28 |     4. once:立即运行,且只运行一次
29 | 
30 | ## 示例
31 | 
32 | 1. 准备项目,项目结构如下:
33 |     ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/10/16/16343707944750.jpg)
34 | 2. 压缩后上传:(推荐使用 `feapder zip` 命令压缩)
35 |     ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/10/16/16343709590040.jpg)
36 |    - 工作路径:上传的项目会被放到docker里的根目录下(跟你本机项目路径没关系),然后解压运行。因`feapder_demo.zip`解压后为`feapder_demo`,所以工作路径配置`/feapder_demo`
37 |    - 本项目没依赖,可以不配置`requirements.txt`
38 |    - 若需要第三放库,则在项目下创建requirements.txt文件,把依赖库写进去,然后路径指向这个文件即可,如`/feaplat_demo/requirements.txt`
39 | 1. 点击项目进入任务列表,添加任务
40 |     ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/10/16/16343712604864.jpg)
41 |    启动命令的执行位置是在上面配置的工作路径下执行的,定时类型为once时点击确认添加会自动执行
42 | 1. 查看任务实例:
43 |     ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/10/16/16343720658671.jpg)
44 |     ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/10/16/16343720862217.jpg)
45 |     
46 |    可以看到已经运行完毕 
47 |    
48 | ## git方式拉取私有项目
49 | 
50 | 拉取私有项目需在git仓库里添加如下公钥
51 | 
52 | ```
53 | ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCd/k/tjbcMislEunjtYQNXxz5tgEDc/fSvuLHBNUX4PtfmMQ07TuUX2XJIIzLRPaqv3nsMn3+QZrV0xQd545FG1Cq83JJB98ATTW7k5Q0eaWXkvThdFeG5+n85KeVV2W4BpdHHNZ5h9RxBUmVZPpAZacdC6OUSBYTyCblPfX9DvjOk+KfwAZVwpJSkv4YduwoR3DNfXrmK5P+wrYW9z/VHUf0hcfWEnsrrHktCKgohZn9Fe8uS3B5wTNd9GgVrLGRk85ag+CChoqg80DjgFt/IhzMCArqwLyMn7rGG4Iu2Ie0TcdMc0TlRxoBhqrfKkN83cfQ3gDf41tZwp67uM9ZN feapder@qq.com
54 | ```
55 | 
56 | 或在系统设置页面配置您的SSH私钥,然后在git仓库里添加您的公钥,例如:
57 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/10/19/16346353514967.jpg)
58 | 
59 | 注意,公私钥加密方式为RSA,其他的可能会有问题
60 | 
61 | 生成RSA公私钥方式如下:
62 | ```shell
63 | ssh-keygen -t rsa -C "备注" -f 生成路径/文件名
64 | ```
65 | 如:
66 | `ssh-keygen -t rsa -C "feaplat" -f id_rsa`
67 | 然后一路回车,不要输密码
68 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/11/17/16371210640228.jpg)
69 | 最终生成 `id_rsa`、`id_rsa.pub` 文件,复制`id_rsa.pub`文件内容到git仓库,复制`id_rsa`文件内容到feaplat爬虫管理系统
70 | 
71 | 
72 | 
73 | ## 爬虫监控
74 | 
75 | > 若您使用的是feapder爬虫或者使用了自定义打点,监控才会有对应的数据
76 | 
77 | 1. 表名:以 task_id 命名
78 | 2. 保留策略:这是influxdb的概念,监控数据默认保留180天,滚动更新,这个保留策略为`feapder_180d`,同时也被设置成了默认策略`default`。所以直接用`default`就可以。
79 | 
80 | ## 系统设置
81 | 
82 | 1. GIT_SSH_PRIVATE_KEY:可以在自己的笔记本上使用`cat .ssh/id_rsa`查看,然后把内容复制到进来。不了解git ssh协议的,自行查资料
83 | 
84 | ## 更新版本
85 | 
86 | ```
87 | git pull
88 | docker-compose up -d
89 | ```
90 | 依次执行以上命令即可
91 | 


--------------------------------------------------------------------------------
/docs/foreword/功能概览.md:
--------------------------------------------------------------------------------
 1 | # FEAPDER
 2 | 
 3 | ## 1. 支持周期性采集
 4 | 
 5 | 周期性抓取是爬虫中常见的需求,如每日抓取一次商品的销量等,我们把每个周期称为一个批次。
 6 | 
 7 | 本框架支持批次采集,引入了批次表的概念,详细记录了每一批次的抓取状态
 8 | 
 9 | ![-w899](http://markdown-media.oss-cn-beijing.aliyuncs.com/2020/12/20/16084680404224.jpg)
10 | 
11 | ## 2. 支持分布式采集
12 | 
13 | 面对海量的数据,分布式采集必不可少的,本框架支持分布式,且可随时重启爬虫,任务不丢失
14 | 
15 | ## 3. 支持爬虫集成
16 | 
17 | 本功能可以将多个爬虫以插件的形式集成为一个爬虫,常用于采集周期一致,需求一致的,但需要采集多个数据源的项目
18 | 
19 | ## 4. 支持海量数据去重
20 | 
21 | 框架内置3种去重机制,通过简单的配置可对任务及数据自动去重,也可拿出来单独作为模块使用,支持批量去重。
22 | 
23 | 1. 临时去重:处理一万条数据约0.26秒。 去重一亿条数据占用内存约1.43G,可指定去重的失效周期
24 | 2. 内存去重:处理一万条数据约0.5秒。 去重一亿条数据占用内存约285MB
25 | 3. 永久去重:处理一万条数据约3.5秒。去重一亿条数据占用内存约285MB
26 | 
27 | ## 5. 数据采集完整性
28 | 
29 | feapder对于每一条URL数据的抓取采取了强状态的控制,做到采集任务中URL抓取100%不丢失,即使多次尝试失败的URL也会进入错误队列并记录失败原因日志。这一特性对于很多强依赖采集数据的业务场景非常重要,保证数据用的放心。
30 | 
31 | ## 6. 数据自动入库
32 | 
33 | 只需要根据数据库表自动生成item,然后给item属性赋值,直接yield 返回即可批量入库
34 | 
35 | ## 7. 支持Debug模式
36 | 
37 | 爬虫支持debug模式,debug模式下默认数据不入库、不修改任务状态。可针对某个任务进行调试,方便开发
38 | 
39 | ## 8. 完善的报警机制
40 | 
41 | 为了保证数据的全量性、准确性、时效性,本框架内置报警机制,有了这些报警,我们可以实时掌握爬虫状态
42 | 
43 | 1. 实时计算爬虫抓取速度,估算剩余时间,在指定的抓取周期内预判是否会超时
44 | 
45 |     ![-w657](http://markdown-media.oss-cn-beijing.aliyuncs.com/2020/12/20/16084718683378.jpg)
46 | 
47 | 
48 | 2. 爬虫卡死报警
49 | 
50 |     ![-w501](http://markdown-media.oss-cn-beijing.aliyuncs.com/2020/12/20/16084718974597.jpg)
51 | 
52 | 3. 爬虫任务失败数过多报警,可能是由于网站模板改动或封堵导致
53 | 
54 |     ![-w416](http://markdown-media.oss-cn-beijing.aliyuncs.com/2020/12/29/16092335882158.jpg)
55 | 
56 | ## 9. 下载监控
57 | 
58 | 框架对请求总数、成功数、失败数、解析异常数进行监控,将数据点打入到infuxdb,结合Grafana面板,可方便掌握抓取情况
59 | 
60 | ![-w1299](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/02/09/16128568548280.jpg)
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/docs/foreword/架构设计.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # 框架流程图
 3 | 
 4 | ![boris-spider -1-](http://markdown-media.oss-cn-beijing.aliyuncs.com/2020/06/08/borisspider-1.png)
 5 | 
 6 | ## 模块说明:
 7 | 
 8 | * spider **框架调度核心**
 9 | * parser_control **模版控制器**,负责调度parser
10 | * collector **任务收集器**,负责从任务队里中批量取任务到内存,以减少爬虫对任务队列数据库的访问频率及并发量
11 | * parser **数据解析器**
12 | * start_request 初始任务下发函数
13 | * item_buffer **数据缓冲队列**,批量将数据存储到数据库中
14 | * request_buffer **请求任务缓冲队列**,批量将请求任务存储到任务队列中
15 | * request **数据下载器**,封装了requests,用于从互联网上下载数据
16 | * response **请求响应**,封装了response, 支持xpath、css、re等解析方式,自动处理中文乱码
17 | 
18 | ## 流程说明:
19 | 
20 | 1. spider调度**start_request**生产任务
21 | 2. **start_request**下发任务到request_buffer中
22 | 3. spider调度**request_buffer**批量将任务存储到任务队列数据库中
23 | 4. spider调度**collector**从任务队列中批量获取任务到内存队列
24 | 5. spider调度**parser_control**从collector的内存队列中获取任务
25 | 6. **parser_control**调度**request**请求数据
26 | 7. **request**请求与下载数据
27 | 8. request将下载后的数据给**response**,进一步封装
28 | 9. 将封装好的**response**返回给**parser_control**(图示为多个parser_control,表示多线程)
29 | 10. parser_control调度对应的**parser**,解析返回的response(图示多组parser表示不同的网站解析器)
30 | 11. parser_control将parser解析到的数据item及新产生的request分发到**item_buffer**与**request_buffer**
31 | 12. spider调度**item_buffer**与**request_buffer**将数据批量入库
32 | 
33 | 


--------------------------------------------------------------------------------
/docs/images/qingguo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/docs/images/qingguo.jpg


--------------------------------------------------------------------------------
/docs/lib/docsify-copy-code/docsify-copy-code.min.js:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * docsify-copy-code
 3 |  * v2.1.0
 4 |  * https://github.com/jperasmus/docsify-copy-code
 5 |  * (c) 2017-2019 JP Erasmus <jperasmus11@gmail.com>
 6 |  * MIT license
 7 |  */
 8 | !function(){"use strict";function r(o){return(r="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(o){return typeof o}:function(o){return o&&"function"==typeof Symbol&&o.constructor===Symbol&&o!==Symbol.prototype?"symbol":typeof o})(o)}!function(o,e){void 0===e&&(e={});var t=e.insertAt;if(o&&"undefined"!=typeof document){var n=document.head||document.getElementsByTagName("head")[0],c=document.createElement("style");c.type="text/css","top"===t&&n.firstChild?n.insertBefore(c,n.firstChild):n.appendChild(c),c.styleSheet?c.styleSheet.cssText=o:c.appendChild(document.createTextNode(o))}}(".docsify-copy-code-button,.docsify-copy-code-button span{cursor:pointer;transition:all .25s ease}.docsify-copy-code-button{position:absolute;z-index:1;top:0;right:0;overflow:visible;padding:.65em .8em;border:0;border-radius:0;outline:0;font-size:1em;background:grey;background:var(--theme-color,grey);color:#fff;opacity:0}.docsify-copy-code-button span{border-radius:3px;background:inherit;pointer-events:none}.docsify-copy-code-button .error,.docsify-copy-code-button .success{position:absolute;z-index:-100;top:50%;left:0;padding:.5em .65em;font-size:.825em;opacity:0;-webkit-transform:translateY(-50%);transform:translateY(-50%)}.docsify-copy-code-button.error .error,.docsify-copy-code-button.success .success{opacity:1;-webkit-transform:translate(-115%,-50%);transform:translate(-115%,-50%)}.docsify-copy-code-button:focus,pre:hover .docsify-copy-code-button{opacity:1}"),document.querySelector('link[href*="docsify-copy-code"]')&&console.warn("[Deprecation] Link to external docsify-copy-code stylesheet is no longer necessary."),window.DocsifyCopyCodePlugin={init:function(){return function(o,e){o.ready(function(){console.warn("[Deprecation] Manually initializing docsify-copy-code using window.DocsifyCopyCodePlugin.init() is no longer necessary.")})}}},window.$docsify=window.$docsify||{},window.$docsify.plugins=[function(o,s){o.doneEach(function(){var o=Array.apply(null,document.querySelectorAll("pre[data-lang]")),c={buttonText:"Copy to clipboard",errorText:"Error",successText:"Copied"};s.config.copyCode&&Object.keys(c).forEach(function(t){var n=s.config.copyCode[t];"string"==typeof n?c[t]=n:"object"===r(n)&&Object.keys(n).some(function(o){var e=-1<location.href.indexOf(o);return c[t]=e?n[o]:c[t],e})});var e=['<button class="docsify-copy-code-button">','<span class="label">'.concat(c.buttonText,"</span>"),'<span class="error">'.concat(c.errorText,"</span>"),'<span class="success">'.concat(c.successText,"</span>"),"</button>"].join("");o.forEach(function(o){o.insertAdjacentHTML("beforeend",e)})}),o.mounted(function(){document.querySelector(".content").addEventListener("click",function(o){if(o.target.classList.contains("docsify-copy-code-button")){var e="BUTTON"===o.target.tagName?o.target:o.target.parentNode,t=document.createRange(),n=e.parentNode.querySelector("code"),c=window.getSelection();t.selectNode(n),c.removeAllRanges(),c.addRange(t);try{document.execCommand("copy")&&(e.classList.add("success"),setTimeout(function(){e.classList.remove("success")},1e3))}catch(o){console.error("docsify-copy-code: ".concat(o)),e.classList.add("error"),setTimeout(function(){e.classList.remove("error")},1e3)}"function"==typeof(c=window.getSelection()).removeRange?c.removeRange(t):"function"==typeof c.removeAllRanges&&c.removeAllRanges()}})})}].concat(window.$docsify.plugins||[])}();
 9 | //# sourceMappingURL=docsify-copy-code.min.js.map
10 | 


--------------------------------------------------------------------------------
/docs/lib/docsify/lib/plugins/docsify-edit-on-github.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Minified by jsDelivr using Terser v3.14.1.
 3 |  * Original file: /npm/docsify-edit-on-github@1.0.3/index.js
 4 |  *
 5 |  * Do NOT use SRI with dynamically generated files! More information: https://www.jsdelivr.com/using-sri-with-dynamic-files
 6 |  */
 7 | ! function(t) {
 8 |     t.EditOnGithubPlugin = {}, t.EditOnGithubPlugin.create = function(n, i, e) {
 9 |         function u(t) {
10 |             return header = ['<div style="overflow: auto">',
11 |                 '<p style="float: right"><a style="text-decoration: underline; cursor: pointer"',
12 |                 'onclick="EditOnGithubPlugin.onClick(event)">', '<img alt="memo" src="https://github.githubassets.com/images/icons/emoji/memo.png" class="emoji"> &nbsp', t, "</a></p>", "</div>"
13 |             ].join("")
14 |         }
15 |         return e = e || "Edit on github", i = i || n.replace(/\/blob\//, "/edit/"), t.EditOnGithubPlugin.editDoc =
16 |             function(t, n) {
17 |                 var e = n.route.file;
18 |                 if (e) {
19 |                     var u = i + e;
20 |                     return window.open(u), t.preventDefault(), !1
21 |                 }
22 |                 return !0
23 |             },
24 |             function(n, i) {
25 |                 if (t.EditOnGithubPlugin.onClick = function(t) {
26 |                         EditOnGithubPlugin.editDoc(t, i)
27 |                     }, (r = e) && "[object Function]" === {}.toString.call(r)) n.afterEach(function(t) {
28 |                     return u(e(i.route.file)) + t
29 |                 });
30 |                 else {
31 |                     var o = u(e);
32 |                     n.afterEach(function(t) {
33 |                         return o + t
34 |                     })
35 |                 }
36 |                 var r
37 |             }
38 |     }
39 | }(window);
40 | //# sourceMappingURL=/sm/eef821f4877f09e27be373326100cefe923735a9bb303de51b16f9079d063a86.map


--------------------------------------------------------------------------------
/docs/lib/docsify/lib/plugins/ga.js:
--------------------------------------------------------------------------------
 1 | (function () {
 2 |   // From https://github.com/egoist/vue-ga/blob/master/src/index.js
 3 |   function appendScript() {
 4 |     var script = document.createElement('script');
 5 |     script.async = true;
 6 |     script.src = 'https://www.google-analytics.com/analytics.js';
 7 |     document.body.appendChild(script);
 8 |   }
 9 | 
10 |   function init(id) {
11 |     appendScript();
12 |     window.ga =
13 |       window.ga ||
14 |       function () {
15 |         (window.ga.q = window.ga.q || []).push(arguments);
16 |       };
17 |     window.ga.l = Number(new Date());
18 |     window.ga('create', id, 'auto');
19 |   }
20 | 
21 |   function collect() {
22 |     if (!window.ga) {
23 |       init($docsify.ga);
24 |     }
25 | 
26 |     window.ga('set', 'page', location.hash);
27 |     window.ga('send', 'pageview');
28 |   }
29 | 
30 |   var install = function (hook) {
31 |     if (!$docsify.ga) {
32 |       console.error('[Docsify] ga is required.');
33 |       return
34 |     }
35 | 
36 |     hook.beforeEach(collect);
37 |   };
38 | 
39 |   $docsify.plugins = [].concat(install, $docsify.plugins);
40 | 
41 | }());
42 | 


--------------------------------------------------------------------------------
/docs/lib/prismjs/components/prism-java.js:
--------------------------------------------------------------------------------
 1 | (function (Prism) {
 2 | 
 3 | 	var keywords = /\b(?:abstract|assert|boolean|break|byte|case|catch|char|class|const|continue|default|do|double|else|enum|exports|extends|final|finally|float|for|goto|if|implements|import|instanceof|int|interface|long|module|native|new|null|open|opens|package|private|protected|provides|public|requires|return|short|static|strictfp|super|switch|synchronized|this|throw|throws|to|transient|transitive|try|uses|var|void|volatile|while|with|yield)\b/;
 4 | 
 5 | 	// based on the java naming conventions
 6 | 	var className = /\b[A-Z](?:\w*[a-z]\w*)?\b/;
 7 | 
 8 | 	Prism.languages.java = Prism.languages.extend('clike', {
 9 | 		'class-name': [
10 | 			className,
11 | 
12 | 			// variables and parameters
13 | 			// this to support class names (or generic parameters) which do not contain a lower case letter (also works for methods)
14 | 			/\b[A-Z]\w*(?=\s+\w+\s*[;,=())])/
15 | 		],
16 | 		'keyword': keywords,
17 | 		'function': [
18 | 			Prism.languages.clike.function,
19 | 			{
20 | 				pattern: /(\:\:)[a-z_]\w*/,
21 | 				lookbehind: true
22 | 			}
23 | 		],
24 | 		'number': /\b0b[01][01_]*L?\b|\b0x[\da-f_]*\.?[\da-f_p+-]+\b|(?:\b\d[\d_]*\.?[\d_]*|\B\.\d[\d_]*)(?:e[+-]?\d[\d_]*)?[dfl]?/i,
25 | 		'operator': {
26 | 			pattern: /(^|[^.])(?:<<=?|>>>?=?|->|--|\+\+|&&|\|\||::|[?:~]|[-+*/%&|^!=<>]=?)/m,
27 | 			lookbehind: true
28 | 		}
29 | 	});
30 | 
31 | 	Prism.languages.insertBefore('java', 'string', {
32 | 		'triple-quoted-string': {
33 | 			// http://openjdk.java.net/jeps/355#Description
34 | 			pattern: /"""[ \t]*[\r\n](?:(?:"|"")?(?:\\.|[^"\\]))*"""/,
35 | 			greedy: true,
36 | 			alias: 'string'
37 | 		}
38 | 	});
39 | 
40 | 	Prism.languages.insertBefore('java', 'class-name', {
41 | 		'annotation': {
42 | 			alias: 'punctuation',
43 | 			pattern: /(^|[^.])@\w+/,
44 | 			lookbehind: true
45 | 		},
46 | 		'namespace': {
47 | 			pattern: /(\b(?:exports|import(?:\s+static)?|module|open|opens|package|provides|requires|to|transitive|uses|with)\s+)[a-z]\w*(?:\.[a-z]\w*)+/,
48 | 			lookbehind: true,
49 | 			inside: {
50 | 				'punctuation': /\./,
51 | 			}
52 | 		},
53 | 		'generics': {
54 | 			pattern: /<(?:[\w\s,.&?]|<(?:[\w\s,.&?]|<(?:[\w\s,.&?]|<[\w\s,.&?]*>)*>)*>)*>/,
55 | 			inside: {
56 | 				'class-name': className,
57 | 				'keyword': keywords,
58 | 				'punctuation': /[<>(),.:]/,
59 | 				'operator': /[?&|]/
60 | 			}
61 | 		}
62 | 	});
63 | }(Prism));
64 | 


--------------------------------------------------------------------------------
/docs/lib/prismjs/components/prism-python.js:
--------------------------------------------------------------------------------
1 | Prism.languages.python={comment:{pattern:/(^|[^\\])#.*/,lookbehind:!0},"string-interpolation":{pattern:/(?:f|rf|fr)(?:("""|''')[\s\S]*?\1|("|')(?:\\.|(?!\2)[^\\\r\n])*\2)/i,greedy:!0,inside:{interpolation:{pattern:/((?:^|[^{])(?:{{)*){(?!{)(?:[^{}]|{(?!{)(?:[^{}]|{(?!{)(?:[^{}])+})+})+}/,lookbehind:!0,inside:{"format-spec":{pattern:/(:)[^:(){}]+(?=}$)/,lookbehind:!0},"conversion-option":{pattern:/![sra](?=[:}]$)/,alias:"punctuation"},rest:null}},string:/[\s\S]+/}},"triple-quoted-string":{pattern:/(?:[rub]|rb|br)?("""|''')[\s\S]*?\1/i,greedy:!0,alias:"string"},string:{pattern:/(?:[rub]|rb|br)?("|')(?:\\.|(?!\1)[^\\\r\n])*\1/i,greedy:!0},function:{pattern:/((?:^|\s)def[ \t]+)[a-zA-Z_]\w*(?=\s*\()/g,lookbehind:!0},"class-name":{pattern:/(\bclass\s+)\w+/i,lookbehind:!0},decorator:{pattern:/(^\s*)@\w+(?:\.\w+)*/im,lookbehind:!0,alias:["annotation","punctuation"],inside:{punctuation:/\./}},keyword:/\b(?:and|as|assert|async|await|break|class|continue|def|del|elif|else|except|exec|finally|for|from|global|if|import|in|is|lambda|nonlocal|not|or|pass|print|raise|return|try|while|with|yield)\b/,builtin:/\b(?:__import__|abs|all|any|apply|ascii|basestring|bin|bool|buffer|bytearray|bytes|callable|chr|classmethod|cmp|coerce|compile|complex|delattr|dict|dir|divmod|enumerate|eval|execfile|file|filter|float|format|frozenset|getattr|globals|hasattr|hash|help|hex|id|input|int|intern|isinstance|issubclass|iter|len|list|locals|long|map|max|memoryview|min|next|object|oct|open|ord|pow|property|range|raw_input|reduce|reload|repr|reversed|round|set|setattr|slice|sorted|staticmethod|str|sum|super|tuple|type|unichr|unicode|vars|xrange|zip)\b/,boolean:/\b(?:True|False|None)\b/,number:/(?:\b(?=\d)|\B(?=\.))(?:0[bo])?(?:(?:\d|0x[\da-f])[\da-f]*(?:\.\d*)?|\.\d+)(?:e[+-]?\d+)?j?\b/i,operator:/[-+%=]=?|!=|\*\*?=?|\/\/?=?|<[<=>]?|>[=>]?|[&|^~]/,punctuation:/[{}[\];(),.:]/},Prism.languages.python["string-interpolation"].inside.interpolation.inside.rest=Prism.languages.python,Prism.languages.py=Prism.languages.python;


--------------------------------------------------------------------------------
/docs/lib/prismjs/components/prism-sql.js:
--------------------------------------------------------------------------------
 1 | Prism.languages.sql = {
 2 | 	'comment': {
 3 | 		pattern: /(^|[^\\])(?:\/\*[\s\S]*?\*\/|(?:--|\/\/|#).*)/,
 4 | 		lookbehind: true
 5 | 	},
 6 | 	'variable': [
 7 | 		{
 8 | 			pattern: /@(["'`])(?:\\[\s\S]|(?!\1)[^\\])+\1/,
 9 | 			greedy: true
10 | 		},
11 | 		/@[\w.$]+/
12 | 	],
13 | 	'string': {
14 | 		pattern: /(^|[^@\\])("|')(?:\\[\s\S]|(?!\2)[^\\]|\2\2)*\2/,
15 | 		greedy: true,
16 | 		lookbehind: true
17 | 	},
18 | 	'function': /\b(?:AVG|COUNT|FIRST|FORMAT|LAST|LCASE|LEN|MAX|MID|MIN|MOD|NOW|ROUND|SUM|UCASE)(?=\s*\()/i, // Should we highlight user defined functions too?
19 | 	'keyword': /\b(?:ACTION|ADD|AFTER|ALGORITHM|ALL|ALTER|ANALYZE|ANY|APPLY|AS|ASC|AUTHORIZATION|AUTO_INCREMENT|BACKUP|BDB|BEGIN|BERKELEYDB|BIGINT|BINARY|BIT|BLOB|BOOL|BOOLEAN|BREAK|BROWSE|BTREE|BULK|BY|CALL|CASCADED?|CASE|CHAIN|CHAR(?:ACTER|SET)?|CHECK(?:POINT)?|CLOSE|CLUSTERED|COALESCE|COLLATE|COLUMNS?|COMMENT|COMMIT(?:TED)?|COMPUTE|CONNECT|CONSISTENT|CONSTRAINT|CONTAINS(?:TABLE)?|CONTINUE|CONVERT|CREATE|CROSS|CURRENT(?:_DATE|_TIME|_TIMESTAMP|_USER)?|CURSOR|CYCLE|DATA(?:BASES?)?|DATE(?:TIME)?|DAY|DBCC|DEALLOCATE|DEC|DECIMAL|DECLARE|DEFAULT|DEFINER|DELAYED|DELETE|DELIMITERS?|DENY|DESC|DESCRIBE|DETERMINISTIC|DISABLE|DISCARD|DISK|DISTINCT|DISTINCTROW|DISTRIBUTED|DO|DOUBLE|DROP|DUMMY|DUMP(?:FILE)?|DUPLICATE|ELSE(?:IF)?|ENABLE|ENCLOSED|END|ENGINE|ENUM|ERRLVL|ERRORS|ESCAPED?|EXCEPT|EXEC(?:UTE)?|EXISTS|EXIT|EXPLAIN|EXTENDED|FETCH|FIELDS|FILE|FILLFACTOR|FIRST|FIXED|FLOAT|FOLLOWING|FOR(?: EACH ROW)?|FORCE|FOREIGN|FREETEXT(?:TABLE)?|FROM|FULL|FUNCTION|GEOMETRY(?:COLLECTION)?|GLOBAL|GOTO|GRANT|GROUP|HANDLER|HASH|HAVING|HOLDLOCK|HOUR|IDENTITY(?:_INSERT|COL)?|IF|IGNORE|IMPORT|INDEX|INFILE|INNER|INNODB|INOUT|INSERT|INT|INTEGER|INTERSECT|INTERVAL|INTO|INVOKER|ISOLATION|ITERATE|JOIN|KEYS?|KILL|LANGUAGE|LAST|LEAVE|LEFT|LEVEL|LIMIT|LINENO|LINES|LINESTRING|LOAD|LOCAL|LOCK|LONG(?:BLOB|TEXT)|LOOP|MATCH(?:ED)?|MEDIUM(?:BLOB|INT|TEXT)|MERGE|MIDDLEINT|MINUTE|MODE|MODIFIES|MODIFY|MONTH|MULTI(?:LINESTRING|POINT|POLYGON)|NATIONAL|NATURAL|NCHAR|NEXT|NO|NONCLUSTERED|NULLIF|NUMERIC|OFF?|OFFSETS?|ON|OPEN(?:DATASOURCE|QUERY|ROWSET)?|OPTIMIZE|OPTION(?:ALLY)?|ORDER|OUT(?:ER|FILE)?|OVER|PARTIAL|PARTITION|PERCENT|PIVOT|PLAN|POINT|POLYGON|PRECEDING|PRECISION|PREPARE|PREV|PRIMARY|PRINT|PRIVILEGES|PROC(?:EDURE)?|PUBLIC|PURGE|QUICK|RAISERROR|READS?|REAL|RECONFIGURE|REFERENCES|RELEASE|RENAME|REPEAT(?:ABLE)?|REPLACE|REPLICATION|REQUIRE|RESIGNAL|RESTORE|RESTRICT|RETURNS?|REVOKE|RIGHT|ROLLBACK|ROUTINE|ROW(?:COUNT|GUIDCOL|S)?|RTREE|RULE|SAVE(?:POINT)?|SCHEMA|SECOND|SELECT|SERIAL(?:IZABLE)?|SESSION(?:_USER)?|SET(?:USER)?|SHARE|SHOW|SHUTDOWN|SIMPLE|SMALLINT|SNAPSHOT|SOME|SONAME|SQL|START(?:ING)?|STATISTICS|STATUS|STRIPED|SYSTEM_USER|TABLES?|TABLESPACE|TEMP(?:ORARY|TABLE)?|TERMINATED|TEXT(?:SIZE)?|THEN|TIME(?:STAMP)?|TINY(?:BLOB|INT|TEXT)|TOP?|TRAN(?:SACTIONS?)?|TRIGGER|TRUNCATE|TSEQUAL|TYPES?|UNBOUNDED|UNCOMMITTED|UNDEFINED|UNION|UNIQUE|UNLOCK|UNPIVOT|UNSIGNED|UPDATE(?:TEXT)?|USAGE|USE|USER|USING|VALUES?|VAR(?:BINARY|CHAR|CHARACTER|YING)|VIEW|WAITFOR|WARNINGS|WHEN|WHERE|WHILE|WITH(?: ROLLUP|IN)?|WORK|WRITE(?:TEXT)?|YEAR)\b/i,
20 | 	'boolean': /\b(?:TRUE|FALSE|NULL)\b/i,
21 | 	'number': /\b0x[\da-f]+\b|\b\d+\.?\d*|\B\.\d+\b/i,
22 | 	'operator': /[-+*\/=%^~]|&&?|\|\|?|!=?|<(?:=>?|<|>)?|>[>=]?|\b(?:AND|BETWEEN|IN|LIKE|NOT|OR|IS|DIV|REGEXP|RLIKE|SOUNDS LIKE|XOR)\b/i,
23 | 	'punctuation': /[;[\]()`,.]/
24 | };
25 | 


--------------------------------------------------------------------------------
/docs/lib/prismjs/components/prism-yaml.js:
--------------------------------------------------------------------------------
 1 | Prism.languages.yaml = {
 2 | 	'scalar': {
 3 | 		pattern: /([\-:]\s*(?:![^\s]+)?[ \t]*[|>])[ \t]*(?:((?:\r?\n|\r)[ \t]+)[^\r\n]+(?:\2[^\r\n]+)*)/,
 4 | 		lookbehind: true,
 5 | 		alias: 'string'
 6 | 	},
 7 | 	'comment': /#.*/,
 8 | 	'key': {
 9 | 		pattern: /(\s*(?:^|[:\-,[{\r\n?])[ \t]*(?:![^\s]+)?[ \t]*)[^\r\n{[\]},#\s]+?(?=\s*:\s)/,
10 | 		lookbehind: true,
11 | 		alias: 'atrule'
12 | 	},
13 | 	'directive': {
14 | 		pattern: /(^[ \t]*)%.+/m,
15 | 		lookbehind: true,
16 | 		alias: 'important'
17 | 	},
18 | 	'datetime': {
19 | 		pattern: /([:\-,[{]\s*(?:![^\s]+)?[ \t]*)(?:\d{4}-\d\d?-\d\d?(?:[tT]|[ \t]+)\d\d?:\d{2}:\d{2}(?:\.\d*)?[ \t]*(?:Z|[-+]\d\d?(?::\d{2})?)?|\d{4}-\d{2}-\d{2}|\d\d?:\d{2}(?::\d{2}(?:\.\d*)?)?)(?=[ \t]*(?:$|,|]|}))/m,
20 | 		lookbehind: true,
21 | 		alias: 'number'
22 | 	},
23 | 	'boolean': {
24 | 		pattern: /([:\-,[{]\s*(?:![^\s]+)?[ \t]*)(?:true|false)[ \t]*(?=$|,|]|})/im,
25 | 		lookbehind: true,
26 | 		alias: 'important'
27 | 	},
28 | 	'null': {
29 | 		pattern: /([:\-,[{]\s*(?:![^\s]+)?[ \t]*)(?:null|~)[ \t]*(?=$|,|]|})/im,
30 | 		lookbehind: true,
31 | 		alias: 'important'
32 | 	},
33 | 	'string': {
34 | 		pattern: /([:\-,[{]\s*(?:![^\s]+)?[ \t]*)("|')(?:(?!\2)[^\\\r\n]|\\.)*\2(?=[ \t]*(?:$|,|]|}|\s*#))/m,
35 | 		lookbehind: true,
36 | 		greedy: true
37 | 	},
38 | 	'number': {
39 | 		pattern: /([:\-,[{]\s*(?:![^\s]+)?[ \t]*)[+-]?(?:0x[\da-f]+|0o[0-7]+|(?:\d+\.?\d*|\.?\d+)(?:e[+-]?\d+)?|\.inf|\.nan)[ \t]*(?=$|,|]|})/im,
40 | 		lookbehind: true
41 | 	},
42 | 	'tag': /![^\s]+/,
43 | 	'important': /[&*][\w]+/,
44 | 	'punctuation': /---|[:[\]{}\-,|>?]|\.\.\./
45 | };
46 | 
47 | Prism.languages.yml = Prism.languages.yaml;


--------------------------------------------------------------------------------
/docs/question/setting不生效问题.md:
--------------------------------------------------------------------------------
 1 | # setting不生效问题
 2 | 
 3 | ## 问题
 4 | 
 5 | 以下面这个项目结构为例,在`spiders`目录下运行`spider_test.py`读取不到`setting.py`,所以`setting`的配置不生效。
 6 | 
 7 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2022/11/01/16672715088563.jpg)
 8 | 
 9 | 读取不到是因为python的环境变量问题,在spiders目录下运行,只会找spides目录下的文件
10 | 
11 | ## 解决方式
12 | 
13 | ### 方法1:在setting同级目录下运行
14 | 
15 | 在main.py中导入spider_test, 然后运行main.py
16 | 
17 | ### 方法2:设置工作区间
18 | 
19 | 设置工作区间方式(以pycharm为例):项目->右键->Mark Directory as -> Sources Root
20 | 
21 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2022/11/01/16672717483410.jpg)
22 | 
23 | ### 方法3:设置PYTHONPATH
24 | 
25 | 以mac或linux举例,执行如下命令
26 | 
27 | ```shell
28 | export PYTHONPATH=$PYTHONPATH:/绝对路径/spider-project
29 | ```
30 | 注:这个命令设置的环境变量只在当前终端有效
31 | 
32 | 然后即可在spiders目录下运行
33 | 
34 | ```shell
35 | python spider_test.py
36 | ```
37 | 
38 | window如何添加环境变量大家自行探索,搞定了可在评论区留言


--------------------------------------------------------------------------------
/docs/question/安装问题.md:
--------------------------------------------------------------------------------
 1 | # 安装问题
 2 | 
 3 | ## 1. bitarray问题
 4 | 
 5 | > window下pip 安装报错
 6 | 
 7 | 
 8 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/02/09/16128685646774.jpg)
 9 | 
10 | 解决办法:安装 Microsoft Visual C++ 工具,工具下载地址如下所示:
11 | https://download.microsoft.com/download/5/f/7/5f7acaeb-8363-451f-9425-68a90f98b238/visualcppbuildtools_full.exe
12 | 
13 | ## 2. AttributeError 'str' object has not attribute 'decode'
14 | 
15 | > window下pip 安装报错
16 | 
17 | ![670479264](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/16/670479264.jpg)
18 | 
19 | 下载bitarray离线包,版本要求`bitarray>=1.5.3`
20 | 
21 | https://www.lfd.uci.edu/~gohlke/pythonlibs/#bitarray
22 | 
23 | ![-w722](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/16/16158992617537.jpg)
24 | 
25 | 
26 | 解压,进入目录下执行:
27 | 
28 |     python setup.py install
29 | 


--------------------------------------------------------------------------------
/docs/question/请求问题.md:
--------------------------------------------------------------------------------
1 | # 请求问题
2 | 
3 | ## ValueError: check_hostname requires server_hostname
4 | 
5 |     pip install urllib3==1.25.8
6 |     
7 | 参考:https://stackoverflow.com/questions/66642705/why-requests-raise-this-exception-check-hostname-requires-server-hostname


--------------------------------------------------------------------------------
/docs/question/运行问题.md:
--------------------------------------------------------------------------------
 1 | # 运行问题
 2 | 
 3 | ## 1. 二次运行时卡住,不继续抓取
 4 | 
 5 | ![1779423237](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/11/1779423237.jpg)
 6 | 
 7 | **原因:**
 8 | 
 9 | 因爬虫支持分布式和任务防丢,为防止任务抢占和任务丢失,巧妙的利用了redis有序集合来存储任务。
10 | 
11 | 策略:有序集合有个分数,爬虫取任务时,只取小于当前时间戳分数的任务,同时将任务分数修改为当前时间戳+10分钟,当任务做完时,再主动将任务删除。
12 | 
13 | 目的:将取到的任务分数修改成10分钟后,可防止其他爬虫节点取到同样的任务,同时当爬虫意外退出后,任务也不会丢失,10分钟后还可以取到。但也会导致有时爬虫启动时,明明有任务,却处于等待任务的情况。
14 | 
15 | 应对等待情况:
16 | 
17 | 1. 可将任务清空,重新抓取,可直接操作redis清空,或通过传参方式
18 | 
19 |         spider = test_spider.TestSpider(redis_key="feapder:test_spider", delete_keys="*z_requsets")
20 |         spider.start()
21 |         
22 |     delete_keys为需要删除的key,类型: 元组/bool/string,支持正则; 常用于清空任务队列,否则重启时会断点续爬,如写成`delete_keys=True`也是可以的
23 | 
24 | 1. 手动修改任务分数为小于当前时间搓的分数
25 | 
26 |     ![-w917](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/11/16154327722622.jpg)
27 | 
28 | 1. 等10分钟就好了
29 | 
30 | 2. 用debug模式开发
31 | 


--------------------------------------------------------------------------------
/docs/robots.txt:
--------------------------------------------------------------------------------
1 | User-agent: *


--------------------------------------------------------------------------------
/docs/source_code/BaseParser.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # BaseParser
  3 | 
  4 | BaseParser为Spider的基类,用来定义任务下发与数据解析,是面向用户提供的接口
  5 | 
  6 | ## 源码
  7 | 
  8 | 
  9 | ```python
 10 | class BaseParser(object):
 11 |     def start_requests(self):
 12 |         """
 13 |         @summary: 添加初始url
 14 |         ---------
 15 |         ---------
 16 |         @result: yield Request()
 17 |         """
 18 | 
 19 |         pass
 20 | 
 21 |     def download_midware(self, request):
 22 |         """
 23 |         @summary: 下载中间件 可修改请求的一些参数, 或可自定义下载,然后返回 request, response
 24 |         ---------
 25 |         @param request:
 26 |         ---------
 27 |         @result: return request / request, response
 28 |         """
 29 | 
 30 |         pass
 31 | 
 32 |     def validate(self, request, response):
 33 |         """
 34 |         @summary: 校验函数, 可用于校验response是否正确
 35 |         若函数内抛出异常,则重试请求
 36 |         若返回True 或 None,则进入解析函数
 37 |         若返回False,则抛弃当前请求
 38 |         可通过request.callback_name 区分不同的回调函数,编写不同的校验逻辑
 39 |         ---------
 40 |         @param request:
 41 |         @param response:
 42 |         ---------
 43 |         @result: True / None / False
 44 |         """
 45 | 
 46 |         pass
 47 | 
 48 |     def parse(self, request, response):
 49 |         """
 50 |         @summary: 默认的解析函数
 51 |         ---------
 52 |         @param request:
 53 |         @param response:
 54 |         ---------
 55 |         @result:
 56 |         """
 57 | 
 58 |         pass
 59 | 
 60 |     def exception_request(self, request, response):
 61 |         """
 62 |         @summary: 请求或者parser里解析出异常的request
 63 |         ---------
 64 |         @param request:
 65 |         @param response:
 66 |         ---------
 67 |         @result: request / callback / None (返回值必须可迭代)
 68 |         """
 69 | 
 70 |         pass
 71 | 
 72 |     def failed_request(self, request, response):
 73 |         """
 74 |         @summary: 超过最大重试次数的request
 75 |         可返回修改后的request  若不返回request,则将传进来的request直接人redis的failed表。否则将修改后的request入failed表
 76 |         ---------
 77 |         @param request:
 78 |         ---------
 79 |         @result: request / item / callback / None (返回值必须可迭代)
 80 |         """
 81 | 
 82 |         pass
 83 | 
 84 |     def start_callback(self):
 85 |         """
 86 |         @summary: 程序开始的回调
 87 |         ---------
 88 |         ---------
 89 |         @result: None
 90 |         """
 91 | 
 92 |         pass
 93 | 
 94 |     def end_callback(self):
 95 |         """
 96 |         @summary: 程序结束的回调
 97 |         ---------
 98 |         ---------
 99 |         @result: None
100 |         """
101 | 
102 |         pass
103 | 
104 |     @property
105 |     def name(self):
106 |         return self.__class__.__name__
107 | 
108 |     def close(self):
109 |         pass
110 | ```
111 | 
112 | ## 使用
113 | 
114 | 以程序开始结束回调举例:
115 | 
116 | ```python
117 | import feapder
118 | 
119 | 
120 | class TestSpider(feapder.Spider):
121 |     def start_callback(self):
122 |         print("爬虫开始了")
123 | 
124 |     def end_callback(self):
125 |         print("爬虫结束了")
126 | ```


--------------------------------------------------------------------------------
/docs/source_code/BatchParser.md:
--------------------------------------------------------------------------------
  1 | # BatchParser
  2 | 
  3 | BaseParser为BatchSpider的基类,用来定义任务下发与数据解析,是面向用户提供的接口
  4 | 
  5 | 除了提供[BaseParser](source_code/BaseParser)所有接口外,还提供以下方法
  6 | 
  7 | ## 方法详解
  8 | 
  9 | ### 1. 添加任务 add_task
 10 | 
 11 | add_task, 每次执行start_monitor都会调用,且在init_task之前调用, 用于在批次爬虫启动前添加任务到数据库
 12 | 
 13 | ```
 14 | class TestSpider(feapder.BatchSpider):
 15 |     def add_task(self):
 16 |         pass
 17 | ```
 18 | 
 19 | ### 2. 更新任务
 20 | 
 21 | #### 方法一:
 22 | 
 23 | 一条条更新
 24 | 
 25 | ```python
 26 | def update_task_state(self, task_id, state=1, **kwargs):
 27 |     """
 28 |     @summary: 更新任务表中任务状态,做完每个任务时代码逻辑中要主动调用
 29 |     调用方法为 yield lambda : self.update_task_state(task_id, state)
 30 |     ---------
 31 |     @param task_id: 任务id
 32 |     @param state: 任务状态
 33 |     ---------
 34 |     @result:
 35 |     """
 36 | ```
 37 | 
 38 | 举例说明
 39 | 
 40 | ```
 41 | def parse(self, request, response):
 42 |     yield item  # 返回item, item会自动批量入库
 43 |     yield lambda : self.update_task_state(request.task_id, 1)
 44 | ```
 45 | 
 46 |  在`yield item`后,调用`self.update_task_state`函数实现任务状态更新。
 47 |  
 48 |  这里为什么使用`yield lambda`方式呢?因为`yield item`后,item不会马上入库,会存在一个buffer中,批量入库,如果我们直接调用`self.update_task_state`更新任务状态,可能这时item还并未入库,如果此时程序意外退出,那么缓存中的这一部分item数据将会丢失,但是此时任务状态已更新,任务不会重做,这便会导致这个任务所对应的数据丢失
 49 |  
 50 |  `yield lambda`返回的是一个回调函数,这个函数并不会马上执行,系统会保证item入库后再执行,因此这么写的用意在于item入库后再更新任务状态
 51 |  
 52 | #### 方法二:
 53 | 
 54 | 批量更新
 55 | 
 56 | ```python
 57 | def update_task_batch(self, task_id, state=1, **kwargs):
 58 |     """
 59 |     批量更新任务 多处调用,更新的字段必须一致
 60 |     注意:需要 写成 yield update_task_batch(...) 否则不会更新
 61 |     @param task_id:
 62 |     @param state:
 63 |     @param kwargs:
 64 |     @return:
 65 |     """
 66 | ```
 67 | 
 68 | 举例说明
 69 | 
 70 | ```python
 71 | def parse(self, request, response):
 72 |     yield item  # 返回item, item会自动批量入库
 73 |     yield self.update_task_batch(request.task_id, 1) # 更新任务状态为1
 74 | ```
 75 | 
 76 | 在`yield item`后调用`self.update_task_batch`实现批量更新
 77 | 
 78 | 注意,批量更新必须使用 `yield`, 因为`update_task_batch`函数并未实现更新逻辑,只是返回了`UpdateItem`, `UpdateItem`与`Item`类似,只不过带有更新功能,框架会在Item入库后在调用`UpdateItem`实现批量更新。关于`UpdateItem`详解,请参考[UpdateItem]()
 79 | 
 80 | #### 两种方式选取
 81 | 
 82 | 同一张表,若更新字段相同,推荐使用批量更新的方式,效率更高,若字段不同,用一条条更新的方式。因为批量更新,这一批的更新字段必须一致
 83 | 
 84 | 比如当请求失败时,将任务更新为-1,同时标记失败原因,成功时将任务更新为1,写法如下:
 85 | 
 86 | ```python
 87 | def parse(self, request, response):
 88 |     yield self.update_task_batch(request.task_id, 1) # 更新任务状态为1
 89 | 
 90 | def failed_request(self, request, response):
 91 |     """
 92 |     @summary: 超过最大重试次数的request
 93 |     ---------
 94 |     @param request:
 95 |     ---------
 96 |     @result: request / item / callback / None (返回值必须可迭代)
 97 |     """
 98 | 
 99 |     yield request
100 |     yield lambda : self.update_task_state(request.task_id, -1, remark="失败原因") # 更新任务状态为-1
101 | ```
102 | 
103 | 因任务失败时多更新了个remark字段,与任务成功时只更新state字段不同,因此需要将此更新操作单独拆出来,用`update_task_state`方式更新
104 | 
105 | ### 3. 获取批次时间
106 | 
107 | 示例:
108 | 
109 |     def parse(self, request, response):
110 |         item = SpiderDataItem()  # 声明一个item
111 |         item.batch_data = self.batch_date
112 |         item.title = title  # 给item属性赋值
113 |         yield item  # 返回item, item会自动批量入库
114 |         
115 | 使用`self.batch_date`可获取当前批次时间,然后拼接到item入库
116 | 
117 | 数据示例
118 | 
119 | | id | title | batch_date |
120 | | --- | --- | --- |
121 | | 1 | 百度一下 | 2021-01-01 |


--------------------------------------------------------------------------------
/docs/source_code/Item.md:
--------------------------------------------------------------------------------
  1 | # Item
  2 | 
  3 | 有关Item的简介及创建,可参考[命令行工具](command/cmdline?id=_3-创建-item)
  4 | 
  5 | ## 数据入库
  6 | 
  7 | 数据自动入库,除了根据mysql表生产item外,也可以直接给item赋值,示例如下:
  8 | 
  9 | ```
 10 | from feapder import Item
 11 | 
 12 | item = Item()
 13 | item.table_name = "spider_data" # 表名
 14 | item.title = title
 15 | yield item
 16 | ```
 17 | 
 18 | 等价于:
 19 |  
 20 | 1. 生成item
 21 |     
 22 |     ```
 23 |     from feapder import Item
 24 |     
 25 |     class SpiderDataItem(Item):
 26 |         """
 27 |         This class was generated by feapder.
 28 |         command: feapder create -i spider_data.
 29 |         """
 30 |     
 31 |         def __init__(self, *args, **kwargs):
 32 |             # self.id = None
 33 |             self.title = None
 34 |     ```
 35 | 
 36 | 1. 使用
 37 | 
 38 |     ```         
 39 |     item = SpiderDataItem()
 40 |     item.title = title
 41 |     yield item
 42 |     ```
 43 |     
 44 | ## Item指纹
 45 | 
 46 | item指纹用于数据入库前的去重,默认为所有字段值排序后计算的md5,但当数据中有采集时间时,这种指纹计算方式明显不合理。因此我们可以通过如下方法指定参与去重的key
 47 | 
 48 | ```
 49 | from feapder import Item
 50 | 
 51 | 
 52 | class SpiderDataItem(Item):
 53 |     
 54 |     __unique_key__ = ["title", "url"] # 指定去重的key为 title、url,最后的指纹为title与url值联合计算的md5
 55 | 
 56 |     def __init__(self, *args, **kwargs):
 57 |         # self.id = None
 58 |         self.title = None
 59 |         self.url = None
 60 |         self.crawl_time = None
 61 | ```
 62 | 
 63 | 或可通过如下方式指定`__unique_key__`
 64 | 
 65 | ```
 66 | item = SpiderDataItem()
 67 | item.unique_key =  ["title", "url"] # 支持列表、元组、字符串
 68 | ```
 69 | 
 70 | 或者重写指纹函数
 71 | 
 72 | ```
 73 | from feapder import Item
 74 | 
 75 | 
 76 | class SpiderDataItem(Item):
 77 |     ...
 78 | 
 79 |     @property
 80 |     def fingerprint(self):
 81 |         return "我是指纹"
 82 | ```
 83 | 
 84 | ## 入库前对item进行处理
 85 | 
 86 | pre_to_db函数为每个item入库前的回调函数,可通过此函数对数据进行处理
 87 | 
 88 | ```python
 89 | from feapder import Item
 90 | 
 91 | 
 92 | class SpiderDataItem(Item):
 93 | 
 94 |     def __init__(self, *args, **kwargs):
 95 |         # self.id = None
 96 |         self.title = None
 97 | 
 98 |     def pre_to_db(self):
 99 |         """
100 |         入库前的处理
101 |         """
102 |         self.title = self.title.strip()
103 | ```
104 | 
105 | ## 更新数据
106 | 
107 | 采集过程中,往往会有些数据漏采或解析出错,如果我们想更新已入库的数据,可将Item转为UpdateItem
108 | 
109 |     item = SpiderDataItem.to_UpdateItem()
110 |     
111 | 或直接修改继承类
112 | 
113 | ```
114 | from feapder import Item, UpdateItem
115 | 
116 | class SpiderDataItem(UpdateItem):
117 |     ...
118 | ```
119 | 
120 | 关于UpdateItem使用,详见[UpdateItem](source_code/UpdateItem)
121 | 


--------------------------------------------------------------------------------
/docs/source_code/MongoDB.md:
--------------------------------------------------------------------------------
  1 | # MongoDB
  2 | 
  3 | ## 数据自动入Mongo库使用须知
  4 | 
  5 | - 使用`MongoDb`存储数据,需要使用`MongoPipeline`
  6 | 
  7 | 示例:
  8 | 
  9 | ```python
 10 | import feapder
 11 | from feapder import Item
 12 | 
 13 | 
 14 | class TestMongo(feapder.AirSpider):
 15 |     __custom_setting__ = dict(
 16 |         ITEM_PIPELINES=["feapder.pipelines.mongo_pipeline.MongoPipeline"],
 17 |         MONGO_IP="localhost",
 18 |         MONGO_PORT=27017,
 19 |         MONGO_DB="feapder",
 20 |         MONGO_USER_NAME="",
 21 |         MONGO_USER_PASS="",
 22 |     )
 23 | 
 24 |     def start_requests(self):
 25 |         yield feapder.Request("https://www.baidu.com")
 26 | 
 27 |     def parse(self, request, response):
 28 |         title = response.xpath("//title/text()").extract_first()  # 取标题
 29 |         item = Item()  # 声明一个item
 30 |         item.table_name = "test_mongo" # 指定存储的表名
 31 |         item.title = title  # 给item属性赋值
 32 |         yield item  # 返回item, item会自动批量入库
 33 | 
 34 | 
 35 | if __name__ == "__main__":
 36 |     TestMongo().start()
 37 | ```
 38 | 
 39 | 
 40 | ## 直接使用
 41 | 
 42 | ### 连接
 43 | 
 44 | ```python
 45 | from feapder.db.mongodb import MongoDB
 46 | 
 47 | 
 48 | db = MongoDB(
 49 |     ip="localhost", port=27017, db="feapder", user_name="feapder", user_pass="feapder123"
 50 | )
 51 | ```
 52 | 
 53 | 若环境变量中配置了数据库连接方式或者setting中已配置,则可不传参 
 54 | 
 55 | ```python
 56 | db = MongoDB()
 57 | ```
 58 |     
 59 | 或者可以根据url连接
 60 | 
 61 | ```python
 62 | db = MongoDB.from_url("mongodb://username:password@ip:port/db")
 63 | ```
 64 |     
 65 | ### 方法
 66 | 
 67 | > MongoDB封装了增删改查等方法,方便使用
 68 | 
 69 | #### 查
 70 | 
 71 | ```python
 72 | def find(self, table, limit=0) -> List[Dict]:
 73 |     """
 74 |     @summary:
 75 |     无数据: 返回()
 76 |     有数据: 若limit == 1 则返回 (data1, data2)
 77 |             否则返回 ((data1, data2),)
 78 |     ---------
 79 |     @param table:
 80 |     @param limit:
 81 |     ---------
 82 |     @result:
 83 |     """
 84 | ```
 85 |     
 86 | 
 87 | #### 增
 88 | 
 89 | ```python
 90 | def add(self, table, data, **kwargs):
 91 |     """
 92 | 
 93 |     Args:
 94 |         table:
 95 |         data:
 96 |         kwargs:
 97 |             auto_update: 覆盖更新,将替换唯一索引重复的数据,默认False
 98 |             update_columns: 更新指定的列(如果数据的唯一索引存在,则更新指定字段,如 update_columns = ["name", "title"]
 99 |             insert_ignore: 唯一索引冲突时是否忽略,默认为False
100 |             condition_fields: 用于条件查找的字段,默认以`_id`作为查找条件,默认:['_id']
101 |             exception_callfunc: 异常回调
102 | 
103 |     Returns: 添加行数
104 | 
105 |     """
106 | ```
107 | 
108 | ```python
109 | def add_batch(self, table: str, datas: List[Dict], **kwargs):
110 |     """
111 |     @summary: 批量添加数据
112 |     ---------
113 |     @param command: 字典
114 |     @param datas: 列表 [[..], [...]]
115 |     @param **kwargs:
116 |         auto_update: 覆盖更新,将替换唯一索引重复的数据,默认False
117 |         update_columns: 更新指定的列(如果数据的唯一索引存在,则更新指定字段,如 update_columns = ["name", "title"]
118 |         update_columns_value: 指定更新的字段对应的值
119 |         condition_fields: 用于条件查找的字段,默认以`_id`作为查找条件,默认:['_id']
120 |     ---------
121 |     @result: 添加行数
122 |     """
123 | ```
124 | 
125 | #### 更新
126 | 
127 | ```python
128 | def update(self, coll_name, data: Dict, condition: Dict, upsert: bool = False):
129 |     """
130 |     更新
131 |     Args:
132 |         coll_name: 集合名
133 |         data: 单条数据 {"xxx":"xxx"}
134 |         condition: 更新条件 {"_id": "xxxx"}
135 |         upsert: 数据不存在则插入,默认为 False
136 | 
137 |     Returns: True / False
138 |     """
139 | ```
140 | 
141 | #### 删除
142 | 
143 | ```python
144 | def delete(self, table, condition: Dict):
145 |     """
146 |     删除
147 |     Args:
148 |         table:
149 |         condition: 查找条件
150 |     Returns: True / False
151 |     """
152 | ```
153 | 


--------------------------------------------------------------------------------
/docs/source_code/MysqlDB.md:
--------------------------------------------------------------------------------
  1 | # MysqlDB
  2 | 
  3 | MysqlDB具有断开自动重连特性,支持多线程下操作,内置连接池,最大连接数100
  4 | 
  5 | ## 连接
  6 | 
  7 | ```python
  8 | from feapder.db.mysqldb import MysqlDB
  9 | 
 10 | 
 11 | db = MysqlDB(
 12 |     ip="localhost", port=3306, db="feapder", user_name="feapder", user_pass="feapder123"
 13 | )
 14 | ```
 15 | 
 16 | 若环境变量中配置了数据库连接方式或者setting中已配置,则可不传参 
 17 | 
 18 | ```python
 19 | db = MysqlDB()
 20 | ```
 21 |     
 22 | 或者可以根据url连接
 23 | 
 24 | ```python
 25 | db = MysqlDB.from_url("mysql://username:password@ip:port/db?charset=utf8mb4")
 26 | ```
 27 |     
 28 | ## 方法
 29 | 
 30 | > MysqlDB封装了增删改查等方法,方便使用
 31 | 
 32 | ### 查
 33 | 
 34 | ```python
 35 | def find(self, sql, limit=0, to_json=False):
 36 |     """
 37 |     @summary:
 38 |     无数据: 返回()
 39 |     有数据: 若limit == 1 则返回 (data1, data2)
 40 |             否则返回 ((data1, data2),)
 41 |     ---------
 42 |     @param sql:
 43 |     @param limit:
 44 |     @param to_json 是否将查询结果转为json
 45 |     ---------
 46 |     @result:
 47 |     """
 48 | ```
 49 |     
 50 | 
 51 | ### 增
 52 | 
 53 | ```python
 54 | def add(self, sql, exception_callfunc=None):
 55 |     """
 56 |     Args:
 57 |         sql:
 58 |         exception_callfunc: 异常回调
 59 | 
 60 |     Returns:添加行数
 61 | 
 62 |     """
 63 | ```
 64 | 
 65 | ```python
 66 | def add_smart(self, table, data: Dict, **kwargs):
 67 |     """
 68 |     添加数据, 直接传递json格式的数据,不用拼sql
 69 |     Args:
 70 |         table: 表名
 71 |         data: 字典 {"xxx":"xxx"}
 72 |         **kwargs:
 73 | 
 74 |     Returns:添加行数
 75 | 
 76 |     """
 77 | ```
 78 | 
 79 | 
 80 | ```python
 81 | def add_batch(self, sql, datas: List[Dict]):
 82 |     """
 83 |     @summary: 批量添加数据
 84 |     ---------
 85 |     @ param sql: insert ignore into (xxx, xxx) values (%s, %s, %s)
 86 |     @ param datas: 列表 [{}, {}, {}]
 87 |     ---------
 88 |     @result:添加行数
 89 |     """
 90 | ```
 91 | 
 92 | ```python
 93 | def add_batch_smart(self, table, datas: List[Dict], **kwargs):
 94 |     """
 95 |     批量添加数据, 直接传递list格式的数据,不用拼sql
 96 |     Args:
 97 |         table: 表名
 98 |         datas: 列表 [{}, {}, {}]
 99 |         **kwargs:
100 | 
101 |     Returns: 添加行数
102 | 
103 |     """
104 | ```
105 | 
106 | ### 更新
107 | 
108 | ```python
109 | def update(self, sql):
110 |     pass
111 | ```
112 | 
113 | ```python
114 | def update_smart(self, table, data: Dict, condition):
115 |     """
116 |     更新, 不用拼sql
117 |     Args:
118 |         table: 表名 
119 |         data: 数据 {"xxx":"xxx"}
120 |         condition: 更新条件 where后面的条件,如 condition='status=1'
121 | 
122 |     Returns: True / False
123 |     
124 |     """
125 | ```
126 | 
127 | ### 删除
128 | 
129 | ```python
130 | def delete(self, sql):
131 |     """
132 |     删除
133 |     Args:
134 |         sql: 
135 | 
136 |     Returns: True / False
137 | 
138 |     """
139 | ```
140 | 
141 | ### 执行其他sql
142 | 
143 | ```python
144 | def execute(self, sql):
145 |     pass
146 | ```


--------------------------------------------------------------------------------
/docs/source_code/RedisDB.md:
--------------------------------------------------------------------------------
 1 | # RedisDB
 2 | 
 3 | RedisDB支持**哨兵模式**、**集群模式**与单节点的**普通模式**,封装了操作redis的常用的方法
 4 | 
 5 | ## 连接
 6 | 
 7 | > 若环境变量中配置了数据库连接方式或者setting中已配置,则可不传参 
 8 | 
 9 | ### 普通模式
10 | 
11 | ```python
12 | from feapder.db.redisdb import RedisDB
13 | 
14 | db = RedisDB(ip_ports="localhost:6379", db=0, user_pass=None)
15 | ```
16 | 
17 | 使用地址连接
18 | 
19 | ```python
20 | from feapder.db.redisdb import RedisDB
21 | 
22 | db = RedisDB.from_url("redis://[[username]:[password]]@[host]:[port]/[db]")
23 | ```
24 | 
25 | ### 哨兵模式
26 | 
27 | ```python
28 | from feapder.db.redisdb import RedisDB
29 | 
30 | db = RedisDB(ip_ports="172.25.21.4:26379,172.25.21.5:26379,172.25.21.6:26379", db=0, user_pass=None, service_name="my_master")
31 | ```
32 | 
33 | 注意:多个地址用逗号分隔,需传递`service_name`
34 | 
35 | 对应setting配置文件,配置方式为:
36 | 
37 | ```python
38 | REDISDB_IP_PORTS = "172.25.21.4:26379,172.25.21.5:26379,172.25.21.6:26379"
39 | REDISDB_USER_PASS = ""
40 | REDISDB_DB = 0
41 | REDISDB_SERVICE_NAME = "my_master"
42 | ```
43 | 
44 | ### 集群模式
45 | 
46 | ```python
47 | from feapder.db.redisdb import RedisDB
48 | 
49 | db = RedisDB(ip_ports="172.25.21.4:26379,172.25.21.5:26379,172.25.21.6:26379", db=0, user_pass=None)
50 | ```
51 | 
52 | 注意:多个地址用逗号分隔,不用传递`service_name`
53 | 
54 | 对应setting配置文件,配置方式为:
55 | 
56 | ```python
57 | REDISDB_IP_PORTS = "172.25.21.4:26379,172.25.21.5:26379,172.25.21.6:26379"
58 | REDISDB_USER_PASS = ""
59 | REDISDB_DB = 0
60 | ```
61 | 
62 | ## 方法:
63 | 
64 | 详见源码,此处不一一列举, 源码:`feapder.db.redisdb`


--------------------------------------------------------------------------------
/docs/source_code/UpdateItem.md:
--------------------------------------------------------------------------------
 1 | # UpdateItem
 2 | 
 3 | UpdateItem用于更新数据,继承至Item,所以使用方式基本与Item一致,下载只说不同之处
 4 | 
 5 | ## 更新逻辑
 6 | 
 7 | 更新逻辑借助了数据库的唯一索引,即插入数据时发现数据已存在,则更新。因此要求数据表必须存在唯一索引,才能使用UpdateItem
 8 | 
 9 | 比如将title设置唯一,要求每条数据的title都不能重复
10 | 
11 | ![-w781](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/16/16158245077159.jpg)
12 | 
13 | 或联合索引,要求title与url不能同时重复
14 | 
15 | ![-w761](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/16/16158245648750.jpg)
16 | 
17 | 
18 | ## 指定更新的字段
19 | 
20 | 方式1:指定`__update_key__`
21 | 
22 | ```python
23 | from feapder import UpdateItem
24 | 
25 | 
26 | class SpiderDataItem(UpdateItem):
27 |     
28 |     __update_key__ = ["title"] # 更新title字段
29 | 
30 |     def __init__(self, *args, **kwargs):
31 |         # self.id = None
32 |         self.title = None
33 |         self.url = None
34 | ```
35 | 
36 | 方式2:赋值`update_key`
37 | 
38 | ```python
39 | from feapder import UpdateItem
40 | 
41 | 
42 | class SpiderDataItem(UpdateItem):
43 | 
44 | 
45 |     def __init__(self, *args, **kwargs):
46 |         # self.id = None
47 |         self.title = None
48 |         self.url = None
49 | 
50 | item = SpiderDataItem()
51 | item.update_key = "title" # 支持列表、元组、字符串
52 | ```
53 | 
54 | 方式3:将普通的item转为UpdateItem,然后再指定更新的key
55 | 
56 | ```python
57 | from feapder import Item
58 | 
59 | 
60 | class SpiderDataItem(Item):
61 | 
62 | 
63 |     def __init__(self, *args, **kwargs):
64 |         # self.id = None
65 |         self.title = None
66 |         self.url = None
67 | 
68 | item = SpiderDataItem()
69 | item = item.to_UpdateItem()
70 | item.update_key = "title"
71 | ```
72 | 
73 | **推荐方式1,直接改Item类,不用修改爬虫代码**


--------------------------------------------------------------------------------
/docs/source_code/dedup.md:
--------------------------------------------------------------------------------
  1 | # Dedup
  2 | 
  3 | Dedup是feapder大数据去重模块,不同于BloomFilter,去重受槽位数量影响,Dedup使用了弹性的去重机制,可容纳海量的数据去重。
  4 | 
  5 | 
  6 | ## 去重方式
  7 | 
  8 | ### 临时去重
  9 | 
 10 | > 基于redis,支持批量,去重有时效性。去重一万条数据约0.26秒,一亿条数据占用内存约1.43G
 11 | 
 12 | ```python
 13 | from feapder.dedup import Dedup
 14 | 
 15 | data = {"xxx": 123, "xxxx": "xxxx"}
 16 | datas = ["xxx", "bbb"]
 17 | 
 18 | def test_ExpireFilter():
 19 |     dedup = Dedup(
 20 |         Dedup.ExpireFilter, expire_time=10, redis_url="redis://@localhost:6379/0"
 21 |     )
 22 | 
 23 |     # 逐条去重
 24 |     assert dedup.add(data) == 1
 25 |     assert dedup.get(data) == 1
 26 | 
 27 |     # 批量去重
 28 |     assert dedup.add(datas) == [1, 1]
 29 |     assert dedup.get(datas) == [1, 1]
 30 | ```
 31 | 
 32 | 
 33 | ### 内存去重
 34 | 
 35 | > 基于内存,支持批量。去重一万条数据约0.5秒,一亿条数据占用内存约285MB
 36 | 
 37 | ```python
 38 | from feapder.dedup import Dedup
 39 | 
 40 | data = {"xxx": 123, "xxxx": "xxxx"}
 41 | datas = ["xxx", "bbb"]
 42 | 
 43 | def test_MemoryFilter():
 44 |     dedup = Dedup(Dedup.MemoryFilter)  # 表名为test 历史数据3秒有效期
 45 | 
 46 |     # 逐条去重
 47 |     assert dedup.add(data) == 1
 48 |     assert dedup.get(data) == 1
 49 | 
 50 |     # 批量去重
 51 |     assert dedup.add(datas) == [1, 1]
 52 |     assert dedup.get(datas) == [1, 1]
 53 | ```
 54 | 
 55 | ### 永久去重
 56 | 
 57 | > 基于redis,支持批量,永久去重。 去重一万条数据约3.5秒,一亿条数据占用内存约285MB
 58 | 
 59 | ```python
 60 | from feapder.dedup import Dedup
 61 | 
 62 | def test_BloomFilter():
 63 |     dedup = Dedup(Dedup.BloomFilter, redis_url="redis://@localhost:6379/0")
 64 | 
 65 |     # 逐条去重
 66 |     assert dedup.add(data) == 1
 67 |     assert dedup.get(data) == 1
 68 | 
 69 |     # 批量去重
 70 |     assert dedup.add(datas) == [1, 1]
 71 |     assert dedup.get(datas) == [1, 1]
 72 | ```
 73 | 
 74 | ## 过滤数据
 75 | 
 76 | Dedup可以通过如下方法,过滤掉已存在的数据
 77 | 
 78 | 
 79 | ```python
 80 | from feapder.dedup import Dedup
 81 | 
 82 | def test_filter():
 83 |     dedup = Dedup(Dedup.BloomFilter, redis_url="redis://@localhost:6379/0")
 84 | 
 85 |     # 制造已存在数据
 86 |     datas = ["xxx", "bbb"]
 87 |     dedup.add(datas)
 88 | 
 89 |     # 过滤掉已存在数据 "xxx", "bbb"
 90 |     datas = ["xxx", "bbb", "ccc"]
 91 |     dedup.filter_exist_data(datas)
 92 |     assert datas == ["ccc"]
 93 | ```
 94 | 
 95 | ## Dedup参数
 96 | 
 97 | - **filter_type**:去重类型,支持BloomFilter、MemoryFilter、ExpireFilter三种
 98 | - **redis_url**不是必须传递的,若项目中存在setting.py文件,且已配置redis连接方式,则可以不传递redis_url
 99 | 
100 |     ![-w294](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/07/16151133801599.jpg)
101 | 
102 |     ```
103 |     import feapder
104 |     from feapder.dedup import Dedup
105 | 
106 |     class TestSpider(feapder.Spider):
107 |         def __init__(self, *args, **kwargs):
108 |             self.dedup = Dedup() # 默认是永久去重
109 |     ```
110 | 
111 | - **name**: 过滤器名称 该名称会默认以dedup作为前缀 `dedup:expire_set:[name]`或`dedup:bloomfilter:[name]`。 默认ExpireFilter name=过期时间,BloomFilter name=`dedup:bloomfilter:bloomfilter`
112 | 
113 |  ![-w499](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/07/16151136442498.jpg)
114 | 
115 |  若对不同数据源去重,可通过name参数来指定不同去重库
116 | 
117 | - **absolute_name**:过滤器绝对名称 不会加dedup前缀
118 | - **expire_time**:ExpireFilter的过期时间 单位为秒,其他两种过滤器不用指定
119 | - **error_rate**:BloomFilter/MemoryFilter的误判率 默认为0.00001
120 | - **to_md5**:去重前是否将数据转为MD5,默认是
121 | 
122 | ## 爬虫中使用
123 | 
124 | 框架支持对请求和入库的数据进行去重,仅需要在[配置文件](source_code/配置文件)中进行配置即可
125 | 
126 | ```python
127 | ITEM_FILTER_ENABLE = False # item 去重
128 | REQUEST_FILTER_ENABLE = False # request 去重
129 | ```
130 | 
131 | 或者可以直接导入此去重模块使用
132 | 
133 | ```python
134 | from feapder.dedup import Dedup
135 | ```
136 | 
137 | 


--------------------------------------------------------------------------------
/docs/source_code/logger.md:
--------------------------------------------------------------------------------
 1 | # 日志配置及使用
 2 | 
 3 | ## 日志配置
 4 | 
 5 | 见配置文件,相关配置如下:
 6 | 
 7 | ```python
 8 | LOG_NAME = os.path.basename(os.getcwd())
 9 | LOG_PATH = "log/%s.log" % LOG_NAME  # log存储路径
10 | LOG_LEVEL = "DEBUG"
11 | LOG_COLOR = True  # 是否带有颜色
12 | LOG_IS_WRITE_TO_CONSOLE = True  # 是否打印到控制台
13 | LOG_IS_WRITE_TO_FILE = False  # 是否写文件
14 | LOG_MODE = "w"  # 写文件的模式
15 | LOG_MAX_BYTES = 10 * 1024 * 1024  # 每个日志文件的最大字节数
16 | LOG_BACKUP_COUNT = 20  # 日志文件保留数量
17 | LOG_ENCODING = "utf8"  # 日志文件编码
18 | OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
19 | ```
20 | 
21 | 框架屏蔽了requests、selenium等一些第三方库的日志,OTHERS_LOG_LEVAL是用来控制这些第三库日志等级的。
22 | 
23 | ## 使用日志工具
24 | 
25 | 
26 | ```python
27 | from feapder.utils.log import log
28 | 
29 | log.debug("xxx")
30 | log.info("xxx")
31 | log.warning("xxx")
32 | log.error("xxx")
33 | log.critical("xxx")
34 | ```
35 | 
36 | 默认是带有颜色的日志:
37 | 
38 | ![-w583](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/08/06/16282311862710.jpg)
39 | 
40 | 日志等级:CRITICAL > ERROR > WARNING > INFO > DEBUG
41 | 


--------------------------------------------------------------------------------
/docs/source_code/pipeline.md:
--------------------------------------------------------------------------------
 1 | # Pipeline
 2 | 
 3 | Pipeline是数据入库时流经的管道,用户可自定义,以便对接其他数据库。
 4 | 
 5 | 框架已内置mysql及mongo管道,其他管道作为扩展方式提供,可从[feapder_pipelines](https://github.com/Boris-code/feapder_pipelines)项目中按需安装
 6 | 
 7 | 项目地址:https://github.com/Boris-code/feapder_pipelines
 8 | 
 9 | ## 使用方式
10 | 
11 | 注:item会被聚合成多条一起流经pipeline,方便批量入库
12 | 
13 | ### 1. 编写pipeline
14 | 
15 | ```python
16 | from feapder.pipelines import BasePipeline
17 | from typing import Dict, List, Tuple
18 | 
19 | 
20 | class Pipeline(BasePipeline):
21 |     """
22 |     pipeline 是单线程的,批量保存数据的操作,不建议在这里写网络请求代码,如下载图片等
23 |     """
24 | 
25 |     def save_items(self, table, items: List[Dict]) -> bool:
26 |         """
27 |         保存数据
28 |         Args:
29 |             table: 表名
30 |             items: 数据,[{},{},...]
31 | 
32 |         Returns: 是否保存成功 True / False
33 |                  若False,不会将本批数据入到去重库,以便再次入库
34 | 
35 |         """
36 | 
37 |         print("自定义pipeline, 保存数据 >>>>", table, items)
38 | 
39 |         return True
40 | 
41 |     def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
42 |         """
43 |         更新数据, 与UpdateItem配合使用,若爬虫中没使用UpdateItem,则可不实现此接口
44 |         Args:
45 |             table: 表名
46 |             items: 数据,[{},{},...]
47 |             update_keys: 更新的字段, 如 ("title", "publish_time")
48 | 
49 |         Returns: 是否更新成功 True / False
50 |                  若False,不会将本批数据入到去重库,以便再次入库
51 | 
52 |         """
53 | 
54 |         print("自定义pipeline, 更新数据 >>>>", table, items, update_keys)
55 | 
56 |         return True
57 | ```
58 | 
59 | `Pipeline`需继承`BasePipeline`,类名和存放位置随意,需要实现`save_items`接口。一定要有返回值,返回`False`表示数据没保存成功,会触发重试逻辑
60 | 
61 | `update_items`接口与`UpdateItem`配合使用,更新数据时使用,若爬虫中没使用UpdateItem,则可不实现此接口
62 | 
63 | ### 2. 编写配置文件
64 | 
65 | ```python
66 | # 数据入库的pipeline,支持多个
67 | ITEM_PIPELINES = [
68 |     "pipeline.Pipeline"
69 | ]
70 | ``` 
71 | 
72 | 将编写好的pipeline配置进来,值为类的模块路径,需要指定到具体的类名
73 | 
74 | ## 示例
75 | 
76 | 地址:https://github.com/Boris-code/feapder/tree/master/tests/test-pipeline
77 | 


--------------------------------------------------------------------------------
/docs/source_code/proxy.md:
--------------------------------------------------------------------------------
  1 | # 代理使用说明
  2 | 
  3 | 代理使用有三种方式
  4 | 1. 使用框架内置代理池
  5 | 2. 自定义代理池
  6 | 3. 请求中直接指定
  7 | 
  8 | ## 方式1. 使用框架内置代理池
  9 | 
 10 | ### 配置代理
 11 | 
 12 | 在配置文件中配置代理提取接口
 13 | 
 14 | ```python
 15 | # 设置代理
 16 | PROXY_EXTRACT_API = None  # 代理提取API ,返回的代理分割符为\r\n
 17 | PROXY_ENABLE = True
 18 | PROXY_MAX_FAILED_TIMES = 5  # 代理最大失败次数,超过则不使用,自动删除
 19 | ```
 20 | 
 21 | 要求API返回的代理格式为使用 /r/n 分隔:
 22 | 
 23 | ```
 24 | ip:port
 25 | ip:port
 26 | ip:port
 27 | ```
 28 | 
 29 | 这样feapder在请求时会自动随机使用上面的代理请求了
 30 | 
 31 | ## 管理代理
 32 | 
 33 | 1. 删除代理(默认是请求异常连续5次,再删除代理)
 34 | 
 35 |     例如在发生异常时删除代理
 36 |     
 37 |     ```python
 38 |     import feapder
 39 |     class TestProxy(feapder.AirSpider):
 40 |         def start_requests(self):
 41 |             yield feapder.Request("https://www.baidu.com")
 42 |         
 43 |         def parse(self, request, response):
 44 |             print(response)
 45 |         
 46 |         def exception_request(self, request, response):
 47 |             request.del_proxy()
 48 |             
 49 |     ```
 50 |     
 51 | ## 方式2. 自定义代理池
 52 | 
 53 | 1. 编写代理池:例如在你的项目下创建个my_proxypool.py,实现下面的函数
 54 |     
 55 |     ```python
 56 |     from feapder.network.proxy_pool import BaseProxyPool 
 57 |         
 58 |     class MyProxyPool(BaseProxyPool):
 59 |         def get_proxy(self):
 60 |             """
 61 |             获取代理
 62 |             Returns:
 63 |                 {"http": "xxx", "https": "xxx"}
 64 |             """
 65 |             pass
 66 |         
 67 |         def del_proxy(self, proxy):
 68 |             """
 69 |             @summary: 删除代理
 70 |             ---------
 71 |             @param proxy: xxx
 72 |             """
 73 |             pass
 74 |     ```
 75 | 
 76 | 3. 修改setting的代理配置
 77 | 
 78 |     ```
 79 |     PROXY_POOL = "my_proxypool.MyProxyPool"  # 代理池
 80 |     ```
 81 |     
 82 |     将编写好的代理池配置进来,值为类的模块路径,需要指定到具体的类名
 83 |  
 84 | 
 85 | 
 86 | ## 方式3. 不使用代理池,直接给请求指定代理
 87 | 
 88 | 直接给request.proxies赋值即可,例如在下载中间件里使用
 89 | 
 90 | ```python
 91 | import feapder
 92 | 
 93 | class TestProxy(feapder.AirSpider):
 94 |     def start_requests(self):
 95 |         yield feapder.Request("https://www.baidu.com")
 96 |         
 97 |     def download_midware(self, request):
 98 |         # 这里使用代理使用即可
 99 |         request.proxies = {"https": "https://ip:port", "http": "http://ip:port"} 
100 |         return request
101 | 
102 |     def parse(self, request, response):
103 |         print(response)
104 | ```


--------------------------------------------------------------------------------
/docs/source_code/tools.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # tools
 3 | 
 4 | `feapder.utils.tools`里封装了爬虫中常用的函数,目前共计**129**个,可通过阅读源码了解使用
 5 | 
 6 | ## 举例
 7 | 
 8 | ### 时间格式化
 9 | 
10 | ```python
11 | from feapder.utils import tools
12 | 
13 | time = "昨天"
14 | 
15 | date = tools.format_time(time)
16 | assert date == "2021-03-15 00:00:00"
17 | ```
18 | 


--------------------------------------------------------------------------------
/docs/source_code/报警及监控.md:
--------------------------------------------------------------------------------
  1 | # 报警及监控
  2 | 
  3 | 支持钉钉、飞书、企业微信、邮件报警
  4 | 
  5 | ## 钉钉报警
  6 | 
  7 | 条件:需要有钉钉群,需要获取钉钉机器人的Webhook地址
  8 | 
  9 | 获取方式参考官方文档:https://developers.dingtalk.com/document/app/custom-robot-access
 10 | 
 11 | 安全设置选择自定义关键词,填入**feapder**
 12 | 
 13 | ![-w547](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/27/16167753030324.jpg)
 14 | 
 15 | 或使用加签方式,然后在setting中设置密钥
 16 | 
 17 | 相关配置:
 18 | 
 19 | ```python
 20 | # 钉钉报警
 21 | DINGDING_WARNING_URL = ""  # 钉钉机器人api
 22 | DINGDING_WARNING_PHONE = ""  # 报警人 支持列表,可指定多个
 23 | DINGDING_WARNING_ALL = False  # 是否提示所有人, 默认为False
 24 | DINGDING_WARNING_SECRET = None  # 加签密钥
 25 | ```
 26 | 
 27 | ## 企业微信报警
 28 | 
 29 | 条件:需要企业微信群,并获取企业微信机器人的Webhook地址
 30 | 
 31 | 获取方式:https://weibanzhushou.com/blog/330
 32 | 
 33 | 报警简介:
 34 | 
 35 | - 仅支持文本模式
 36 | - 当用户手机号码为空字符串或`WECHAT_WARNING_ALL`为`True`时将会`@全体成员`
 37 | 
 38 | 
 39 | 相关设置:
 40 | 
 41 | ```python
 42 | # 企业微信报警
 43 | WECHAT_WARNING_URL = ""  # 企业微信机器人api
 44 | WECHAT_WARNING_PHONE = ""  # 报警人 将会在群内@此人, 支持列表,可指定多人
 45 | WECHAT_WARNING_ALL = False  # 是否提示所有人, 默认为False
 46 | ```
 47 | 
 48 | ## 飞书报警
 49 | 
 50 | 可参考文档设置机器人:https://open.feishu.cn/document/ukTMukTMukTM/ucTM5YjL3ETO24yNxkjN#e1cdee9f
 51 | 
 52 | 然后在feapder的setting文件中修改如下配置
 53 | 
 54 | ```
 55 | FEISHU_WARNING_URL = ""  # 飞书机器人api
 56 | FEISHU_WARNING_USER = None  # 报警人 {"open_id":"ou_xxxxx", "name":"xxxx"} 或 [{"open_id":"ou_xxxxx", "name":"xxxx"}]
 57 | FEISHU_WARNING_ALL = False  # 是否提示所有人, 默认为False
 58 | ```
 59 | 
 60 | ## 邮件报警
 61 | 
 62 | 相关配置:
 63 | 
 64 | ```
 65 | # 邮件报警
 66 | EMAIL_SENDER = ""  # 发件人
 67 | EMAIL_PASSWORD = ""  # 授权码
 68 | EMAIL_RECEIVER = "" # 收件人 支持列表,可指定多个
 69 | ```
 70 | 
 71 | 邮件报警目前支持163邮箱作为发送者,`EMAIL_SENDER`为邮箱账号,如`feapder@163.com`, `EMAIL_PASSWORD`为授权码,不是登录密码,获取授权码的流程如下:
 72 | 
 73 | 1. 设置 -> POP3/SMTP/IMAP
 74 | 
 75 |     ![-w258](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/27/16167719328720.jpg)
 76 | 
 77 | 2. 开启SMTP服务
 78 | 
 79 |     ![-w444](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/27/16167719490656.jpg)
 80 |     
 81 |     开启后,会弹出授权码,该授权码即为EMAIL_PASSWORD
 82 |     
 83 | 3. 设置反垃圾规则为高级
 84 |     
 85 |     ![-w1112](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/03/27/16167719655644.jpg)
 86 | 
 87 | 4. 将本邮箱账号添加到白名单中
 88 | 
 89 | ## 报警间隔及报警级别
 90 | 
 91 | 框架会对相同的报警进行过滤,防止刷屏,默认的报警时间间隔为1小时,可通过以下配置修改:
 92 | 
 93 | ```python
 94 | WARNING_INTERVAL = 3600  # 相同报警的报警时间间隔,防止刷屏
 95 | WARNING_LEVEL = "DEBUG" # 报警级别, DEBUG / ERROR
 96 | ```
 97 | 
 98 | DEBUG级别的报警包含一些运行信息,ERROR级别的报警都是有问题的报警,需要及时处理
 99 | 
100 | 
101 | ## 可视化监控
102 | 
103 | 支持对爬虫运行情况进行监控,除了数据监控和请求监控外,用户还可自定义监控内容,详情参考[自定义监控](source_code/监控打点?id=自定义监控)
104 | 
105 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/09/14/16316112326191.jpg)
106 | 
107 | 需 feapder>=1.6.6, 需配合feaplat爬虫管理平台


--------------------------------------------------------------------------------
/docs/source_code/监控打点.md:
--------------------------------------------------------------------------------
 1 | # 监控打点
 2 | 
 3 | 需配合爬虫管理系统 **feaplat**
 4 | 
 5 | 监控数据默认保留180天,滚动删除
 6 | 
 7 | ## 爬虫中使用
 8 | 
 9 | > 需feapder>=1.6.6
10 | 
11 | feapder内置了监控打点,只需要部署到feaplat爬虫管理系统即可实现对请求和数据监控
12 | 
13 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/09/14/16316112326191.jpg)
14 | 
15 | - 注意使用 `yield item` 的方式入库的数据,才能看到数据监控的指标,图表的title是表名,折线图展示了每个字段是否有值的情况以及数据总量(total count)
16 | 
17 | - document为下载情况
18 | 
19 | 若想监控些其他的指标,参考自定义监控:
20 | 
21 | 
22 | ## 自定义监控
23 | 
24 | 举例:编写`test_metrics.py`代码如下:
25 | 
26 | ```python
27 | from feapder.utils import metrics
28 | 
29 | # 初始化打点系统
30 | metrics.init()
31 | 
32 | metrics.emit_counter("key", count=1, classify="test")
33 | 
34 | metrics.close()
35 | ```
36 | 
37 | 部署到feaplat:
38 | 
39 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/09/13/16315065474223.jpg)
40 | 
41 | 查看监控:
42 | 
43 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/09/13/16315067391666.jpg)
44 | 
45 | 再来解释下
46 | ```
47 | metrics.emit_counter("key", count=1, classify="test")
48 | ```
49 | - key 对应上图中的折线
50 | - count 对应上图中的点数
51 | - classify 对应上图中的图表标题
52 | 
53 | 若代码如下:
54 | ```python
55 | from feapder.utils import metrics
56 | 
57 | # 初始化打点系统
58 | metrics.init()
59 | 
60 | metrics.emit_counter("key", count=1, classify="test")
61 | metrics.emit_counter("key2", count=1, classify="test")
62 | metrics.emit_counter("key3", count=1, classify="test")
63 | 
64 | metrics.emit_counter("哈哈", count=1, classify="test2")
65 | 
66 | metrics.close()
67 | ```
68 | 
69 | 应该生成两张图表,第一个图表3条折线,实际生成如下:
70 | 
71 | ![](http://markdown-media.oss-cn-beijing.aliyuncs.com/2021/09/13/16315071385604.jpg)
72 | 
73 | 
74 | 如在feapder爬虫中使用,示例如下:
75 | 
76 | ```python
77 | import feapder
78 | from feapder.utils import metrics
79 | 
80 | 
81 | class TestSpider(feapder.AirSpider):
82 |     def start_requests(self):
83 |         yield feapder.Request("https://www.baidu.com")
84 | 
85 |     def parse(self, request, response):
86 |         # 自定义监控
87 |         metrics.emit_counter("success", count=1, classify="自定义的监控指标")
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     TestSpider().start()
92 | ```
93 | 
94 | 我们只需要导包,然后`metrics.emit_counter`即可,不需要关心 `metrics.init`和`metrics.close`, 若在scrapy或其他python脚本中使用,必须调用`metrics.init`和`metrics.close`
95 | 


--------------------------------------------------------------------------------
/docs/usage/使用前必读.md:
--------------------------------------------------------------------------------
 1 | # 使用前必读
 2 | 
 3 | ## 爬虫种类简介
 4 | 
 5 | feapder爬虫框架内置三种爬虫
 6 | 
 7 | 1. AirSpider - 轻量级爬虫
 8 | 2. Spider - 分布式爬虫
 9 | 3. BatchSpider - 分布式批次爬虫
10 | 
11 | **一、AirSpider :**
12 | 
13 | 轻量爬虫,学习成本低。面对一些数据量较少,无需断点续爬,无需分布式采集的需求,可采用此爬虫。
14 |  
15 | **二、Spider :**
16 | 
17 | 分布式爬虫,适用于海量数据采集,支持断点续爬、爬虫报警、数据自动入库等功能
18 | 
19 | 
20 | **三、BatchSpider**
21 | 
22 | 分布式批次爬虫,对于需要周期性采集的数据,优先考虑使用本爬虫。
23 | 
24 | 本爬虫会自动维护个批次信息表,详细的记录了每个批次时间、任务完成情况、批次周期等信息,示例数据如下
25 | ![-w899](http://markdown-media.oss-cn-beijing.aliyuncs.com/2020/12/20/16084680404224.jpg)
26 | 
27 | 另外本爬虫与其他爬虫最大的区别是,会维护个批次时间信息,本批次未完成下一批次不会开始。
28 | 
29 | 举个例子
30 | 
31 | > 需求:每7天全量抓取一次商品价格信息。表结构需要包含每个批次信息
32 | 
33 | 表设计如下:
34 | 
35 | | 字段 | 说明 |
36 | | --- | --- |
37 | | id | 主键 |
38 | | item_id | 商品id |
39 | | price | 价格 |
40 | | crawl_time | 采集时间 |
41 | | batch_date | 批次时间 |
42 | 
43 | 数据示例
44 | 
45 | | id | item_id | price | crawl_time | batch_date |
46 | | --- | --- | --- | --- | --- |
47 | | 1 | 3213 | 99 | 2021-01-01 | 2021-01-01 |
48 | | 2 | 3214 | 90 | 2021-01-05 | 2021-01-01 |
49 | | 3 | 3213 | 95 | 2021-01-08 | 2021-01-08 |
50 | | 4 | 3214 | 92 | 2021-01-20| 2021-01-08 |
51 | 
52 | 从数据示例中可以看到
53 | - id(1,2) 两条数据虽然是不同天采集的,但都归属于2021-01-01这个批次。
54 | - id(3,4) 为7天后抓取的新一批数据,归属于2021-01-08这个批次。
55 | - id为4的数琚,采集时间为20号,虽然已经超出了7天这个维度,但因是采集超时等某种原因导致,为了保证每个批次数据的完整性,仍会归属于2021-01-08这个批次。
56 | 
57 | BatchSpider爬虫会自动维护这个batch_date, 有了这个batch_date,方便业务做时序数据展示
58 | 
59 | 并且在采集过程中,可随时重启爬虫,若本批次还有剩余任务,会继续抓取,若本批次结束了,下一批次未到时,爬虫会自动退出
60 | 
61 | ## 学习路线
62 | 
63 | feapder虽然内置三种爬虫,但对于开发者暴露的接口一致。只需要继承不同的类即可,使用方式雷同。
64 | 
65 | 建议学习路线为 AirSpider->Spider->BatchSpider。因为后一个爬虫是基于前一个爬虫丰富而来的,与我们读书 小学->初中->高中这个路线类似
66 | 


--------------------------------------------------------------------------------
/feapder/VERSION:
--------------------------------------------------------------------------------
1 | 1.9.2


--------------------------------------------------------------------------------
/feapder/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2020/4/21 10:41 PM
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | import os
11 | import re
12 | import sys
13 | 
14 | sys.path.insert(0, re.sub(r"([\\/]items$)|([\\/]spiders$)", "", os.getcwd()))
15 | 
16 | __all__ = [
17 |     "AirSpider",
18 |     "Spider",
19 |     "TaskSpider",
20 |     "BatchSpider",
21 |     "BaseParser",
22 |     "TaskParser",
23 |     "BatchParser",
24 |     "Request",
25 |     "Response",
26 |     "Item",
27 |     "UpdateItem",
28 |     "ArgumentParser",
29 | ]
30 | 
31 | from feapder.core.spiders import AirSpider, Spider, TaskSpider, BatchSpider
32 | from feapder.core.base_parser import BaseParser, TaskParser, BatchParser
33 | from feapder.network.request import Request
34 | from feapder.network.response import Response
35 | from feapder.network.item import Item, UpdateItem
36 | from feapder.utils.custom_argparse import ArgumentParser
37 | 


--------------------------------------------------------------------------------
/feapder/buffer/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | '''
3 | Created on 2020/4/23 12:09 AM
4 | ---------
5 | @summary:
6 | ---------
7 | @author: Boris
8 | @email: boris_liu@foxmail.com
9 | '''


--------------------------------------------------------------------------------
/feapder/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/feapder/commands/__init__.py


--------------------------------------------------------------------------------
/feapder/commands/create/__init__.py:
--------------------------------------------------------------------------------
 1 | __all__ = [
 2 |     "CreateProject",
 3 |     "CreateSpider",
 4 |     "CreateItem",
 5 |     "CreateInit",
 6 |     "CreateJson",
 7 |     "CreateTable",
 8 |     "CreateCookies",
 9 |     "CreateSetting",
10 |     "CreateParams",
11 | ]
12 | 
13 | from .create_table import CreateTable
14 | from .create_json import CreateJson
15 | from .create_spider import CreateSpider
16 | from .create_init import CreateInit
17 | from .create_item import CreateItem
18 | from .create_project import CreateProject
19 | from .create_cookies import CreateCookies
20 | from .create_setting import CreateSetting
21 | from .create_params import CreateParams
22 | 


--------------------------------------------------------------------------------
/feapder/commands/create/create_cookies.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/4/25 10:22 上午
 4 | ---------
 5 | @summary: 将浏览器的cookie转为request的cookie
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | import json
12 | 
13 | import pyperclip
14 | 
15 | from feapder.utils.tools import get_cookies_from_str, print_pretty
16 | 
17 | 
18 | class CreateCookies:
19 |     def get_data(self):
20 |         """
21 |         @summary: 从剪切板中读取内容
22 |         ---------
23 |         ---------
24 |         @result:
25 |         """
26 |         input("请复制浏览器cookie (列表或字符串格式), 复制后按任意键读取剪切板内容\n")
27 | 
28 |         text = pyperclip.paste()
29 |         print(text + "\n")
30 | 
31 |         return text
32 | 
33 |     def create(self):
34 |         data = self.get_data()
35 |         cookies = {}
36 |         try:
37 |             data_json = json.loads(data)
38 | 
39 |             for data in data_json:
40 |                 cookies[data.get("name")] = data.get("value")
41 | 
42 |         except:
43 |             cookies = get_cookies_from_str(data)
44 | 
45 |         print_pretty(cookies)
46 | 


--------------------------------------------------------------------------------
/feapder/commands/create/create_init.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2018-08-28 17:38:43
 4 | ---------
 5 | @summary: 创建__init__.py
 6 | ---------
 7 | @author: Boris
 8 | @email:  boris_liu@foxmail.com
 9 | """
10 | 
11 | from feapder.utils.tools import dumps_json
12 | 
13 | 
14 | class CreateInit:
15 |     def create(self):
16 |         __all__ = []
17 | 
18 |         import os
19 | 
20 |         path = os.getcwd()
21 |         for file in os.listdir(path):
22 |             if file.endswith(".py") and not file.startswith("__init__"):
23 |                 model = file.split(".")[0]
24 |                 __all__.append(model)
25 | 
26 |         del os
27 | 
28 |         with open("__init__.py", "w", encoding="utf-8") as file:
29 |             text = "__all__ = %s" % dumps_json(__all__)
30 |             file.write(text)
31 | 


--------------------------------------------------------------------------------
/feapder/commands/create/create_json.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2018-08-28 17:38:43
 4 | ---------
 5 | @summary: 字符串转json
 6 | ---------
 7 | @author: Boris
 8 | @email:  boris_liu@foxmail.com
 9 | """
10 | 
11 | import pyperclip
12 | 
13 | import feapder.utils.tools as tools
14 | 
15 | 
16 | class CreateJson:
17 |     def get_data(self):
18 |         """
19 |         @summary: 从控制台读取多行
20 |         ---------
21 |         ---------
22 |         @result:
23 |         """
24 |         input("请复制需要转换的内容(xxx:xxx格式,支持多行),复制后按任意键读取剪切板内容\n")
25 | 
26 |         text = pyperclip.paste()
27 |         print(text + "\n")
28 | 
29 |         data = []
30 |         for line in text.split("\n"):
31 |             line = line.strip().replace("\t", " " * 4)
32 |             if not line:
33 |                 break
34 | 
35 |             data.append(line)
36 | 
37 |         return data
38 | 
39 |     def create(self, sort_keys=False):
40 |         contents = self.get_data()
41 | 
42 |         json = {}
43 |         for content in contents:
44 |             content = content.strip()
45 |             if not content or content.startswith(":"):
46 |                 continue
47 | 
48 |             regex = "([^:\s]*)[:|\s]*(.*)"
49 | 
50 |             result = tools.get_info(content, regex, fetch_one=True)
51 |             if result[0] in json:
52 |                 json[result[0]] = json[result[0]] + "&" + result[1]
53 |             else:
54 |                 json[result[0]] = result[1].strip()
55 | 
56 |         print(tools.dumps_json(json, sort_keys=sort_keys))
57 | 


--------------------------------------------------------------------------------
/feapder/commands/create/create_params.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/4/25 10:22 上午
 4 | ---------
 5 | @summary: 将浏览器的cookie转为request的cookie
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | import sys
12 | 
13 | from feapder.utils.tools import dumps_json
14 | 
15 | 
16 | class CreateParams:
17 |     def get_data(self):
18 |         """
19 |         @summary: 从控制台读取多行
20 |         ---------
21 |         ---------
22 |         @result:
23 |         """
24 |         print("请输入请求地址")
25 |         data = []
26 |         while True:
27 |             line = sys.stdin.readline().strip()
28 |             if not line:
29 |                 break
30 | 
31 |             data.append(line)
32 | 
33 |         return "".join(data)
34 | 
35 |     def get_params(self, url):
36 |         params_json = {}
37 |         params = url.split("?")[-1].split("&")
38 |         for param in params:
39 |             key_value = param.split("=", 1)
40 |             params_json[key_value[0]] = key_value[1]
41 | 
42 |         return params_json
43 | 
44 |     def create(self):
45 |         data = self.get_data()
46 | 
47 |         params = self.get_params(data)
48 |         url = data.split("?")[0]
49 | 
50 |         print(f'url = "{url}"')
51 |         print(f"params = {dumps_json(params)}")
52 | 


--------------------------------------------------------------------------------
/feapder/commands/create/create_project.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2018-08-28 17:38:43
 4 | ---------
 5 | @summary: 创建项目
 6 | ---------
 7 | @author: Boris
 8 | @email:  boris_liu@foxmail.com
 9 | """
10 | 
11 | import getpass
12 | import os
13 | import shutil
14 | 
15 | import feapder.utils.tools as tools
16 | 
17 | 
18 | def deal_file_info(file):
19 |     file = file.replace("{DATE}", tools.get_current_date())
20 |     file = file.replace("{USER}", os.getenv("FEAPDER_USER") or getpass.getuser())
21 | 
22 |     return file
23 | 
24 | 
25 | class CreateProject:
26 |     def copy_callback(self, src, dst, *, follow_symlinks=True):
27 |         if src.endswith(".py"):
28 |             with open(src, "r", encoding="utf-8") as src_file, open(
29 |                 dst, "w", encoding="utf8"
30 |             ) as dst_file:
31 |                 content = src_file.read()
32 |                 content = deal_file_info(content)
33 |                 dst_file.write(content)
34 | 
35 |         else:
36 |             shutil.copy2(src, dst, follow_symlinks=follow_symlinks)
37 | 
38 |     def create(self, project_name):
39 |         if os.path.exists(project_name):
40 |             print("%s 项目已经存在" % project_name)
41 |         else:
42 |             template_path = os.path.abspath(
43 |                 os.path.join(__file__, "../../../templates/project_template")
44 |             )
45 |             shutil.copytree(
46 |                 template_path, project_name, copy_function=self.copy_callback
47 |             )
48 | 
49 |             print("\n%s 项目生成成功" % project_name)
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/feapder/commands/create/create_setting.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/4/23 13:20
 4 | ---------
 5 | @summary: 生成配置文件
 6 | ---------
 7 | @author: mkdir700
 8 | @email:  mkdir700@gmail.com
 9 | """
10 | 
11 | import os
12 | import shutil
13 | 
14 | 
15 | class CreateSetting:
16 |     def create(self):
17 |         if os.path.exists("setting.py"):
18 |             confirm = input("配置文件已存在 是否覆盖 (y/n).  ")
19 |             if confirm != "y":
20 |                 print("取消覆盖  退出")
21 |                 return
22 | 
23 |         template_file_path = os.path.abspath(
24 |             os.path.join(__file__, "../../../templates/project_template/setting.py")
25 |         )
26 |         shutil.copy(template_file_path, "./", follow_symlinks=False)
27 |         print("配置文件生成成功")
28 | 


--------------------------------------------------------------------------------
/feapder/commands/retry.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2022/11/18 12:33 PM
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | import argparse
11 | 
12 | from feapder.core.handle_failed_items import HandleFailedItems
13 | from feapder.core.handle_failed_requests import HandleFailedRequests
14 | 
15 | 
16 | def retry_failed_requests(redis_key):
17 |     handle_failed_requests = HandleFailedRequests(redis_key)
18 |     handle_failed_requests.reput_failed_requests_to_requests()
19 | 
20 | 
21 | def retry_failed_items(redis_key):
22 |     handle_failed_items = HandleFailedItems(redis_key)
23 |     handle_failed_items.reput_failed_items_to_db()
24 |     handle_failed_items.close()
25 | 
26 | 
27 | def parse_args():
28 |     parser = argparse.ArgumentParser(
29 |         description="重试失败的请求或入库失败的item",
30 |         usage="usage: feapder retry [options] [args]",
31 |     )
32 |     parser.add_argument(
33 |         "-r",
34 |         "--request",
35 |         help="重试失败的request 如 feapder retry --request <redis_key>",
36 |         metavar="",
37 |     )
38 |     parser.add_argument(
39 |         "-i", "--item", help="重试失败的item 如 feapder retry --item <redis_key>", metavar=""
40 |     )
41 |     args = parser.parse_args()
42 |     return args
43 | 
44 | 
45 | def main():
46 |     args = parse_args()
47 |     if args.request:
48 |         retry_failed_requests(args.request)
49 |     if args.item:
50 |         retry_failed_items(args.item)
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     main()
55 | 


--------------------------------------------------------------------------------
/feapder/commands/zip.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2022/2/13 12:59 上午
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | import argparse
12 | import os
13 | import re
14 | import zipfile
15 | 
16 | 
17 | def is_ignore_file(ignore_files: list, filename):
18 |     for ignore_file in ignore_files:
19 |         if re.search(ignore_file, filename):
20 |             return True
21 |     return False
22 | 
23 | 
24 | def zip(dir_path, zip_name, ignore_dirs: list = None, ignore_files: list = None):
25 |     print(f"正在压缩 {dir_path} >> {zip_name}")
26 |     ignore_files.append(os.path.basename(zip_name))
27 |     with zipfile.ZipFile(zip_name, "w") as file:
28 |         dir_name = os.path.basename(dir_path)
29 |         parent_dir = os.path.dirname(dir_path)
30 |         if parent_dir:
31 |             os.chdir(parent_dir)
32 |         for path, dirs, filenames in os.walk(dir_name):
33 |             # 修改原dirs,方式遍历忽略文件夹里的文件
34 |             if ignore_dirs:
35 |                 dirs[:] = [d for d in dirs if d not in ignore_dirs]
36 |             for filename in filenames:
37 |                 if ignore_files and is_ignore_file(ignore_files, filename):
38 |                     continue
39 | 
40 |                 filepath = os.path.join(path, filename)
41 |                 print(f"  adding {filepath}")
42 |                 file.write(filepath)
43 | 
44 |     print(f"压缩成功 {dir_path} >> {zip_name}")
45 | 
46 | 
47 | def parse_args():
48 |     parser = argparse.ArgumentParser(
49 |         description="压缩文件夹, 默认排除以下文件夹及文件 .git,__pycache__,.idea,venv,.DS_Store",
50 |         usage="feapder zip dir_path [zip_name]",
51 |     )
52 |     parser.add_argument("dir_path", type=str, help="文件夹路径")
53 |     parser.add_argument("zip_name", type=str, nargs="?", help="压缩后的文件名,默认为文件夹名.zip")
54 |     parser.add_argument("-i", help="忽略文件,逗号分隔,支持正则", metavar="")
55 |     parser.add_argument("-I", help="忽略文件夹,逗号分隔,支持正则 ", metavar="")
56 |     parser.add_argument("-o", help="输出路径,默认为当前目录", metavar="")
57 | 
58 |     args = parser.parse_args()
59 |     return args
60 | 
61 | 
62 | def main():
63 |     ignore_dirs = [".git", "__pycache__", ".idea", "venv", "env"]
64 |     ignore_files = [".DS_Store"]
65 |     args = parse_args()
66 |     if args.i:
67 |         ignore_files.extend(args.i.split(","))
68 |     if args.I:
69 |         ignore_dirs.extend(args.I.split(","))
70 |     dir_path = args.dir_path
71 |     zip_name = args.zip_name or os.path.basename(dir_path) + ".zip"
72 |     if args.o:
73 |         zip_name = os.path.join(args.o, os.path.basename(zip_name))
74 | 
75 |     zip(dir_path, zip_name, ignore_dirs=ignore_dirs, ignore_files=ignore_files)
76 | 


--------------------------------------------------------------------------------
/feapder/core/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | '''
3 | Created on 2020/4/23 12:09 AM
4 | ---------
5 | @summary:
6 | ---------
7 | @author: Boris
8 | @email: boris_liu@foxmail.com
9 | '''


--------------------------------------------------------------------------------
/feapder/core/handle_failed_items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2022/11/18 11:33 AM
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | import feapder.setting as setting
11 | from feapder.buffer.item_buffer import ItemBuffer
12 | from feapder.db.redisdb import RedisDB
13 | from feapder.network.item import Item, UpdateItem
14 | from feapder.utils.log import log
15 | 
16 | 
17 | class HandleFailedItems:
18 |     def __init__(self, redis_key, task_table=None, item_buffer=None):
19 |         if redis_key.endswith(":s_failed_items"):
20 |             redis_key = redis_key.replace(":s_failed_items", "")
21 | 
22 |         self._redisdb = RedisDB()
23 |         self._item_buffer = item_buffer or ItemBuffer(redis_key, task_table=task_table)
24 | 
25 |         self._table_failed_items = setting.TAB_FAILED_ITEMS.format(redis_key=redis_key)
26 | 
27 |     def get_failed_items(self, count=1):
28 |         failed_items = self._redisdb.sget(
29 |             self._table_failed_items, count=count, is_pop=False
30 |         )
31 |         return failed_items
32 | 
33 |     def reput_failed_items_to_db(self):
34 |         log.debug("正在重新写入失败的items...")
35 |         total_count = 0
36 |         while True:
37 |             try:
38 |                 failed_items = self.get_failed_items()
39 |                 if not failed_items:
40 |                     break
41 | 
42 |                 for data_str in failed_items:
43 |                     data = eval(data_str)
44 | 
45 |                     for add in data.get("add"):
46 |                         table = add.get("table")
47 |                         datas = add.get("datas")
48 |                         for _data in datas:
49 |                             item = Item(**_data)
50 |                             item.table_name = table
51 |                             self._item_buffer.put_item(item)
52 |                             total_count += 1
53 | 
54 |                     for update in data.get("update"):
55 |                         table = update.get("table")
56 |                         datas = update.get("datas")
57 |                         update_keys = update.get("update_keys")
58 |                         for _data in datas:
59 |                             item = UpdateItem(**_data)
60 |                             item.table_name = table
61 |                             item.update_key = update_keys
62 |                             self._item_buffer.put_item(item)
63 |                             total_count += 1
64 | 
65 |                     # 入库成功后删除
66 |                     def delete_item():
67 |                         self._redisdb.srem(self._table_failed_items, data_str)
68 | 
69 |                     self._item_buffer.put_item(delete_item)
70 |                     self._item_buffer.flush()
71 | 
72 |             except Exception as e:
73 |                 log.exception(e)
74 | 
75 |         if total_count:
76 |             log.debug("导入%s条失败item到数库" % total_count)
77 |         else:
78 |             log.debug("没有失败的item")
79 | 
80 |     def close(self):
81 |         self._item_buffer.close()
82 | 


--------------------------------------------------------------------------------
/feapder/core/handle_failed_requests.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2018-08-13 11:43:01
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email:  boris_liu@foxmail.com
 9 | """
10 | import feapder.setting as setting
11 | from feapder.buffer.request_buffer import RequestBuffer
12 | from feapder.db.redisdb import RedisDB
13 | from feapder.network.request import Request
14 | from feapder.utils.log import log
15 | 
16 | 
17 | class HandleFailedRequests:
18 |     def __init__(self, redis_key):
19 |         if redis_key.endswith(":z_failed_requests"):
20 |             redis_key = redis_key.replace(":z_failed_requests", "")
21 | 
22 |         self._redisdb = RedisDB()
23 |         self._request_buffer = RequestBuffer(redis_key)
24 | 
25 |         self._table_failed_request = setting.TAB_FAILED_REQUESTS.format(
26 |             redis_key=redis_key
27 |         )
28 | 
29 |     def get_failed_requests(self, count=10000):
30 |         failed_requests = self._redisdb.zget(self._table_failed_request, count=count)
31 |         failed_requests = [eval(failed_request) for failed_request in failed_requests]
32 |         return failed_requests
33 | 
34 |     def reput_failed_requests_to_requests(self):
35 |         log.debug("正在重置失败的requests...")
36 |         total_count = 0
37 |         while True:
38 |             try:
39 |                 failed_requests = self.get_failed_requests()
40 |                 if not failed_requests:
41 |                     break
42 | 
43 |                 for request in failed_requests:
44 |                     request["retry_times"] = 0
45 |                     request_obj = Request.from_dict(request)
46 |                     self._request_buffer.put_request(request_obj)
47 | 
48 |                     total_count += 1
49 |             except Exception as e:
50 |                 log.exception(e)
51 | 
52 |         self._request_buffer.flush()
53 | 
54 |         log.debug("重置%s条失败requests为待抓取requests" % total_count)
55 | 


--------------------------------------------------------------------------------
/feapder/core/spiders/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2020/4/22 12:08 AM
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | __all__ = ["AirSpider", "TaskSpider", "Spider", "BatchSpider"]
12 | 
13 | from feapder.core.spiders.air_spider import AirSpider
14 | from feapder.core.spiders.spider import Spider
15 | from feapder.core.spiders.task_spider import TaskSpider
16 | from feapder.core.spiders.batch_spider import BatchSpider
17 | 


--------------------------------------------------------------------------------
/feapder/db/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on 2020/4/23 12:09 AM
4 | ---------
5 | @summary:
6 | ---------
7 | @author: Boris
8 | @email: boris_liu@foxmail.com
9 | """


--------------------------------------------------------------------------------
/feapder/db/memorydb.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2020/4/21 11:42 PM
 4 | ---------
 5 | @summary: 基于内存的队列,代替redis
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | from queue import PriorityQueue
11 | 
12 | from feapder import setting
13 | 
14 | 
15 | class MemoryDB:
16 |     def __init__(self):
17 |         self.priority_queue = PriorityQueue(maxsize=setting.TASK_MAX_CACHED_SIZE)
18 | 
19 |     def add(self, item, ignore_max_size=False):
20 |         """
21 |         添加任务
22 |         :param item: 数据: 支持小于号比较的类 或者 (priority, item)
23 |         :param ignore_max_size: queue满时是否等待,为True时无视队列的maxsize,直接往里塞
24 |         :return:
25 |         """
26 |         if ignore_max_size:
27 |             self.priority_queue._put(item)
28 |             self.priority_queue.unfinished_tasks += 1
29 |         else:
30 |             self.priority_queue.put(item)
31 | 
32 |     def get(self):
33 |         """
34 |         获取任务
35 |         :return:
36 |         """
37 |         try:
38 |             item = self.priority_queue.get(timeout=1)
39 |             return item
40 |         except:
41 |             return
42 | 
43 |     def empty(self):
44 |         return self.priority_queue.empty()
45 | 


--------------------------------------------------------------------------------
/feapder/dedup/README.md:
--------------------------------------------------------------------------------
 1 | # Dedup
 2 | 
 3 | Dedup是feapder大数据去重模块,内置3种去重机制,使用方式一致,可容纳的去重数据量与内存有关。不同于BloomFilter,去重受槽位数量影响,Dedup使用了弹性的去重机制,可容纳海量的数据去重。
 4 | 
 5 | 
 6 | ## 去重方式
 7 | 
 8 | ### 临时去重
 9 | 
10 | > 基于redis,支持批量,去重有时效性。去重一万条数据约0.26秒,一亿条数据占用内存约1.43G
11 | 
12 | ```
13 | from feapder.dedup import Dedup
14 | 
15 | data = {"xxx": 123, "xxxx": "xxxx"}
16 | datas = ["xxx", "bbb"]
17 | 
18 | def test_ExpireFilter():
19 |     dedup = Dedup(
20 |         Dedup.ExpireFilter, expire_time=10, redis_url="redis://@localhost:6379/0"
21 |     )
22 | 
23 |     # 逐条去重
24 |     assert dedup.add(data) == 1
25 |     assert dedup.get(data) == 1
26 | 
27 |     # 批量去重
28 |     assert dedup.add(datas) == [1, 1]
29 |     assert dedup.get(datas) == [1, 1]
30 | ```
31 | 
32 | 
33 | ### 内存去重
34 | 
35 | > 基于内存,支持批量。去重一万条数据约0.5秒,一亿条数据占用内存约285MB
36 | 
37 | ```
38 | from feapder.dedup import Dedup
39 | 
40 | data = {"xxx": 123, "xxxx": "xxxx"}
41 | datas = ["xxx", "bbb"]
42 | 
43 | def test_MemoryFilter():
44 |     dedup = Dedup(Dedup.MemoryFilter)  # 表名为test 历史数据3秒有效期
45 | 
46 |     # 逐条去重
47 |     assert dedup.add(data) == 1
48 |     assert dedup.get(data) == 1
49 | 
50 |     # 批量去重
51 |     assert dedup.add(datas) == [1, 1]
52 |     assert dedup.get(datas) == [1, 1]
53 | ```
54 | 
55 | ### 永久去重
56 | 
57 | > 基于redis,支持批量,永久去重。 去重一万条数据约3.5秒,一亿条数据占用内存约285MB
58 | 
59 |     from feapder.dedup import Dedup
60 | 
61 |     datas = {
62 |         "xxx": xxx,
63 |         "xxxx": "xxxx",
64 |     }
65 | 
66 |     dedup = Dedup()
67 | 
68 |     print(dedup) # <ScalableBloomFilter: RedisBitArray: dedup:bloomfilter:bloomfilter>
69 |     print(dedup.add(datas)) # 0 不存在
70 |     print(dedup.get(datas)) # 1 存在
71 |     
72 | ## 过滤数据
73 | 
74 | Dedup可以通过如下方法,过滤掉已存在的数据
75 | 
76 | 
77 | ```python
78 | from feapder.dedup import Dedup
79 | 
80 | def test_filter():
81 |     dedup = Dedup(Dedup.BloomFilter, redis_url="redis://@localhost:6379/0")
82 | 
83 |     # 制造已存在数据
84 |     datas = ["xxx", "bbb"]
85 |     dedup.add(datas)
86 | 
87 |     # 过滤掉已存在数据 "xxx", "bbb"
88 |     datas = ["xxx", "bbb", "ccc"]
89 |     dedup.filter_exist_data(datas)
90 |     assert datas == ["ccc"]
91 | ```
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/feapder/dedup/basefilter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2022/9/21 11:17 AM
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | import abc
11 | from typing import List, Union
12 | 
13 | 
14 | class BaseFilter:
15 |     @abc.abstractmethod
16 |     def add(
17 |         self, keys: Union[List[str], str], *args, **kwargs
18 |     ) -> Union[List[bool], bool]:
19 |         """
20 | 
21 |         Args:
22 |             keys: list / 单个值
23 |             *args:
24 |             **kwargs:
25 | 
26 |         Returns:
27 |             list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
28 |         """
29 |         pass
30 | 
31 |     @abc.abstractmethod
32 |     def get(self, keys: Union[List[str], str]) -> Union[List[bool], bool]:
33 |         """
34 |         检查数据是否存在
35 |         Args:
36 |             keys: list / 单个值
37 | 
38 |         Returns:
39 |             list / 单个值 (如果数据已存在 返回 1 否则返回 0)
40 |         """
41 |         pass
42 | 


--------------------------------------------------------------------------------
/feapder/dedup/expirefilter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2018/12/13 9:44 PM
 4 | ---------
 5 | @summary: 带有有效期的去重集合
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | import time
12 | 
13 | from feapder.db.redisdb import RedisDB
14 | from feapder.dedup.basefilter import BaseFilter
15 | 
16 | 
17 | class ExpireFilter(BaseFilter):
18 |     redis_db = None
19 | 
20 |     def __init__(
21 |         self, name: str, expire_time: int, expire_time_record_key=None, redis_url=None
22 |     ):
23 |         if not name:
24 |             raise ValueError("name cant't be None")
25 |         if not expire_time:
26 |             raise ValueError("please set expire time, units is seconds")
27 | 
28 |         if not self.__class__.redis_db:
29 |             self.__class__.redis_db = RedisDB(url=redis_url)
30 | 
31 |         self.name = name
32 |         self.expire_time = expire_time
33 |         self.expire_time_record_key = expire_time_record_key
34 |         self.del_expire_key_time = None
35 | 
36 |         self.record_expire_time()
37 | 
38 |         self.del_expire_key()
39 | 
40 |     def __repr__(self):
41 |         return "<ExpireSet: {}>".format(self.name)
42 | 
43 |     @property
44 |     def current_timestamp(self):
45 |         return int(time.time())
46 | 
47 |     def add(self, keys, *args, **kwargs):
48 |         """
49 |         @param keys: 检查关键词在zset中是否存在,支持列表批量
50 |         @return: list / 单个值
51 |         """
52 |         if self.current_timestamp - self.del_expire_key_time > self.expire_time:
53 |             self.del_expire_key()
54 | 
55 |         is_added = self.redis_db.zadd(self.name, keys, self.current_timestamp)
56 |         return is_added
57 | 
58 |     def get(self, keys):
59 |         is_exist = self.redis_db.zexists(self.name, keys)
60 |         if isinstance(keys, list):
61 |             # 判断数据本身是否重复
62 |             temp_set = set()
63 |             for i, key in enumerate(keys):
64 |                 if key in temp_set:
65 |                     is_exist[i] = 1
66 |                 else:
67 |                     temp_set.add(key)
68 | 
69 |         return is_exist
70 | 
71 |     def del_expire_key(self):
72 |         self.redis_db.zremrangebyscore(
73 |             self.name, "-inf", self.current_timestamp - self.expire_time
74 |         )
75 |         self.del_expire_key_time = self.current_timestamp
76 | 
77 |     def record_expire_time(self):
78 |         if self.expire_time_record_key:
79 |             self.redis_db.hset(
80 |                 self.expire_time_record_key, key=self.name, value=self.expire_time
81 |             )
82 | 


--------------------------------------------------------------------------------
/feapder/dedup/litefilter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2022/9/21 11:28 AM
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | from typing import List, Union, Set
11 | 
12 | from feapder.dedup.basefilter import BaseFilter
13 | 
14 | 
15 | class LiteFilter(BaseFilter):
16 |     def __init__(self):
17 |         self.datas: Set[str] = set()
18 | 
19 |     def add(
20 |         self, keys: Union[List[str], str], *args, **kwargs
21 |     ) -> Union[List[int], int]:
22 |         """
23 | 
24 |         Args:
25 |             keys: list / 单个值
26 |             *args:
27 |             **kwargs:
28 | 
29 |         Returns:
30 |             list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
31 |         """
32 |         if isinstance(keys, list):
33 |             is_add = []
34 |             for key in keys:
35 |                 if key not in self.datas:
36 |                     self.datas.add(key)
37 |                     is_add.append(1)
38 |                 else:
39 |                     is_add.append(0)
40 |         else:
41 |             if keys not in self.datas:
42 |                 is_add = 1
43 |                 self.datas.add(keys)
44 |             else:
45 |                 is_add = 0
46 |         return is_add
47 | 
48 |     def get(self, keys: Union[List[str], str]) -> Union[List[int], int]:
49 |         """
50 |         检查数据是否存在
51 |         Args:
52 |             keys: list / 单个值
53 | 
54 |         Returns:
55 |             list / 单个值 (如果数据已存在 返回 1 否则返回 0)
56 |         """
57 |         if isinstance(keys, list):
58 |             temp_set = set()
59 |             is_exist = []
60 |             for key in keys:
61 |                 # 数据本身重复或者数据在去重库里
62 |                 if key in temp_set or key in self.datas:
63 |                     is_exist.append(1)
64 |                 else:
65 |                     is_exist.append(0)
66 |                     temp_set.add(key)
67 | 
68 |             return is_exist
69 |         else:
70 |             return int(keys in self.datas)
71 | 


--------------------------------------------------------------------------------
/feapder/network/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/feapder/network/__init__.py


--------------------------------------------------------------------------------
/feapder/network/downloader/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._requests import RequestsDownloader
 2 | from ._requests import RequestsSessionDownloader
 3 | 
 4 | # 下面是非必要依赖
 5 | try:
 6 |     from ._selenium import SeleniumDownloader
 7 | except ModuleNotFoundError:
 8 |     pass
 9 | try:
10 |     from ._playwright import PlaywrightDownloader
11 | except ModuleNotFoundError:
12 |     pass
13 | 


--------------------------------------------------------------------------------
/feapder/network/downloader/_requests.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2022/4/10 5:57 下午
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | import requests
12 | from requests.adapters import HTTPAdapter
13 | 
14 | from feapder.network.downloader.base import Downloader
15 | from feapder.network.response import Response
16 | 
17 | 
18 | class RequestsDownloader(Downloader):
19 |     def download(self, request) -> Response:
20 |         response = requests.request(
21 |             request.method, request.url, **request.requests_kwargs
22 |         )
23 |         response = Response(response)
24 |         return response
25 | 
26 | 
27 | class RequestsSessionDownloader(Downloader):
28 |     session = None
29 | 
30 |     @property
31 |     def _session(self):
32 |         if not self.__class__.session:
33 |             self.__class__.session = requests.Session()
34 |             # pool_connections – 缓存的 urllib3 连接池个数  pool_maxsize – 连接池中保存的最大连接数
35 |             http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
36 |             # 任何使用该session会话的 HTTP 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
37 |             self.__class__.session.mount("http", http_adapter)
38 | 
39 |         return self.__class__.session
40 | 
41 |     def download(self, request) -> Response:
42 |         response = self._session.request(
43 |             request.method, request.url, **request.requests_kwargs
44 |         )
45 |         response = Response(response)
46 |         return response
47 | 


--------------------------------------------------------------------------------
/feapder/network/downloader/_selenium.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on 2022/7/26 4:28 下午
  4 | ---------
  5 | @summary:
  6 | ---------
  7 | @author: Boris
  8 | @email: boris_liu@foxmail.com
  9 | """
 10 | 
 11 | import feapder.setting as setting
 12 | import feapder.utils.tools as tools
 13 | from feapder.network.downloader.base import RenderDownloader
 14 | from feapder.network.response import Response
 15 | from feapder.utils.webdriver import WebDriverPool, SeleniumDriver
 16 | 
 17 | 
 18 | class SeleniumDownloader(RenderDownloader):
 19 |     webdriver_pool: WebDriverPool = None
 20 | 
 21 |     @property
 22 |     def _webdriver_pool(self):
 23 |         if not self.__class__.webdriver_pool:
 24 |             self.__class__.webdriver_pool = WebDriverPool(
 25 |                 **setting.WEBDRIVER, driver=SeleniumDriver
 26 |             )
 27 | 
 28 |         return self.__class__.webdriver_pool
 29 | 
 30 |     def download(self, request) -> Response:
 31 |         # 代理优先级 自定义 > 配置文件 > 随机
 32 |         if request.custom_proxies:
 33 |             proxy = request.get_proxy()
 34 |         elif setting.WEBDRIVER.get("proxy"):
 35 |             proxy = setting.WEBDRIVER.get("proxy")
 36 |         else:
 37 |             proxy = request.get_proxy()
 38 | 
 39 |         # user_agent优先级 自定义 > 配置文件 > 随机
 40 |         if request.custom_ua:
 41 |             user_agent = request.get_user_agent()
 42 |         elif setting.WEBDRIVER.get("user_agent"):
 43 |             user_agent = setting.WEBDRIVER.get("user_agent")
 44 |         else:
 45 |             user_agent = request.get_user_agent()
 46 | 
 47 |         cookies = request.get_cookies()
 48 |         url = request.url
 49 |         render_time = request.render_time or setting.WEBDRIVER.get("render_time")
 50 |         if request.get_params():
 51 |             url = tools.joint_url(url, request.get_params())
 52 | 
 53 |         browser: SeleniumDriver = self._webdriver_pool.get(
 54 |             user_agent=user_agent, proxy=proxy
 55 |         )
 56 |         try:
 57 |             browser.get(url)
 58 |             if cookies:
 59 |                 browser.cookies = cookies
 60 |                 # 刷新使cookie生效
 61 |                 browser.get(url)
 62 | 
 63 |             if render_time:
 64 |                 tools.delay_time(render_time)
 65 | 
 66 |             html = browser.page_source
 67 |             response = Response.from_dict(
 68 |                 {
 69 |                     "url": browser.current_url,
 70 |                     "cookies": browser.cookies,
 71 |                     "_content": html.encode(),
 72 |                     "status_code": 200,
 73 |                     "elapsed": 666,
 74 |                     "headers": {
 75 |                         "User-Agent": browser.user_agent,
 76 |                         "Cookie": tools.cookies2str(browser.cookies),
 77 |                     },
 78 |                 }
 79 |             )
 80 | 
 81 |             response.driver = browser
 82 |             response.browser = browser
 83 |             return response
 84 |         except Exception as e:
 85 |             self._webdriver_pool.remove(browser)
 86 |             raise e
 87 | 
 88 |     def close(self, driver):
 89 |         if driver:
 90 |             self._webdriver_pool.remove(driver)
 91 | 
 92 |     def put_back(self, driver):
 93 |         """
 94 |         释放浏览器对象
 95 |         """
 96 |         self._webdriver_pool.put(driver)
 97 | 
 98 |     def close_all(self):
 99 |         """
100 |         关闭所有浏览器
101 |         """
102 |         self._webdriver_pool.close()
103 | 


--------------------------------------------------------------------------------
/feapder/network/downloader/base.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | from abc import ABC
 3 | 
 4 | from feapder.network.response import Response
 5 | 
 6 | 
 7 | class Downloader:
 8 |     @abc.abstractmethod
 9 |     def download(self, request) -> Response:
10 |         """
11 | 
12 |         Args:
13 |             request: feapder.Request
14 | 
15 |         Returns: feapder.Response
16 | 
17 |         """
18 |         raise NotImplementedError
19 | 
20 |     def close(self, response: Response):
21 |         pass
22 | 
23 | 
24 | class RenderDownloader(Downloader, ABC):
25 |     def put_back(self, driver):
26 |         """
27 |         释放浏览器对象
28 |         """
29 |         pass
30 | 
31 |     def close(self, driver):
32 |         """
33 |         关闭浏览器
34 |         """
35 |         pass
36 | 
37 |     def close_all(self):
38 |         """
39 |         关闭所有浏览器
40 |         """
41 |         pass
42 | 


--------------------------------------------------------------------------------
/feapder/network/proxy_pool/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2023/7/25 10:16
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | from .base import BaseProxyPool
11 | from .proxy_pool import ProxyPool
12 | 


--------------------------------------------------------------------------------
/feapder/network/proxy_pool/base.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2023/7/25 10:03
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | import abc
12 | 
13 | from feapder.utils.log import log
14 | 
15 | 
16 | class BaseProxyPool:
17 |     @abc.abstractmethod
18 |     def get_proxy(self):
19 |         """
20 |         获取代理
21 |         Returns:
22 |             {"http": "xxx", "https": "xxx"}
23 |         """
24 |         raise NotImplementedError
25 | 
26 |     @abc.abstractmethod
27 |     def del_proxy(self, proxy):
28 |         """
29 |         @summary: 删除代理
30 |         ---------
31 |         @param proxy: ip:port
32 |         """
33 |         raise NotImplementedError
34 | 
35 |     def tag_proxy(self, **kwargs):
36 |         """
37 |         @summary: 标记代理
38 |         ---------
39 |         @param kwargs:
40 |         @return:
41 |         """
42 |         log.warning("暂不支持标记代理")
43 |         pass
44 | 


--------------------------------------------------------------------------------
/feapder/network/proxy_pool/proxy_pool.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2022/10/19 10:40 AM
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | from queue import Queue
11 | 
12 | import requests
13 | 
14 | import feapder.setting as setting
15 | from feapder.network.proxy_pool.base import BaseProxyPool
16 | from feapder.utils import metrics
17 | from feapder.utils import tools
18 | 
19 | 
20 | class ProxyPool(BaseProxyPool):
21 |     """
22 |     通过API提取代理,存储在内存中,无代理时会自动提取
23 |     API返回的代理以 \r\n 分隔
24 |     """
25 | 
26 |     def __init__(self, proxy_api=None, **kwargs):
27 |         self.proxy_api = proxy_api or setting.PROXY_EXTRACT_API
28 |         self.proxy_queue = Queue()
29 | 
30 |     def format_proxy(self, proxy):
31 |         return {"http": "http://" + proxy, "https": "http://" + proxy}
32 | 
33 |     @tools.retry(3, interval=5)
34 |     def pull_proxies(self):
35 |         resp = requests.get(self.proxy_api)
36 |         proxies = resp.text.strip()
37 |         resp.close()
38 |         if "{" in proxies or not proxies:
39 |             raise Exception("获取代理失败", proxies)
40 |         # 使用 /r/n 分隔
41 |         return proxies.split("\r\n")
42 | 
43 |     def get_proxy(self):
44 |         try:
45 |             if self.proxy_queue.empty():
46 |                 proxies = self.pull_proxies()
47 |                 for proxy in proxies:
48 |                     self.proxy_queue.put_nowait(proxy)
49 |                     metrics.emit_counter("total", 1, classify="proxy")
50 | 
51 |             proxy = self.proxy_queue.get_nowait()
52 |             self.proxy_queue.put_nowait(proxy)
53 | 
54 |             metrics.emit_counter("used_times", 1, classify="proxy")
55 | 
56 |             return self.format_proxy(proxy)
57 |         except Exception as e:
58 |             tools.send_msg("获取代理失败", level="error")
59 |             raise Exception("获取代理失败", e)
60 | 
61 |     def del_proxy(self, proxy):
62 |         """
63 |         @summary: 删除代理
64 |         ---------
65 |         @param proxy: ip:port
66 |         """
67 |         if proxy in self.proxy_queue.queue:
68 |             self.proxy_queue.queue.remove(proxy)
69 |             metrics.emit_counter("invalid", 1, classify="proxy")
70 | 


--------------------------------------------------------------------------------
/feapder/network/user_pool/__init__.py:
--------------------------------------------------------------------------------
 1 | __all__ = [
 2 |     "GuestUserPool",
 3 |     "GuestUser",
 4 |     "NormalUserPool",
 5 |     "NormalUser",
 6 |     "GoldUserPool",
 7 |     "GoldUser",
 8 |     "GoldUserStatus",
 9 | ]
10 | 
11 | from .gold_user_pool import GoldUserPool, GoldUser, GoldUserStatus
12 | from .guest_user_pool import GuestUserPool, GuestUser
13 | from .normal_user_pool import NormalUserPool, NormalUser
14 | 


--------------------------------------------------------------------------------
/feapder/pipelines/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/3/17 10:57 下午
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | import abc
12 | from typing import Dict, List, Tuple
13 | 
14 | 
15 | class BasePipeline(metaclass=abc.ABCMeta):
16 |     """
17 |     pipeline 是单线程的,批量保存数据的操作,不建议在这里写网络请求代码,如下载图片等
18 |     """
19 | 
20 |     @abc.abstractmethod
21 |     def save_items(self, table, items: List[Dict]) -> bool:
22 |         """
23 |         保存数据
24 |         Args:
25 |             table: 表名
26 |             items: 数据,[{},{},...]
27 | 
28 |         Returns: 是否保存成功 True / False
29 |                  若False,不会将本批数据入到去重库,以便再次入库
30 | 
31 |         """
32 | 
33 |         return True
34 | 
35 |     def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
36 |         """
37 |         更新数据, 与UpdateItem配合使用,若爬虫中没使用UpdateItem,则可不实现此接口
38 |         Args:
39 |             table: 表名
40 |             items: 数据,[{},{},...]
41 |             update_keys: 更新的字段, 如 ("title", "publish_time")
42 | 
43 |         Returns: 是否更新成功 True / False
44 |                  若False,不会将本批数据入到去重库,以便再次入库
45 | 
46 |         """
47 | 
48 |         return True
49 | 
50 |     def close(self):
51 |         """
52 |         关闭,爬虫结束时调用
53 |         Returns:
54 | 
55 |         """
56 |         pass
57 | 


--------------------------------------------------------------------------------
/feapder/pipelines/console_pipeline.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/3/18 12:39 上午
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | from feapder.pipelines import BasePipeline
12 | from typing import Dict, List, Tuple
13 | from feapder.utils.log import log
14 | 
15 | 
16 | class ConsolePipeline(BasePipeline):
17 |     """
18 |     pipeline 是单线程的,批量保存数据的操作,不建议在这里写网络请求代码,如下载图片等
19 |     """
20 | 
21 |     def save_items(self, table, items: List[Dict]) -> bool:
22 |         """
23 |         保存数据
24 |         Args:
25 |             table: 表名
26 |             items: 数据,[{},{},...]
27 | 
28 |         Returns: 是否保存成功 True / False
29 |                  若False,不会将本批数据入到去重库,以便再次入库
30 | 
31 |         """
32 |         log.info("【调试输出】共导出 %s 条数据 到 %s" % (len(items), table))
33 |         return True
34 | 
35 |     def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
36 |         """
37 |         更新数据
38 |         Args:
39 |             table: 表名
40 |             items: 数据,[{},{},...]
41 |             update_keys: 更新的字段, 如 ("title", "publish_time")
42 | 
43 |         Returns: 是否更新成功 True / False
44 |                  若False,不会将本批数据入到去重库,以便再次入库
45 | 
46 |         """
47 |         log.info("【调试输出】共导出 %s 条数据 到 %s" % (len(items), table))
48 |         return True
49 | 


--------------------------------------------------------------------------------
/feapder/pipelines/mongo_pipeline.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-04-18 14:12:21
 4 | ---------
 5 | @summary: 导出数据
 6 | ---------
 7 | @author: Mkdir700
 8 | @email:  mkdir700@gmail.com
 9 | """
10 | from typing import Dict, List, Tuple
11 | 
12 | from feapder.db.mongodb import MongoDB
13 | from feapder.pipelines import BasePipeline
14 | from feapder.utils.log import log
15 | 
16 | 
17 | class MongoPipeline(BasePipeline):
18 |     def __init__(self):
19 |         self._to_db = None
20 | 
21 |     @property
22 |     def to_db(self):
23 |         if not self._to_db:
24 |             self._to_db = MongoDB()
25 | 
26 |         return self._to_db
27 | 
28 |     def save_items(self, table, items: List[Dict]) -> bool:
29 |         """
30 |         保存数据
31 |         Args:
32 |             table: 表名
33 |             items: 数据,[{},{},...]
34 | 
35 |         Returns: 是否保存成功 True / False
36 |                  若False,不会将本批数据入到去重库,以便再次入库
37 | 
38 |         """
39 |         try:
40 |             add_count = self.to_db.add_batch(coll_name=table, datas=items)
41 |             datas_size = len(items)
42 |             log.info(
43 |                 "共导出 %s 条数据到 %s,  新增 %s条, 重复 %s 条"
44 |                 % (datas_size, table, add_count, datas_size - add_count)
45 |             )
46 |             return True
47 |         except Exception as e:
48 |             log.exception(e)
49 |             return False
50 | 
51 |     def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
52 |         """
53 |         更新数据
54 |         Args:
55 |             table: 表名
56 |             items: 数据,[{},{},...]
57 |             update_keys: 更新的字段, 如 ("title", "publish_time")
58 | 
59 |         Returns: 是否更新成功 True / False
60 |                  若False,不会将本批数据入到去重库,以便再次入库
61 | 
62 |         """
63 |         try:
64 |             add_count = self.to_db.add_batch(
65 |                 coll_name=table,
66 |                 datas=items,
67 |                 update_columns=update_keys or list(items[0].keys()),
68 |             )
69 |             datas_size = len(items)
70 |             update_count = datas_size - add_count
71 |             msg = "共导出 %s 条数据到 %s,  新增 %s 条, 更新 %s 条" % (
72 |                 datas_size,
73 |                 table,
74 |                 add_count,
75 |                 update_count,
76 |             )
77 |             if update_keys:
78 |                 msg += " 更新字段为 {}".format(update_keys)
79 |             log.info(msg)
80 | 
81 |             return True
82 |         except Exception as e:
83 |             log.exception(e)
84 |             return False
85 | 


--------------------------------------------------------------------------------
/feapder/pipelines/mysql_pipeline.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2018-07-29 22:48:30
 4 | ---------
 5 | @summary: 导出数据
 6 | ---------
 7 | @author: Boris
 8 | @email:  boris_liu@foxmail.com
 9 | """
10 | from typing import Dict, List, Tuple
11 | 
12 | import feapder.utils.tools as tools
13 | from feapder.db.mysqldb import MysqlDB
14 | from feapder.pipelines import BasePipeline
15 | from feapder.utils.log import log
16 | 
17 | 
18 | class MysqlPipeline(BasePipeline):
19 |     def __init__(self):
20 |         self._to_db = None
21 | 
22 |     @property
23 |     def to_db(self):
24 |         if not self._to_db:
25 |             self._to_db = MysqlDB()
26 | 
27 |         return self._to_db
28 | 
29 |     def save_items(self, table, items: List[Dict]) -> bool:
30 |         """
31 |         保存数据
32 |         Args:
33 |             table: 表名
34 |             items: 数据,[{},{},...]
35 | 
36 |         Returns: 是否保存成功 True / False
37 |                  若False,不会将本批数据入到去重库,以便再次入库
38 | 
39 |         """
40 | 
41 |         sql, datas = tools.make_batch_sql(table, items)
42 |         add_count = self.to_db.add_batch(sql, datas)
43 |         datas_size = len(datas)
44 |         if add_count:
45 |             log.info(
46 |                 "共导出 %s 条数据 到 %s, 重复 %s 条" % (datas_size, table, datas_size - add_count)
47 |             )
48 | 
49 |         return add_count != None
50 | 
51 |     def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
52 |         """
53 |         更新数据
54 |         Args:
55 |             table: 表名
56 |             items: 数据,[{},{},...]
57 |             update_keys: 更新的字段, 如 ("title", "publish_time")
58 | 
59 |         Returns: 是否更新成功 True / False
60 |                  若False,不会将本批数据入到去重库,以便再次入库
61 | 
62 |         """
63 | 
64 |         sql, datas = tools.make_batch_sql(
65 |             table, items, update_columns=update_keys or list(items[0].keys())
66 |         )
67 |         update_count = self.to_db.add_batch(sql, datas)
68 |         if update_count:
69 |             msg = "共更新 %s 条数据 到 %s" % (update_count // 2, table)
70 |             if update_keys:
71 |                 msg += " 更新字段为 {}".format(update_keys)
72 |             log.info(msg)
73 | 
74 |         return update_count != None
75 | 


--------------------------------------------------------------------------------
/feapder/requirements.txt:
--------------------------------------------------------------------------------
 1 | better-exceptions>=0.2.2
 2 | DBUtils>=2.0
 3 | parsel>=1.5.2
 4 | PyExecJS>=1.5.1
 5 | pymongo>=3.10.1
 6 | PyMySQL>=0.9.3
 7 | redis>=2.10.6,<4.0.0
 8 | requests>=2.22.0
 9 | selenium>=3.141.0
10 | bs4>=0.0.1
11 | ipython>=7.14.0
12 | bitarray>=1.5.3
13 | redis-py-cluster>=2.1.0
14 | cryptography>=3.3.2
15 | urllib3>=1.25.8
16 | loguru>=0.5.3
17 | influxdb>=5.3.1
18 | pyperclip>=1.8.2
19 | webdriver-manager>=4.0.0
20 | terminal-layout>=2.1.3
21 | playwright


--------------------------------------------------------------------------------
/feapder/templates/air_spider_template.tmpl:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on {DATE}
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: {USER}
 8 | """
 9 | 
10 | import feapder
11 | 
12 | 
13 | class ${spider_name}(feapder.AirSpider):
14 |     def start_requests(self):
15 |         yield feapder.Request("https://spidertools.cn")
16 | 
17 |     def parse(self, request, response):
18 |         # 提取网站title
19 |         print(response.xpath("//title/text()").extract_first())
20 |         # 提取网站描述
21 |         print(response.xpath("//meta[@name='description']/@content").extract_first())
22 |         print("网站地址: ", response.url)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     ${spider_name}().start()


--------------------------------------------------------------------------------
/feapder/templates/batch_spider_template.tmpl:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on {DATE}
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: {USER}
 8 | """
 9 | 
10 | import feapder
11 | from feapder import ArgumentParser
12 | 
13 | 
14 | class ${spider_name}(feapder.BatchSpider):
15 |     # 自定义数据库,若项目中有setting.py文件,此自定义可删除
16 |     __custom_setting__ = dict(
17 |         REDISDB_IP_PORTS="localhost:6379",
18 |         REDISDB_USER_PASS="",
19 |         REDISDB_DB=0,
20 |         MYSQL_IP="localhost",
21 |         MYSQL_PORT=3306,
22 |         MYSQL_DB="",
23 |         MYSQL_USER_NAME="",
24 |         MYSQL_USER_PASS="",
25 |     )
26 | 
27 |     def start_requests(self, task):
28 |         yield feapder.Request("https://spidertools.cn")
29 | 
30 |     def parse(self, request, response):
31 |         # 提取网站title
32 |         print(response.xpath("//title/text()").extract_first())
33 |         # 提取网站描述
34 |         print(response.xpath("//meta[@name='description']/@content").extract_first())
35 |         print("网站地址: ", response.url)
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     spider = ${spider_name}(
40 |         redis_key="xxx:xxxx",  # 分布式爬虫调度信息存储位置
41 |         task_table="",  # mysql中的任务表
42 |         task_keys=["id", "xxx"],  # 需要获取任务表里的字段名,可添加多个
43 |         task_state="state",  # mysql中任务状态字段
44 |         batch_record_table="xxx_batch_record",  # mysql中的批次记录表
45 |         batch_name="xxx(周全)",  # 批次名字
46 |         batch_interval=7,  # 批次周期 天为单位 若为小时 可写 1 / 24
47 |     )
48 | 
49 |     parser = ArgumentParser(description="${spider_name}爬虫")
50 | 
51 |     parser.add_argument(
52 |         "--start_master",
53 |         action="store_true",
54 |         help="添加任务",
55 |         function=spider.start_monitor_task,
56 |     )
57 |     parser.add_argument(
58 |         "--start_worker", action="store_true", help="启动爬虫", function=spider.start
59 |     )
60 | 
61 |     parser.start()
62 | 
63 |     # 直接启动
64 |     # spider.start()  # 启动爬虫
65 |     # spider.start_monitor_task() # 添加任务
66 | 
67 |     # 通过命令行启动
68 |     # python ${file_name} --start_master  # 添加任务
69 |     # python ${file_name} --start_worker  # 启动爬虫
70 | 


--------------------------------------------------------------------------------
/feapder/templates/item_template.tmpl:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on {DATE}
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: {USER}
 8 | """
 9 | 
10 | from feapder import Item
11 | 
12 | 
13 | class ${item_name}Item(Item):
14 |     """
15 |     This class was generated by feapder
16 |     command: feapder create -i ${command}
17 |     """
18 | 
19 |     __table_name__ = "${table_name}"
20 | 
21 |     def __init__(self, *args, **kwargs):
22 |         ${propertys}
23 | 


--------------------------------------------------------------------------------
/feapder/templates/project_template/CHECK_DATA.md:
--------------------------------------------------------------------------------
 1 | # 数据审核 
 2 | ## 表说明:
 3 | 
 4 | > 表名 含义(更新策略)
 5 | 
 6 | ## 一、准确性
 7 | 
 8 | **字段设计是否满足需求? 表之间的关联字段是否满足要求? (需要人工检查)**
 9 | 
10 | > 注意:是否设计了自增 id,id 的类型是否设置为 bigint?
11 | > 注意:unique index 是否需要设计?
12 | > 注意:各张表之间是否需要设计关联字段;
13 | 
14 | * [ ] 是
15 | * [ ] 否
16 | 
17 | **各字段采集内容及存储格式是否满足要求?是否与网页一致?是否有信息缺失?**
18 | 
19 | > 备注:可尝试对每个字段进行升降序排列,然后抽样检查;
20 |      
21 | **是否考虑了网站同一类数据可能出现的数据格式不一致情况?**
22 | 
23 | > 建议:代码对各个字段不做兼容性处理、数据不一致则抛出异常并记录 
24 | 
25 | * [ ] 是
26 | * [ ] 否
27 | 
28 | ## 二、全量性
29 | 
30 | **如果是增量采集,是否最早信息和最晚信息都采集了,同时条目总数是否正确;**
31 | **如果是批次采集,是否每个批次都有?**
32 | 
33 | >备注:需要去网页端评估单个批次的总量;
34 | >参考sql语句:SELECT count(1), batch_date from [table_name] GROUP BY batch_date;
35 | 
36 | **如果与另外一张表有关联关系,是否信息关联完整?**
37 | 
38 | ## 三、稳定性
39 | 
40 | * [ ] 是否能够长期稳定采集? 
41 | * [ ] 是否加IP代理?
42 | * [ ] 是否支持断点续跑?
43 | * [ ] 是否能确保按时启动,定期采集?
44 | * [ ] 是否已开启报警? 
45 | 
46 | ## 四、采集频次、类型、存储方式
47 | 
48 | * [ ] 采集频次是否满足要求?
49 | * [ ] 采集类型是否满足要求:增量采集 or 批次采集? 
50 | 


--------------------------------------------------------------------------------
/feapder/templates/project_template/README.md:
--------------------------------------------------------------------------------
1 | # xxx爬虫文档
2 | ## 调研
3 | 
4 | ## 数据库设计
5 | 
6 | ## 爬虫逻辑
7 | 
8 | ## 项目架构


--------------------------------------------------------------------------------
/feapder/templates/project_template/items/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/feapder/templates/project_template/items/__init__.py


--------------------------------------------------------------------------------
/feapder/templates/project_template/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on {DATE}
 4 | ---------
 5 | @summary: 爬虫入口
 6 | ---------
 7 | @author: {USER}
 8 | """
 9 | 
10 | from feapder import ArgumentParser
11 | 
12 | from spiders import *
13 | 
14 | def crawl_xxx():
15 |     """
16 |     AirSpider爬虫
17 |     """
18 |     spider = xxx.XXXSpider()
19 |     spider.start()
20 | 
21 | def crawl_xxx():
22 |     """
23 |     Spider爬虫
24 |     """
25 |     spider = xxx.XXXSpider(redis_key="xxx:xxx")
26 |     spider.start()
27 | 
28 | 
29 | def crawl_xxx(args):
30 |     """
31 |     BatchSpider爬虫
32 |     """
33 |     spider = xxx_spider.XXXSpider(
34 |         task_table="",  # mysql中的任务表
35 |         batch_record_table="",  # mysql中的批次记录表
36 |         batch_name="xxx(周全)",  # 批次名字
37 |         batch_interval=7,  # 批次时间 天为单位 若为小时 可写 1 / 24
38 |         task_keys=["id", "xxx"],  # 需要获取任务表里的字段名,可添加多个
39 |         redis_key="xxx:xxxx",  # redis中存放request等信息的根key
40 |         task_state="state",  # mysql中任务状态字段
41 |     )
42 | 
43 |     if args == 1:
44 |         spider.start_monitor_task()
45 |     elif args == 2:
46 |         spider.start()
47 |     elif args == 3:
48 |         spider.init_task()
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     parser = ArgumentParser(description="xxx爬虫")
53 | 
54 |     parser.add_argument(
55 |         "--crawl_xxx", action="store_true", help="xxx爬虫", function=crawl_xxx
56 |     )
57 |     parser.add_argument(
58 |         "--crawl_xxx", action="store_true", help="xxx爬虫", function=crawl_xxx
59 |     )
60 |     parser.add_argument(
61 |         "--crawl_xxx",
62 |         type=int,
63 |         nargs=1,
64 |         help="xxx爬虫",
65 |         choices=[1, 2, 3],
66 |         function=crawl_xxx,
67 |     )
68 | 
69 |     parser.start()
70 | 
71 |     # main.py作为爬虫启动的统一入口,提供命令行的方式启动多个爬虫,若只有一个爬虫,可不编写main.py
72 |     # 将上面的xxx修改为自己实际的爬虫名
73 |     # 查看运行命令 python main.py --help
74 |     # AirSpider与Spider爬虫运行方式 python main.py --crawl_xxx
75 |     # BatchSpider运行方式
76 |     # 1. 下发任务:python main.py --crawl_xxx 1
77 |     # 2. 采集:python main.py --crawl_xxx 2
78 |     # 3. 重置任务:python main.py --crawl_xxx 3
79 | 
80 | 


--------------------------------------------------------------------------------
/feapder/templates/project_template/spiders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/feapder/templates/project_template/spiders/__init__.py


--------------------------------------------------------------------------------
/feapder/templates/spider_template.tmpl:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on {DATE}
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: {USER}
 8 | """
 9 | 
10 | import feapder
11 | 
12 | 
13 | class ${spider_name}(feapder.Spider):
14 |     # 自定义数据库,若项目中有setting.py文件,此自定义可删除
15 |     __custom_setting__ = dict(
16 |         REDISDB_IP_PORTS="localhost:6379", REDISDB_USER_PASS="", REDISDB_DB=0
17 |     )
18 | 
19 |     def start_requests(self):
20 |         yield feapder.Request("https://spidertools.cn")
21 | 
22 |     def parse(self, request, response):
23 |         # 提取网站title
24 |         print(response.xpath("//title/text()").extract_first())
25 |         # 提取网站描述
26 |         print(response.xpath("//meta[@name='description']/@content").extract_first())
27 |         print("网站地址: ", response.url)
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     ${spider_name}(redis_key="xxx:xxx").start()
32 | 


--------------------------------------------------------------------------------
/feapder/templates/task_spider_template.tmpl:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on {DATE}
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: {USER}
 8 | """
 9 | 
10 | import feapder
11 | from feapder import ArgumentParser
12 | 
13 | 
14 | class ${spider_name}(feapder.TaskSpider):
15 |     # 自定义数据库,若项目中有setting.py文件,此自定义可删除
16 |     __custom_setting__ = dict(
17 |         REDISDB_IP_PORTS="localhost:6379",
18 |         REDISDB_USER_PASS="",
19 |         REDISDB_DB=0,
20 |         MYSQL_IP="localhost",
21 |         MYSQL_PORT=3306,
22 |         MYSQL_DB="",
23 |         MYSQL_USER_NAME="",
24 |         MYSQL_USER_PASS="",
25 |     )
26 | 
27 |     def start_requests(self, task):
28 |         task_id = task.id
29 |         url = task.url
30 |         yield feapder.Request(url, task_id=task_id)
31 | 
32 |     def parse(self, request, response):
33 |         # 提取网站title
34 |         print(response.xpath("//title/text()").extract_first())
35 |         # 提取网站描述
36 |         print(response.xpath("//meta[@name='description']/@content").extract_first())
37 |         print("网站地址: ", response.url)
38 | 
39 |         # mysql 需要更新任务状态为做完 即 state=1
40 |         yield self.update_task_batch(request.task_id)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     # 用mysql做任务表,需要先建好任务任务表
45 |     spider = ${spider_name}(
46 |         redis_key="xxx:xxx",  # 分布式爬虫调度信息存储位置
47 |         task_table="",  # mysql中的任务表
48 |         task_keys=["id", "url"],  # 需要获取任务表里的字段名,可添加多个
49 |         task_state="state",  # mysql中任务状态字段
50 |     )
51 | 
52 |     # 用redis做任务表
53 |     # spider = ${spider_name}(
54 |     #     redis_key="xxx:xxxx",  # 分布式爬虫调度信息存储位置
55 |     #     task_table="", # 任务表名
56 |     #     task_table_type="redis", # 任务表类型为redis
57 |     # )
58 | 
59 |     parser = ArgumentParser(description="${spider_name}爬虫")
60 | 
61 |     parser.add_argument(
62 |         "--start_master",
63 |         action="store_true",
64 |         help="添加任务",
65 |         function=spider.start_monitor_task,
66 |     )
67 |     parser.add_argument(
68 |         "--start_worker", action="store_true", help="启动爬虫", function=spider.start
69 |     )
70 | 
71 |     parser.start()
72 | 
73 |     # 直接启动
74 |     # spider.start()  # 启动爬虫
75 |     # spider.start_monitor_task() # 添加任务
76 | 
77 |     # 通过命令行启动
78 |     # python ${file_name} --start_master  # 添加任务
79 |     # python ${file_name} --start_worker  # 启动爬虫


--------------------------------------------------------------------------------
/feapder/templates/update_item_template.tmpl:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on {DATE}
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: {USER}
 8 | """
 9 | 
10 | from feapder import UpdateItem
11 | 
12 | 
13 | class ${item_name}Item(UpdateItem):
14 |     """
15 |     This class was generated by feapder
16 |     command: feapder create -i ${command}
17 |     """
18 | 
19 |     __table_name__ = "${table_name}"
20 | 
21 |     def __init__(self, *args, **kwargs):
22 |         ${propertys}
23 | 


--------------------------------------------------------------------------------
/feapder/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | '''
3 | Created on 2019/11/5 4:41 PM
4 | ---------
5 | @summary:
6 | ---------
7 | @author: Boris
8 | @email: boris_liu@foxmail.com
9 | '''


--------------------------------------------------------------------------------
/feapder/utils/custom_argparse.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2018-10-15 14:32:12
 4 | ---------
 5 | @summary: 封装ArgumentParser, 使其支持function, 调用start自动执行
 6 | ---------
 7 | @author: Boris
 8 | @email:  boris_liu@foxmail.com
 9 | """
10 | 
11 | import argparse
12 | 
13 | 
14 | class ArgumentParser(argparse.ArgumentParser):
15 |     def __init__(self, *args, **kwargs):
16 |         self.functions = {}
17 | 
18 |         super(ArgumentParser, self).__init__(*args, **kwargs)
19 | 
20 |     def add_argument(self, *args, **kwargs):
21 |         function = kwargs.pop("function") if "function" in kwargs else None
22 |         key = self._get_optional_kwargs(*args, **kwargs).get("dest")
23 |         self.functions[key] = function
24 | 
25 |         return super(ArgumentParser, self).add_argument(*args, **kwargs)
26 | 
27 |     def start(self, args=None, namespace=None):
28 |         args = self.parse_args(args=args, namespace=namespace)
29 |         for key, value in vars(args).items():  # vars() 函数返回对象object的属性和属性值的字典对象
30 |             if value not in (None, False):
31 |                 if callable(self.functions[key]):
32 |                     if value != True:
33 |                         if isinstance(value, list) and len(value) == 1:
34 |                             value = value[0]
35 |                         self.functions[key](value)
36 |                     else:
37 |                         self.functions[key]()
38 | 
39 |     def run(self, args, values=None):
40 |         if args in self.functions:
41 |             if values:
42 |                 self.functions[args](values)
43 |             else:
44 |                 self.functions[args]()
45 | 
46 |         else:
47 |             raise Exception(f"无此方法: {args}")
48 | 
49 | 
50 | if __name__ == "__main__":
51 | 
52 |     def test():
53 |         print("test not args func")
54 | 
55 |     def test2(args):
56 |         print("test args func", args)
57 | 
58 |     parser = ArgumentParser(description="测试")
59 | 
60 |     parser.add_argument("--test2", type=int, nargs=1, help="(1|2)", function=test2)
61 |     parser.add_argument("--test", action="store_true", help="", function=test)
62 | 
63 |     parser.start()
64 | 


--------------------------------------------------------------------------------
/feapder/utils/email_sender.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2020/2/19 12:57 PM
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | import os
12 | import smtplib
13 | from email.header import Header
14 | from email.mime.multipart import MIMEMultipart
15 | from email.mime.text import MIMEText
16 | from email.utils import formataddr
17 | 
18 | from feapder.utils.log import log
19 | 
20 | 
21 | class EmailSender(object):
22 |     SENDER = "feapder报警系统"
23 | 
24 |     def __init__(self, username, password, smtpserver="smtp.163.com"):
25 |         self.username = username
26 |         self.password = password
27 |         self.smtpserver = smtpserver
28 |         self.smtp_client = smtplib.SMTP_SSL(smtpserver)
29 |         self.sender = EmailSender.SENDER
30 | 
31 |     def __enter__(self):
32 |         self.login()
33 |         return self
34 | 
35 |     def __exit__(self, exc_type, exc_val, exc_tb):
36 |         self.quit()
37 | 
38 |     def quit(self):
39 |         self.smtp_client.quit()
40 | 
41 |     def login(self):
42 |         self.smtp_client.connect(self.smtpserver)
43 |         self.smtp_client.login(self.username, self.password)
44 | 
45 |     def send(
46 |         self,
47 |         receivers: list,
48 |         title: str,
49 |         content: str,
50 |         content_type: str = "plain",
51 |         filepath: str = None,
52 |     ):
53 |         """
54 | 
55 |         Args:
56 |             receivers:
57 |             title:
58 |             content:
59 |             content_type: html / plain
60 |             filepath:
61 | 
62 |         Returns:
63 | 
64 |         """
65 |         # 创建一个带附件的实例
66 |         message = MIMEMultipart()
67 |         message["From"] = formataddr(
68 |             (self.sender, self.username)
69 |         )  # 括号里的对应发件人邮箱昵称、发件人邮箱账号
70 |         message["To"] = ",".join(
71 |             [formataddr((receiver, receiver)) for receiver in receivers]
72 |         )
73 | 
74 |         message["Subject"] = Header(title, "utf-8")
75 | 
76 |         content = MIMEText(content, content_type, "utf-8")
77 |         message.attach(content)
78 | 
79 |         # 构造附件
80 |         if filepath:
81 |             attach = MIMEText(open(filepath, "rb").read(), "base64", "utf-8")
82 |             attach.add_header(
83 |                 "content-disposition",
84 |                 "attachment",
85 |                 filename=("utf-8", "", os.path.basename(filepath)),
86 |             )
87 |             message.attach(attach)
88 | 
89 |         msg = message.as_string()
90 |         # 此处直接发送多个邮箱有问题,改成一个个发送
91 |         for receiver in receivers:
92 |             log.debug("发送邮件到 {}".format(receiver))
93 |             self.smtp_client.sendmail(self.username, receiver, msg)
94 |         log.debug("邮件发送成功!!!")
95 |         return True
96 | 


--------------------------------------------------------------------------------
/feapder/utils/perfect_dict.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/4/8 11:32 上午
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | 
12 | def ensure_value(value):
13 |     if isinstance(value, (list, tuple)):
14 |         _value = []
15 |         for v in value:
16 |             _value.append(ensure_value(v))
17 | 
18 |         if isinstance(value, tuple):
19 |             value = tuple(_value)
20 |         else:
21 |             value = _value
22 | 
23 |     if isinstance(value, dict):
24 |         return PerfectDict(value)
25 |     else:
26 |         return value
27 | 
28 | 
29 | class PerfectDict(dict):
30 |     """
31 |     >>> data = PerfectDict({"id":1, "url":"xxx"})
32 |     >>> data
33 |     {'id': 1, 'url': 'xxx'}
34 |     >>> data = PerfectDict(id=1, url="xxx")
35 |     >>> data
36 |     {'id': 1, 'url': 'xxx'}
37 |     >>> data.id
38 |     1
39 |     >>> data.get("id")
40 |     1
41 |     >>> data["id"]
42 |     1
43 |     >>> id, url = data
44 |     >>> id
45 |     1
46 |     >>> url
47 |     'xxx'
48 |     >>> data[0]
49 |     1
50 |     >>> data[1]
51 |     'xxx'
52 |     >>> data = PerfectDict({"a": 1, "b": {"b1": 2}, "c": [{"c1": [{"d": 1}]}]})
53 |     >>> data.b.b1
54 |     2
55 |     >>> data[1].b1
56 |     2
57 |     >>> data.get("b").b1
58 |     2
59 |     >>> data.c[0].c1
60 |     [{'d': 1}]
61 |     >>> data.c[0].c1[0]
62 |     {'d': 1}
63 |     """
64 | 
65 |     def __init__(self, _dict: dict = None, _values: list = None, **kwargs):
66 |         self.__dict__ = _dict or kwargs or {}
67 |         self.__dict__.pop("__values__", None)
68 |         super().__init__(self.__dict__, **kwargs)
69 |         self.__values__ = _values or list(self.__dict__.values())
70 | 
71 |     def __getitem__(self, key):
72 |         if isinstance(key, int):
73 |             value = self.__values__[key]
74 |         else:
75 |             value = self.__dict__[key]
76 | 
77 |         return ensure_value(value)
78 | 
79 |     def __iter__(self, *args, **kwargs):
80 |         for value in self.__values__:
81 |             yield ensure_value(value)
82 | 
83 |     def __getattribute__(self, item):
84 |         value = object.__getattribute__(self, item)
85 |         if item == "__dict__" or item == "__values__":
86 |             return value
87 |         return ensure_value(value)
88 | 
89 |     def get(self, key, default=None):
90 |         if key in self.__dict__:
91 |             value = self.__dict__[key]
92 |             return ensure_value(value)
93 | 
94 |         return default
95 | 


--------------------------------------------------------------------------------
/feapder/utils/tail_thread.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2024/3/19 20:00
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | import sys
11 | import threading
12 | 
13 | 
14 | class TailThread(threading.Thread):
15 |     """
16 |     所有子线程结束后,主线程才会退出
17 |     """
18 | 
19 |     def start(self) -> None:
20 |         """
21 |         解决python3.12 RuntimeError: cannot join thread before it is started的报错
22 |         """
23 |         super().start()
24 | 
25 |         if sys.version_info.minor >= 12 and sys.version_info.major >= 3:
26 |             for thread in threading.enumerate():
27 |                 if (
28 |                     thread.daemon
29 |                     or thread is threading.current_thread()
30 |                     or not thread.is_alive()
31 |                 ):
32 |                     continue
33 |                 thread.join()
34 | 


--------------------------------------------------------------------------------
/feapder/utils/webdriver/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2022/9/7 4:39 PM
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | from .playwright_driver import PlaywrightDriver
11 | from .selenium_driver import SeleniumDriver
12 | from .webdirver import InterceptRequest, InterceptResponse
13 | from .webdriver_pool import WebDriverPool
14 | 
15 | # 为了兼容老代码
16 | WebDriver = SeleniumDriver
17 | 


--------------------------------------------------------------------------------
/feapder/utils/webdriver/webdirver.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2022/9/7 4:27 PM
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | import abc
11 | 
12 | from feapder import setting
13 | 
14 | 
15 | class InterceptRequest:
16 |     def __init__(self, url, data, headers):
17 |         self.url = url
18 |         self.data = data
19 |         self.headers = headers
20 | 
21 | 
22 | class InterceptResponse:
23 |     def __init__(self, request: InterceptRequest, url, headers, content, status_code):
24 |         self.request = request
25 |         self.url = url
26 |         self.headers = headers
27 |         self.content = content
28 |         self.status_code = status_code
29 | 
30 | 
31 | class WebDriver:
32 |     def __init__(
33 |         self,
34 |         load_images=True,
35 |         user_agent=None,
36 |         proxy=None,
37 |         headless=False,
38 |         driver_type=None,
39 |         timeout=16,
40 |         window_size=(1024, 800),
41 |         executable_path=None,
42 |         custom_argument=None,
43 |         download_path=None,
44 |         auto_install_driver=True,
45 |         use_stealth_js=True,
46 |         **kwargs,
47 |     ):
48 |         """
49 |         webdirver 封装,支持chrome、phantomjs 和 firefox
50 |         Args:
51 |             load_images: 是否加载图片
52 |             user_agent: 字符串 或 无参函数,返回值为user_agent
53 |             proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
54 |             headless: 是否启用无头模式
55 |             driver_type: CHROME,EDGE 或 PHANTOMJS,FIREFOX
56 |             timeout: 请求超时时间
57 |             window_size: # 窗口大小
58 |             executable_path: 浏览器路径,默认为默认路径
59 |             custom_argument: 自定义参数 用于webdriver.Chrome(options=chrome_options, **kwargs)
60 |             download_path: 文件下载保存路径;如果指定,不再出现“保留”“放弃”提示,仅对Chrome有效
61 |             auto_install_driver: 自动下载浏览器驱动 支持chrome 和 firefox
62 |             use_stealth_js: 使用stealth.min.js隐藏浏览器特征
63 |             **kwargs:
64 |         """
65 |         self._load_images = load_images
66 |         self._user_agent = user_agent or setting.DEFAULT_USERAGENT
67 |         self._proxy = proxy
68 |         self._headless = headless
69 |         self._timeout = timeout
70 |         self._window_size = window_size
71 |         self._executable_path = executable_path
72 |         self._custom_argument = custom_argument
73 |         self._download_path = download_path
74 |         self._auto_install_driver = auto_install_driver
75 |         self._use_stealth_js = use_stealth_js
76 |         self._driver_type = driver_type
77 |         self._kwargs = kwargs
78 | 
79 |     @abc.abstractmethod
80 |     def quit(self):
81 |         pass
82 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2020/4/22 10:45 PM
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | from os.path import dirname, join
12 | from sys import version_info
13 | 
14 | import setuptools
15 | 
16 | if version_info < (3, 6, 0):
17 |     raise SystemExit("Sorry! feapder requires python 3.6.0 or later.")
18 | 
19 | with open(join(dirname(__file__), "feapder/VERSION"), "rb") as fh:
20 |     version = fh.read().decode("ascii").strip()
21 | 
22 | with open("README.md", "r", encoding="utf8") as fh:
23 |     long_description = fh.read()
24 | 
25 | packages = setuptools.find_packages()
26 | packages.extend(
27 |     [
28 |         "feapder",
29 |         "feapder.templates",
30 |         "feapder.templates.project_template",
31 |         "feapder.templates.project_template.spiders",
32 |         "feapder.templates.project_template.items",
33 |     ]
34 | )
35 | 
36 | requires = [
37 |     "better-exceptions>=0.2.2",
38 |     "DBUtils>=2.0",
39 |     "parsel>=1.5.2",
40 |     "PyMySQL>=0.9.3",
41 |     "redis>=2.10.6,<4.0.0",
42 |     "requests>=2.22.0",
43 |     "bs4>=0.0.1",
44 |     "ipython>=7.14.0",
45 |     "cryptography>=3.3.2",
46 |     "urllib3>=1.25.8",
47 |     "loguru>=0.5.3",
48 |     "influxdb>=5.3.1",
49 |     "pyperclip>=1.8.2",
50 |     "terminal-layout>=2.1.3",
51 | ]
52 | 
53 | render_requires = [
54 |     "webdriver-manager>=4.0.0",
55 |     "playwright",
56 |     "selenium>=3.141.0",
57 | ]
58 | 
59 | all_requires = [
60 |     "bitarray>=1.5.3",
61 |     "PyExecJS>=1.5.1",
62 |     "pymongo>=3.10.1",
63 |     "redis-py-cluster>=2.1.0",
64 | ] + render_requires
65 | 
66 | setuptools.setup(
67 |     name="feapder",
68 |     version=version,
69 |     author="Boris",
70 |     license="MIT",
71 |     author_email="feapder@qq.com",
72 |     python_requires=">=3.6",
73 |     description="feapder是一款支持分布式、批次采集、数据防丢、报警丰富的python爬虫框架",
74 |     long_description=long_description,
75 |     long_description_content_type="text/markdown",
76 |     install_requires=requires,
77 |     extras_require={"all": all_requires, "render": render_requires},
78 |     entry_points={"console_scripts": ["feapder = feapder.commands.cmdline:execute"]},
79 |     url="https://github.com/Boris-code/feapder.git",
80 |     packages=packages,
81 |     include_package_data=True,
82 |     classifiers=["Programming Language :: Python :: 3"],
83 | )
84 | 


--------------------------------------------------------------------------------
/tests/air-spider/test_air_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2020/4/22 10:41 PM
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | import feapder
12 | 
13 | 
14 | class TestAirSpider(feapder.AirSpider):
15 |     __custom_setting__ = dict(
16 |         USE_SESSION=True,
17 |         TASK_MAX_CACHED_SIZE=10,
18 |     )
19 | 
20 |     def start_callback(self):
21 |         print("爬虫开始")
22 | 
23 |     def end_callback(self):
24 |         print("爬虫结束")
25 | 
26 |     def start_requests(self, *args, **kws):
27 |         for i in range(1):
28 |             print(i)
29 |             yield feapder.Request("https://www.baidu.com")
30 | 
31 |     def download_midware(self, request):
32 |         # request.headers = {'User-Agent': ""}
33 |         # request.proxies = {"https":"https://12.12.12.12:6666"}
34 |         # request.cookies = {}
35 |         return request
36 | 
37 |     def validate(self, request, response):
38 |         if response.status_code != 200:
39 |             raise Exception("response code not 200")  # 重试
40 | 
41 |         # if "哈哈" not in response.text:
42 |         #     return False # 抛弃当前请求
43 | 
44 |     def parse(self, request, response):
45 |         print(response.bs4().title)
46 |         print(response.xpath("//title").extract_first())
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     TestAirSpider(thread_count=1).start()
51 | 


--------------------------------------------------------------------------------
/tests/air-spider/test_air_spider_filter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2020/4/22 10:41 PM
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | import feapder
12 | 
13 | 
14 | class TestAirSpider(feapder.AirSpider):
15 |     __custom_setting__ = dict(
16 |         REQUEST_FILTER_ENABLE=True,  # request 去重
17 |         # REQUEST_FILTER_SETTING=dict(
18 |         #     filter_type=3,  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、 轻量去重(LiteFilter)= 4
19 |         #     expire_time=2592000,  # 过期时间1个月
20 |         # ),
21 |         REQUEST_FILTER_SETTING=dict(
22 |             filter_type=4,  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、 轻量去重(LiteFilter)= 4
23 |         ),
24 |     )
25 | 
26 |     def start_requests(self, *args, **kws):
27 |         for i in range(200):
28 |             yield feapder.Request("https://www.baidu.com")
29 | 
30 |     def parse(self, request, response):
31 |         print(response.bs4().title)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     TestAirSpider(thread_count=1).start()
36 | 


--------------------------------------------------------------------------------
/tests/air-spider/test_air_spider_item.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-03-30 10:27:21
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | import feapder
11 | from feapder import Item
12 | 
13 | 
14 | class TestAirSpiderItem(feapder.AirSpider):
15 |     __custom_setting__ = dict(
16 |         MYSQL_IP="localhost",
17 |         MYSQL_PORT=3306,
18 |         MYSQL_DB="feapder",
19 |         MYSQL_USER_NAME="feapder",
20 |         MYSQL_USER_PASS="feapder123",
21 |         ITEM_FILTER_ENABLE=True,  # item 去重
22 |         ITEM_FILTER_SETTING = dict(
23 |             filter_type=4  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、轻量去重(LiteFilter)= 4
24 |         )
25 |     )
26 | 
27 |     def start_requests(self):
28 |         yield feapder.Request("https://www.baidu.com")
29 | 
30 |     def parse(self, request, response):
31 |         title = response.xpath("string(//title)").extract_first()
32 |         for i in range(3):
33 |             item = Item()
34 |             item.table_name = "spider_data"
35 |             item.url = request.url
36 |             item.title = title
37 |             yield item
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     TestAirSpiderItem().start()
42 | 


--------------------------------------------------------------------------------
/tests/air-spider/test_render_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2020/4/22 10:41 PM
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | import feapder
12 | 
13 | 
14 | class TestAirSpider(feapder.AirSpider):
15 |     def start_requests(self, *args, **kws):
16 |         yield feapder.Request("https://www.baidu.com", render=True)
17 | 
18 |     # def download_midware(self, request):
19 |     #     request.proxies = {
20 |     #         "http": "http://xxx.xxx.xxx.xxx:8888",
21 |     #         "https": "http://xxx.xxx.xxx.xxx:8888",
22 |     #     }
23 | 
24 |     def parse(self, request, response):
25 |         print(response.bs4().title)
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     TestAirSpider(thread_count=1).start()
30 | 


--------------------------------------------------------------------------------
/tests/batch-spider-integration/batch_spider_integration_task.sql:
--------------------------------------------------------------------------------
 1 | -- ----------------------------
 2 | -- Table structure for batch_spider_integration_task
 3 | -- ----------------------------
 4 | CREATE TABLE `batch_spider_integration_task` (
 5 |   `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
 6 |   `url` varchar(255) DEFAULT NULL,
 7 |   `parser_name` varchar(255) DEFAULT NULL,
 8 |   `state` int(11) DEFAULT '0',
 9 |   PRIMARY KEY (`id`)
10 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
11 | 
12 | -- ----------------------------
13 | -- Records of batch_spider_integration_task
14 | -- ----------------------------
15 | INSERT INTO `batch_spider_integration_task` VALUES (1, 'https://news.sina.com.cn/', 'SinaNewsParser', 0);
16 | INSERT INTO `batch_spider_integration_task` VALUES (2, 'https://news.qq.com/', 'TencentNewsParser', 0);


--------------------------------------------------------------------------------
/tests/batch-spider-integration/items/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/tests/batch-spider-integration/items/__init__.py


--------------------------------------------------------------------------------
/tests/batch-spider-integration/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-03-02 23:38:24
 4 | ---------
 5 | @summary: 爬虫入口
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | from feapder import ArgumentParser
11 | from feapder import BatchSpider
12 | 
13 | from spiders import *
14 | 
15 | 
16 | def batch_spider_integration_test(args):
17 |     """
18 |     BatchSpider集成测试
19 |     """
20 | 
21 |     spider = BatchSpider(
22 |         task_table="batch_spider_integration_task",  # mysql中的任务表
23 |         batch_record_table="batch_spider_integration_batch_record",  # mysql中的批次记录表
24 |         batch_name="批次爬虫集成测试",  # 批次名字
25 |         batch_interval=7,  # 批次时间 天为单位 若为小时 可写 1 / 24
26 |         task_keys=["id", "url", "parser_name"],  # 集成批次爬虫,需要将批次爬虫的名字取出来,任务分发时才知道分发到哪个模板上
27 |         redis_key="feapder:test_batch_spider_integration",  # redis中存放request等信息的根key
28 |         task_state="state",  # mysql中任务状态字段
29 |     )
30 | 
31 |     # 集成
32 |     spider.add_parser(sina_news_parser.SinaNewsParser)
33 |     spider.add_parser(tencent_news_parser.TencentNewsParser)
34 | 
35 |     if args == 1:
36 |         spider.start_monitor_task()
37 |     elif args == 2:
38 |         spider.start()
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     parser = ArgumentParser(description="批次爬虫集成测试")
43 | 
44 |     parser.add_argument(
45 |         "--batch_spider_integration_test",
46 |         type=int,
47 |         nargs=1,
48 |         help="批次爬虫集成测试(1|2)",
49 |         function=batch_spider_integration_test,
50 |     )
51 | 
52 |     parser.start()
53 | 
54 |     # 运行
55 |     # 下发任务及监控进度 python3 main.py --batch_spider_integration_test 1
56 |     # 采集 python3 main.py --batch_spider_integration_test 2
57 | 


--------------------------------------------------------------------------------
/tests/batch-spider-integration/setting.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """爬虫配置文件"""
 3 | import os
 4 | 
 5 | # MYSQL
 6 | MYSQL_IP = "localhost"
 7 | MYSQL_PORT = 3306
 8 | MYSQL_DB = "feapder"
 9 | MYSQL_USER_NAME = "feapder"
10 | MYSQL_USER_PASS = "feapder123"
11 | 
12 | # REDIS
13 | # IP:PORT
14 | REDISDB_IP_PORTS = "localhost:6379"
15 | REDISDB_USER_PASS = ""
16 | # 默认 0 到 15 共16个数据库
17 | REDISDB_DB = 0
18 | 
19 | # # 爬虫相关
20 | # # COLLECTOR
21 | # COLLECTOR_SLEEP_TIME = 1 # 从任务队列中获取任务到内存队列的间隔
22 | # COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量
23 | #
24 | # # SPIDER
25 | # SPIDER_THREAD_COUNT = 10 # 爬虫并发数
26 | # SPIDER_SLEEP_TIME = 0 # 下载时间间隔(解析完一个response后休眠时间)
27 | # SPIDER_MAX_RETRY_TIMES = 100 # 每个请求最大重试次数
28 | 
29 | # # 重新尝试失败的requests 当requests重试次数超过允许的最大重试次数算失败
30 | # RETRY_FAILED_REQUESTS = False
31 | # # request 超时时间,超过这个时间重新做(不是网络请求的超时时间)单位秒
32 | # REQUEST_LOST_TIMEOUT = 600  # 10分钟
33 | # # 保存失败的request
34 | # SAVE_FAILED_REQUEST = True
35 | #
36 | # # 下载缓存 利用redis缓存,由于内存小,所以仅供测试时使用
37 | # RESPONSE_CACHED_ENABLE = False  # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True
38 | # RESPONSE_CACHED_EXPIRE_TIME = 3600  # 缓存时间 秒
39 | # RESPONSE_CACHED_USED = False  # 是否使用缓存 补采数据时可设置为True
40 | #
41 | # WARNING_FAILED_COUNT = 1000  # 任务失败数 超过WARNING_FAILED_COUNT则报警
42 | #
43 | # # 爬虫初始化工作
44 | # # 爬虫是否常驻
45 | # KEEP_ALIVE = False
46 | #
47 | #
48 | # # 设置代理
49 | # PROXY_EXTRACT_API = None  # 代理提取API ,返回的代理分割符为\r\n
50 | # PROXY_ENABLE = True
51 | #
52 | # # 随机headers
53 | # RANDOM_HEADERS = True
54 | # # requests 使用session
55 | # USE_SESSION = False
56 | #
57 | # # 去重
58 | # ITEM_FILTER_ENABLE = False # item 去重
59 | # REQUEST_FILTER_ENABLE = False # request 去重
60 | #
61 | # # 报警
62 | # DINGDING_WARNING_URL = "" # 钉钉机器人api
63 | # DINGDING_WARNING_PHONE = "" # 报警人
64 | # LINGXI_TOKEN = "" # 灵犀报警token
65 | #
66 | # LOG_NAME = os.path.basename(os.getcwd())
67 | # LOG_PATH = "log/%s.log" % LOG_NAME  # log存储路径
68 | # LOG_LEVEL = "DEBUG"
69 | # LOG_IS_WRITE_TO_FILE = False
70 | # OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
71 | 


--------------------------------------------------------------------------------
/tests/batch-spider-integration/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 |     "sina_news_parser",
3 |     "tencent_news_parser"
4 | ]


--------------------------------------------------------------------------------
/tests/batch-spider-integration/spiders/sina_news_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-03-02 23:40:37
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | import feapder
11 | 
12 | 
13 | class SinaNewsParser(feapder.BatchParser):
14 |     """
15 |     注意 这里继承的是BatchParser,而不是BatchSpider
16 |     """
17 | 
18 |     def start_requests(self, task):
19 |         task_id = task[0]
20 |         url = task[1]
21 |         yield feapder.Request(url, task_id=task_id)
22 | 
23 |     def parse(self, request, response):
24 |         title = response.xpath("//title/text()").extract_first()
25 |         print(self.name, title)
26 |         yield self.update_task_batch(request.task_id, 1)
27 | 


--------------------------------------------------------------------------------
/tests/batch-spider-integration/spiders/tencent_news_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-03-02 23:42:40
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | import feapder
11 | 
12 | 
13 | class TencentNewsParser(feapder.BatchParser):
14 |     """
15 |     注意 这里继承的是BatchParser,而不是BatchSpider
16 |     """
17 | 
18 |     def start_requests(self, task):
19 |         task_id = task[0]
20 |         url = task[1]
21 |         yield feapder.Request(url, task_id=task_id)
22 | 
23 |     def parse(self, request, response):
24 |         title = response.xpath("//title/text()").extract_first()
25 |         print(self.name, title)
26 |         yield self.update_task_batch(request.task_id, 1)
27 | 


--------------------------------------------------------------------------------
/tests/batch-spider/items/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 |     "spider_data_item"
3 | ]


--------------------------------------------------------------------------------
/tests/batch-spider/items/spider_data_item.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-02-08 16:39:27
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | from feapder import Item
11 | 
12 | 
13 | class SpiderDataItem(Item):
14 |     """
15 |     This class was generated by feapder.
16 |     command: feapder create -i spider_data.
17 |     """
18 | 
19 |     def __init__(self, *args, **kwargs):
20 |         # self.id = None  # type : int(10) unsigned | allow_null : NO | key : PRI | default_value : None | extra : auto_increment | column_comment : 
21 |         self.title = None  # type : varchar(255) | allow_null : YES | key :  | default_value : None | extra : | column_comment :
22 | 


--------------------------------------------------------------------------------
/tests/batch-spider/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-02-08 16:02:02
 4 | ---------
 5 | @summary: 爬虫入口
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | from spiders import *
11 | from feapder import ArgumentParser
12 | 
13 | 
14 | def crawl_test(args):
15 |     spider = test_spider.TestSpider(
16 |         redis_key="feapder:test_batch_spider",  # 分布式爬虫调度信息存储位置
17 |         task_table="batch_spider_task",  # mysql中的任务表
18 |         task_keys=["id", "url"],  # 需要获取任务表里的字段名,可添加多个
19 |         task_state="state",  # mysql中任务状态字段
20 |         batch_record_table="batch_spider_batch_record",  # mysql中的批次记录表
21 |         batch_name="批次爬虫测试(周全)",  # 批次名字
22 |         batch_interval=7,  # 批次周期 天为单位 若为小时 可写 1 / 24
23 |     )
24 | 
25 |     if args == 1:
26 |         spider.start_monitor_task()  # 下发及监控任务
27 |     else:
28 |         spider.start()  # 采集
29 | 
30 | def test_debug():
31 |     spider = test_spider.TestSpider.to_DebugBatchSpider(
32 |         task_id=1,
33 |         redis_key="feapder:test_batch_spider",  # 分布式爬虫调度信息存储位置
34 |         task_table="batch_spider_task",  # mysql中的任务表
35 |         task_keys=["id", "url"],  # 需要获取任务表里的字段名,可添加多个
36 |         task_state="state",  # mysql中任务状态字段
37 |         batch_record_table="batch_spider_batch_record",  # mysql中的批次记录表
38 |         batch_name="批次爬虫测试(周全)",  # 批次名字
39 |         batch_interval=7,  # 批次周期 天为单位 若为小时 可写 1 / 24
40 |     )
41 | 
42 | 
43 |     spider.start()  # 采集
44 | 
45 | 
46 | if __name__ == "__main__":
47 | 
48 |     parser = ArgumentParser(description="批次爬虫测试")
49 | 
50 |     parser.add_argument(
51 |         "--crawl_test", type=int, nargs=1, help="(1|2)", function=crawl_test
52 |     )
53 |     parser.add_argument("--test_debug", action="store_true", help="测试debug", function=test_debug)
54 | 
55 |     parser.start()
56 | 
57 |     # 运行
58 |     # 下发任务及监控进度 python3 main.py --crawl_test 1
59 |     # 采集 python3 main.py --crawl_test 2


--------------------------------------------------------------------------------
/tests/batch-spider/setting.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """爬虫配置文件"""
 3 | import os
 4 | 
 5 | 
 6 | # MYSQL
 7 | MYSQL_IP = "localhost"
 8 | MYSQL_PORT = 3306
 9 | MYSQL_DB = "feapder"
10 | MYSQL_USER_NAME = "feapder"
11 | MYSQL_USER_PASS = "feapder123"
12 | 
13 | # REDIS
14 | # IP:PORT
15 | REDISDB_IP_PORTS = "localhost:6379"
16 | REDISDB_USER_PASS = ""
17 | # 默认 0 到 15 共16个数据库
18 | REDISDB_DB = 0
19 | 
20 | # # 爬虫相关
21 | # # COLLECTOR
22 | # COLLECTOR_SLEEP_TIME = 1 # 从任务队列中获取任务到内存队列的间隔
23 | # COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量
24 | #
25 | # # SPIDER
26 | # SPIDER_THREAD_COUNT = 10 # 爬虫并发数
27 | # SPIDER_SLEEP_TIME = 0 # 下载时间间隔(解析完一个response后休眠时间)
28 | # SPIDER_MAX_RETRY_TIMES = 100 # 每个请求最大重试次数
29 | 
30 | # # 重新尝试失败的requests 当requests重试次数超过允许的最大重试次数算失败
31 | # RETRY_FAILED_REQUESTS = False
32 | # # request 超时时间,超过这个时间重新做(不是网络请求的超时时间)单位秒
33 | # REQUEST_LOST_TIMEOUT = 600  # 10分钟
34 | # # 保存失败的request
35 | # SAVE_FAILED_REQUEST = True
36 | #
37 | # # 下载缓存 利用redis缓存,由于内存小,所以仅供测试时使用
38 | # RESPONSE_CACHED_ENABLE = False  # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True
39 | # RESPONSE_CACHED_EXPIRE_TIME = 3600  # 缓存时间 秒
40 | # RESPONSE_CACHED_USED = False  # 是否使用缓存 补采数据时可设置为True
41 | #
42 | # WARNING_FAILED_COUNT = 1000  # 任务失败数 超过WARNING_FAILED_COUNT则报警
43 | #
44 | # # 爬虫初始化工作
45 | # # 爬虫是否常驻
46 | # KEEP_ALIVE = False
47 | #
48 | #
49 | # # 设置代理
50 | # PROXY_EXTRACT_API = None  # 代理提取API ,返回的代理分割符为\r\n
51 | # PROXY_ENABLE = True
52 | #
53 | # # 随机headers
54 | # RANDOM_HEADERS = True
55 | # # requests 使用session
56 | # USE_SESSION = False
57 | #
58 | # # 去重
59 | # ITEM_FILTER_ENABLE = False # item 去重
60 | # REQUEST_FILTER_ENABLE = False # request 去重
61 | #
62 | # # 报警
63 | # DINGDING_WARNING_URL = "" # 钉钉机器人api
64 | # DINGDING_WARNING_PHONE = "" # 报警人
65 | # LINGXI_TOKEN = "" # 灵犀报警token
66 | #
67 | # LOG_NAME = os.path.basename(os.getcwd())
68 | # LOG_PATH = "log/%s.log" % LOG_NAME  # log存储路径
69 | # LOG_LEVEL = "DEBUG"
70 | # LOG_IS_WRITE_TO_FILE = False
71 | # OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
72 | 


--------------------------------------------------------------------------------
/tests/batch-spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 |     "test_spider"
3 | ]


--------------------------------------------------------------------------------
/tests/batch-spider/spiders/test_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-02-08 16:09:47
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | import feapder
11 | from items import *
12 | 
13 | 
14 | class TestSpider(feapder.BatchSpider):
15 |     # def init_task(self):
16 |     #     pass
17 | 
18 |     def start_requests(self, task):
19 |         # task 为在任务表中取出的每一条任务
20 |         id, url = task  # id, url为所取的字段,main函数中指定的
21 |         yield feapder.Request(url, task_id=id, render=True)  # task_id为任务id,用于更新任务状态
22 | 
23 |     def parse(self, request, response):
24 |         title = response.xpath('//title/text()').extract_first()  # 取标题
25 |         item = spider_data_item.SpiderDataItem()  # 声明一个item
26 |         item.title = title  # 给item属性赋值
27 |         yield item  # 返回item, item会自动批量入库
28 |         yield self.update_task_batch(request.task_id, 1) # 更新任务状态为1
29 | 
30 |     def exception_request(self, request, response):
31 |         """
32 |         @summary: 请求或者parser里解析出异常的request
33 |         ---------
34 |         @param request:
35 |         @param response:
36 |         ---------
37 |         @result: request / callback / None (返回值必须可迭代)
38 |         """
39 | 
40 |         pass
41 | 
42 |     def failed_request(self, request, response):
43 |         """
44 |         @summary: 超过最大重试次数的request
45 |         ---------
46 |         @param request:
47 |         ---------
48 |         @result: request / item / callback / None (返回值必须可迭代)
49 |         """
50 | 
51 |         yield request
52 |         yield self.update_task_batch(request.task_id, -1)  # 更新任务状态为-1
53 | 


--------------------------------------------------------------------------------
/tests/batch-spider/table.sql:
--------------------------------------------------------------------------------
 1 | -- ----------------------------
 2 | -- Table structure for spider_data
 3 | -- ----------------------------
 4 | CREATE TABLE `spider_data` (
 5 |   `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
 6 |   `title` varchar(255) DEFAULT NULL,
 7 |   PRIMARY KEY (`id`)
 8 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
 9 | 
10 | -- ----------------------------
11 | -- Table structure for batch_spider_task
12 | -- ----------------------------
13 | CREATE TABLE `batch_spider_task` (
14 |   `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
15 |   `url` varchar(255) DEFAULT NULL,
16 |   `state` int(11) DEFAULT '0',
17 |   PRIMARY KEY (`id`)
18 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
19 | 
20 | -- ----------------------------
21 | -- Records of batch_spider_task
22 | -- ----------------------------
23 | INSERT INTO `batch_spider_task` VALUES (1, 'https://www.baidu.com', 0);
24 | 


--------------------------------------------------------------------------------
/tests/db/test_redis.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/3/4 11:01 下午
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | from feapder.db.redisdb import RedisDB
12 | 
13 | redis = RedisDB(ip_ports="localhost:6379", db=0)
14 | 
15 | redis.lpush("l_test", 2)
16 | redis.lpush("l_test", 3)
17 | 
18 | print(redis.lrange("l_test"))
19 | print(redis.lrem("l_test", 2))
20 | print(redis.lrange("l_test"))
21 | 


--------------------------------------------------------------------------------
/tests/jd_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-03-09 20:45:36
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | import feapder
11 | from feapder import Item
12 | from feapder.utils import tools
13 | 
14 | 
15 | class JdSpider(feapder.BatchSpider):
16 |     # 自定义数据库,若项目中有setting.py文件,此自定义可删除
17 |     __custom_setting__ = dict(
18 |         REDISDB_IP_PORTS="localhost:6379",
19 |         REDISDB_DB=0,
20 |         MYSQL_IP="localhost",
21 |         MYSQL_PORT=3306,
22 |         MYSQL_DB="feapder",
23 |         MYSQL_USER_NAME="feapder",
24 |         MYSQL_USER_PASS="feapder123",
25 |     )
26 | 
27 |     def start_requests(self, task):
28 |         task_id, item_id = task
29 |         url = "https://item.jd.com/{}.html".format(item_id)
30 |         yield feapder.Request(url, task_id=task_id)  # 携带task_id字段
31 | 
32 |     def parse(self, request, response):
33 |         title = response.xpath("string(//div[@class='sku-name'])").extract_first(default="").strip()
34 | 
35 |         item = Item()
36 |         item.table_name = "jd_item"  # 指定入库的表名
37 |         item.title = title
38 |         item.batch_date = self.batch_date  # 获取批次信息,批次信息框架自己维护
39 |         item.crawl_time = tools.get_current_date()  # 获取当前时间
40 |         yield item  # 自动批量入库
41 |         yield self.update_task_batch(request.task_id, 1)  # 更新任务状态
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     spider = JdSpider(
46 |         redis_key="feapder:jd_item",  # redis中存放任务等信息key前缀
47 |         task_table="jd_item_task",  # mysql中的任务表
48 |         task_keys=["id", "item_id"],  # 需要获取任务表里的字段名,可添加多个
49 |         task_state="state",  # mysql中任务状态字段
50 |         batch_record_table="jd_item_batch_record",  # mysql中的批次记录表,自动生成
51 |         batch_name="京东商品爬虫(周度全量)",  # 批次名字
52 |         batch_interval=7,  # 批次周期 天为单位 若为小时 可写 1 / 24
53 |     )
54 | 
55 |     # 下面两个启动函数 相当于 master、worker。需要分开运行
56 |     # spider.start_monitor_task() # maser: 下发及监控任务
57 |     spider.start()  # worker: 采集
58 | 


--------------------------------------------------------------------------------
/tests/mongo_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-02-08 16:06:12
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | import feapder
11 | from feapder import Item, UpdateItem
12 | 
13 | 
14 | class TestMongo(feapder.AirSpider):
15 |     __custom_setting__ = dict(
16 |         ITEM_PIPELINES=["feapder.pipelines.mongo_pipeline.MongoPipeline"],
17 |         MONGO_IP="localhost",
18 |         MONGO_PORT=27017,
19 |         MONGO_DB="feapder",
20 |         MONGO_USER_NAME="",
21 |         MONGO_USER_PASS="",
22 |     )
23 | 
24 |     def start_requests(self):
25 |         yield feapder.Request("https://www.baidu.com")
26 | 
27 |     def parse(self, request, response):
28 |         title = response.xpath("//title/text()").extract_first()  # 取标题
29 |         for i in range(10):
30 |             item = Item()  # 声明一个item
31 |             item.table_name = "test_mongo"
32 |             item.title = title + str(666)  # 给item属性赋值
33 |             item.i = i + 5
34 |             item.c = "777"
35 |             yield item  # 返回item, item会自动批量入库
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     TestMongo().start()
40 | 


--------------------------------------------------------------------------------
/tests/spider-integration/items/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/tests/spider-integration/items/__init__.py


--------------------------------------------------------------------------------
/tests/spider-integration/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-03-02 23:38:24
 4 | ---------
 5 | @summary: 爬虫入口
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | from feapder import Spider
11 | 
12 | from spiders import *
13 | 
14 | 
15 | def spider_integration_test():
16 |     """
17 |     Spider集成测试
18 |     """
19 |     spider = Spider(redis_key="feapder:test_spider_integration")
20 |     # 集成
21 |     spider.add_parser(sina_news_parser.SinaNewsParser)
22 |     spider.add_parser(tencent_news_parser.TencentNewsParser)
23 | 
24 |     spider.start()
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     spider_integration_test()
29 | 


--------------------------------------------------------------------------------
/tests/spider-integration/setting.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """爬虫配置文件"""
 3 | import os
 4 | 
 5 | # MYSQL
 6 | MYSQL_IP = "localhost"
 7 | MYSQL_PORT = 3306
 8 | MYSQL_DB = "feapder"
 9 | MYSQL_USER_NAME = "feapder"
10 | MYSQL_USER_PASS = "feapder123"
11 | 
12 | # REDIS
13 | # IP:PORT
14 | REDISDB_IP_PORTS = "localhost:6379"
15 | REDISDB_USER_PASS = ""
16 | # 默认 0 到 15 共16个数据库
17 | REDISDB_DB = 0
18 | 
19 | # # 爬虫相关
20 | # # COLLECTOR
21 | # COLLECTOR_SLEEP_TIME = 1 # 从任务队列中获取任务到内存队列的间隔
22 | # COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量
23 | #
24 | # # SPIDER
25 | # SPIDER_THREAD_COUNT = 10 # 爬虫并发数
26 | # SPIDER_SLEEP_TIME = 0 # 下载时间间隔(解析完一个response后休眠时间)
27 | # SPIDER_MAX_RETRY_TIMES = 100 # 每个请求最大重试次数
28 | 
29 | # # 重新尝试失败的requests 当requests重试次数超过允许的最大重试次数算失败
30 | # RETRY_FAILED_REQUESTS = False
31 | # # request 超时时间,超过这个时间重新做(不是网络请求的超时时间)单位秒
32 | # REQUEST_LOST_TIMEOUT = 600  # 10分钟
33 | # # 保存失败的request
34 | # SAVE_FAILED_REQUEST = True
35 | #
36 | # # 下载缓存 利用redis缓存,由于内存小,所以仅供测试时使用
37 | # RESPONSE_CACHED_ENABLE = False  # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True
38 | # RESPONSE_CACHED_EXPIRE_TIME = 3600  # 缓存时间 秒
39 | # RESPONSE_CACHED_USED = False  # 是否使用缓存 补采数据时可设置为True
40 | #
41 | # WARNING_FAILED_COUNT = 1000  # 任务失败数 超过WARNING_FAILED_COUNT则报警
42 | #
43 | # # 爬虫初始化工作
44 | # # 爬虫是否常驻
45 | # KEEP_ALIVE = False
46 | #
47 | #
48 | # # 设置代理
49 | # PROXY_EXTRACT_API = None  # 代理提取API ,返回的代理分割符为\r\n
50 | # PROXY_ENABLE = True
51 | #
52 | # # 随机headers
53 | # RANDOM_HEADERS = True
54 | # # requests 使用session
55 | # USE_SESSION = False
56 | #
57 | # # 去重
58 | # ITEM_FILTER_ENABLE = False # item 去重
59 | # REQUEST_FILTER_ENABLE = False # request 去重
60 | #
61 | # # 报警
62 | # DINGDING_WARNING_URL = "" # 钉钉机器人api
63 | # DINGDING_WARNING_PHONE = "" # 报警人
64 | # LINGXI_TOKEN = "" # 灵犀报警token
65 | #
66 | # LOG_NAME = os.path.basename(os.getcwd())
67 | # LOG_PATH = "log/%s.log" % LOG_NAME  # log存储路径
68 | # LOG_LEVEL = "DEBUG"
69 | # LOG_IS_WRITE_TO_FILE = False
70 | # OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
71 | 


--------------------------------------------------------------------------------
/tests/spider-integration/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 |     "sina_news_parser",
3 |     "tencent_news_parser"
4 | ]


--------------------------------------------------------------------------------
/tests/spider-integration/spiders/sina_news_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-03-02 23:40:37
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | import feapder
11 | 
12 | 
13 | class SinaNewsParser(feapder.BaseParser):
14 |     def start_requests(self):
15 |         """
16 |         注意 这里继承的是BaseParser,而不是Spider
17 |         """
18 |         yield feapder.Request("https://news.sina.com.cn/")
19 | 
20 |     def parse(self, request, response):
21 |         title = response.xpath("//title/text()").extract_first()
22 |         print(title)
23 | 


--------------------------------------------------------------------------------
/tests/spider-integration/spiders/tencent_news_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-03-02 23:42:40
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | import feapder
11 | 
12 | 
13 | class TencentNewsParser(feapder.BaseParser):
14 |     """
15 |     注意 这里继承的是BaseParser,而不是Spider
16 |     """
17 |     def start_requests(self):
18 |         yield feapder.Request("https://news.qq.com/")
19 | 
20 |     def parse(self, request, response):
21 |         title = response.xpath("//title/text()").extract_first()
22 |         print(title)
23 | 


--------------------------------------------------------------------------------
/tests/spider/items/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 |     "spider_data_item"
3 | ]


--------------------------------------------------------------------------------
/tests/spider/items/spider_data_item.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-03-10 17:28:36
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | from feapder import Item
11 | 
12 | 
13 | class SpiderDataItem(Item):
14 |     """
15 |     This class was generated by feapder.
16 |     command: feapder create -i spider_data.
17 |     """
18 | 
19 |     def __init__(self, *args, **kwargs):
20 |         # self.id = None
21 |         self.title = None
22 | 


--------------------------------------------------------------------------------
/tests/spider/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-02-08 16:01:50
 4 | ---------
 5 | @summary: 爬虫入口
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | from spiders import *
11 | 
12 | if __name__ == "__main__":
13 |     spider = test_spider.TestSpider(redis_key="feapder3:test_spider", thread_count=100, keep_alive=False)
14 |     spider.start()


--------------------------------------------------------------------------------
/tests/spider/setting.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """爬虫配置文件"""
 3 | import os
 4 | 
 5 | 
 6 | # MYSQL
 7 | MYSQL_IP = "localhost"
 8 | MYSQL_PORT = 3306
 9 | MYSQL_DB = "feapder"
10 | MYSQL_USER_NAME = "feapder"
11 | MYSQL_USER_PASS = "feapder123"
12 | 
13 | # REDIS
14 | # IP:PORT
15 | REDISDB_IP_PORTS = "localhost:6379"
16 | REDISDB_USER_PASS = ""
17 | REDISDB_DB = 0
18 | 
19 | # # 爬虫相关
20 | # # COLLECTOR
21 | COLLECTOR_SLEEP_TIME = 1 # 从任务队列中获取任务到内存队列的间隔
22 | COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量
23 | #
24 | # # SPIDER
25 | SPIDER_THREAD_COUNT = 100 # 爬虫并发数
26 | SPIDER_SLEEP_TIME = 0 # 下载时间间隔(解析完一个response后休眠时间)
27 | # SPIDER_MAX_RETRY_TIMES = 100 # 每个请求最大重试次数
28 | 
29 | # # 重新尝试失败的requests 当requests重试次数超过允许的最大重试次数算失败
30 | # RETRY_FAILED_REQUESTS = False
31 | # # request 超时时间,超过这个时间重新做(不是网络请求的超时时间)单位秒
32 | # REQUEST_LOST_TIMEOUT = 600  # 10分钟
33 | # # 保存失败的request
34 | # SAVE_FAILED_REQUEST = True
35 | #
36 | # # 下载缓存 利用redis缓存,由于内存小,所以仅供测试时使用
37 | # RESPONSE_CACHED_ENABLE = False  # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True
38 | # RESPONSE_CACHED_EXPIRE_TIME = 3600  # 缓存时间 秒
39 | # RESPONSE_CACHED_USED = False  # 是否使用缓存 补采数据时可设置为True
40 | #
41 | # WARNING_FAILED_COUNT = 1000  # 任务失败数 超过WARNING_FAILED_COUNT则报警
42 | #
43 | # # 爬虫初始化工作
44 | # # 爬虫是否常驻
45 | # KEEP_ALIVE = True
46 | #
47 | # # 设置代理
48 | # PROXY_EXTRACT_API = None  # 代理提取API ,返回的代理分割符为\r\n
49 | # PROXY_ENABLE = True
50 | #
51 | # # 随机headers
52 | # RANDOM_HEADERS = True
53 | # # requests 使用session
54 | # USE_SESSION = False
55 | #
56 | # # 去重
57 | # ITEM_FILTER_ENABLE = False # item 去重
58 | # REQUEST_FILTER_ENABLE = False # request 去重
59 | #
60 | # # 报警
61 | # DINGDING_WARNING_URL = "" # 钉钉机器人api
62 | # DINGDING_WARNING_PHONE = "" # 报警人
63 | # LINGXI_TOKEN = "" # 灵犀报警token
64 | #
65 | # LOG_NAME = os.path.basename(os.getcwd())
66 | # LOG_PATH = "log/%s.log" % LOG_NAME  # log存储路径
67 | # LOG_LEVEL = "DEBUG"
68 | # LOG_IS_WRITE_TO_FILE = False
69 | # OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
70 | REQUEST_FILTER_ENABLE=True  # request 去重
71 | # REQUEST_FILTER_SETTING=dict(
72 | #     filter_type=3,  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、 轻量去重(LiteFilter)= 4
73 | #     expire_time=2592000,  # 过期时间1个月
74 | # ),
75 | REQUEST_FILTER_SETTING=dict(
76 |     filter_type=4,  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、 轻量去重(LiteFilter)= 4
77 | )


--------------------------------------------------------------------------------
/tests/spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["test_spider", "test_spider2"]
2 | 


--------------------------------------------------------------------------------
/tests/spider/spiders/test_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-02-08 16:06:12
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | import feapder
11 | from items import *
12 | 
13 | 
14 | class TestSpider(feapder.Spider):
15 |     def start_requests(self):
16 |         for i in range(1):
17 |             yield feapder.Request(f"https://www.baidu.com#{i}", callback=self.parse)
18 | 
19 |     def validate(self, request, response):
20 |         if response.status_code != 200:
21 |             raise Exception("response code not 200")  # 重试
22 | 
23 |         # if "哈哈" not in response.text:
24 |         #     return False # 抛弃当前请求
25 | 
26 |     def parse(self, request, response):
27 |         title = response.xpath("//title/text()").extract_first()  # 取标题
28 |         item = spider_data_item.SpiderDataItem()  # 声明一个item
29 |         item.title = title  # 给item属性赋值
30 |         yield item  # 返回item, item会自动批量入库
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     spider = TestSpider(redis_key="feapder3:test_spider", thread_count=100)
35 |     spider.start()


--------------------------------------------------------------------------------
/tests/spider/spiders/test_spider2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-02-08 16:06:12
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | import feapder
11 | from items import *
12 | 
13 | 
14 | class TestSpider2(feapder.Spider):
15 |     def start_requests(self):
16 |         for i in range(100):
17 |             yield feapder.Request("https://www.baidu.com#{}".format(i))
18 | 
19 |     def parse(self, request, response):
20 |         title = response.xpath("//title/text()").extract_first()  # 取标题
21 |         item = spider_data_item.SpiderDataItem()  # 声明一个item
22 |         item.title = title  # 给item属性赋值
23 |         yield item  # 返回item, item会自动批量入库
24 | 


--------------------------------------------------------------------------------
/tests/spider/table.sql:
--------------------------------------------------------------------------------
1 | -- ----------------------------
2 | -- Table structure for spider_data
3 | -- ----------------------------
4 | CREATE TABLE `spider_data` (
5 |   `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
6 |   `title` varchar(255) DEFAULT NULL,
7 |   PRIMARY KEY (`id`)
8 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
9 | 


--------------------------------------------------------------------------------
/tests/task-spider/test_task_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2022-06-10 14:30:54
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | import feapder
11 | from feapder import ArgumentParser
12 | 
13 | 
14 | class TestTaskSpider(feapder.TaskSpider):
15 |     def add_task(self):
16 |         # 加种子任务 框架会调用这个函数,方便往redis里塞任务,但不能写成死循环。实际业务中可以自己写个脚本往redis里塞任务
17 |         self._redisdb.zadd(self._task_table, {"id": 1, "url": "https://www.baidu.com"})
18 | 
19 |     def start_requests(self, task):
20 |         task_id, url = task
21 |         yield feapder.Request(url, task_id=task_id)
22 | 
23 |     def parse(self, request, response):
24 |         # 提取网站title
25 |         print(response.xpath("//title/text()").extract_first())
26 |         # 提取网站描述
27 |         print(response.xpath("//meta[@name='description']/@content").extract_first())
28 |         print("网站地址: ", response.url)
29 | 
30 |         # mysql 需要更新任务状态为做完 即 state=1
31 |         # yield self.update_task_batch(request.task_id)
32 | 
33 | 
34 | def start(args):
35 |     """
36 |     用mysql做种子表
37 |     """
38 |     spider = TestTaskSpider(
39 |         task_table="spider_task",
40 |         task_keys=["id", "url"],
41 |         redis_key="test:task_spider",
42 |         keep_alive=True,
43 |     )
44 |     if args == 1:
45 |         spider.start_monitor_task()
46 |     else:
47 |         spider.start()
48 | 
49 | 
50 | def start2(args):
51 |     """
52 |     用redis做种子表
53 |     """
54 |     spider = TestTaskSpider(
55 |         task_table="spider_task2",
56 |         task_table_type="redis",
57 |         redis_key="test:task_spider",
58 |         keep_alive=True,
59 |         use_mysql=False,
60 |     )
61 |     if args == 1:
62 |         spider.start_monitor_task()
63 |     else:
64 |         spider.start()
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     parser = ArgumentParser(description="测试TaskSpider")
69 | 
70 |     parser.add_argument(
71 |         "--start", type=int, nargs=1, help="用mysql做种子表 (1|2)", function=start
72 |     )
73 |     parser.add_argument(
74 |         "--start2", type=int, nargs=1, help="用redis做种子表 (1|2)", function=start2
75 |     )
76 | 
77 |     parser.start()
78 | 
79 |     # 下发任务  python3 test_task_spider.py --start 1
80 |     # 采集  python3 test_task_spider.py --start 2
81 | 


--------------------------------------------------------------------------------
/tests/test-debugger/README.md:
--------------------------------------------------------------------------------
1 | # xxx爬虫文档
2 | ## 调研
3 | 
4 | ## 数据库设计
5 | 
6 | ## 爬虫逻辑
7 | 
8 | ## 项目架构


--------------------------------------------------------------------------------
/tests/test-debugger/items/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Boris-code/feapder/100cde40eb3c9d03a3fa0af23f22c39c5a523bb8/tests/test-debugger/items/__init__.py


--------------------------------------------------------------------------------
/tests/test-debugger/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2023-06-09 20:26:29
 4 | ---------
 5 | @summary: 爬虫入口
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | import feapder
11 | 
12 | from spiders import *
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     test_debugger.TestDebugger.to_DebugSpider(
17 |         request=feapder.Request("https://spidertools.cn", render=True),
18 |         redis_key="test:xxx",
19 |     ).start()
20 | 


--------------------------------------------------------------------------------
/tests/test-debugger/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 |     "test_debugger"
3 | ]


--------------------------------------------------------------------------------
/tests/test-debugger/spiders/test_debugger.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2023-06-09 20:26:47
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | import feapder
11 | 
12 | 
13 | class TestDebugger(feapder.Spider):
14 |     def start_requests(self):
15 |         yield feapder.Request("https://spidertools.cn", render=True)
16 | 
17 |     def parse(self, request, response):
18 |         # 提取网站title
19 |         print(response.xpath("//title/text()").extract_first())
20 |         # 提取网站描述
21 |         print(response.xpath("//meta[@name='description']/@content").extract_first())
22 |         print("网站地址: ", response.url)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     TestDebugger.to_DebugSpider(
27 |         request=feapder.Request("https://spidertools.cn", render=True), redis_key="test:xxx"
28 |     ).start()
29 | 


--------------------------------------------------------------------------------
/tests/test-pipeline/items/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 |     "spider_data_item"
3 | ]


--------------------------------------------------------------------------------
/tests/test-pipeline/items/spider_data_item.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-02-08 16:39:27
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | from feapder import Item
11 | 
12 | 
13 | class SpiderDataItem(Item):
14 |     """
15 |     This class was generated by feapder.
16 |     command: feapder create -i spider_data.
17 |     """
18 | 
19 |     def __init__(self, *args, **kwargs):
20 |         # self.id = None  # type : int(10) unsigned | allow_null : NO | key : PRI | default_value : None | extra : auto_increment | column_comment : 
21 |         self.title = None  # type : varchar(255) | allow_null : YES | key :  | default_value : None | extra : | column_comment :
22 | 


--------------------------------------------------------------------------------
/tests/test-pipeline/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-02-08 16:02:02
 4 | ---------
 5 | @summary: 爬虫入口
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | from spiders import *
11 | from feapder import ArgumentParser
12 | 
13 | 
14 | def crawl_test(args):
15 |     spider = test_spider.TestSpider(
16 |         redis_key="feapder:test_batch_spider",  # 分布式爬虫调度信息存储位置
17 |         task_table="batch_spider_task",  # mysql中的任务表
18 |         task_keys=["id", "url"],  # 需要获取任务表里的字段名,可添加多个
19 |         task_state="state",  # mysql中任务状态字段
20 |         batch_record_table="batch_spider_batch_record",  # mysql中的批次记录表
21 |         batch_name="批次爬虫测试(周全)",  # 批次名字
22 |         batch_interval=7,  # 批次周期 天为单位 若为小时 可写 1 / 24
23 |     )
24 | 
25 |     if args == 1:
26 |         spider.start_monitor_task()  # 下发及监控任务
27 |     else:
28 |         spider.start()  # 采集
29 | 
30 | 
31 | if __name__ == "__main__":
32 | 
33 |     parser = ArgumentParser(description="批次爬虫测试")
34 | 
35 |     parser.add_argument(
36 |         "--crawl_test", type=int, nargs=1, help="(1|2)", function=crawl_test
37 |     )
38 | 
39 |     parser.start()
40 | 
41 |     # 运行
42 |     # 下发任务及监控进度 python3 main.py --crawl_test 1
43 |     # 采集 python3 main.py --crawl_test 2


--------------------------------------------------------------------------------
/tests/test-pipeline/pipeline.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/3/18 12:39 上午
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | from feapder.pipelines import BasePipeline
12 | from typing import Dict, List, Tuple
13 | 
14 | 
15 | class Pipeline(BasePipeline):
16 |     """
17 |     pipeline 是单线程的,批量保存数据的操作,不建议在这里写网络请求代码,如下载图片等
18 |     """
19 | 
20 |     def save_items(self, table, items: List[Dict]) -> bool:
21 |         """
22 |         保存数据
23 |         Args:
24 |             table: 表名
25 |             items: 数据,[{},{},...]
26 | 
27 |         Returns: 是否保存成功 True / False
28 |                  若False,不会将本批数据入到去重库,以便再次入库
29 | 
30 |         """
31 | 
32 |         print("自定义pipeline, 保存数据 >>>>", table, items)
33 | 
34 |         return True
35 | 
36 |     def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
37 |         """
38 |         更新数据, 与UpdateItem配合使用,若爬虫中没使用UpdateItem,则可不实现此接口
39 |         Args:
40 |             table: 表名
41 |             items: 数据,[{},{},...]
42 |             update_keys: 更新的字段, 如 ("title", "publish_time")
43 | 
44 |         Returns: 是否更新成功 True / False
45 |                  若False,不会将本批数据入到去重库,以便再次入库
46 | 
47 |         """
48 | 
49 |         print("自定义pipeline, 更新数据 >>>>", table, items, update_keys)
50 | 
51 |         return True
52 | 


--------------------------------------------------------------------------------
/tests/test-pipeline/setting.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """爬虫配置文件"""
 3 | import os
 4 | 
 5 | 
 6 | # MYSQL
 7 | MYSQL_IP = "localhost"
 8 | MYSQL_PORT = 3306
 9 | MYSQL_DB = "feapder"
10 | MYSQL_USER_NAME = "feapder"
11 | MYSQL_USER_PASS = "feapder123"
12 | 
13 | # REDIS
14 | # IP:PORT
15 | REDISDB_IP_PORTS = "localhost:6379"
16 | REDISDB_USER_PASS = ""
17 | # 默认 0 到 15 共16个数据库
18 | REDISDB_DB = 0
19 | 
20 | # 数据入库的pipeline,可自定义,默认MysqlPipeline
21 | ITEM_PIPELINES = [
22 |     "pipeline.Pipeline"
23 | ]
24 | 
25 | # # 爬虫相关
26 | # # COLLECTOR
27 | # COLLECTOR_SLEEP_TIME = 1 # 从任务队列中获取任务到内存队列的间隔
28 | # COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量
29 | #
30 | # # SPIDER
31 | # SPIDER_THREAD_COUNT = 10 # 爬虫并发数
32 | # SPIDER_SLEEP_TIME = 0 # 下载时间间隔(解析完一个response后休眠时间)
33 | # SPIDER_MAX_RETRY_TIMES = 100 # 每个请求最大重试次数
34 | 
35 | # # 重新尝试失败的requests 当requests重试次数超过允许的最大重试次数算失败
36 | # RETRY_FAILED_REQUESTS = False
37 | # # request 超时时间,超过这个时间重新做(不是网络请求的超时时间)单位秒
38 | # REQUEST_LOST_TIMEOUT = 600  # 10分钟
39 | # # 保存失败的request
40 | # SAVE_FAILED_REQUEST = True
41 | #
42 | # # 下载缓存 利用redis缓存,由于内存小,所以仅供测试时使用
43 | # RESPONSE_CACHED_ENABLE = False  # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True
44 | # RESPONSE_CACHED_EXPIRE_TIME = 3600  # 缓存时间 秒
45 | # RESPONSE_CACHED_USED = False  # 是否使用缓存 补采数据时可设置为True
46 | #
47 | # WARNING_FAILED_COUNT = 1000  # 任务失败数 超过WARNING_FAILED_COUNT则报警
48 | #
49 | # # 爬虫初始化工作
50 | # # 爬虫是否常驻
51 | # KEEP_ALIVE = False
52 | #
53 | #
54 | # # 设置代理
55 | # PROXY_EXTRACT_API = None  # 代理提取API ,返回的代理分割符为\r\n
56 | # PROXY_ENABLE = True
57 | #
58 | # # 随机headers
59 | # RANDOM_HEADERS = True
60 | # # requests 使用session
61 | # USE_SESSION = False
62 | #
63 | # # 去重
64 | # ITEM_FILTER_ENABLE = False # item 去重
65 | # REQUEST_FILTER_ENABLE = False # request 去重
66 | #
67 | # # 报警
68 | # DINGDING_WARNING_URL = "" # 钉钉机器人api
69 | # DINGDING_WARNING_PHONE = "" # 报警人
70 | # LINGXI_TOKEN = "" # 灵犀报警token
71 | #
72 | # LOG_NAME = os.path.basename(os.getcwd())
73 | # LOG_PATH = "log/%s.log" % LOG_NAME  # log存储路径
74 | # LOG_LEVEL = "DEBUG"
75 | # LOG_IS_WRITE_TO_FILE = False
76 | # OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
77 | 


--------------------------------------------------------------------------------
/tests/test-pipeline/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 |     "test_spider"
3 | ]


--------------------------------------------------------------------------------
/tests/test-pipeline/spiders/test_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-02-08 16:09:47
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | import feapder
11 | from items import *
12 | 
13 | 
14 | class TestSpider(feapder.BatchSpider):
15 |     # def init_task(self):
16 |     #     pass
17 | 
18 |     def start_requests(self, task):
19 |         # task 为在任务表中取出的每一条任务
20 |         id, url = task  # id, url为所取的字段,main函数中指定的
21 |         yield feapder.Request(url, task_id=id)
22 | 
23 |     def parse(self, request, response):
24 |         title = response.xpath('//title/text()').extract_first()  # 取标题
25 |         item = spider_data_item.SpiderDataItem()  # 声明一个item
26 |         item.title = title  # 给item属性赋值
27 |         yield item  # 返回item, item会自动批量入库
28 |         yield self.update_task_batch(request.task_id, 1) # 更新任务状态为1
29 | 
30 |     def exception_request(self, request, response):
31 |         """
32 |         @summary: 请求或者parser里解析出异常的request
33 |         ---------
34 |         @param request:
35 |         @param response:
36 |         ---------
37 |         @result: request / callback / None (返回值必须可迭代)
38 |         """
39 | 
40 |         pass
41 | 
42 |     def failed_request(self, request, response):
43 |         """
44 |         @summary: 超过最大重试次数的request
45 |         ---------
46 |         @param request:
47 |         ---------
48 |         @result: request / item / callback / None (返回值必须可迭代)
49 |         """
50 | 
51 |         yield request
52 |         yield self.update_task_batch(request.task_id, -1) # 更新任务状态为-1
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/tests/test-pipeline/table.sql:
--------------------------------------------------------------------------------
 1 | -- ----------------------------
 2 | -- Table structure for spider_data
 3 | -- ----------------------------
 4 | CREATE TABLE `spider_data` (
 5 |   `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
 6 |   `title` varchar(255) DEFAULT NULL,
 7 |   PRIMARY KEY (`id`)
 8 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
 9 | 
10 | -- ----------------------------
11 | -- Table structure for batch_spider_task
12 | -- ----------------------------
13 | CREATE TABLE `batch_spider_task` (
14 |   `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
15 |   `url` varchar(255) DEFAULT NULL,
16 |   `state` int(11) DEFAULT '0',
17 |   PRIMARY KEY (`id`)
18 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
19 | 
20 | -- ----------------------------
21 | -- Records of batch_spider_task
22 | -- ----------------------------
23 | INSERT INTO `batch_spider_task` VALUES (1, 'https://www.baidu.com', 0);
24 | 


--------------------------------------------------------------------------------
/tests/test_dedup.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from redis import Redis
  4 | 
  5 | from feapder.dedup import Dedup
  6 | 
  7 | 
  8 | class TestDedup(unittest.TestCase):
  9 |     def clear(self):
 10 |         self.absolute_name = "test_dedup"
 11 |         redis = Redis.from_url("redis://@localhost:6379/0", decode_responses=True)
 12 |         keys = redis.keys(self.absolute_name + "*")
 13 |         if keys:
 14 |             redis.delete(*keys)
 15 | 
 16 |     def setUp(self) -> None:
 17 |         self.clear()
 18 |         self.mock_data()
 19 | 
 20 |     def tearDown(self) -> None:
 21 |         self.clear()
 22 | 
 23 |     def mock_data(self):
 24 |         self.data = {"xxx": 123, "xxxx": "xxxx"}
 25 |         self.datas = ["xxx", "bbb", "xxx"]
 26 | 
 27 |     def test_MemoryFilter(self):
 28 |         dedup = Dedup(
 29 |             Dedup.MemoryFilter, absolute_name=self.absolute_name
 30 |         )  # 表名为test 历史数据3秒有效期
 31 | 
 32 |         # 逐条去重
 33 |         self.assertEqual(dedup.add(self.data), 1)
 34 |         self.assertEqual(dedup.get(self.data), 1)
 35 | 
 36 |         # 批量去重
 37 |         self.assertEqual(dedup.get(self.datas), [0, 0, 1])
 38 |         self.assertEqual(dedup.add(self.datas), [1, 1, 0])
 39 |         self.assertEqual(dedup.get(self.datas), [1, 1, 1])
 40 | 
 41 |     def test_ExpireFilter(self):
 42 |         dedup = Dedup(
 43 |             Dedup.ExpireFilter,
 44 |             expire_time=10,
 45 |             redis_url="redis://@localhost:6379/0",
 46 |             absolute_name=self.absolute_name,
 47 |         )
 48 | 
 49 |         # 逐条去重
 50 |         self.assertEqual(dedup.add(self.data), 1)
 51 |         self.assertEqual(dedup.get(self.data), 1)
 52 | 
 53 |         # 批量去重
 54 |         self.assertEqual(dedup.get(self.datas), [0, 0, 1])
 55 |         self.assertEqual(dedup.add(self.datas), [1, 1, 0])
 56 |         self.assertEqual(dedup.get(self.datas), [1, 1, 1])
 57 | 
 58 |     def test_BloomFilter(self):
 59 |         dedup = Dedup(
 60 |             Dedup.BloomFilter,
 61 |             redis_url="redis://@localhost:6379/0",
 62 |             absolute_name=self.absolute_name,
 63 |         )
 64 | 
 65 |         # 逐条去重
 66 |         self.assertEqual(dedup.add(self.data), 1)
 67 |         self.assertEqual(dedup.get(self.data), 1)
 68 | 
 69 |         # 批量去重
 70 |         self.assertEqual(dedup.get(self.datas), [0, 0, 1])
 71 |         self.assertEqual(dedup.add(self.datas), [1, 1, 0])
 72 |         self.assertEqual(dedup.get(self.datas), [1, 1, 1])
 73 | 
 74 |     def test_LiteFilter(self):
 75 |         dedup = Dedup(
 76 |             Dedup.LiteFilter,
 77 |         )
 78 | 
 79 |         # 逐条去重
 80 |         self.assertEqual(dedup.add(self.data), 1)
 81 |         self.assertEqual(dedup.get(self.data), 1)
 82 | 
 83 |         # 批量去重
 84 |         self.assertEqual(dedup.get(self.datas), [0, 0, 1])
 85 |         self.assertEqual(dedup.add(self.datas), [1, 1, 0])
 86 |         self.assertEqual(dedup.get(self.datas), [1, 1, 1])
 87 | 
 88 |     def test_filter(self):
 89 |         dedup = Dedup(
 90 |             Dedup.BloomFilter,
 91 |             redis_url="redis://@localhost:6379/0",
 92 |             to_md5=True,
 93 |             absolute_name=self.absolute_name,
 94 |         )
 95 | 
 96 |         # 制造已存在数据
 97 |         self.datas = ["xxx", "bbb"]
 98 |         result = dedup.add(self.datas)
 99 |         self.assertEqual(result, [1, 1])
100 | 
101 |         # 过滤掉已存在数据 "xxx", "bbb"
102 |         self.datas = ["xxx", "bbb", "ccc"]
103 |         dedup.filter_exist_data(self.datas)
104 |         self.assertEqual(self.datas, ["ccc"])
105 | 


--------------------------------------------------------------------------------
/tests/test_download_midware.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2023/9/21 13:59
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | import feapder
12 | 
13 | 
14 | def download_midware(request):
15 |     print("outter download_midware")
16 |     return request
17 | 
18 | 
19 | class TestAirSpider(feapder.AirSpider):
20 |     def start_requests(self):
21 |         yield feapder.Request(
22 |             "https://www.baidu.com", download_midware=download_midware
23 |         )
24 | 
25 |     def parse(self, request, response):
26 |         print(request, response)
27 | 
28 | 
29 | class TestSpiderSpider(feapder.Spider):
30 |     def start_requests(self):
31 |         yield feapder.Request(
32 |             "https://www.baidu.com", download_midware=[download_midware, self.download_midware]
33 |         )
34 | 
35 |     def download_midware(self, request):
36 |         print("class download_midware")
37 |         return request
38 | 
39 |     def parse(self, request, response):
40 |         print(request, response)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     # TestAirSpider().start()
45 |     TestSpiderSpider(redis_key="test").start()
46 | 


--------------------------------------------------------------------------------
/tests/test_lock.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/7/15 5:00 下午
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | from feapder.utils.redis_lock import RedisLock
12 | from feapder.db.redisdb import RedisDB
13 | import time
14 | 
15 | def test_lock():
16 |     with RedisLock(key="test", redis_cli=RedisDB().get_redis_obj(), wait_timeout=10) as _lock:
17 |         if _lock.locked:
18 |             print(1)
19 |             time.sleep(100)
20 | 
21 | if __name__ == '__main__':
22 |     test_lock()


--------------------------------------------------------------------------------
/tests/test_log.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/6/18 10:36 上午
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | from feapder.utils.log import log
12 | 
13 | log.debug("debug")
14 | log.info("info")
15 | log.success("success")
16 | log.warning("warning")
17 | log.error("error")
18 | log.critical("critical")
19 | log.exception("exception")


--------------------------------------------------------------------------------
/tests/test_metrics.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from feapder.utils import metrics
 4 | 
 5 | # 初始化打点系统
 6 | metrics.init(
 7 |     influxdb_host="localhost",
 8 |     influxdb_port="8086",
 9 |     influxdb_udp_port="8089",
10 |     influxdb_database="feapder",
11 |     influxdb_user="***",
12 |     influxdb_password="***",
13 |     influxdb_measurement="test_metrics",
14 |     debug=True,
15 | )
16 | 
17 | 
18 | async def test_counter_async():
19 |     for i in range(100):
20 |         await metrics.aemit_counter("total count", count=100, classify="test5")
21 |         for j in range(100):
22 |             await metrics.aemit_counter("key", count=1, classify="test5")
23 | 
24 | 
25 | def test_counter():
26 |     for i in range(100):
27 |         metrics.emit_counter("total count", count=100, classify="test5")
28 |         for j in range(100):
29 |             metrics.emit_counter("key", count=1, classify="test5")
30 | 
31 | 
32 | def test_store():
33 |     metrics.emit_store("total", 100, classify="cookie_count")
34 | 
35 | 
36 | def test_time():
37 |     metrics.emit_timer("total", 100, classify="time")
38 | 
39 | 
40 | def test_any():
41 |     metrics.emit_any(
42 |         tags={"_key": "total", "_type": "any"}, fields={"_value": 100}, classify="time"
43 |     )
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     asyncio.run(test_counter_async())
48 |     test_counter_async()
49 |     test_store()
50 |     test_time()
51 |     test_any()
52 |     metrics.close()
53 | 


--------------------------------------------------------------------------------
/tests/test_mysqldb.py:
--------------------------------------------------------------------------------
1 | from feapder.db.mysqldb import MysqlDB
2 | 
3 | 
4 | db = MysqlDB(
5 |     ip="localhost", port=3306, db="feapder", user_name="feapder", user_pass="feapder123"
6 | )
7 | 
8 | MysqlDB.from_url("mysql://feapder:feapder123@localhost:3306/feapder?charset=utf8mb4")


--------------------------------------------------------------------------------
/tests/test_playwright.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2022/9/15 8:47 PM
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | import time
12 | 
13 | from playwright.sync_api import Page
14 | 
15 | import feapder
16 | from feapder.utils.webdriver import PlaywrightDriver
17 | 
18 | 
19 | class TestPlaywright(feapder.AirSpider):
20 |     __custom_setting__ = dict(
21 |         RENDER_DOWNLOADER="feapder.network.downloader.PlaywrightDownloader",
22 |     )
23 | 
24 |     def start_requests(self):
25 |         yield feapder.Request("https://www.baidu.com", render=True)
26 | 
27 |     def parse(self, reqeust, response):
28 |         driver: PlaywrightDriver = response.driver
29 |         page: Page = driver.page
30 | 
31 |         page.type("#kw", "feapder")
32 |         page.click("#su")
33 |         page.wait_for_load_state("networkidle")
34 |         time.sleep(1)
35 | 
36 |         html = page.content()
37 |         response.text = html  # 使response加载最新的页面
38 |         for data_container in response.xpath("//div[@class='c-container']"):
39 |             print(data_container.xpath("string(.//h3)").extract_first())
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     TestPlaywright(thread_count=1).run()
44 | 


--------------------------------------------------------------------------------
/tests/test_playwright2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2022/9/15 8:47 PM
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | from playwright.sync_api import Response
12 | from feapder.utils.webdriver import (
13 |     PlaywrightDriver,
14 |     InterceptResponse,
15 |     InterceptRequest,
16 | )
17 | 
18 | import feapder
19 | 
20 | 
21 | def on_response(response: Response):
22 |     print(response.url)
23 | 
24 | 
25 | class TestPlaywright(feapder.AirSpider):
26 |     __custom_setting__ = dict(
27 |         RENDER_DOWNLOADER="feapder.network.downloader.PlaywrightDownloader",
28 |         PLAYWRIGHT=dict(
29 |             user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
30 |             proxy=None,  # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
31 |             headless=False,  # 是否为无头浏览器
32 |             driver_type="chromium",  # chromium、firefox、webkit
33 |             timeout=30,  # 请求超时时间
34 |             window_size=(1024, 800),  # 窗口大小
35 |             executable_path=None,  # 浏览器路径,默认为默认路径
36 |             download_path=None,  # 下载文件的路径
37 |             render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
38 |             wait_until="networkidle",  # 等待页面加载完成的事件,可选值:"commit", "domcontentloaded", "load", "networkidle"
39 |             use_stealth_js=False,  # 使用stealth.min.js隐藏浏览器特征
40 |             # page_on_event_callback=dict(response=on_response),  # 监听response事件
41 |             # page.on() 事件的回调 如 page_on_event_callback={"dialog": lambda dialog: dialog.accept()}
42 |             storage_state_path=None,  # 保存浏览器状态的路径
43 |             url_regexes=["wallpaper/list"],  # 拦截接口,支持正则,数组类型
44 |             save_all=True,  # 是否保存所有拦截的接口
45 |         ),
46 |     )
47 | 
48 |     def start_requests(self):
49 |         yield feapder.Request(
50 |             "http://www.soutushenqi.com/image/search/?searchWord=%E6%A0%91%E5%8F%B6",
51 |             render=True,
52 |         )
53 | 
54 |     def parse(self, reqeust, response):
55 |         driver: PlaywrightDriver = response.driver
56 | 
57 |         intercept_response: InterceptResponse = driver.get_response("wallpaper/list")
58 |         intercept_request: InterceptRequest = intercept_response.request
59 | 
60 |         req_url = intercept_request.url
61 |         req_header = intercept_request.headers
62 |         req_data = intercept_request.data
63 |         print("请求url", req_url)
64 |         print("请求header", req_header)
65 |         print("请求data", req_data)
66 | 
67 |         data = driver.get_json("wallpaper/list")
68 |         print("接口返回的数据", data)
69 | 
70 |         print("------ 测试save_all=True ------- ")
71 | 
72 |         # 测试save_all=True
73 |         all_intercept_response: list = driver.get_all_response("wallpaper/list")
74 |         for intercept_response in all_intercept_response:
75 |             intercept_request: InterceptRequest = intercept_response.request
76 |             req_url = intercept_request.url
77 |             req_header = intercept_request.headers
78 |             req_data = intercept_request.data
79 |             print("请求url", req_url)
80 |             print("请求header", req_header)
81 |             print("请求data", req_data)
82 | 
83 |         all_intercept_json = driver.get_all_json("wallpaper/list")
84 |         for intercept_json in all_intercept_json:
85 |             print("接口返回的数据", intercept_json)
86 | 
87 |         # 千万别忘了
88 |         driver.clear_cache()
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     TestPlaywright(thread_count=1).run()
93 | 


--------------------------------------------------------------------------------
/tests/test_rander.py:
--------------------------------------------------------------------------------
 1 | import feapder
 2 | 
 3 | 
 4 | class XueQiuSpider(feapder.AirSpider):
 5 |     def start_requests(self):
 6 |         for i in range(10):
 7 |             yield feapder.Request("https://baidu.com/#{}".format(i), render=True)
 8 | 
 9 |     def parse(self, request, response):
10 |         print(response.cookies.get_dict())
11 |         print(response.headers)
12 |         print(response.browser)
13 |         print("response.url ", response.url)
14 | 
15 |         # article_list = response.xpath('//div[@class="detail"]')
16 |         # for article in article_list:
17 |         #     title = article.xpath("string(.//a)").extract_first()
18 |         #     print(title)
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     XueQiuSpider(thread_count=1).start()
23 | 


--------------------------------------------------------------------------------
/tests/test_rander2.py:
--------------------------------------------------------------------------------
 1 | import feapder
 2 | 
 3 | 
 4 | class XueQiuSpider(feapder.Spider):
 5 |     __custom_setting__ = dict(
 6 |         REDISDB_IP_PORTS="localhost:6379", REDISDB_USER_PASS="", REDISDB_DB=0
 7 |     )
 8 | 
 9 |     def start_requests(self):
10 |         for i in range(10):
11 |             yield feapder.Request("https://news.qq.com/#{}".format(i), render=True)
12 | 
13 |     def parse(self, request, response):
14 |         print(response.cookies.get_dict())
15 |         print("response.url ", response.url)
16 | 
17 |         article_list = response.xpath('//div[@class="detail"]')
18 |         for article in article_list:
19 |             title = article.xpath("string(.//a)").extract_first()
20 |             print(title)
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     XueQiuSpider(
25 |         thread_count=10, redis_key="feapter:test_rander", delete_keys=True
26 |     ).start()
27 | 


--------------------------------------------------------------------------------
/tests/test_rander3.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import feapder
 4 | from feapder.utils.webdriver import WebDriver
 5 | 
 6 | 
 7 | class TestRender(feapder.AirSpider):
 8 |     def start_requests(self):
 9 |         yield feapder.Request("http://www.baidu.com", render=True)
10 | 
11 |     def parse(self, request, response):
12 |         browser: WebDriver = response.browser
13 |         browser.find_element_by_id("kw").send_keys("feapder")
14 |         browser.find_element_by_id("su").click()
15 |         time.sleep(5)
16 |         print(browser.page_source)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     TestRender().start()
21 | 


--------------------------------------------------------------------------------
/tests/test_rander_xhr.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import feapder
 4 | from feapder.utils.webdriver import WebDriver
 5 | 
 6 | 
 7 | class TestRender(feapder.AirSpider):
 8 |     __custom_setting__ = dict(
 9 |         WEBDRIVER=dict(
10 |             pool_size=1,  # 浏览器的数量
11 |             load_images=True,  # 是否加载图片
12 |             user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
13 |             proxy=None,  # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
14 |             headless=False,  # 是否为无头浏览器
15 |             driver_type="CHROME",  # CHROME、EDGE、PHANTOMJS、FIREFOX
16 |             timeout=30,  # 请求超时时间
17 |             window_size=(1024, 800),  # 窗口大小
18 |             executable_path=None,  # 浏览器路径,默认为默认路径
19 |             render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
20 |             custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
21 |             xhr_url_regexes=[
22 |                 "/ad",
23 |             ],  # 拦截 http://www.spidertools.cn/spidertools/ad 接口
24 |         )
25 |     )
26 | 
27 |     def start_requests(self):
28 |         yield feapder.Request("http://www.spidertools.cn", render=True)
29 | 
30 |     def parse(self, request, response):
31 |         browser: WebDriver = response.browser
32 |         time.sleep(3)
33 | 
34 |         # 获取接口数据 文本类型
35 |         ad = browser.xhr_text("/ad")
36 |         print(ad)
37 | 
38 |         # 获取接口数据 转成json,本例因为返回的接口是文本,所以不转了
39 |         # browser.xhr_json("/ad")
40 | 
41 |         xhr_response = browser.xhr_response("/ad")
42 |         print("请求接口", xhr_response.request.url)
43 |         # 请求头目前获取的不完整
44 |         print("请求头", xhr_response.request.headers)
45 |         print("请求体", xhr_response.request.data)
46 |         print("返回头", xhr_response.headers)
47 |         print("返回地址", xhr_response.url)
48 |         print("返回内容", xhr_response.content)
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     TestRender().start()
53 | 


--------------------------------------------------------------------------------
/tests/test_redisdb.py:
--------------------------------------------------------------------------------
1 | from feapder.db.redisdb import RedisDB
2 | import time
3 | db = RedisDB.from_url("redis://localhost:6379")
4 | 
5 | # db.clear("test")
6 | db.zincrby("test", 1.0, "a")
7 | 


--------------------------------------------------------------------------------
/tests/test_request.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/3/4 11:26 下午
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | from feapder import Request, Response
12 | 
13 | 
14 | def test_selector():
15 |     request = Request("https://www.baidu.com?a=1&b=2", data={}, params=None)
16 |     response = request.get_response()
17 |     print(response)
18 | 
19 |     print(response.xpath("//a/@href"))
20 |     print(response.css("a::attr(href)"))
21 |     print(response.css("a::attr(href)").extract_first())
22 | 
23 |     content = response.re("<a.*?href='(.*?)'")
24 |     print(content)
25 | 
26 | 
27 | def test_from_text():
28 |     text = """    <script src="./lib/docsify/lib/docsify.min.js"></script>
29 |         <script src="./lib/docsify/lib/plugins/ga.js"></script>
30 |         <script src="./lib/docsify/lib/plugins/search.js"></script>
31 |         <script src="./lib/docsify-copy-code/dist/docsify-copy-code.min.js"></script>
32 |         <script src="./lib/prismjs/components/prism-bash.js"></script>
33 |         <script src="./lib/prismjs/components/prism-java.js"></script>
34 |         <script src="./lib/prismjs/components/prism-sql.js"></script>
35 |         <script src="./lib/prismjs/components/prism-yaml.js"></script>
36 |         <script src="./lib/prismjs/components/prism-python.js"></script>
37 |         <script src="//cdn.jsdelivr.net/npm/docsify/lib/plugins/zoom-image.min.js"></script>"""
38 | 
39 |     resp = Response.from_text(text=text, url="http://feapder.com/#/README")
40 |     print(resp.text)
41 |     print(resp)
42 |     print(resp.xpath("//script"))
43 | 
44 | def test_to_dict():
45 |     request = Request("https://www.baidu.com?a=1&b=2", data={"a":1}, params="k=1", callback="test", task_id=1, cookies={"a":1})
46 |     print(request.to_dict)


--------------------------------------------------------------------------------
/tests/test_spider_params.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-03-07 21:27:00
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | import feapder
11 | 
12 | 
13 | class TestSpiderParams(feapder.Spider):
14 |     # 自定义数据库,若项目中有setting.py文件,此自定义可删除
15 |     __custom_setting__ = dict(
16 |         REDISDB_IP_PORTS="localhost:6379", REDISDB_USER_PASS="", REDISDB_DB=0
17 |     )
18 | 
19 |     def start_requests(self):
20 |         yield feapder.Request(f"https://www.baidu.com")
21 | 
22 |     def parse(self, request, response):
23 |         print(request.url)
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     spider = TestSpiderParams(redis_key="feapder:test_spider_params")
28 |     spider.start()
29 | 


--------------------------------------------------------------------------------
/tests/test_task.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/4/8 1:06 下午
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | from feapder.utils.perfect_dict import PerfectDict
12 | 
13 | 
14 | task_key = ["id", "url"]
15 | task = [1, "http://www.badu.com"]
16 | task = PerfectDict(_dict=dict(zip(task_key, task)), _values=task)
17 | 
18 | task = PerfectDict(id=1, url="http://www.badu.com")
19 | task = PerfectDict({"id":"1", "url":"http://www.badu.com"})
20 | 
21 | print(task)
22 | id, url = task
23 | print(id, url)
24 | print(task[0], task[1])
25 | print(task.id, task.url)
26 | print(task["id"], task["url"])
27 | print(task.get("id"), task.get("url"))
28 | 


--------------------------------------------------------------------------------
/tests/test_template/test_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2022-08-04 17:58:45
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | """
 9 | 
10 | import feapder
11 | from feapder import ArgumentParser
12 | 
13 | 
14 | class TestSpider(feapder.TaskSpider):
15 |     # 自定义数据库,若项目中有setting.py文件,此自定义可删除
16 |     __custom_setting__ = dict(
17 |         REDISDB_IP_PORTS="localhost:6379",
18 |         REDISDB_USER_PASS="",
19 |         REDISDB_DB=0,
20 |         MYSQL_IP="localhost",
21 |         MYSQL_PORT=3306,
22 |         MYSQL_DB="",
23 |         MYSQL_USER_NAME="",
24 |         MYSQL_USER_PASS="",
25 |     )
26 | 
27 |     def start_requests(self, task):
28 |         task_id = task.id
29 |         url = task.url
30 |         yield feapder.Request(url, task_id=task_id)
31 | 
32 |     def parse(self, request, response):
33 |         # 提取网站title
34 |         print(response.xpath("//title/text()").extract_first())
35 |         # 提取网站描述
36 |         print(response.xpath("//meta[@name='description']/@content").extract_first())
37 |         print("网站地址: ", response.url)
38 | 
39 |         # mysql 需要更新任务状态为做完 即 state=1
40 |         yield self.update_task_batch(request.task_id)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     # 用mysql做任务表,需要先建好任务任务表
45 |     spider = TestSpider(
46 |         redis_key="xxx:xxx",  # 分布式爬虫调度信息存储位置
47 |         task_table="",  # mysql中的任务表
48 |         task_keys=["id", "url"],  # 需要获取任务表里的字段名,可添加多个
49 |         task_state="state",  # mysql中任务状态字段
50 |     )
51 | 
52 |     # 用redis做任务表
53 |     # spider = TestSpider(
54 |     #     redis_key="xxx:xxxx",  # 分布式爬虫调度信息存储位置
55 |     #     task_table="", # 任务表名
56 |     #     task_table_type="redis", # 任务表类型为redis
57 |     # )
58 | 
59 |     parser = ArgumentParser(description="TestSpider爬虫")
60 | 
61 |     parser.add_argument(
62 |         "--start_master",
63 |         action="store_true",
64 |         help="添加任务",
65 |         function=spider.start_monitor_task,
66 |     )
67 |     parser.add_argument(
68 |         "--start_worker", action="store_true", help="启动爬虫", function=spider.start
69 |     )
70 | 
71 |     parser.start()
72 | 
73 |     # 直接启动
74 |     # spider.start()  # 启动爬虫
75 |     # spider.start_monitor_task() # 添加任务
76 | 
77 |     # 通过命令行启动
78 |     # python test_spider.py --start_master  # 添加任务
79 |     # python test_spider.py --start_worker  # 启动爬虫


--------------------------------------------------------------------------------
/tests/test_tools.py:
--------------------------------------------------------------------------------
 1 | from feapder.utils import tools
 2 | from datetime import datetime
 3 | 
 4 | 
 5 | date = tools.format_time("昨天3:10")
 6 | print(date)
 7 | 
 8 | print(tools.format_date("2017年4月17日 3时27分12秒"))
 9 | 
10 | date = tools.format_time("昨天")
11 | print(date)
12 | 
13 | date = tools.format_time("2021-11-05 14:18:10")
14 | print(date)
15 | 
16 | date = tools.format_time("1 年前")
17 | print(date)
18 | 
19 | 
20 | class C:
21 |     pass
22 | 
23 | 
24 | data = {"date": datetime.now(), "c": C()}
25 | print(tools.dumps_json(data))
26 | 


--------------------------------------------------------------------------------
/tests/test_webdriver.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/3/18 7:05 下午
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | from feapder.utils.webdriver import WebDriverPool, WebDriver
11 | import threading
12 | 
13 | 
14 | def test_webdirver_pool():
15 | 
16 |     webdriver_pool = WebDriverPool(
17 |         pool_size=2, load_images=False, driver_type=WebDriver.FIREFOX, timeout=30
18 |     )
19 | 
20 |     def request():
21 |         try:
22 |             browser = webdriver_pool.get()
23 |             browser.get("https://baidu.com")
24 |             print(browser.title)
25 |             webdriver_pool.put(browser)
26 |         except:
27 |             print("失败")
28 | 
29 |     for i in range(5):
30 |         threading.Thread(target=request).start()
31 | 
32 | 
33 | def test_webdriver():
34 |     with WebDriver(
35 |         load_images=True, driver_type=WebDriver.CHROME, timeout=30
36 |     ) as browser:
37 |         browser.get("https://httpbin.org/get")
38 |         html = browser.page_source
39 |         print(html)
40 |         print(browser.user_agent)
41 | 
42 |         import time
43 |         time.sleep(1000)
44 | 
45 | test_webdriver()


--------------------------------------------------------------------------------
/tests/user_pool/test_gold_user_pool.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/9/13 2:33 下午
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | import unittest
12 | 
13 | from feapder.network.user_pool import GoldUser
14 | from feapder.network.user_pool import GoldUserPool
15 | 
16 | 
17 | class TestUserPool(unittest.TestCase):
18 |     def setUp(self) -> None:
19 |         users = [
20 |             GoldUser(
21 |                 username="zhangsan",
22 |                 password="1234",
23 |                 max_use_times=10,
24 |                 use_interval=5,
25 |             ),
26 |             GoldUser(
27 |                 username="lisi",
28 |                 password="1234",
29 |                 max_use_times=10,
30 |                 use_interval=5,
31 |                 login_interval=50,
32 |             ),
33 |         ]
34 | 
35 |         class CustomGoldUserPool(GoldUserPool):
36 |             def login(self, user: GoldUser) -> GoldUser:
37 |                 # 此处为假数据,正常需通过登录网站获取cookie
38 |                 username = user.username
39 |                 password = user.password
40 | 
41 |                 # 登录获取cookie
42 |                 cookie = "zzzz"
43 |                 user.cookies = cookie
44 | 
45 |                 return user
46 | 
47 |         self.user_pool = CustomGoldUserPool(
48 |             "test:user_pool",
49 |             users=users,
50 |             keep_alive=True,
51 |         )
52 | 
53 |     def test_run(self):
54 |         self.user_pool.run()
55 | 
56 |     def test_get_user(self):
57 |         user = self.user_pool.get_user()
58 |         print(user)
59 | 
60 |         user = self.user_pool.get_user(username="zhangsan")
61 |         print(user)
62 | 
63 |     def test_del_user(self):
64 |         self.user_pool.del_user("lisi")
65 | 
66 |     def test_delay_user(self):
67 |         user = self.user_pool.get_user(username="lisi")
68 |         print(user)
69 |         self.user_pool.delay_use("lisi", 60)
70 |         user = self.user_pool.get_user(username="lisi")
71 |         print(user)
72 | 
73 |     def test_exclusive(self):
74 |         """
75 |         测试独占
76 |         """
77 |         # 用户lisi被test_spider爬虫独占
78 |         user = self.user_pool.get_user(
79 |             username="lisi", used_for_spider_name="test_spider"
80 |         )
81 |         print(user)
82 | 
83 |         # test_spider爬虫可以正常使用
84 |         user = self.user_pool.get_user(
85 |             username="lisi", used_for_spider_name="test_spider"
86 |         )
87 |         print(user)
88 | 
89 |         # 其他的爬虫需要在独占的间隔后使用
90 |         user = self.user_pool.get_user(username="lisi")
91 |         print(user)
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     unittest.main()
96 | 


--------------------------------------------------------------------------------
/tests/user_pool/test_guest_user_pool.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/9/13 2:33 下午
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | import time
12 | import unittest
13 | from typing import Optional
14 | 
15 | from feapder.network.user_pool import GuestUser
16 | from feapder.network.user_pool import GuestUserPool
17 | 
18 | 
19 | class TestUserPool(unittest.TestCase):
20 |     def setUp(self) -> None:
21 |         # 默认的用户池,使用webdriver访问page_url生产cookie
22 |         self.user_pool = GuestUserPool(
23 |             "test:user_pool", page_url="https://www.baidu.com"
24 |         )
25 | 
26 |         # 自定义生产cookie的方法
27 |         class CustomGuestUserPool(GuestUserPool):
28 |             def login(self) -> Optional[GuestUser]:
29 |                 # 此处为假数据,正常需通过网站获取cookie
30 |                 user = GuestUser(
31 |                     user_agent="xxx",
32 |                     proxies="yyy",
33 |                     cookies={"some_key": "some_value{}".format(time.time())},
34 |                 )
35 |                 return user
36 | 
37 |         self.custom_user_pool = CustomGuestUserPool(
38 |             "test:custom_user_pool", min_users=10, keep_alive=True
39 |         )
40 | 
41 |     def test_get_user(self):
42 |         """
43 |         测试直接获取游客用户
44 |         Returns:
45 | 
46 |         """
47 |         user = self.custom_user_pool.get_user(block=True)
48 |         print("取到user:", user)
49 |         print("cookie:", user.cookies)
50 |         print("user_agent:", user.user_agent)
51 |         print("proxies:", user.proxies)
52 | 
53 |     def test_del_user(self):
54 |         user = GuestUser(
55 |             **{
56 |                 "user_id": "9f1654ba654e12adfea548eae89a8f6f",
57 |                 "user_agent": "xxx",
58 |                 "proxies": "yyy",
59 |                 "cookies": {"some_key": "some_value1640006728.908013"},
60 |             }
61 |         )
62 |         print(user.user_id)
63 |         self.custom_user_pool.del_user(user.user_id)
64 | 
65 |     def test_keep_alive(self):
66 |         """
67 |         测试生产游客用户,面对需要大量cookie,需要单独起个进程维护cookie的场景
68 |         Returns:
69 | 
70 |         """
71 | 
72 |         self.custom_user_pool.run()
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     unittest.main()
77 | 


--------------------------------------------------------------------------------
/tests/user_pool/test_normal_user_pool.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/9/13 2:33 下午
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: Boris
 8 | @email: boris_liu@foxmail.com
 9 | """
10 | 
11 | import unittest
12 | 
13 | from feapder.network.user_pool import NormalUser
14 | from feapder.network.user_pool import NormalUserPool
15 | 
16 | 
17 | class TestUserPool(unittest.TestCase):
18 |     def setUp(self) -> None:
19 |         class CustomNormalUserPool(NormalUserPool):
20 |             def login(self, user: NormalUser) -> NormalUser:
21 |                 # 此处为假数据,正常需通过登录网站获取cookie
22 |                 username = user.username
23 |                 password = user.password
24 | 
25 |                 # 登录获取cookie
26 |                 cookie = "xxx"
27 |                 user.cookies = cookie
28 | 
29 |                 return user
30 | 
31 |         self.user_pool = CustomNormalUserPool(
32 |             "test:user_pool",
33 |             table_userbase="test_userbase",
34 |             login_retry_times=0,
35 |             keep_alive=True,
36 |         )
37 | 
38 |     def test_get_user(self):
39 |         user = self.user_pool.get_user()
40 |         print("取到user:", user)
41 |         print("cookie:", user.cookies)
42 |         print("user_agent:", user.user_agent)
43 |         print("proxies:", user.proxies)
44 | 
45 |     def test_del_user(self):
46 |         self.user_pool.del_user(1)
47 | 
48 |     def test_tag_user_locked(self):
49 |         self.user_pool.tag_user_locked(2)
50 | 
51 |     def test_keep_alive(self):
52 |         self.user_pool.run()
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     unittest.main()
57 | 


--------------------------------------------------------------------------------