├── .gitignore ├── README.md ├── master ├── README.md ├── action │ ├── help.py │ └── task_action.py ├── config.conf ├── config.py ├── db │ ├── elastic_search.py │ ├── mongodb.py │ ├── mysqldb.py │ ├── oracledb.py │ └── redisdb.py ├── init.py ├── manager │ ├── statistic_article_count.py │ └── task_manager.py ├── pid.py ├── service │ └── task_service.py ├── start_service.py ├── static │ └── css │ │ └── markdown.css ├── sync_data │ └── sync_article.py ├── templates │ └── README.html └── utils │ ├── export_data.py │ ├── ffmpeg_manager.py │ ├── kill_python_pro.py │ ├── log.py │ ├── network.py │ ├── prpcrypt.py │ ├── ring_buff.py │ └── tools.py ├── worker.rar ├── worker ├── base │ ├── article_manager.py │ ├── base_parser.py │ ├── collector.py │ ├── constance.py │ ├── parser_control.py │ ├── spider.py │ ├── url_count.txt │ └── url_manager.py ├── config.conf ├── db │ ├── elastic_search.py │ ├── mongodb.py │ ├── mysqldb.py │ ├── oracledb.py │ └── redisdb.py ├── extractor │ ├── article_extractor.py │ └── config.py ├── init.py ├── mapping.py ├── news │ ├── export_data.py │ ├── main-bak.py │ ├── main.py │ ├── parsers │ │ ├── __init__.py │ │ ├── base_parser.py │ │ └── news_parser.py │ ├── start.bat │ └── task_status.py ├── pid.py ├── url_count.txt └── utils │ ├── bloomfilter.py │ ├── encrypt.py │ ├── export_data.py │ ├── ffmpeg_manager.py │ ├── kill_python_pro.py │ ├── log.py │ ├── network.py │ ├── prpcrypt.py │ └── tools.py └── 优化.txt /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/README.md -------------------------------------------------------------------------------- /master/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/README.md -------------------------------------------------------------------------------- /master/action/help.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/action/help.py -------------------------------------------------------------------------------- /master/action/task_action.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/action/task_action.py -------------------------------------------------------------------------------- /master/config.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/config.conf -------------------------------------------------------------------------------- /master/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/config.py -------------------------------------------------------------------------------- /master/db/elastic_search.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/db/elastic_search.py -------------------------------------------------------------------------------- /master/db/mongodb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/db/mongodb.py -------------------------------------------------------------------------------- /master/db/mysqldb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/db/mysqldb.py -------------------------------------------------------------------------------- /master/db/oracledb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/db/oracledb.py -------------------------------------------------------------------------------- /master/db/redisdb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/db/redisdb.py -------------------------------------------------------------------------------- /master/init.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/init.py -------------------------------------------------------------------------------- /master/manager/statistic_article_count.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/manager/statistic_article_count.py -------------------------------------------------------------------------------- /master/manager/task_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/manager/task_manager.py -------------------------------------------------------------------------------- /master/pid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/pid.py -------------------------------------------------------------------------------- /master/service/task_service.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/service/task_service.py -------------------------------------------------------------------------------- /master/start_service.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/start_service.py -------------------------------------------------------------------------------- /master/static/css/markdown.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/static/css/markdown.css -------------------------------------------------------------------------------- /master/sync_data/sync_article.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/sync_data/sync_article.py -------------------------------------------------------------------------------- /master/templates/README.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/templates/README.html -------------------------------------------------------------------------------- /master/utils/export_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/utils/export_data.py -------------------------------------------------------------------------------- /master/utils/ffmpeg_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/utils/ffmpeg_manager.py -------------------------------------------------------------------------------- /master/utils/kill_python_pro.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.system('taskkill /im "python.exe" /F') -------------------------------------------------------------------------------- /master/utils/log.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/utils/log.py -------------------------------------------------------------------------------- /master/utils/network.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/utils/network.py -------------------------------------------------------------------------------- /master/utils/prpcrypt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/utils/prpcrypt.py -------------------------------------------------------------------------------- /master/utils/ring_buff.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/utils/ring_buff.py -------------------------------------------------------------------------------- /master/utils/tools.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/master/utils/tools.py -------------------------------------------------------------------------------- /worker.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker.rar -------------------------------------------------------------------------------- /worker/base/article_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/base/article_manager.py -------------------------------------------------------------------------------- /worker/base/base_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/base/base_parser.py -------------------------------------------------------------------------------- /worker/base/collector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/base/collector.py -------------------------------------------------------------------------------- /worker/base/constance.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/base/constance.py -------------------------------------------------------------------------------- /worker/base/parser_control.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/base/parser_control.py -------------------------------------------------------------------------------- /worker/base/spider.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/base/spider.py -------------------------------------------------------------------------------- /worker/base/url_count.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /worker/base/url_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/base/url_manager.py -------------------------------------------------------------------------------- /worker/config.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/config.conf -------------------------------------------------------------------------------- /worker/db/elastic_search.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/db/elastic_search.py -------------------------------------------------------------------------------- /worker/db/mongodb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/db/mongodb.py -------------------------------------------------------------------------------- /worker/db/mysqldb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/db/mysqldb.py -------------------------------------------------------------------------------- /worker/db/oracledb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/db/oracledb.py -------------------------------------------------------------------------------- /worker/db/redisdb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/db/redisdb.py -------------------------------------------------------------------------------- /worker/extractor/article_extractor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/extractor/article_extractor.py -------------------------------------------------------------------------------- /worker/extractor/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/extractor/config.py -------------------------------------------------------------------------------- /worker/init.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/init.py -------------------------------------------------------------------------------- /worker/mapping.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/mapping.py -------------------------------------------------------------------------------- /worker/news/export_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/news/export_data.py -------------------------------------------------------------------------------- /worker/news/main-bak.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/news/main-bak.py -------------------------------------------------------------------------------- /worker/news/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/news/main.py -------------------------------------------------------------------------------- /worker/news/parsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/news/parsers/__init__.py -------------------------------------------------------------------------------- /worker/news/parsers/base_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/news/parsers/base_parser.py -------------------------------------------------------------------------------- /worker/news/parsers/news_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/news/parsers/news_parser.py -------------------------------------------------------------------------------- /worker/news/start.bat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/news/start.bat -------------------------------------------------------------------------------- /worker/news/task_status.py: -------------------------------------------------------------------------------- 1 | is_doing = False # 多线程的共享变量需要放到单独的文件中, 通过task_status.is_doing的方式修改值 -------------------------------------------------------------------------------- /worker/pid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/pid.py -------------------------------------------------------------------------------- /worker/url_count.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /worker/utils/bloomfilter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/utils/bloomfilter.py -------------------------------------------------------------------------------- /worker/utils/encrypt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/utils/encrypt.py -------------------------------------------------------------------------------- /worker/utils/export_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/utils/export_data.py -------------------------------------------------------------------------------- /worker/utils/ffmpeg_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/utils/ffmpeg_manager.py -------------------------------------------------------------------------------- /worker/utils/kill_python_pro.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.system('taskkill /im "python.exe" /F') -------------------------------------------------------------------------------- /worker/utils/log.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/utils/log.py -------------------------------------------------------------------------------- /worker/utils/network.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/utils/network.py -------------------------------------------------------------------------------- /worker/utils/prpcrypt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/utils/prpcrypt.py -------------------------------------------------------------------------------- /worker/utils/tools.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/striver-ing/distributed-spider/HEAD/worker/utils/tools.py -------------------------------------------------------------------------------- /优化.txt: -------------------------------------------------------------------------------- 1 | 1、创建数据缓存队列, 另开线程存数据。 不占用parser解析时间 2 | 2、最后一层url不删除 --------------------------------------------------------------------------------