├── kolesa ├── __init__.py ├── README.md ├── handle_mongo.py └── crawl_kolesa.py ├── boss_zhipin ├── __init__.py ├── README.md └── crawl_boss_zhipin.py ├── dasouche ├── __init__.py ├── README.md └── handle_dasouche.py ├── synchronous ├── __init__.py ├── sample │ ├── __init__.py │ ├── multiprocess_pool.py │ ├── thread_test1.py │ ├── multiprocess_test3.py │ ├── multiprocess_test2.py │ ├── process_not_share.py │ ├── multiprocess_test1.py │ ├── multiprocess_class.py │ └── multiprocess_share.py ├── test1.py ├── handle_queue.py ├── handle_redis.py ├── handle_request.py ├── handle_spider.py └── spider_multiprocess.py ├── login_github ├── __init__.py ├── README.md └── handle_login.py ├── dongqiudi ├── dongqiudi │ ├── __init__.py │ ├── main.py │ ├── dongqiudi_pic │ │ ├── 西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑 │ │ │ ├── 7 │ │ │ ├── ChMf8FzSXFKAcY7DAAHvfisNUuY153.jpg │ │ │ ├── ChMf8FzgKFSARnpeAAGp9Jfeq-A752.jpg │ │ │ ├── ChMf8Fztq1-AXiXEADuUv2VgHRs890.gif │ │ │ ├── ChNLklztqZOAFKI4AANLRhtfxnE659.jpg │ │ │ ├── ChNLklztqnaAEJHSAAK8ZOypb-k480.jpg │ │ │ ├── ChNLklztqrmAFEVQAAGNxQf4M3s194.jpg │ │ │ └── ChNLklztvpmAGjgfAAFd4X3svKc014.jpg │ │ └── C罗与法拉利车手勒克莱尔同场较劲! │ │ │ ├── ChO2w1zuIRiAIJuDAAJrbFscpoU610.jpg │ │ │ ├── ChO2w1zuISOAF511AAFaT82ScyE114.jpg │ │ │ └── ChONolzuIOiASesEAAEgiz_2cMw359.jpg │ ├── spiders │ │ ├── __init__.py │ │ └── crawl_dongqiudi.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ └── settings.py ├── scrapy.cfg └── README.md ├── douban_movie_top250 ├── __init__.py ├── README.md ├── handle_mongo.py └── crawl_douban_movie_info_top250.py ├── mafengwo ├── mafengwo │ ├── __init__.py │ ├── main.py │ ├── spiders │ │ ├── __init__.py │ │ └── crawl_mafengwo.py │ ├── mafengwo_images │ │ └── full │ │ │ └── 0b5ae5f0ae9aaaa661bc67bbd23eae74bb71bc46.jpg │ ├── items.py │ ├── pipelines.py │ ├── middlewares.py │ ├── settings.py │ └── url_list.txt └── scrapy.cfg ├── douban_movie_top250_scrapy ├── douban │ ├── __init__.py │ ├── main.py │ ├── douban.json │ ├── spiders │ │ ├── __init__.py │ │ └── douban_spider.py │ ├── items.py │ ├── pipelines.py │ ├── middlewares.py │ └── settings.py ├── README.md ├── .idea │ ├── misc.xml │ ├── modules.xml │ ├── douban.iml │ ├── deployment.xml │ └── workspace.xml └── scrapy.cfg ├── mafengwo_article_spider ├── mafengwo │ ├── __init__.py │ ├── main.py │ ├── spiders │ │ ├── __init__.py │ │ └── crawl_mafengwo.py │ ├── js │ │ ├── README.md │ │ ├── handle_sn.py │ │ └── tool_decode_index.js │ ├── middlewares.py │ ├── handle_mongo.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ ├── handle_task.py │ └── url_list.txt ├── README.md ├── .idea │ ├── encodings.xml │ ├── misc.xml │ ├── modules.xml │ └── mafengwo.iml └── scrapy.cfg ├── video ├── README.md └── lishipin │ └── crawl_lishipin.py ├── baidu_m_keyword_ranks ├── README.md ├── setting.py ├── handle_mysql.py └── baidu_m_keyword.py ├── lagou ├── README.md ├── handle_mongo.py ├── crawl_lagou_job_old.py ├── handle_mysql.py └── crawl_lagou_job_new.py ├── .idea ├── vcs.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── small-spider-project.iml ├── deployment.xml └── workspace.xml ├── README.md └── .gitignore /kolesa/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /boss_zhipin/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dasouche/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /synchronous/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /login_github/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /douban_movie_top250/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mafengwo/mafengwo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /synchronous/sample/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /douban_movie_top250_scrapy/douban/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mafengwo_article_spider/mafengwo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video/README.md: -------------------------------------------------------------------------------- 1 | #### 1、lishipin 梨视频数据抓取 2 | -------------------------------------------------------------------------------- /dasouche/README.md: -------------------------------------------------------------------------------- 1 | # 大搜车爬虫 2 | 3 | #### bug:dazhuang_python@sina.com 4 | -------------------------------------------------------------------------------- /baidu_m_keyword_ranks/README.md: -------------------------------------------------------------------------------- 1 | # 百度M站搜索关键字去除广告后的排名抓取 2 | ## python3.6 多线程 3 | -------------------------------------------------------------------------------- /kolesa/README.md: -------------------------------------------------------------------------------- 1 | # kolesa爬虫 2 | 3 | #### bug:dazhuang_python@sina.com 4 | -------------------------------------------------------------------------------- /login_github/README.md: -------------------------------------------------------------------------------- 1 | # 登录github 2 | 3 | #### bug:dazhuang_python@sina.com 4 | -------------------------------------------------------------------------------- /boss_zhipin/README.md: -------------------------------------------------------------------------------- 1 | # boos直聘python岗位全国爬虫 2 | 3 | #### bug:dazhuang_python@sina.com 4 | -------------------------------------------------------------------------------- /douban_movie_top250/README.md: -------------------------------------------------------------------------------- 1 | # douban电影top250爬虫 2 | 3 | #### bug:dazhuang_python@sina.com 4 | -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/main.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute("scrapy crawl crawl_dongqiudi".split()) -------------------------------------------------------------------------------- /mafengwo/mafengwo/main.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute("scrapy crawl crawl_mafengwo".split()) -------------------------------------------------------------------------------- /douban_movie_top250_scrapy/README.md: -------------------------------------------------------------------------------- 1 | # douban电影top250爬虫-通过scrapy框架抓取 2 | 3 | #### bug:dazhuang_python@sina.com 4 | -------------------------------------------------------------------------------- /douban_movie_top250_scrapy/douban/main.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute("scrapy crawl douban_spider".split()) -------------------------------------------------------------------------------- /lagou/README.md: -------------------------------------------------------------------------------- 1 | # 拉钩python岗位全国爬虫 2 | 3 | ##### 不能在__init__方法中写mongo信息,否则多进程无法启动 4 | 5 | #### bug:dazhuang_python@sina.com 6 | -------------------------------------------------------------------------------- /mafengwo_article_spider/mafengwo/main.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute("scrapy crawl crawl_mafengwo".split()) -------------------------------------------------------------------------------- /douban_movie_top250_scrapy/douban/douban.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/douban_movie_top250_scrapy/douban/douban.json -------------------------------------------------------------------------------- /mafengwo_article_spider/README.md: -------------------------------------------------------------------------------- 1 | # small-spider-project 2 | ## 日常爬虫 3 | 4 | #### mafengwo_article_spider 马蜂窝最新,最热游记抓取 5 | 6 | 7 | 8 | #### bug:dazhuang_python@sina.com 9 | -------------------------------------------------------------------------------- /baidu_m_keyword_ranks/setting.py: -------------------------------------------------------------------------------- 1 | mysql_ip = '127.0.0.1' 2 | mysql_port = 3306 3 | mysql_database = '库名' 4 | mysql_table = '' 5 | mysql_username = '用户名' 6 | mysql_password = '密码' 7 | -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/7 -------------------------------------------------------------------------------- /mafengwo_article_spider/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /mafengwo/mafengwo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /douban_movie_top250_scrapy/douban/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /mafengwo/mafengwo/mafengwo_images/full/0b5ae5f0ae9aaaa661bc67bbd23eae74bb71bc46.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/mafengwo/mafengwo/mafengwo_images/full/0b5ae5f0ae9aaaa661bc67bbd23eae74bb71bc46.jpg -------------------------------------------------------------------------------- /mafengwo_article_spider/mafengwo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /mafengwo_article_spider/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲!/ChO2w1zuIRiAIJuDAAJrbFscpoU610.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲!/ChO2w1zuIRiAIJuDAAJrbFscpoU610.jpg -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲!/ChO2w1zuISOAF511AAFaT82ScyE114.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲!/ChO2w1zuISOAF511AAFaT82ScyE114.jpg -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲!/ChONolzuIOiASesEAAEgiz_2cMw359.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲!/ChONolzuIOiASesEAAEgiz_2cMw359.jpg -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChMf8FzSXFKAcY7DAAHvfisNUuY153.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChMf8FzSXFKAcY7DAAHvfisNUuY153.jpg -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChMf8FzgKFSARnpeAAGp9Jfeq-A752.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChMf8FzgKFSARnpeAAGp9Jfeq-A752.jpg -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChMf8Fztq1-AXiXEADuUv2VgHRs890.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChMf8Fztq1-AXiXEADuUv2VgHRs890.gif -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChNLklztqZOAFKI4AANLRhtfxnE659.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChNLklztqZOAFKI4AANLRhtfxnE659.jpg -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChNLklztqnaAEJHSAAK8ZOypb-k480.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChNLklztqnaAEJHSAAK8ZOypb-k480.jpg -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChNLklztqrmAFEVQAAGNxQf4M3s194.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChNLklztqrmAFEVQAAGNxQf4M3s194.jpg -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChNLklztvpmAGjgfAAFd4X3svKc014.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChNLklztvpmAGjgfAAFd4X3svKc014.jpg -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /douban_movie_top250_scrapy/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /dongqiudi/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = dongqiudi.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = dongqiudi 12 | -------------------------------------------------------------------------------- /douban_movie_top250_scrapy/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /mafengwo/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = mafengwo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = mafengwo 12 | -------------------------------------------------------------------------------- /mafengwo_article_spider/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /douban_movie_top250_scrapy/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = douban.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = douban 12 | -------------------------------------------------------------------------------- /mafengwo_article_spider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = mafengwo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = mafengwo 12 | -------------------------------------------------------------------------------- /douban_movie_top250/handle_mongo.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | from pymongo.collection import Collection 3 | 4 | 5 | class Handle_Mongo(object): 6 | def __init__(self): 7 | mongo_client = pymongo.MongoClient(host="127.0.0.1",port=27017) 8 | self.db_data = mongo_client['douban'] 9 | 10 | def handle_save_data(self,item): 11 | task_collection = Collection(self.db_data,'douban_data') 12 | task_collection.insert(item) 13 | 14 | douban_mongo = Handle_Mongo() 15 | -------------------------------------------------------------------------------- /mafengwo_article_spider/mafengwo/js/README.md: -------------------------------------------------------------------------------- 1 | # 马蜂窝生成sn的js解析 2 | 3 | #### index.js是马蜂窝网站上原有JS文件 4 | #### tool_decode_index.js是通过 http://jsnice.org/格式化和半解密 5 | #### handle_sn.py对SN进行破解,请求时发现无需传递SN,晕菜... 6 | 7 | 8 | ##### 619行:salt值:c9d6618dbc657b41a66eb0af952906f1 9 | ##### 632行: 获取时间戳p3["_ts"] = (new Date)[__Ox2133f[65]](); 10 | ##### 635行: 调用VIEW函数获取sn值var vroot = view(obj["extend"](true, {}, p3)); 11 | ##### 63行: 返回sn值,md5并切片 12 | 13 | 14 | ### 交流:dazhuang_python@sina.com 15 | 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # small-spider-project 2 | ## 日常爬虫 3 | 4 | #### 1、baidu_m_keyword_ranks 百度M站关键词搜索去除广告后的抓取 5 | #### 2、video 视频抓取 6 | #### 3、mafengwo 马蜂窝游记和图片抓取 7 | #### 4、kolesa kolesa数据抓取 8 | #### 5、douban_movie_top250 豆瓣电影top250数据抓取 9 | #### 6、douban_movie_top250_scrapy 豆瓣电影top250数据抓取-通过scrapy框架抓取 10 | #### 7、mafengwo_article_spider 马蜂窝所有游记抓取 11 | #### 8、dasouche 大搜车数据抓取 12 | #### 9、dongqiudi 懂球帝新闻数据抓取 13 | #### 10、github 登录github 14 | #### 11、synchronous 同步爬虫 15 | 16 | 17 | 18 | #### bug:dazhuang_python@sina.com 19 | -------------------------------------------------------------------------------- /dongqiudi/README.md: -------------------------------------------------------------------------------- 1 | # 懂球帝新闻爬虫 2 | ### 需求 3 | 抓取懂球帝新闻https://dongqiudi.com/news 4 | ### 项目结构 5 | ```text 6 | dongqiudi 7 | dongqiudi_pic 图片目录 8 | spiders 爬虫解析文件 9 | items.py 项目字段定义文件 10 | middlewares.py 中间件,包含下载代理中间件 11 | pipelines.py 数据管道,包含mongo数据存储和图片下载 12 | settings.py 配置文件 13 | main.py 启动文件 14 | ``` 15 | ### 说明 16 | ```text 17 | 在pipelines.py中定义mongodb的ip地址和端口号 18 | 在settings.py中定义是否开启中间件,下载延迟等选项 19 | ``` 20 | 21 | 22 | #### bug:dazhuang_python@sina.com 23 | -------------------------------------------------------------------------------- /mafengwo_article_spider/.idea/mafengwo.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /lagou/handle_mongo.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | from pymongo.collection import Collection 3 | 4 | 5 | 6 | class Handle_lagou_mongo(object): 7 | def __init__(self): 8 | lagou_client = pymongo.MongoClient(host="127.0.0.1",port=27017) 9 | self.lagou_db = lagou_client['lagou'] 10 | 11 | def handle_save_data(self,item): 12 | print(item) 13 | lagou_collection = Collection(self.lagou_db,"lagou_data") 14 | lagou_collection.update({"positionId":item['positionId']},item,True) 15 | 16 | 17 | lagou_mongo = Handle_lagou_mongo() -------------------------------------------------------------------------------- /douban_movie_top250_scrapy/.idea/douban.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /douban_movie_top250_scrapy/douban/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DoubanItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | #序号 15 | serial_number = scrapy.Field() 16 | #电影名称 17 | movie_name = scrapy.Field() 18 | #电影介绍 19 | introduce = scrapy.Field() 20 | #星级 21 | star = scrapy.Field() 22 | #评价 23 | evaluate = scrapy.Field() 24 | #电影描述 25 | describe = scrapy.Field() 26 | -------------------------------------------------------------------------------- /synchronous/test1.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | import asyncio 3 | 4 | headers = { 5 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " 6 | "Chrome/86.0.4240.75 Safari/537.36 " 7 | } 8 | 9 | 10 | async def sample_get(): 11 | # 发送一个简单的get请求 12 | async with aiohttp.ClientSession() as session: 13 | async with session.get(url="https://www.baidu.com", headers=headers) as response: 14 | print(response.status) 15 | print(await response.text()) 16 | 17 | 18 | if __name__ == '__main__': 19 | loop = asyncio.get_event_loop() 20 | loop.run_until_complete(sample_get()) 21 | -------------------------------------------------------------------------------- /synchronous/sample/multiprocess_pool.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import time 3 | 4 | 5 | def work(item): 6 | time.sleep(0.05) 7 | return "进程ID:{id},进程名称{name},执行任务item:{item}".format(id=multiprocessing.current_process().pid, 8 | name=multiprocessing.current_process().name, item=item) 9 | 10 | 11 | def main(): 12 | # 进程池大小为4 13 | pool = multiprocessing.Pool(processes=4) 14 | for item in range(100): 15 | result = pool.apply_async(func=work, args=(item,)) 16 | print(result.get()) 17 | pool.close() 18 | pool.join() 19 | 20 | 21 | if __name__ == '__main__': 22 | main() 23 | -------------------------------------------------------------------------------- /.idea/small-spider-project.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 13 | 14 | 16 | -------------------------------------------------------------------------------- /synchronous/handle_queue.py: -------------------------------------------------------------------------------- 1 | import queue 2 | from handle_request import DangdangRequest 3 | 4 | 5 | class DangdangQueue(object): 6 | def __init__(self): 7 | self.queue = queue.Queue() 8 | 9 | def insert_data(self, data): 10 | print("添加抓取任务: ", data) 11 | if isinstance(data, DangdangRequest): 12 | self.queue.put(data) 13 | return False 14 | 15 | def get_data(self): 16 | if not self.queue.empty(): 17 | data = self.queue.get() 18 | print("取出任务:", data) 19 | return data 20 | else: 21 | return False 22 | 23 | def database_empty(self): 24 | return self.queue.qsize() == 0 25 | -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DongqiudiItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | #抓取URL 15 | from_url = scrapy.Field() 16 | #新闻标题 17 | title = scrapy.Field() 18 | #发表时间 19 | release_time = scrapy.Field() 20 | #作者 21 | author = scrapy.Field() 22 | #新闻内容 23 | content = scrapy.Field() 24 | # 抓取时间 25 | crawl_time = scrapy.Field() 26 | images = scrapy.Field() 27 | image_urls = scrapy.Field() 28 | image_paths = scrapy.Field() 29 | -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | import base64 9 | 10 | 11 | class DongqiudiProxyMiddleware(object): 12 | # 设置代理策略 13 | def process_request(self, request, spider): 14 | # proxy,主机头和端口号 15 | request.meta['proxy'] = 'http://http-dyn.abuyun.com:9020' 16 | # 用户名:密码,当前代理必须要有费用 17 | # 你自己买的代理,用户名和密码肯定和我的不一样 18 | proxy_name_pass = 'HTK32673HL02BK2D:50125D2D38937C94'.encode('utf-8') 19 | encode_pass_name = base64.b64encode(proxy_name_pass) 20 | # 将代理信息设置到头部去 21 | # 注意!!!!!Basic后面有一个空格 22 | request.headers['Proxy-Authorization'] = 'Basic ' + encode_pass_name.decode() -------------------------------------------------------------------------------- /kolesa/handle_mongo.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | from pymongo.collection import Collection 3 | 4 | 5 | class Handle_Mongo(object): 6 | def __init__(self): 7 | mongo_client = pymongo.MongoClient(host="127.0.0.1",port=27017) 8 | self.db_data = mongo_client['kolesa'] 9 | 10 | def handle_save_task(self,item): 11 | task_collection = Collection(self.db_data,'kolesa_task') 12 | task_collection.update({'id':item['id']},item,True) 13 | 14 | def handle_get_task(self): 15 | task_collection = Collection(self.db_data,'kolesa_task') 16 | return task_collection.find_one_and_delete({}) 17 | 18 | def handle_save_data(self,item): 19 | task_collection = Collection(self.db_data,'kolesa_data') 20 | task_collection.update({'id':item['id']},item,True) 21 | 22 | kolesa_mongo = Handle_Mongo() -------------------------------------------------------------------------------- /mafengwo_article_spider/mafengwo/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | import requests 10 | import json 11 | import random 12 | 13 | 14 | class MafengwoProxyMiddleware(object): 15 | 16 | def process_response(self, request, response, spider): 17 | if 'mafengwo.net' in request.url: 18 | return response 19 | elif response is None: 20 | return request 21 | elif response.status == 302: 22 | return request 23 | elif response.status == 403: 24 | return request 25 | elif 'flashcookie.sw' in response.text: 26 | return request 27 | else: 28 | return response 29 | -------------------------------------------------------------------------------- /synchronous/sample/thread_test1.py: -------------------------------------------------------------------------------- 1 | import _thread 2 | import threading 3 | import time 4 | 5 | 6 | def _thread_handle(thread_name, delay): 7 | for num in range(10): 8 | time.sleep(delay) 9 | print("{}的num:{}".format(thread_name, num)) 10 | 11 | 12 | def threading_handle(delay=1): 13 | for num in range(10): 14 | time.sleep(delay) 15 | print("{}-num-{}".format(threading.current_thread().name, num)) 16 | 17 | 18 | def main(): 19 | # for item in range(10): 20 | # _thread.start_new_thread(_thread_handle, ("Thread - {}".format(item), 1)) 21 | # # 和进程不同,如果进程死亡,则线程也会死亡 22 | # time.sleep(200) 23 | for item in range(10): 24 | # thread = threading.Thread(target=threading_handle, args=(1,), name="执行线程-{}".format(item)) 25 | thread = threading.Thread(target=threading_handle, args=(1,)) 26 | thread.start() 27 | 28 | 29 | if __name__ == '__main__': 30 | main() 31 | -------------------------------------------------------------------------------- /synchronous/sample/multiprocess_test3.py: -------------------------------------------------------------------------------- 1 | import time 2 | import multiprocessing 3 | 4 | 5 | def status(): 6 | """守护进程方法""" 7 | while True: 8 | print("守护进程ID:{id},守护进程名称:{name}".format(id=multiprocessing.current_process().pid, 9 | name=multiprocessing.current_process().name)) 10 | time.sleep(1) 11 | 12 | 13 | def worker(): 14 | """具体执行工作的方法""" 15 | # 创建守护进程,daemon为TRUE 16 | daemon_process = multiprocessing.Process(target=status, name="守护进程", daemon=True) 17 | daemon_process.start() 18 | for item in range(10): 19 | print("进程ID:{id},进程名称:{name}".format(id=multiprocessing.current_process().pid, name=multiprocessing.current_process().name)) 20 | time.sleep(2) 21 | 22 | 23 | def main(): 24 | process = multiprocessing.Process(target=worker, name="工作进程") 25 | process.start() 26 | 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /synchronous/sample/multiprocess_test2.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import time 3 | 4 | 5 | def send(msg): 6 | time.sleep(5) 7 | print("进程ID:{id},进程名称:{name},发送消息:{msg}".format(id=multiprocessing.current_process().pid, 8 | name=multiprocessing.current_process().name, msg=msg)) 9 | 10 | 11 | def main(): 12 | process = multiprocessing.Process(target=send, name="TEST", args=("发送消息测试",)) 13 | process.start() 14 | # 阻塞主进程执行,将等待子进程执行完毕后再执行主进程 15 | # process.join() 16 | time.sleep(2) 17 | print("进程ID:{id},进程名称:{name}".format(id=multiprocessing.current_process().pid, 18 | name=multiprocessing.current_process().name)) 19 | # 中断进程前判断进程是否存活 20 | if process.is_alive(): 21 | # 中断进程 22 | process.terminate() 23 | print("进程被中断:{name}".format(name=process.name)) 24 | 25 | 26 | if __name__ == '__main__': 27 | main() 28 | -------------------------------------------------------------------------------- /synchronous/sample/process_not_share.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import threading 3 | 4 | # 多进程修改值 5 | value = 0 6 | lock = multiprocessing.Lock() 7 | 8 | 9 | def test1(lock=None): 10 | global value 11 | for i in range(1000000): 12 | # 使用锁解决多线程共享变量时的不安全问题 13 | lock.acquire() 14 | value = value + 1 15 | lock.release() 16 | 17 | 18 | def multiprocess_value(): 19 | p1 = multiprocessing.Process(target=test1) 20 | p2 = multiprocessing.Process(target=test1) 21 | p1.start() 22 | p2.start() 23 | p1.join() 24 | p2.join() 25 | 26 | 27 | def thread_value(): 28 | t1 = threading.Thread(target=test1, args=(lock, )) 29 | t2 = threading.Thread(target=test1, args=(lock, )) 30 | t1.start() 31 | t2.start() 32 | t1.join() 33 | t2.join() 34 | 35 | 36 | if __name__ == '__main__': 37 | # 进程与进程之间不共享数据 38 | # multiprocess_value() 39 | # print(value) 40 | # 多线程间共享数据 41 | thread_value() 42 | print(value) 43 | -------------------------------------------------------------------------------- /synchronous/sample/multiprocess_test1.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import cpu_count 2 | 3 | print("cpu内核数量为:{count}".format(count=cpu_count())) 4 | import multiprocessing 5 | import sys 6 | import time 7 | 8 | 9 | def worker(delay, count): 10 | for num in range(count): 11 | print("{process}进程ID:{id},进程名称:{name}".format(process=num, id=multiprocessing.current_process().pid, 12 | name=multiprocessing.current_process().name)) 13 | time.sleep(delay) 14 | 15 | 16 | def main(): 17 | # 创建三个进程 18 | for item in range(3): 19 | # 传入参数和进程名称 20 | process = multiprocessing.Process(target=worker, args=(1, 10,), name="item-{item}".format(item=item)) 21 | process.start() 22 | print("进程ID:{id},进程名称:{name}".format(id=multiprocessing.current_process().pid, name=multiprocessing.current_process().name)) 23 | # 未设置进程阻塞,主进程即使退出也不会影响子进程执行 24 | print("主进程退出") 25 | sys.exit(0) 26 | 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /mafengwo/mafengwo/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class MafengwoItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | #文章数量 15 | article_sum = scrapy.Field() 16 | #文章标题 17 | title = scrapy.Field() 18 | #作者名称 19 | name = scrapy.Field() 20 | #id 21 | id = scrapy.Field() 22 | #文章发表时间 23 | release_time = scrapy.Field() 24 | #评论数 25 | comment_sum = scrapy.Field() 26 | #收藏数 27 | star_sum = scrapy.Field() 28 | #顶 29 | support_sum = scrapy.Field() 30 | #阅读数 31 | read_sum = scrapy.Field() 32 | #文章内容 33 | content = scrapy.Field() 34 | #抓取URL 35 | from_url = scrapy.Field() 36 | #抓取时间 37 | crawl_time = scrapy.Field() 38 | images = scrapy.Field() 39 | image_urls = scrapy.Field() 40 | image_paths = scrapy.Field() 41 | -------------------------------------------------------------------------------- /mafengwo_article_spider/mafengwo/handle_mongo.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | from pymongo.collection import Collection 3 | 4 | 5 | 6 | 7 | class Mafengwo_mongo(object): 8 | def __init__(self): 9 | # mongo_client = pymongo.MongoClient(host='127.0.0.1', port=39070) 10 | mongo_client = pymongo.MongoClient(host='10.70.120.156', port=27017) 11 | self.db_data = mongo_client['oreo'] 12 | 13 | def get_from_url(self, item): 14 | db_collections = Collection(self.db_data, 'mafengwo_article') 15 | result = db_collections.find_one({'from_url':item}) 16 | if result: 17 | return True 18 | else: 19 | return False 20 | #return False 21 | 22 | def insert_task(self,item): 23 | db_collections = Collection(self.db_data, 'mafengwo_article_task') 24 | db_collections.insert_one(item) 25 | 26 | def get_task(self): 27 | db_collections = Collection(self.db_data, 'mafengwo_article_task') 28 | return db_collections.find_one_and_delete({}) 29 | 30 | 31 | mongo = Mafengwo_mongo() 32 | -------------------------------------------------------------------------------- /mafengwo_article_spider/mafengwo/js/handle_sn.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import time 3 | import requests 4 | import json 5 | 6 | 7 | 8 | 9 | for i in range(1,301): 10 | input_value = { 11 | "cost":"0", 12 | "days":"0", 13 | "mddid":"10065", 14 | "month":"0", 15 | "page":i, 16 | "pageid":"mdd_index", 17 | "sort":"1", 18 | "tagid":"0", 19 | "_ts":"1558433973256" 20 | } 21 | salt = "c9d6618dbc657b41a66eb0af952906f1" 22 | str = json.dumps(input_value)+salt 23 | 24 | # 创建md5对象 25 | hl = hashlib.md5() 26 | hl.update(str.encode(encoding='utf-8')) 27 | md5_result = hl.hexdigest()[2:12] 28 | # input_value['_sn'] = md5_result 29 | 30 | 31 | 32 | url = 'http://www.mafengwo.cn/gonglve/ajax.php?act=get_travellist' 33 | header = { 34 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", 35 | } 36 | response = requests.post(url=url,headers=header,data=input_value) 37 | print(response.text) 38 | time.sleep(1) 39 | -------------------------------------------------------------------------------- /synchronous/handle_redis.py: -------------------------------------------------------------------------------- 1 | import redis 2 | from pickle import dumps, loads 3 | from handle_request import DangdangRequest 4 | 5 | 6 | class RedisQueue(object): 7 | def __init__(self): 8 | pool = redis.ConnectionPool(host="192.168.149.129", port=6379) 9 | self.r = redis.Redis(connection_pool=pool) 10 | 11 | def insert_data(self, data): 12 | print("添加抓取任务: ", data) 13 | if isinstance(data, DangdangRequest): 14 | self.r.rpush("TEST", dumps(data)) 15 | return False 16 | 17 | def get_data(self): 18 | if self.r.llen("TEST"): 19 | data = loads(self.r.lpop("TEST")) 20 | print("取出任务:", data) 21 | return data 22 | else: 23 | return False 24 | 25 | def database_empty(self): 26 | return self.r.llen("TEST") == 0 27 | 28 | 29 | if __name__ == '__main__': 30 | db = RedisQueue() 31 | start_url = "https://www.baidu.com" 32 | baidu_request = DangdangRequest(url=start_url, callback="hello", need_proxy=True) 33 | db.insert_data(data=baidu_request) 34 | request = db.get_data() 35 | -------------------------------------------------------------------------------- /mafengwo_article_spider/mafengwo/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class MafengwoItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | #文章数量 15 | article_sum = scrapy.Field() 16 | #文章标题 17 | title = scrapy.Field() 18 | #作者名称 19 | name = scrapy.Field() 20 | #id 21 | id = scrapy.Field() 22 | #文章发表时间 23 | release_time = scrapy.Field() 24 | #评论数 25 | comment_sum = scrapy.Field() 26 | #收藏数 27 | star_sum = scrapy.Field() 28 | #顶 29 | support_sum = scrapy.Field() 30 | #阅读数 31 | read_sum = scrapy.Field() 32 | #文章内容 33 | content = scrapy.Field() 34 | #抓取URL 35 | from_url = scrapy.Field() 36 | upload_status = scrapy.Field() 37 | #抓取时间 38 | crawl_time = scrapy.Field() 39 | images = scrapy.Field() 40 | image_urls = scrapy.Field() 41 | image_paths = scrapy.Field() 42 | video_urls = scrapy.Field() 43 | -------------------------------------------------------------------------------- /douban_movie_top250_scrapy/douban/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import pymongo,json 9 | from pymongo.collection import Collection 10 | 11 | class DoubanPipeline(object): 12 | def __init__(self): 13 | mongo_client = pymongo.MongoClient(host='127.0.0.1', port=27017) 14 | self.db_data = mongo_client['douban_scrapy'] 15 | 16 | def process_item(self, item, spider): 17 | #指定数据库和表 18 | douban_collection = Collection(self.db_data,'douban') 19 | douban_collection.insert(dict(item)) 20 | return item 21 | 22 | class DoubanJsonPipeline(object): 23 | def __init__(self): 24 | self.file = open('douban.json','w') 25 | 26 | def process_item(self, item, spider): 27 | # json数据中添加逗号和换行符 28 | content = json.dumps(dict(item),ensure_ascii = False) + ",\n" 29 | self.file.write(content) 30 | return item 31 | 32 | def close_spider(self,spider): 33 | self.file.close() 34 | -------------------------------------------------------------------------------- /synchronous/sample/multiprocess_class.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import sys 3 | import time 4 | 5 | 6 | # 继承multiprocessing.Process类 7 | class MyProcess(multiprocessing.Process): 8 | def __init__(self, name, delay, count): 9 | # 调用父类方法传入名称 10 | super().__init__(name=name) 11 | self.delay = delay 12 | self.count = count 13 | 14 | # 多进程类具体执行方法 15 | def run(self) -> None: 16 | for num in range(self.count): 17 | print("{process}进程ID:{id},进程名称:{name}".format(process=num, id=multiprocessing.current_process().pid, 18 | name=multiprocessing.current_process().name)) 19 | time.sleep(self.delay) 20 | 21 | 22 | def main(): 23 | for item in range(3): 24 | process = MyProcess(name="item-{id}".format(id=item), delay=1, count=10) 25 | # 多进程类start方法会调用run方法 26 | process.start() 27 | 28 | print("进程ID:{id},进程名称:{name}".format(id=multiprocessing.current_process().pid, 29 | name=multiprocessing.current_process().name)) 30 | print("主进程退出") 31 | sys.exit(0) 32 | 33 | 34 | if __name__ == '__main__': 35 | main() 36 | -------------------------------------------------------------------------------- /.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /mafengwo/mafengwo/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | from pymongo.collection import Collection 9 | from scrapy.pipelines.images import ImagesPipeline 10 | from scrapy import Request 11 | 12 | class MafengwoPipeline(object): 13 | def __init__(self): 14 | mongo_client = pymongo.MongoClient(host='127.0.0.1', port=27017) 15 | self.db_data = mongo_client['mafengwo'] 16 | 17 | def process_item(self, item, spider): 18 | db_collections = Collection(self.db_data, 'mafengwo_article') 19 | db_collections.update({'from_url':item['from_url']},item,True) 20 | return item 21 | 22 | 23 | class MafengwoImagePipeline(ImagesPipeline): 24 | def get_media_requests(self, item, info): 25 | for image_url in item['image_urls']: 26 | yield Request(url=image_url) 27 | 28 | def item_completed(self, results, item, info): 29 | image_paths = [x['path'] for ok, x in results if ok] 30 | if not image_paths: 31 | pass 32 | item['image_paths'] = image_paths 33 | return item 34 | -------------------------------------------------------------------------------- /douban_movie_top250_scrapy/douban/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | import base64 8 | import random 9 | 10 | class ProxyMiddleware(object): 11 | def __init__(self): 12 | self.proxy_info = [ 13 | {'proxy_url': 'ip4.hahado.cn:35410', 'proxy_user_pass': b'duoipbpvzyymn:tRf6NnfsBi7k0'}, 14 | {'proxy_url': 'ip4.hahado.cn:35164', 'proxy_user_pass': b'duoipcnezxjlvkv:xXuXTPES9XPwp'}, 15 | {'proxy_url': 'ip4.hahado.cn:35401', 'proxy_user_pass': b'duoipwpdlrfwc:888888'}, 16 | {'proxy_url': 'ip4.hahado.cn:35404', 'proxy_user_pass': b'duoipcnxgfzfsyp:TjgLhDqqEj0Pe'}, 17 | {'proxy_url': 'ip4.hahado.cn:35413', 'proxy_user_pass': b'duoipvriezfde:bq4RYrQiWuQzv'}, 18 | ] 19 | 20 | def process_request(self, request, spider): 21 | proxy = random.choice(self.proxy_info) 22 | request.meta['proxy'] = proxy['proxy_url'] 23 | proxy_user_pass = proxy['proxy_user_pass'] 24 | encoded_user_pass = base64.b64encode(proxy_user_pass) 25 | request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass.decode() 26 | # return None 27 | -------------------------------------------------------------------------------- /mafengwo/mafengwo/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | import base64 8 | 9 | 10 | class MafengwoProxyMiddleware(object): 11 | #设置代理策略 12 | def process_request(self, request, spider): 13 | # proxy,主机头和端口号 14 | request.meta['proxy'] = 'http://http-dyn.abuyun.com:9020' 15 | # 用户名:密码,当前代理必须要有费用 16 | # 你自己买的代理,用户名和密码肯定和我的不一样 17 | proxy_name_pass = 'HTK32673HL02BK2D:50125D2D38937C94'.encode('utf-8') 18 | encode_pass_name = base64.b64encode(proxy_name_pass) 19 | # 将代理信息设置到头部去 20 | # 注意!!!!!Basic后面有一个空格 21 | request.headers['Proxy-Authorization'] = 'Basic ' + encode_pass_name.decode() 22 | 23 | #通过response判断下载是否成功 24 | def process_response(self, request, response, spider): 25 | if 'mafengwo.net' in request.url: 26 | return response 27 | elif response is None: 28 | return request 29 | elif response.status == 302: 30 | return request 31 | elif response.status == 403: 32 | return request 33 | elif 'flashcookie.sw' in response.text: 34 | return request 35 | else: 36 | return response 37 | -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | import pymongo 10 | from pymongo.collection import Collection 11 | from scrapy.pipelines.images import ImagesPipeline 12 | from scrapy import Request 13 | 14 | #存储数据 15 | class DongqiudiPipeline(object): 16 | def __init__(self): 17 | mongo_client = pymongo.MongoClient(host='192.168.7.142',port=27017) 18 | self.dongqiudi_db = mongo_client['dongqiudi_data'] 19 | def process_item(self, item, spider): 20 | dongqiudi_collection = Collection(self.dongqiudi_db,"dongqiudi") 21 | dongqiudi_collection.update({'from_url':item['from_url']},item,True) 22 | return item 23 | 24 | #下载图片 25 | class DongqiudiImagePipeline(ImagesPipeline): 26 | def get_media_requests(self, item, info): 27 | for image_url in item['image_urls']: 28 | yield Request(url=image_url,meta={'img_name':image_url,'photo_id':item['title']}) 29 | 30 | def item_completed(self, results, item, info): 31 | image_paths = [x['path'] for ok, x in results if ok] 32 | if not image_paths: 33 | pass 34 | return item 35 | 36 | def file_path(self, request, response=None, info=None): 37 | filename = './' + str(request.meta['photo_id'])+'/'+request.meta['img_name'].split("/")[-1] 38 | return filename 39 | -------------------------------------------------------------------------------- /synchronous/handle_request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import traceback 3 | 4 | 5 | class DangdangRequest(object): 6 | def __init__(self, url, headers, callback, method="GET", need_proxy=False, fail_time=0, timeout=(5, 5)): 7 | self.callback = callback 8 | self.need_proxy = need_proxy 9 | self.fail_time = fail_time 10 | self.timeout = timeout 11 | self.headers = headers 12 | self.url = url 13 | self.method = method 14 | 15 | def __str__(self): 16 | return self.url 17 | 18 | def send_request(self): 19 | print("请求{url}".format(url=self.url)) 20 | proxy_info = {} 21 | if self.method == "GET": 22 | try: 23 | if not self.need_proxy: 24 | response = requests.get(url=self.url, headers=self.headers, timeout=self.timeout) 25 | else: 26 | response = requests.get(url=self.url, headers=self.headers, timeout=self.timeout, 27 | proxies=proxy_info) 28 | except Exception as e: 29 | print(traceback.format_exc()) 30 | return self 31 | else: 32 | return response 33 | 34 | 35 | if __name__ == '__main__': 36 | headers = { 37 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36 " 38 | } 39 | q = DangdangRequest(url="https://www.baidu.com", headers=headers, callback="hello") 40 | response = q.send_request() 41 | print(response.text) 42 | -------------------------------------------------------------------------------- /mafengwo_article_spider/mafengwo/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | from pymongo.collection import Collection 9 | from scrapy.pipelines.images import ImagesPipeline 10 | from scrapy import Request 11 | 12 | class MafengwoPipeline(object): 13 | def __init__(self): 14 | # mongo_client = pymongo.MongoClient(host='127.0.0.1', port=39070) 15 | mongo_client = pymongo.MongoClient(host='10.70.120.156', port=27017) 16 | self.db_data = mongo_client['oreo'] 17 | 18 | def process_item(self, item, spider): 19 | db_collections = Collection(self.db_data, 'mafengwo_article') 20 | db_collections.update({'from_url':item['from_url']},item,True) 21 | return item 22 | 23 | 24 | class MafengwoImagePipeline(ImagesPipeline): 25 | def get_media_requests(self, item, info): 26 | for image_url in item['image_urls']: 27 | yield Request(url=image_url,meta={'img_name':image_url,'photo_id':item['id']}) 28 | 29 | def item_completed(self, results, item, info): 30 | image_paths = [x['path'] for ok, x in results if ok] 31 | if not image_paths: 32 | pass 33 | #item['image_paths'] = image_paths 34 | return item 35 | 36 | def file_path(self, request, response=None, info=None): 37 | filename = './' + str(request.meta['photo_id'])+'/'+request.meta['img_name'].split("/")[-1] 38 | return filename 39 | -------------------------------------------------------------------------------- /douban_movie_top250_scrapy/.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 50 | -------------------------------------------------------------------------------- /synchronous/sample/multiprocess_share.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import time 3 | 4 | value = 1 5 | 6 | 7 | def send_data(conn): 8 | global value 9 | value = value + 1 10 | conn.send(value) 11 | 12 | 13 | def receive_data(conn): 14 | print("接收到的数据为:{data}".format(data=conn.recv())) 15 | 16 | 17 | def pipe_main(): 18 | # 进程通信管道 19 | conn_recv, conn_send = multiprocessing.Pipe() 20 | process_send = multiprocessing.Process(target=send_data, args=(conn_send,)) 21 | process_send.start() 22 | process_send.join() 23 | process_recv = multiprocessing.Process(target=receive_data, args=(conn_recv,)) 24 | process_recv.start() 25 | process_recv.join() 26 | 27 | 28 | def worker(dict, lock): 29 | while True: 30 | # lock.acquire() 31 | with lock: 32 | number = dict.get("ticket") 33 | if number > 0: 34 | time.sleep(1) 35 | number = number - 1 36 | print("{}-ticket={}".format(multiprocessing.current_process().name, number)) 37 | dict.update({"ticket": number}) 38 | else: 39 | print("无票") 40 | break 41 | # lock.release() 42 | 43 | 44 | def main(): 45 | # 使用manager操作字典共享 46 | manager = multiprocessing.Manager() 47 | mgr_dict = manager.dict(ticket=5) 48 | lock = multiprocessing.Lock() 49 | print(mgr_dict) 50 | job_process = [multiprocessing.Process(target=worker, args=(mgr_dict, lock,), name="售票员-{item}".format(item=item)) 51 | for item in range(3)] 52 | for job in job_process: 53 | job.start() 54 | 55 | for end in job_process: 56 | end.join() 57 | 58 | 59 | if __name__ == '__main__': 60 | # pipe_main() 61 | main() 62 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /baidu_m_keyword_ranks/handle_mysql.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | import time 3 | import setting 4 | import csv 5 | 6 | 7 | 8 | class Handle_mysql(object): 9 | def __init__(self): 10 | self.db = pymysql.connect(host=setting.mysql_ip,port=setting.mysql_port,database=setting.mysql_database,user=setting.mysql_username,password=setting.mysql_password) 11 | self.cursor = self.db.cursor() 12 | 13 | def __del__(self): 14 | self.cursor.close() 15 | self.db.close() 16 | 17 | def handle_task(self): 18 | #获取任务关键字 19 | sql = "SELECT search_word FROM seo_fast_rankings WHERE state=1;" 20 | self.cursor.execute(sql) 21 | result = self.cursor.fetchall() 22 | return result 23 | 24 | #插入和更新数据 25 | def handle_insert_db(self,item=None): 26 | sql_insert = """ INSERT INTO seo_baidu_m_keyword_ziran (keyword,rank,crawl_date) VALUES ("%s",'%s',"%s");""" % (item['keyword'],item['rank'],item['crawl_date']) 27 | try: 28 | self.cursor.execute(sql_insert) 29 | self.db.commit() 30 | except: 31 | pass 32 | # print(sql_insert) 33 | 34 | mysql = Handle_mysql() 35 | if __name__ == '__main__': 36 | #插入数据前先删除当日数据 37 | date = time.strftime("%Y-%m-%d", time.localtime()) 38 | sql_delete = """ DELETE FROM seo_baidu_m_keyword_ziran where crawl_date='%s'"""%date 39 | mysql.cursor.execute(sql_delete) 40 | mysql.db.commit() 41 | #导入当日数据 42 | with open('baidu_m_keyword_ziran.csv','r',encoding='utf-8') as f: 43 | csv_reader = csv.reader(f) 44 | data = next(csv_reader) 45 | for i in csv_reader: 46 | info = {} 47 | info['keyword'] = i[0] 48 | info['rank'] = i[1] 49 | info['crawl_date'] = i[2] 50 | mysql.handle_insert_db(info) 51 | -------------------------------------------------------------------------------- /douban_movie_top250_scrapy/douban/spiders/douban_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from douban.items import DoubanItem 4 | 5 | 6 | class DoubanSpiderSpider(scrapy.Spider): 7 | # scrapy项目名称 8 | name = 'douban_spider' 9 | allowed_domains = ['douban.com'] 10 | # 起始URL 11 | start_urls = ['https://movie.douban.com/top250?start=0&filter='] 12 | custom_settings = { 13 | 'USER_AGENT':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0' 14 | } 15 | 16 | # 解析方法 17 | def parse(self, response): 18 | movie_list = response.xpath("//div[@class='article']//ol[@class='grid_view']/li") 19 | for i_item in movie_list: 20 | douban_item = DoubanItem() 21 | douban_item['serial_number'] = i_item.xpath(".//div[@class='item']//em/text()").extract_first() 22 | douban_item['movie_name'] = i_item.xpath(".//div[@class='info']/div[@class='hd']/a/span[1]/text()").extract_first() 23 | content = i_item.xpath(".//div[@class='info']//div[@class='bd']/p[1]/text()").extract() 24 | for i_content in content: 25 | content_s = "".join(i_content.split()) 26 | douban_item['introduce'] = content_s 27 | douban_item['star'] = i_item.xpath(".//span[@class='rating_num']/text()").extract_first() 28 | douban_item['evaluate'] = i_item.xpath(".//div[@class='star']//span[4]/text()").extract_first() 29 | douban_item['describe'] = i_item.xpath(".//p[@class='quote']/span/text()").extract_first() 30 | # yield到pipeline,settings中需要启用,否则无法存储数据 31 | yield douban_item 32 | 33 | nextLink = response.xpath('//span[@class="next"]/link/@href').extract() 34 | # 第10页是最后一页,没有下一页的链接 35 | if nextLink: 36 | nextLink = nextLink[0] 37 | print (nextLink) 38 | yield scrapy.Request('https://movie.douban.com/top250'+nextLink, callback=self.parse) 39 | # # 递归将下一页的地址传给这个函数自己,在进行爬取 40 | -------------------------------------------------------------------------------- /login_github/handle_login.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | 4 | 5 | class Login(object): 6 | def __init__(self): 7 | self.login_session = requests.session() 8 | self.header = { 9 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" 10 | } 11 | self.city_token = "" 12 | 13 | 14 | def handle_city_token(self): 15 | """ 16 | 获取city_token,为登录做准备 17 | :return:self.city_token 18 | """ 19 | login_url = "https://github.com/login" 20 | response = self.login_session.get(url=login_url,headers=self.header) 21 | city_token_search = re.compile(r'name="authenticity_token"\svalue="(.*?)"\s\/>') 22 | self.city_token = city_token_search.search(response.text).group(1) 23 | 24 | def handle_login_github(self): 25 | """ 26 | 执行登录 27 | :return: 登录后匹配的字符串 28 | """ 29 | login_name = input("请输入用户名:") 30 | login_password = input("请输入密码:") 31 | self.handle_city_token() 32 | #获取登录cookie 33 | self.login_session.get(url="https://github.com/manifest.json",headers=self.header) 34 | data = { 35 | "commit": "Sign in", 36 | "utf8": "✓", 37 | "authenticity_token":self.city_token, 38 | "login": login_name, 39 | "password": login_password, 40 | "webauthn-support": "supported", 41 | } 42 | session_url = "https://github.com/session" 43 | self.header['Referer'] = "https://github.com/login" 44 | # 登录 45 | self.login_session.post(url=session_url,headers=self.header,data=data) 46 | self.header.pop('Referer') 47 | #请求设置页 48 | response = self.login_session.get(url="https://github.com/settings/profile",headers=self.header) 49 | search_email = re.compile(login_name) 50 | # 登陆成功后可以获取到自己的登录名称 51 | print(search_email.search(response.text).group()) 52 | if __name__ == '__main__': 53 | github = Login() 54 | github.handle_login_github() 55 | -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/spiders/crawl_dongqiudi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from ..items import DongqiudiItem 4 | import time 5 | import json 6 | 7 | 8 | class CrawlDongqiudiSpider(scrapy.Spider): 9 | name = 'crawl_dongqiudi' 10 | allowed_domains = ['dongqiudi.com'] 11 | start_urls = ['http://dongqiudi.com/'] 12 | 13 | #分析懂球帝页面,通过浏览器开发者工具xhr,可以看到异步json请求 14 | def start_requests(self,time_value=None): 15 | #初始时间使用time.time()构造 16 | if time_value == None: 17 | time_value = int(time.time()) 18 | #分析页面新闻结构 19 | for item_value in [56,232,57,3,4,5,6]: 20 | #如该请求https://dongqiudi.com/api/app/tabs/web/56.json?after=1572577395&page=1 21 | #其中56为栏目编号,after为时间戳,page为页码 22 | page_url = "https://dongqiudi.com/api/app/tabs/web/%s.json?after=%s&page=1"%(item_value,time_value) 23 | yield scrapy.Request(url=page_url,callback=self.handle_page_response,dont_filter=True) 24 | 25 | #处理页码请求的返回 26 | def handle_page_response(self,response): 27 | response_dict = json.loads(response.text) 28 | #从返回中获取下一页链接 29 | next_url = response_dict.get('next') 30 | if next_url: 31 | #请求下一页 32 | yield scrapy.Request(url=next_url,callback=self.handle_page_response,dont_filter=True) 33 | 34 | #解析新闻列表 35 | news_list = response_dict.get('articles') 36 | if news_list: 37 | for item in news_list: 38 | info = {} 39 | #新闻URL 40 | info['from_url'] = item.get('url') 41 | #新闻标题 42 | info['title'] = item.get('title') 43 | #新闻发表时间 44 | info['release_time'] = item.get('published_at') 45 | yield scrapy.Request(url=info['from_url'],callback=self.handle_detail,dont_filter=True,meta=info) 46 | 47 | #处理新闻详情页 48 | def handle_detail(self,response): 49 | dongqiudi = DongqiudiItem() 50 | #作者 51 | dongqiudi['author'] = response.xpath("//header/h2/a/text()").extract_first() 52 | #内容 53 | dongqiudi['content'] = ''.join(response.xpath("//div[@class='con']/p/text()").extract()) 54 | #新闻图片 55 | dongqiudi['image_urls'] = response.xpath("//div[@class='con']/p/img/@data-src").extract() 56 | #抓取时间 57 | dongqiudi['crawl_time'] = time.strftime("%Y%m%d %H:%M:%S", time.localtime()) 58 | #新闻标题 59 | dongqiudi['title'] = response.request.meta['title'] 60 | #抓取url 61 | dongqiudi['from_url'] = response.request.meta['from_url'] 62 | #发表时间 63 | dongqiudi['release_time'] = response.request.meta['release_time'] 64 | #yield到pipeline中 65 | yield dongqiudi 66 | -------------------------------------------------------------------------------- /synchronous/handle_spider.py: -------------------------------------------------------------------------------- 1 | # from handle_redis import RedisQueue 2 | from handle_queue import DangdangQueue 3 | from handle_request import DangdangRequest 4 | from lxml import etree 5 | import time 6 | 7 | 8 | class Spider(object): 9 | headers = { 10 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " 11 | "Chrome/86.0.4240.75 Safari/537.36 " 12 | } 13 | 14 | # queue = RedisQueue() 15 | queue = DangdangQueue() 16 | 17 | def start(self): 18 | """爬虫起始方法""" 19 | for page in range(1, 26): 20 | start_url = "http://bang.dangdang.com/books/fivestars/2-{page}".format(page=page) 21 | dangdang_request = DangdangRequest(url=start_url, callback=self.parse_item, headers=Spider.headers) 22 | Spider.queue.insert_data(data=dangdang_request) 23 | 24 | def do_request(self, request): 25 | """发送请求""" 26 | response = request.send_request() 27 | return response 28 | 29 | def parse_item(self, response): 30 | """解析数据""" 31 | data = [] 32 | html = etree.HTML(response.text) 33 | items = html.xpath("//ul[@class='bang_list']/li") 34 | for item in items: 35 | title = item.xpath(".//div[@class='name']/a/text()") 36 | if title: 37 | data.extend(title) 38 | yield data 39 | 40 | def error(self, request): 41 | """请求错误后返回队列""" 42 | request.fail_time = request.fail_time + 1 43 | if request.fail_time < 20: 44 | print("该请求异常{url}, 将该请求放回队列".format(url=request)) 45 | Spider.queue.insert_data(data=request) 46 | 47 | def schedule(self): 48 | """任务调度""" 49 | start_time = time.time() 50 | while not Spider.queue.database_empty(): 51 | dangdang_request = self.queue.get_data() 52 | if dangdang_request: 53 | print("当前调度:", dangdang_request) 54 | callback = dangdang_request.callback 55 | response = self.do_request(dangdang_request) 56 | if not isinstance(response, DangdangRequest): 57 | # 通过回调方法解析 58 | result = callback(response) 59 | for item in result: 60 | print(item) 61 | else: 62 | dangdang_request = DangdangRequest(url=response.url, headers=Spider.headers, callback=self.parse_item) 63 | # 错误处理 64 | self.error(dangdang_request) 65 | print("共耗时:", time.time()-start_time) 66 | 67 | def run(self): 68 | self.start() 69 | self.schedule() 70 | 71 | 72 | if __name__ == '__main__': 73 | s = Spider() 74 | s.run() 75 | -------------------------------------------------------------------------------- /douban_movie_top250/crawl_douban_movie_info_top250.py: -------------------------------------------------------------------------------- 1 | import re 2 | from concurrent.futures import ThreadPoolExecutor 3 | import requests 4 | from lxml import etree 5 | from handle_mongo import douban_mongo 6 | 7 | 8 | class HandleDoubanMovieTop250(object): 9 | def __init__(self): 10 | self.header = { 11 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 12 | "Accept-Encoding":"gzip, deflate, br", 13 | "Accept-Language":"zh-CN,zh;q=0.9", 14 | "Connection":"keep-alive", 15 | "Host":"movie.douban.com", 16 | "Upgrade-Insecure-Requests":"1", 17 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 18 | } 19 | self.page_url = [] 20 | 21 | def handle_page_url(self): 22 | #通过分析页面URL可以得知 23 | #通过range构造页码变量,从0开始,到249结束,步长为25 24 | for i in range(0,250,25): 25 | url = "https://movie.douban.com/top250?start=%s"%i 26 | self.page_url.append(url) 27 | 28 | #处理请求方法 29 | def handle_request(self,url): 30 | response = requests.get(url=url,headers=self.header) 31 | return response.text 32 | 33 | 34 | #处理页码页 35 | def handle_page_detail(self,url): 36 | print(url) 37 | #处理特殊字符 38 | sub_search = re.compile(r"[\s\r\t]") 39 | response = self.handle_request(url=url) 40 | html = etree.HTML(response) 41 | #解析当前页面有多少个电影信息 42 | item_list = html.xpath("//ol[@class='grid_view']/li") 43 | for item in item_list: 44 | info = {} 45 | #电影名称,将特殊字符替换为空 46 | info['movie_name'] = sub_search.sub('',''.join(item.xpath(".//div[@class='hd']/a//span/text()"))) 47 | info['actors_information'] = sub_search.sub('',''.join(item.xpath(".//div[@class='bd']/p/text()"))) 48 | info['score'] = sub_search.sub('',''.join(item.xpath(".//div[@class='bd']/div[@class='star']/span[2]/text()"))) 49 | info['evaluate'] = sub_search.sub('',''.join(item.xpath(".//div[@class='bd']/div[@class='star']/span[4]/text()"))) 50 | info['describe'] = sub_search.sub('',''.join(item.xpath(".//p[@class='quote']/span/text()"))) 51 | info['from_url'] = url 52 | #数据入库 53 | douban_mongo.handle_save_data(info) 54 | 55 | #启动方法 56 | def run(self): 57 | self.handle_page_url() 58 | #创建线程池 59 | t = ThreadPoolExecutor() 60 | for i in self.page_url: 61 | t.submit(self.handle_page_detail,i) 62 | t.shutdown() 63 | 64 | #入口函数 65 | def main(): 66 | douban = HandleDoubanMovieTop250() 67 | douban.run() 68 | 69 | if __name__ == '__main__': 70 | #入口函数调用 71 | main() 72 | -------------------------------------------------------------------------------- /synchronous/spider_multiprocess.py: -------------------------------------------------------------------------------- 1 | # from handle_redis import RedisQueue 2 | import multiprocessing 3 | from handle_queue import DangdangQueue 4 | from handle_request import DangdangRequest 5 | from lxml import etree 6 | import time 7 | 8 | 9 | class Spider(object): 10 | headers = { 11 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " 12 | "Chrome/86.0.4240.75 Safari/537.36 " 13 | } 14 | 15 | # queue = RedisQueue() 16 | queue = DangdangQueue() 17 | 18 | def start(self): 19 | """爬虫起始方法""" 20 | for page in range(1, 26): 21 | start_url = "http://bang.dangdang.com/books/fivestars/2-{page}".format(page=page) 22 | dangdang_request = DangdangRequest(url=start_url, callback=self.parse_item, headers=Spider.headers) 23 | Spider.queue.insert_data(data=dangdang_request) 24 | 25 | def do_request(self, request): 26 | """发送请求""" 27 | response = request.send_request() 28 | return response 29 | 30 | def parse_item(self, response): 31 | """解析数据""" 32 | data = [] 33 | html = etree.HTML(response.text) 34 | items = html.xpath("//ul[@class='bang_list']/li") 35 | for item in items: 36 | title = item.xpath(".//div[@class='name']/a/text()") 37 | if title: 38 | data.extend(title) 39 | yield data 40 | 41 | def error(self, request): 42 | """请求错误后返回队列""" 43 | request.fail_time = request.fail_time + 1 44 | if request.fail_time < 20: 45 | print("该请求异常{url}, 将该请求放回队列".format(url=request)) 46 | Spider.queue.insert_data(data=request) 47 | 48 | def handle_worker(self, request): 49 | print("{name}调度{url}".format(name=multiprocessing.current_process().name, url=request.url)) 50 | callback = request.callback 51 | response = self.do_request(request) 52 | if not isinstance(response, DangdangRequest): 53 | # 通过回调方法解析 54 | result = callback(response) 55 | for item in result: 56 | print(item) 57 | else: 58 | dangdang_request = DangdangRequest(url=response.url, headers=Spider.headers, callback=self.parse_item) 59 | # 错误处理 60 | self.error(dangdang_request) 61 | 62 | def schedule(self): 63 | """任务调度""" 64 | start_time = time.time() 65 | pool = multiprocessing.Pool(multiprocessing.cpu_count()) 66 | while not Spider.queue.database_empty(): 67 | dangdang_request = self.queue.get_data() 68 | if dangdang_request: 69 | pool.apply_async(func=self.handle_worker, args=(dangdang_request,)) 70 | pool.close() 71 | pool.join() 72 | print("共耗时:", time.time()-start_time) 73 | 74 | def run(self): 75 | self.start() 76 | self.schedule() 77 | 78 | 79 | if __name__ == '__main__': 80 | s = Spider() 81 | s.run() 82 | -------------------------------------------------------------------------------- /video/lishipin/crawl_lishipin.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | from lxml import etree 4 | import re 5 | 6 | class HandleLishipin(object): 7 | def __init__(self): 8 | self.header = { 9 | "Connection":"keep-alive", 10 | "Upgrade-Insecure-Requests":"1", 11 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", 12 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 13 | "Accept-Encoding":"gzip, deflate, br", 14 | "Accept-Language":"zh-CN,zh;q=0.9", 15 | } 16 | 17 | def handle_html(self,url): 18 | response = requests.get(url=url,headers=self.header) 19 | return response.text 20 | 21 | if __name__ == '__main__': 22 | l = HandleLishipin() 23 | list_url = [ 24 | {"name":"新知","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=10&start=%d&sort=%d"}, 25 | {"name":"社会","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=1&start=%d&sort=%d"}, 26 | {"name":"世界","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=2&start=%d&sort=%d"}, 27 | {"name":"生活","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=5&start=%d&sort=%d"}, 28 | {"name":"娱乐","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=4&start=%d&sort=%d"}, 29 | {"name":"财富","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=3&start=%d&sort=%d"}, 30 | {"name":"美食","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=6&start=%d&sort=%d"}, 31 | {"name":"音乐","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=59&start=%d&sort=%d"}, 32 | ] 33 | for item in list_url: 34 | for i in range(0,110,10): 35 | item_url =item['item_url']%(i,i) 36 | detail_text = l.handle_html(item_url) 37 | detail_html = etree.HTML(detail_text) 38 | detail_url = detail_html.xpath("//li[@class='popularem clearfix']//a[@class='actplay']/@href") 39 | video_url_search = re.compile(r'srcUrl="(.*?)"') 40 | video_name_search = re.compile(r'(.*?)') 41 | for url in detail_url: 42 | url = "https://www.pearvideo.com/"+url 43 | video_text = l.handle_html(url) 44 | video_url = video_url_search.search(video_text).group(1) 45 | video_name = video_name_search.search(video_text).group(1) 46 | info = {} 47 | info['video_url'] = video_url 48 | info['name'] = video_name 49 | info['type'] = item['name'] 50 | info['from_url'] = url 51 | info['crawl_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 52 | -------------------------------------------------------------------------------- /douban_movie_top250_scrapy/douban/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for douban project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'douban' 13 | 14 | SPIDER_MODULES = ['douban.spiders'] 15 | NEWSPIDER_MODULE = 'douban.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'douban.middlewares.DoubanSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | DOWNLOADER_MIDDLEWARES = { 56 | 'douban.middlewares.ProxyMiddleware': 543, 57 | } 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'douban.pipelines.DoubanPipeline': 300, 69 | 'douban.pipelines.DoubanJsonPipeline': 301, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | 93 | REDIRECT_ENABLED = False 94 | HTTPERROR_ALLOWED_CODES= [302] 95 | 96 | RETRY_ENABLED:True 97 | RETRY_HTTP_CODECS=[503] 98 | RETRY_TIMES=5 99 | -------------------------------------------------------------------------------- /dongqiudi/dongqiudi/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for dongqiudi project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'dongqiudi' 13 | 14 | SPIDER_MODULES = ['dongqiudi.spiders'] 15 | NEWSPIDER_MODULE = 'dongqiudi.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | CONCURRENT_REQUESTS = 2 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | # DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'dongqiudi.middlewares.DongqiudiSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | DOWNLOADER_MIDDLEWARES = { 56 | # 'dongqiudi.middlewares.DongqiudiDownloaderMiddleware': 543, 57 | 'dongqiudi.middlewares.DongqiudiProxyMiddleware': 543, 58 | } 59 | 60 | # Enable or disable extensions 61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | #设置图片保存路径 67 | IMAGES_STORE = './dongqiudi_pic' 68 | 69 | # Configure item pipelines 70 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 71 | 72 | ITEM_PIPELINES = { 73 | 'dongqiudi.pipelines.DongqiudiPipeline': 300, 74 | #必须设置IMAGES_STORE,否则这条中间件不起作用 75 | 'dongqiudi.pipelines.DongqiudiImagePipeline': 209 76 | } 77 | 78 | # Enable and configure the AutoThrottle extension (disabled by default) 79 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 80 | #AUTOTHROTTLE_ENABLED = True 81 | # The initial download delay 82 | #AUTOTHROTTLE_START_DELAY = 5 83 | # The maximum download delay to be set in case of high latencies 84 | #AUTOTHROTTLE_MAX_DELAY = 60 85 | # The average number of requests Scrapy should be sending in parallel to 86 | # each remote server 87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 88 | # Enable showing throttling stats for every response received: 89 | #AUTOTHROTTLE_DEBUG = False 90 | 91 | # Enable and configure HTTP caching (disabled by default) 92 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 93 | #HTTPCACHE_ENABLED = True 94 | #HTTPCACHE_EXPIRATION_SECS = 0 95 | #HTTPCACHE_DIR = 'httpcache' 96 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 98 | -------------------------------------------------------------------------------- /mafengwo/mafengwo/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for mafengwo project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'mafengwo' 13 | 14 | SPIDER_MODULES = ['mafengwo.spiders'] 15 | NEWSPIDER_MODULE = 'mafengwo.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'mafengwo (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | CONCURRENT_REQUESTS = 2#根据代理隧道数确定请求数 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | DOWNLOAD_DELAY = 0.1 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | DEFAULT_REQUEST_HEADERS = { 43 | # "Host":"www.mafengwo.cn", 44 | # "Connection":"keep-alive", 45 | # "Upgrade-Insecure-Requests":"1", 46 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", 47 | # "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 48 | # "Accept-Encoding":"gzip, deflate", 49 | # "Accept-Language":"zh-CN,zh;q=0.9", 50 | } 51 | 52 | # Enable or disable spider middlewares 53 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 54 | #SPIDER_MIDDLEWARES = { 55 | # 'mafengwo.middlewares.MafengwoSpiderMiddleware': 543, 56 | #} 57 | 58 | # Enable or disable downloader middlewares 59 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 60 | DOWNLOADER_MIDDLEWARES = { 61 | # 'mafengwo.middlewares.MafengwoDownloaderMiddleware': 543, 62 | 'mafengwo.middlewares.MafengwoProxyMiddleware': 543, 63 | } 64 | 65 | # Enable or disable extensions 66 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 67 | #EXTENSIONS = { 68 | # 'scrapy.extensions.telnet.TelnetConsole': None, 69 | #} 70 | 71 | # Configure item pipelines 72 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 73 | ITEM_PIPELINES = { 74 | 'mafengwo.pipelines.MafengwoPipeline': 300, 75 | 'mafengwo.pipelines.MafengwoImagePipeline': 301, 76 | } 77 | 78 | IMAGES_STORE="./mafengwo_images" 79 | 80 | # Enable and configure the AutoThrottle extension (disabled by default) 81 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 82 | #AUTOTHROTTLE_ENABLED = True 83 | # The initial download delay 84 | #AUTOTHROTTLE_START_DELAY = 5 85 | # The maximum download delay to be set in case of high latencies 86 | #AUTOTHROTTLE_MAX_DELAY = 60 87 | # The average number of requests Scrapy should be sending in parallel to 88 | # each remote server 89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 90 | # Enable showing throttling stats for every response received: 91 | #AUTOTHROTTLE_DEBUG = False 92 | 93 | # Enable and configure HTTP caching (disabled by default) 94 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 95 | #HTTPCACHE_ENABLED = True 96 | #HTTPCACHE_EXPIRATION_SECS = 0 97 | #HTTPCACHE_DIR = 'httpcache' 98 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 100 | DOWNLOAD_TIMEOUT = 10 101 | -------------------------------------------------------------------------------- /boss_zhipin/crawl_boss_zhipin.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urljoin 2 | import requests 3 | import pymongo 4 | from pymongo.collection import Collection 5 | import time 6 | import json 7 | from lxml import etree 8 | from concurrent.futures.thread import ThreadPoolExecutor 9 | 10 | 11 | 12 | class HandleBossZhiPin(object): 13 | def __init__(self): 14 | self.header = { 15 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", 16 | } 17 | self.city_list = "" 18 | boss_client = pymongo.MongoClient(host="127.0.0.1", port=27017) 19 | self.boss_db = boss_client['boss'] 20 | self.city_list = [] 21 | 22 | def handle_city(self): 23 | city_api_url = "https://www.zhipin.com/wapi/zpCommon/data/city.json" 24 | city_response = self.handle_request(method='GET',url=city_api_url) 25 | for province in json.loads(city_response)['zpData']['cityList']: 26 | for city in province['subLevelModelList']: 27 | self.city_list.append(city) 28 | 29 | def handle_job_request(self,job,city): 30 | print(city['name']) 31 | for page in range(1,11): 32 | job_url = "https://www.zhipin.com/c%s/?query=%s&page=%s"%(city['code'],job,page) 33 | print(job_url) 34 | response = self.handle_request(method='GET',url=job_url) 35 | html = etree.HTML(response) 36 | job_list = html.xpath("//div[@class='job-list']/ul/li") 37 | for item in job_list: 38 | info = {} 39 | info['job_title'] = item.xpath(".//div[@class='job-title']/text()")[0] 40 | if '实习' in info['job_title']: 41 | continue 42 | info['price'] = item.xpath(".//span[@class='red']/text()")[0] 43 | describe_1 = item.xpath(".//div[@class='info-primary']/p/text()") 44 | if len(describe_1) == 3: 45 | info['location'] = describe_1[0] 46 | info['working_life'] = describe_1[1] 47 | info['education'] = describe_1[2] 48 | info['company_name'] = item.xpath(".//div[@class='info-company']//h3[@class='name']/a/text()")[0] 49 | describe_2 = item.xpath(".//div[@class='info-company']//p/text()") 50 | info['company_type'] = describe_2[0] 51 | info['job_id'] = urljoin("https://www.zhipin.com",item.xpath(".//h3/a/@href")[0]) 52 | info['city'] = city['name'] 53 | self.handle_save_data(item=info) 54 | if not html.xpath("//div[@class='page']/a[@class='next']"): 55 | break 56 | 57 | 58 | def handle_job_detail(self,response): 59 | pass 60 | 61 | def handle_save_data(self,item): 62 | boss_collection = Collection(self.boss_db, "boss_data") 63 | boss_collection.update({"job_id": item['job_id']}, item, True) 64 | 65 | def handle_request(self,method,url,data=None): 66 | while True: 67 | proxy="http://HTK32673HL02BK2D:50125D2D38937C94@http-dyn.abuyun.com:9020" 68 | proxies = { 69 | "http":proxy, 70 | "https":proxy 71 | } 72 | try: 73 | if method == "GET": 74 | response = requests.get(url=url,headers=self.header,proxies=proxies) 75 | elif method == "POST": 76 | response = requests.post(url=url,headers=self.header,data=data,proxies=proxies,timeout=3) 77 | except Exception as e: 78 | print(e) 79 | time.sleep(2) 80 | continue 81 | else: 82 | return response.text 83 | 84 | def run(self): 85 | self.handle_city() 86 | t = ThreadPoolExecutor(max_workers=3) 87 | for city in self.city_list: 88 | t.submit(self.handle_job_request(job='python',city=city)) 89 | t.shutdown() 90 | 91 | def main(): 92 | boss = HandleBossZhiPin() 93 | boss.run() 94 | 95 | if __name__ == '__main__': 96 | main() 97 | -------------------------------------------------------------------------------- /dasouche/handle_dasouche.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import re 4 | import pymongo 5 | from pymongo.collection import Collection 6 | from concurrent.futures.thread import ThreadPoolExecutor 7 | 8 | 9 | class HandleDaSouChe(object): 10 | def __init__(self): 11 | #页码请求URL 12 | self.page_url = "https://aolai.souche.com/v1/searchApi/searchCar.json?_security_token=undefined" 13 | self.header = { 14 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", 15 | } 16 | self.item_url_list = [] 17 | mongo_client = pymongo.MongoClient(host="10.70.120.156", port=27017) 18 | self.db_data = mongo_client['oreo'] 19 | 20 | def handle_save_data(self,item): 21 | db_collection = Collection(self.db_data, 'dasouche_data') 22 | db_collection.update({'carId':item['carId']},item,True) 23 | 24 | def handle_page(self): 25 | for page in range(1,5): 26 | #构造请求数据POST,每页可现实500条数据,共4页 27 | data = { 28 | "keyword":"", 29 | "brandCode":"", 30 | "seriesCode":"", 31 | "price":"", 32 | "carModel":"", 33 | "carAge":"", 34 | "mileage":"", 35 | "gearboxType":"", 36 | "displacement":"", 37 | "emissionStandard":"", 38 | "bodyColor":"", 39 | "fuelType":"", 40 | "seatingCapacity":"", 41 | "drivingMode":"", 42 | "country":"", 43 | "pageNo":page, 44 | "pageSize":"500", 45 | "from":"pc", 46 | "cityCode":"", 47 | "shopCode":"", 48 | "sort":"newsOnShelf", 49 | } 50 | page_result = self.handle_request(method='POST',url=self.page_url,data=data) 51 | for item in json.loads(page_result)['data']['items']: 52 | self.item_url_list.append(item['detailUrl']) 53 | 54 | #处理详情页 55 | def handle_detail(self,url): 56 | id_search = re.compile(r"carId=(.*?)&shopCode=(\d+)") 57 | car_id = id_search.search(url).group(1) 58 | shop_id = id_search.search(url).group(2) 59 | #车辆详情信息 60 | car_detail_url = "https://aolai.souche.com//v1/carDetailsApi/carDetailInfo.json?carId=%s"%car_id 61 | car_detail = self.handle_request(method='GET',url=car_detail_url) 62 | car_detail_result = json.loads(car_detail)['data'] 63 | #售卖商店信息 64 | shop_detail_url = "https://aolai.souche.com//v1/shopApi/queryTangecheShopInfo.json?carId=%s&citycode=%s&shopCode=%s"%(car_id,car_detail_result['baseCarInfoView']['cityCode'],shop_id) 65 | shop_detail_result = self.handle_request(method='GET',url=shop_detail_url) 66 | car_detail_result.update(json.loads(shop_detail_result)['data']) 67 | #车辆厂商配置信息 68 | car_config_url = "https://aolai.souche.com/v1/carDetailsApi/carConfigDetailInfo.json?_security_token=undefined&carId=%s"%car_id 69 | car_config_result = self.handle_request(method='GET',url=car_config_url) 70 | car_detail_result.update(json.loads(car_config_result)['data']) 71 | car_detail_result['from_url'] = url 72 | self.handle_save_data(car_detail_result) 73 | 74 | 75 | 76 | def handle_request(self,method,url,data=None): 77 | if method == 'POST': 78 | response = requests.post(url=url,headers=self.header,data=data) 79 | return response.text 80 | elif method == 'GET': 81 | response = requests.get(url=url,headers=self.header) 82 | return response.text 83 | 84 | 85 | def run(self): 86 | self.handle_page() 87 | t = ThreadPoolExecutor() 88 | for url in self.item_url_list: 89 | t.submit(self.handle_detail,url) 90 | t.shutdown() 91 | 92 | 93 | def main(): 94 | dasouche = HandleDaSouChe() 95 | dasouche.run() 96 | 97 | 98 | if __name__ == '__main__': 99 | main() 100 | -------------------------------------------------------------------------------- /lagou/crawl_lagou_job_old.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import requests 4 | import time 5 | import multiprocessing 6 | from handle_mysql import lagou_mysql 7 | import random 8 | 9 | 10 | 11 | class HandleLaGou(object): 12 | def __init__(self): 13 | self.lagou_session = requests.session() 14 | self.header = { 15 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36', 16 | } 17 | self.city_list = "" 18 | 19 | def handle_city(self): 20 | city_search = re.compile(r'zhaopin/">(.*?)') 21 | city_url = "https://www.lagou.com/jobs/allCity.html" 22 | city_result = self.handle_request(method='GET',url=city_url) 23 | self.city_list = city_search.findall(city_result) 24 | #清除cookie 25 | self.lagou_session.cookies.clear() 26 | 27 | def handle_city_job(self,city): 28 | for page in range(1,31): 29 | data = { 30 | "pn":str(page), 31 | "kd":"python", 32 | } 33 | job_index_url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=" 34 | self.handle_request(method='GET',url=job_index_url) 35 | page_url = "https://www.lagou.com/jobs/positionAjax.json?city=%s&needAddtionalResult=false"%city 36 | self.header['Referer'] = job_index_url.encode() 37 | job_result = self.handle_request(method='POST',url=page_url,data=data) 38 | try: 39 | lagou_data = json.loads(job_result) 40 | except: 41 | continue 42 | else: 43 | job_list = lagou_data['content']['positionResult']['result'] 44 | if job_list: 45 | for job in job_list: 46 | job['crawl_date'] = time.strftime("%Y-%m-%d", time.localtime()) 47 | lagou_mysql.insert_item(job) 48 | else: 49 | break 50 | 51 | def handle_request(self,method,url,data=None): 52 | while True: 53 | proxyinfo = "http://%s:%s@%s:%s" %('H1V32R6470A7G90D','CD217C660A9143C3','http-dyn.abuyun.com','9020') 54 | proxy = { 55 | "http": proxyinfo, 56 | "https": proxyinfo, 57 | } 58 | 59 | try: 60 | if method == "GET": 61 | response = self.lagou_session.get(url=url,headers=self.header,proxies=proxy,timeout=6) 62 | elif method == "POST": 63 | response = self.lagou_session.post(url=url,headers=self.header,data=data,proxies=proxy,timeout=6) 64 | except Exception as e: 65 | print(e) 66 | else: 67 | if '您操作太频繁,请稍后再访问' in response.text: 68 | print('频繁') 69 | self.lagou_session.cookies.clear() 70 | # job_index_url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=" 71 | # self.handle_request(method='GET',url=job_index_url) 72 | # time.sleep(random.choice(range(3,11))) 73 | time.sleep(1) 74 | continue 75 | elif '爬虫行为' in response.text: 76 | print('爬虫') 77 | self.lagou_session.cookies.clear() 78 | time.sleep(1) 79 | # time.sleep(random.choice(range(3,11))) 80 | continue 81 | else: 82 | return response.text 83 | 84 | def run(self): 85 | self.handle_city() 86 | print(self.city_list) 87 | # for city in self.city_list: 88 | # self.handle_city_job(city=city) 89 | pool = multiprocessing.Pool(2) 90 | for city in self.city_list: 91 | pool.apply_async(self.handle_city_job,args=(city,)) 92 | pool.close() 93 | pool.join() 94 | 95 | 96 | def main(): 97 | lagou = HandleLaGou() 98 | lagou.run() 99 | 100 | if __name__ == '__main__': 101 | main() 102 | -------------------------------------------------------------------------------- /mafengwo_article_spider/mafengwo/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for mafengwo project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'mafengwo' 13 | 14 | SPIDER_MODULES = ['mafengwo.spiders'] 15 | NEWSPIDER_MODULE = 'mafengwo.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | DOWNLOAD_DELAY = 0.5 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | DEFAULT_REQUEST_HEADERS = { 43 | # "Host":"www.mafengwo.cn", 44 | # "Connection":"keep-alive", 45 | # "Upgrade-Insecure-Requests":"1", 46 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", 47 | # "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 48 | # "Accept-Encoding":"gzip, deflate", 49 | # "Accept-Language":"zh-CN,zh;q=0.9", 50 | } 51 | 52 | # Enable or disable spider middlewares 53 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 54 | #SPIDER_MIDDLEWARES = { 55 | # 'mafengwo.middlewares.MafengwoSpiderMiddleware': 543, 56 | #} 57 | 58 | # Enable or disable downloader middlewares 59 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 60 | DOWNLOADER_MIDDLEWARES = { 61 | # 'mafengwo.middlewares.MafengwoDownloaderMiddleware': 543, 62 | 'mafengwo.middlewares.MafengwoProxyMiddleware': 543, 63 | } 64 | 65 | # Enable or disable extensions 66 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 67 | #EXTENSIONS = { 68 | # 'scrapy.extensions.telnet.TelnetConsole': None, 69 | #} 70 | 71 | # Configure item pipelines 72 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 73 | ITEM_PIPELINES = { 74 | 'mafengwo.pipelines.MafengwoPipeline': 300, 75 | #'mafengwo.pipelines.MafengwoImagePipeline': 301, 76 | } 77 | 78 | IMAGES_STORE="./mafengwo_images" 79 | 80 | # Enable and configure the AutoThrottle extension (disabled by default) 81 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 82 | #AUTOTHROTTLE_ENABLED = True 83 | # The initial download delay 84 | #AUTOTHROTTLE_START_DELAY = 5 85 | # The maximum download delay to be set in case of high latencies 86 | #AUTOTHROTTLE_MAX_DELAY = 60 87 | # The average number of requests Scrapy should be sending in parallel to 88 | # each remote server 89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 90 | # Enable showing throttling stats for every response received: 91 | #AUTOTHROTTLE_DEBUG = False 92 | 93 | # Enable and configure HTTP caching (disabled by default) 94 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 95 | #HTTPCACHE_ENABLED = True 96 | #HTTPCACHE_EXPIRATION_SECS = 0 97 | #HTTPCACHE_DIR = 'httpcache' 98 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 100 | DOWNLOAD_TIMEOUT = 10 101 | IMAGES_EXPIRES = 90 #90天内抓取的都不会被重抓 102 | RETRY_TIMES = 100 103 | # LOG_LEVEL = 'INFO' 104 | proxy_url = '代理库URL' 105 | -------------------------------------------------------------------------------- /lagou/handle_mysql.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine 2 | from sqlalchemy.ext.declarative import declarative_base 3 | from sqlalchemy import Column,Integer,String,Float,Date 4 | from sqlalchemy.orm import sessionmaker 5 | import time 6 | 7 | 8 | #创建数据库连接 9 | engine = create_engine("mysql+pymysql://root:abcd1234@127.0.0.1:3306/lagou?charset=utf8") 10 | 11 | #声明一个基类 12 | Base = declarative_base() 13 | 14 | #操作数据库需要使用session 15 | Session = sessionmaker(bind=engine) 16 | 17 | class Lagoutables(Base): 18 | __tablename__ = 'lagou_data' 19 | 20 | #id 21 | id = Column(Integer,primary_key=True,autoincrement=True) 22 | #岗位ID 23 | positionId = Column(Integer,nullable=False) 24 | #经度 25 | longitude = Column(Float,nullable=False) 26 | #纬度 27 | latitude = Column(Float,nullable=False) 28 | #岗位名称 29 | positionName = Column(String(length=50),nullable=False) 30 | #工作年限 31 | workYear = Column(String(length=20),nullable=False) 32 | #学历 33 | education = Column(String(length=20),nullable=False) 34 | #岗位性质 35 | jobNature = Column(String(length=20),nullable=True) 36 | #公司类型 37 | financeStage = Column(String(length=30),nullable=True) 38 | #公司规模 39 | companySize = Column(String(length=30),nullable=True) 40 | #业务方向 41 | industryField = Column(String(length=30),nullable=True) 42 | #所在城市 43 | city = Column(String(length=10),nullable=False) 44 | #岗位标签 45 | positionAdvantage = Column(String(length=200),nullable=True) 46 | #公司简称 47 | companyShortName = Column(String(length=50),nullable=True) 48 | #公司全称 49 | companyFullName = Column(String(length=200),nullable=True) 50 | #公司所在区 51 | district = Column(String(length=20),nullable=True) 52 | #公司福利标签 53 | companyLabelList = Column(String(length=200),nullable=True) 54 | #工资 55 | salary = Column(String(length=20),nullable=False) 56 | #抓取日期 57 | crawl_date = Column(Date,nullable=False) 58 | 59 | #创建表 60 | # Lagoutables.metadata.create_all(engine) 61 | 62 | class HandleLagouData(object): 63 | def __init__(self): 64 | self.mysql_session = Session() 65 | self.item = Lagoutables() 66 | 67 | def insert_item(self,item): 68 | date = time.strftime("%Y-%m-%d", time.localtime()) 69 | data = Lagoutables( 70 | # 岗位ID 71 | positionId = item['positionId'], 72 | # 经度 73 | longitude = item['longitude'], 74 | # 纬度 75 | latitude = item['latitude'], 76 | # 岗位名称 77 | positionName = item['positionName'], 78 | # 工作年限 79 | workYear = item['workYear'], 80 | # 学历 81 | education = item['education'], 82 | # 岗位性质 83 | jobNature = item['jobNature'], 84 | # 公司类型 85 | financeStage = item['financeStage'], 86 | # 公司规模 87 | companySize = item['companySize'], 88 | # 业务方向 89 | industryField = item['industryField'], 90 | # 所在城市 91 | city = item['city'], 92 | # 岗位标签 93 | positionAdvantage = item['positionAdvantage'], 94 | # 公司简称 95 | companyShortName = item['companyShortName'], 96 | # 公司全称 97 | companyFullName = item['companyFullName'], 98 | # 公司所在区 99 | district = item['district'], 100 | # 公司福利标签 101 | companyLabelList = ','.join(item['companyLabelList']), 102 | salary = item['salary'], 103 | # 抓取日期 104 | crawl_date = item['crawl_date'] 105 | ) 106 | query_result = self.mysql_session.query(Lagoutables).filter(Lagoutables.crawl_date==date,Lagoutables.positionId==item['positionId']).first() 107 | if query_result: 108 | print('该岗位信息已存在%s:%s:%s'%(item['positionId'],item['city'],item['positionName'])) 109 | else: 110 | self.mysql_session.add(data) 111 | self.mysql_session.commit() 112 | print('新增岗位信息%s'%item['positionId']) 113 | return self.item 114 | 115 | lagou_mysql = HandleLagouData() 116 | # item = {'positionId':6009711} 117 | # lagou_mysql.insert_item(item) 118 | -------------------------------------------------------------------------------- /baidu_m_keyword_ranks/baidu_m_keyword.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import re 3 | import requests 4 | from lxml import etree 5 | from concurrent.futures import ThreadPoolExecutor 6 | from baidu_m_keyword_ziran.handle_mysql import mysql 7 | from baidu_m_keyword_ziran.handle_mongo import mongo 8 | import time 9 | 10 | 11 | class Handle_baidu_m(object): 12 | def __init__(self): 13 | self.header = { 14 | "Host":"m.baidu.com", 15 | "Connection":"keep-alive", 16 | "Upgrade-Insecure-Requests":"1", 17 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36", 18 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 19 | "Accept-Encoding":"gzip, deflate", 20 | "Accept-Language":"zh-CN,zh;q=0.9", 21 | } 22 | 23 | #处理标题中的特殊字符 24 | def handle_title(self,title): 25 | search = re.compile('"|“|”|{|}') 26 | search_list = search.findall(title) 27 | for value in search_list: 28 | return re.sub(search,urllib.parse.quote(value),title) 29 | else: 30 | return title 31 | 32 | #处理任务 33 | def handle_task(self,keyword): 34 | print(keyword) 35 | result = {} 36 | result_list = [] 37 | result['keyword'] = keyword 38 | url_list = ["http://m.baidu.com/s?pn=0&word="+keyword,"http://m.baidu.com/s?pn=10&word="+keyword,"http://m.baidu.com/s?pn=20&word="+keyword] 39 | for url in url_list: 40 | response = requests.get(url=url,headers=self.header) 41 | baidu_html = etree.HTML(response.text) 42 | item_list = baidu_html.xpath("//div[@id='results']/div") 43 | for item in item_list: 44 | info = {} 45 | #获取标题 46 | title = item.xpath(".//span[contains(@class,'title')]//text()|.//header[@class='c-row']/a/h3[@class='c-title']//text()") 47 | if title: 48 | info['title'] = self.handle_title(''.join(title)).replace("'","") 49 | if '百度百科' in info['title']: 50 | info['target_url'] = "https://wapbaike.baidu.com/item/"+keyword 51 | if '其他人还在搜' in info['title']: 52 | continue 53 | if '相关词语' in info['title']: 54 | continue 55 | if '相关平台' in info['title']: 56 | continue 57 | if '相关品牌' in info['title']: 58 | continue 59 | if '相关网站' in info['title']: 60 | continue 61 | if keyword+' - 资讯' in info['title']: 62 | info['target_url'] = 'http://m.baidu.com/sf/vsearch?pd=realtime&word='+keyword 63 | if keyword+' - 视频' in info['title']: 64 | info['target_url'] = 'http://m.baidu.com/sf/vsearch?pd=video&atn=index&tn=vsearch&word='+keyword 65 | if keyword+' - 小视频' in info['title']: 66 | info['target_url'] = 'http://m.baidu.com/sf/vsearch?pd=xsp&atn=index&tn=vsearch&word='+keyword 67 | else: 68 | target_url = eval(item.xpath("./@data-log")[0].encode('utf-8').decode())['mu'] 69 | if target_url: 70 | info['target_url'] = target_url 71 | else: 72 | if '_企业信息' in info['title']: 73 | info['target_url'] = item.xpath("//a[@class='c-blocka']/@data-url")[0] 74 | result_list.append(info) 75 | else: 76 | continue 77 | result['rank'] = result_list 78 | result['crawl_time'] = time.strftime("%Y-%m-%d", time.localtime()) 79 | print(result) 80 | # mongo.insert_item_in_db('baidu_m_keyword_ziran',result) 81 | # mysql.handle_insert_db(result) 82 | 83 | if __name__ == '__main__': 84 | baidu_m = Handle_baidu_m() 85 | # baidu_m.handle_task('盐城二手奥迪a1') 86 | #线程池 87 | t = ThreadPoolExecutor() 88 | thread_list = [] 89 | #获取任务 90 | task = mysql.handle_task() 91 | for keyword in task: 92 | thread = t.submit(baidu_m.handle_task,keyword[0]) 93 | thread_list.append(thread) 94 | t.shutdown() 95 | # print([thread.result() for thread in thread_list]) 96 | -------------------------------------------------------------------------------- /kolesa/crawl_kolesa.py: -------------------------------------------------------------------------------- 1 | import re 2 | from lxml import etree 3 | import requests 4 | import json 5 | from concurrent.futures import ThreadPoolExecutor 6 | import multiprocessing 7 | from handle_mongo import kolesa_mongo 8 | 9 | class Crawl_kolesa(object): 10 | def __init__(self): 11 | #首页URL 12 | self.index_url = "https://kolesa.kz/cars/" 13 | self.header = { 14 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 15 | "Accept-Encoding":"gzip, deflate, br", 16 | "Accept-Language":"zh-CN,zh;q=0.9", 17 | "Connection":"keep-alive", 18 | "Host":"kolesa.kz", 19 | "Upgrade-Insecure-Requests":"1", 20 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36", 21 | } 22 | self.brand_list_url = "" 23 | 24 | #处理请求方法 25 | def handle_request(self,url): 26 | response = requests.get(url=url,headers=self.header) 27 | return response.text 28 | 29 | #处理品牌方法 30 | def handle_brand(self): 31 | response = self.handle_request(url=self.index_url) 32 | html = etree.HTML(response) 33 | #解析品牌列表 34 | self.brand_list_url = html.xpath("//div[@class='cross-links'][2]/div[@class='cross-links-container']/ul[@class='col-sm-4 cross-links-list']/li/a/@href") 35 | 36 | #解析品牌筛选条件下的页码页 37 | def handle_brand_page(self,url): 38 | detail_info_search = re.compile(r"listing.items.push\((.*?)\);") 39 | #网站仅显示1000页 40 | for page in range(1,1001): 41 | #https://kolesa.kz/cars/gaz/?sort_by=add_date-asc&page=2 42 | #构造品牌页码URL 43 | brand_url = "https://kolesa.kz%s?sort_by=add_date-asc&page=%s"%(url,page) 44 | print(brand_url) 45 | #请求品牌页码页 46 | response = self.handle_request(url=brand_url) 47 | #每页的详情数据 48 | detail_list = detail_info_search.findall(response) 49 | if detail_list: 50 | for detail in detail_list: 51 | detail = json.loads(detail) 52 | detail_info = {} 53 | detail_info['car_name'] = detail.get("name",None) 54 | detail_info['id'] = detail.get("id",None) 55 | detail_info['car_model'] = detail['attributes']['model'] 56 | detail_info['car_brand'] = detail['attributes']['brand'] 57 | detail_info['price'] = detail.get("unitPrice",None) 58 | detail_info['from_url'] = detail.get("url",None) 59 | #对接mongo 60 | kolesa_mongo.handle_save_task(detail_info) 61 | 62 | #处理详情页 63 | def handle_detail(self,item): 64 | response = self.handle_request(item['from_url']) 65 | html = etree.HTML(response) 66 | item['year'] = html.xpath("//span[@class='year']/text()")[0].strip() 67 | item_list = html.xpath("//div[@class='offer__parameters']/dl") 68 | for i in item_list: 69 | name = i.xpath("./dt/span/text()")[0].strip() 70 | if name == "Пробег": 71 | #公里数 72 | item['mileage'] = i.xpath("./dd/text()")[0].strip() 73 | elif name == "Коробка передач": 74 | #变速箱 75 | item['gearbox'] = i.xpath("./dd/text()")[0].strip() 76 | elif name == "Руль": 77 | #方向盘方向 78 | item['steering_wheel'] = i.xpath("./dd/text()")[0].strip() 79 | if not item.get('mileage'): 80 | item['mileage'] = 'no data' 81 | if not item.get('gearbox'): 82 | item['geargox'] = 'no data' 83 | if not item.get('streering_wheel'): 84 | item['steering_wheel'] = 'no data' 85 | #保存数据 86 | kolesa_mongo.handle_save_data(item) 87 | 88 | 89 | #处理任务方法 90 | def handle_task(self): 91 | self.handle_brand() 92 | print("处理品牌") 93 | t = ThreadPoolExecutor() 94 | for url in self.brand_list_url: 95 | t.submit(self.handle_brand_page,url) 96 | t.shutdown() 97 | 98 | #处理最终数据方法 99 | def handle_data(self): 100 | t = ThreadPoolExecutor() 101 | while True: 102 | task = kolesa_mongo.handle_get_task() 103 | if task: 104 | t.submit(self.handle_detail, task) 105 | else: 106 | break 107 | t.shutdown() 108 | 109 | #爬虫启动方法 110 | def run(self): 111 | m1 = multiprocessing.Process(target=self.handle_task) 112 | m1.start() 113 | m1.join() 114 | 115 | m2 = multiprocessing.Process(target=self.handle_data) 116 | m2.start() 117 | m2.join() 118 | 119 | 120 | 121 | 122 | 123 | if __name__ == '__main__': 124 | kolesa = Crawl_kolesa() 125 | kolesa.run() 126 | -------------------------------------------------------------------------------- /lagou/crawl_lagou_job_new.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import requests 4 | import time 5 | import multiprocessing 6 | from handle_mysql import lagou_mysql 7 | 8 | class HandleLaGou(object): 9 | def __init__(self): 10 | #使用session保存cookie信息 11 | self.lagou_session = requests.session() 12 | self.header = { 13 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36', 14 | } 15 | self.city_list = "" 16 | 17 | def handle_request(self,method,url,data=None,info=None): 18 | ''' 19 | 处理请求方法 20 | :param method: 请求方法 21 | :param url: 请求url 22 | :param data: post请求的数据 23 | :return: 数据入库 24 | ''' 25 | # 由于代理不稳定,所以使用while循环 26 | while True: 27 | # 动态版阿布云代理 28 | proxyinfo = "http://%s:%s@%s:%s" %('H1V32R6470A7G90D','CD217C660A9143C3','http-dyn.abuyun.com','9020') 29 | proxy = { 30 | "http": proxyinfo, 31 | "https": proxyinfo, 32 | } 33 | try: 34 | if method == "GET": 35 | response = self.lagou_session.get(url=url,headers=self.header,proxies=proxy,timeout=6) 36 | elif method == "POST": 37 | response = self.lagou_session.post(url=url,headers=self.header,data=data,proxies=proxy,timeout=6) 38 | except Exception as e: 39 | print(e) 40 | else: 41 | # 由于反爬虫造成的continue 42 | if '频繁' in response.text: 43 | print('频繁') 44 | # 首先清除当前存在的cookie信息 45 | self.lagou_session.cookies.clear() 46 | # 重新请求cookie信息,并休眠10秒 47 | self.lagou_session.get( 48 | url="https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info, 49 | headers=self.header) 50 | time.sleep(10) 51 | continue 52 | elif '错误网关' in response.text: 53 | print('错误网关') 54 | time.sleep(1) 55 | continue 56 | elif '页面加载中' in response.text: 57 | print('页面加载中') 58 | time.sleep(2) 59 | continue 60 | else: 61 | return response.text 62 | 63 | def handle_city(self): 64 | ''' 65 | 获取拉勾网岗位信息城市 66 | :return: 城市列表 67 | ''' 68 | city_search = re.compile(r'zhaopin/">(.*?)') 69 | city_url = "https://www.lagou.com/jobs/allCity.html" 70 | city_result = self.handle_request(method='GET',url=city_url) 71 | self.city_list = city_search.findall(city_result) 72 | #清除cookie 73 | self.lagou_session.cookies.clear() 74 | 75 | def handle_city_job(self,city): 76 | ''' 77 | :param city: 城市信息 78 | :return: 最终岗位数据,存储到Mysql 79 | ''' 80 | #发出第一个请求,获取cookies信息和页码信息 81 | first_request_url="https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput="%city 82 | first_response = self.handle_request(method='GET',url=first_request_url) 83 | total_page_search = re.compile(r'class="span\stotalNum">(\d+)') 84 | try: 85 | total_page = total_page_search.search(first_response).group(1) 86 | #由于无岗位信息而return 87 | except Exception as e: 88 | return 89 | else: 90 | #经过分析,每个地区最多显示30页 91 | for i in range(1,int(total_page)+1): 92 | data = { 93 | "pn":i, 94 | "kd":"python" 95 | } 96 | #请求岗位信息时必须带上Referer 97 | referer_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput="%city 98 | self.header["Referer"]=referer_url.encode() 99 | page_url = "https://www.lagou.com/jobs/positionAjax.json?city=%s&needAddtionalResult=false"%city 100 | response = self.handle_request(method='POST',url=page_url,data=data,info=city) 101 | lagou_data = json.loads(response) 102 | job_list = lagou_data['content']['positionResult']['result'] 103 | if job_list: 104 | for job in job_list: 105 | job['crawl_date'] = time.strftime("%Y-%m-%d", time.localtime()) 106 | lagou_mysql.insert_item(job) 107 | 108 | if __name__ == '__main__': 109 | lagou = HandleLaGou() 110 | lagou.handle_city() 111 | print(lagou.city_list) 112 | pool = multiprocessing.Pool(2) 113 | for city in lagou.city_list: 114 | pool.apply_async(lagou.handle_city_job,args=(city,)) 115 | pool.close() 116 | pool.join() 117 | # for city in lagou.city_list: 118 | # lagou.handle_city_job(city) 119 | -------------------------------------------------------------------------------- /mafengwo_article_spider/mafengwo/handle_task.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import requests 4 | import re 5 | import json 6 | from handle_mongo import mongo 7 | from settings import proxy_url 8 | from concurrent.futures.thread import ThreadPoolExecutor 9 | import multiprocessing 10 | 11 | 12 | class HandleMaFengWoTask(object): 13 | def __init__(self): 14 | self.header = { 15 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 16 | } 17 | self.proxy_list = [] 18 | 19 | def handle_proxy(self): 20 | response = requests.get(url=proxy_url) 21 | data = json.loads(response.text) 22 | sum = 0 23 | #每请求一次加入200个代理 24 | for proxy in data['proxys']: 25 | sum = sum + 1 26 | if sum > 200: 27 | break 28 | proxy_dict = { 29 | "http": proxy['proxy'], 30 | "https": proxy['proxy'] 31 | } 32 | self.proxy_list.append(proxy_dict) 33 | 34 | 35 | #最新游记 36 | def handle_new_article(self,page): 37 | article_url_search = re.compile(r'a\shref="/i/(\d+)\.html"') 38 | info = {} 39 | info['flag'] = 'GET' 40 | info['url'] = 'https://pagelet.mafengwo.cn/note/pagelet/recommendNoteApi?params={"type":3,"objid":0,"page":%s,"ajax":1,"retina":0}'%page 41 | print(info['url']) 42 | new_article = self.handle_request(info) 43 | try: 44 | html = json.loads(new_article)['data']['html'] 45 | except: 46 | return 47 | article_url_list = article_url_search.findall(html) 48 | for article_id in set(article_url_list): 49 | insert_mongo = {} 50 | insert_mongo['id'] = article_id 51 | insert_mongo['url'] = 'http://pagelet.mafengwo.cn/note/pagelet/headOperateApi?params={"iid":"%s"}'%article_id 52 | insert_mongo['item_type'] = 'head_item' 53 | print(insert_mongo) 54 | mongo.insert_task(insert_mongo) 55 | 56 | #热门游记 57 | def handle_hot_article(self,page): 58 | article_url_search = re.compile(r'a\shref="/i/(\d+)\.html"') 59 | info = {} 60 | info['flag'] = 'GET' 61 | info['url'] = 'https://pagelet.mafengwo.cn/note/pagelet/recommendNoteApi?params={"type":0,"objid":0,"page":%s,"ajax":1,"retina":0}' % page 62 | print(info['url']) 63 | new_article = self.handle_request(info) 64 | try: 65 | html = json.loads(new_article)['data']['html'] 66 | except: 67 | return 68 | article_url_list = article_url_search.findall(html) 69 | for article_id in set(article_url_list): 70 | insert_mongo = {} 71 | insert_mongo['id'] = article_id 72 | insert_mongo[ 73 | 'url'] = 'http://pagelet.mafengwo.cn/note/pagelet/headOperateApi?params={"iid":"%s"}' % article_id 74 | insert_mongo['item_type'] = 'head_item' 75 | print(insert_mongo) 76 | mongo.insert_task(insert_mongo) 77 | 78 | def handle_new_column(self): 79 | column_url_search = re.compile(r'/traveller/article.php\?id=\d+') 80 | for i in range(0,2000,10): 81 | info= {} 82 | info['flag'] = 'GET' 83 | info['url'] = 'https://www.mafengwo.cn/traveller/ajax.php?action=getMoreArticles&sort=ctime&start=%s'%i 84 | new_column = self.handle_request(info) 85 | html = json.loads(new_column)['html'] 86 | column_list = column_url_search.findall(html) 87 | for column in set(column_list): 88 | url = 'https://www.mafengwo.cn'+column 89 | print(url) 90 | break 91 | 92 | def handle_hot_column(self): 93 | column_url_search = re.compile(r'/traveller/article.php\?id=\d+') 94 | for i in range(0,2000,10): 95 | info= {} 96 | info['flag'] = 'GET' 97 | info['url'] = 'https://www.mafengwo.cn/traveller/ajax.php?action=getMoreArticles&sort=hot&start=%s'%i 98 | new_column = self.handle_request(info) 99 | html = json.loads(new_column)['html'] 100 | column_list = column_url_search.findall(html) 101 | for column in set(column_list): 102 | url = 'https://www.mafengwo.cn'+column 103 | print(url) 104 | break 105 | 106 | def handle_request(self,info): 107 | #判断代理数量,如果小于10则更新代理 108 | if len(self.proxy_list)<10: 109 | self.handle_proxy() 110 | if info['flag'] == 'GET': 111 | while True: 112 | try: 113 | response = requests.get(url=info['url'],headers=self.header,proxies=self.proxy_list.pop(0),timeout=6) 114 | except Exception as e: 115 | print(e) 116 | time.sleep(2) 117 | continue 118 | else: 119 | return response.text 120 | elif info['flag'] == 'POST': 121 | response = requests.post(url=info['url'],headers=self.header,data=info['data'],proxies=self.proxy_list.pop(0),timeout=6) 122 | return response.text 123 | 124 | #最新游记处理进程 125 | def process_1(self): 126 | t1 = ThreadPoolExecutor() 127 | for page in range(1,8): 128 | print(page) 129 | t1.submit(self.handle_new_article,page) 130 | t1.shutdown() 131 | 132 | #热门游记处理进程 133 | def process_2(self): 134 | t2 = ThreadPoolExecutor() 135 | for page in range(1,8): 136 | print(page) 137 | t2.submit(self.handle_hot_article,page) 138 | t2.shutdown() 139 | # self.handle_new_column() 140 | # self.handle_hot_column() 141 | 142 | def run(self): 143 | m1 = multiprocessing.Process(target=self.process_1) 144 | m2 = multiprocessing.Process(target=self.process_2) 145 | m1.start() 146 | m2.start() 147 | m1.join() 148 | m2.join() 149 | 150 | def main(): 151 | mafengwo_task = HandleMaFengWoTask() 152 | mafengwo_task.run() 153 | 154 | if __name__ == '__main__': 155 | main() 156 | -------------------------------------------------------------------------------- /mafengwo_article_spider/mafengwo/spiders/crawl_mafengwo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import re 4 | import time 5 | import scrapy 6 | from ..items import MafengwoItem 7 | from mafengwo.handle_mongo import mongo 8 | 9 | 10 | class CrawlMafengwoSpider(scrapy.Spider): 11 | name = 'crawl_mafengwo' 12 | allowed_domains = ['mafengwo.cn'] 13 | 14 | #从task库中取出任务 15 | def start_requests(self): 16 | for i in range(1): 17 | task = mongo.get_task() 18 | #如果有任务则执行 19 | if task: 20 | if '_id' in task: 21 | task.pop('_id') 22 | print(task) 23 | if task['item_type'] == 'head_item': 24 | yield scrapy.Request(url=task['url'],callback=self.handle_detail_head,dont_filter=True,meta=task) 25 | elif task['item_type'] == 'article_item': 26 | yield scrapy.Request(url=task['url'],callback=self.handle_detail,dont_filter=True,meta=task) 27 | 28 | #解析美篇游记的头部信息 29 | def handle_detail_head(self,response): 30 | read_comment_search = re.compile(r'(.*?)') 31 | name_search = re.compile(r'class="per_name"\stitle="(.*?)">') 32 | star_search = re.compile(r'(\d+)收藏') 33 | release_time_search = re.compile(r'(.*?)') 34 | html = json.loads(response.text)['data']['html'] 35 | info = {} 36 | read_comment = read_comment_search.search(html).group(1).split('/') 37 | info['read_sum'] = read_comment[0] 38 | info['comment_sum'] = read_comment[1] 39 | info['name'] = name_search.search(html).group(1) 40 | info['star_sum'] = star_search.search(html).group(1) 41 | info['release_time'] = release_time_search.search(html).group(1) 42 | info['item_type'] = 'article_item' 43 | info['url'] = 'http://www.mafengwo.cn/i/%s.html'%(response.request.meta['id']) 44 | mongo.insert_task(info) 45 | 46 | #解析游记 47 | def handle_detail(self,response): 48 | id_search = re.compile(r"window.Env\s=\s(.*);") 49 | seq_search = re.compile(r'data-seq="(\d+)"') 50 | try: 51 | id_result = json.loads(id_search.search(response.text).group(1)) 52 | except: 53 | return 54 | id = id_result['iid'] 55 | iid = id_result.get('new_iid') 56 | #存在下一页 57 | if iid: 58 | print(response.url+"存在多页") 59 | response.request.meta['id'] = id 60 | response.request.meta['iid'] = iid 61 | #文章标题 62 | response.request.meta['title'] = response.xpath("//title/text()").extract_first() 63 | #文章内容 64 | response.request.meta['content'] = response.xpath("//div[@class='_j_content_box']").extract() 65 | #请求URL 66 | response.request.meta['from_url'] = response.url 67 | #请求下一页所使用的ID 68 | next_request_seq = seq_search.findall(response.text)[-1] 69 | next_detail_url = "http://www.mafengwo.cn/note/ajax/detail/getNoteDetailContentChunk?id=%s&iid=%s&seq=%s&back=0" % (id, iid, next_request_seq) 70 | yield scrapy.Request(url=next_detail_url, callback=self.handle_detail_json, dont_filter=True,meta=response.request.meta) 71 | # 不存在下一页 72 | else: 73 | #处理游记 74 | m3u8_search = re.compile(r'data-url="(.*\.m3u8)"') 75 | mafengwo_data = MafengwoItem() 76 | mafengwo_data['title'] = response.xpath("//title/text()").extract_first() 77 | mafengwo_data['from_url'] = response.request.meta['from_url'] 78 | mafengwo_data['read_sum'] = response.request.meta['read_sum'] 79 | mafengwo_data['comment_sum'] = response.request.meta['comment_sum'] 80 | mafengwo_data['star_sum'] = response.request.meta['star_sum'] 81 | # mafengwo_data['support_sum'] = response.request.meta['support_sum'] 82 | mafengwo_data['release_time'] = response.request.meta['release_time'] 83 | mafengwo_data['name'] = response.request.meta['name'] 84 | mafengwo_data['id'] = id 85 | mafengwo_data['content'] = self.handle_img_src(''.join(response.xpath("//div[@id='pnl_contentinfo']").extract_first())) 86 | photo_url_search = re.compile(r'data-src="(.*?)\?') 87 | mafengwo_data['video_urls'] = m3u8_search.findall(mafengwo_data['content']) 88 | mafengwo_data['image_urls'] = photo_url_search.findall(mafengwo_data['content']) 89 | mafengwo_data['upload_status'] = 0 90 | mafengwo_data['crawl_time'] = time.strftime("%Y%m%d %H:%M:%S", time.localtime()) 91 | yield mafengwo_data 92 | 93 | def handle_detail_json(self,response): 94 | m3u8_search = re.compile(r'data-url="(.*\.m3u8)"') 95 | seq_search = re.compile(r'data-seq="(\d+)"') 96 | html_text = json.loads(response.text)['data'] 97 | if html_text['html'] == "": 98 | mafengwo_data = MafengwoItem() 99 | mafengwo_data['title'] = response.request.meta['title'] 100 | mafengwo_data['from_url'] = response.request.meta['from_url'] 101 | mafengwo_data['read_sum'] = response.request.meta['read_sum'] 102 | mafengwo_data['comment_sum'] = response.request.meta['comment_sum'] 103 | mafengwo_data['star_sum'] = response.request.meta['star_sum'] 104 | # mafengwo_data['support_sum'] = response.request.meta['support_sum'] 105 | mafengwo_data['release_time'] = response.request.meta['release_time'] 106 | mafengwo_data['name'] = response.request.meta['name'] 107 | mafengwo_data['id'] = response.request.meta['id'] 108 | mafengwo_data['content'] = self.handle_img_src(''.join(response.request.meta['content'])) 109 | mafengwo_data['upload_status'] = 0 110 | mafengwo_data['crawl_time'] = time.strftime("%Y%m%d %H:%M:%S", time.localtime()) 111 | photo_url_search = re.compile(r'data-src="(.*?)\?') 112 | mafengwo_data['video_urls'] = m3u8_search.findall(mafengwo_data['content']) 113 | mafengwo_data['image_urls'] = photo_url_search.findall(mafengwo_data['content']) 114 | yield mafengwo_data 115 | else: 116 | html = html_text['html'] 117 | response.request.meta['content'].append(html) 118 | next_request_seq = seq_search.findall(html)[-1] 119 | if next_request_seq: 120 | next_detail_url = "http://www.mafengwo.cn/note/ajax/detail/getNoteDetailContentChunk?id=%s&iid=%s&seq=%s&back=0" % (response.request.meta['id'], response.request.meta['iid'], next_request_seq) 121 | yield scrapy.Request(url=next_detail_url, callback=self.handle_detail_json, dont_filter=True,meta=response.request.meta) 122 | 123 | #处理游记中的图片URL 124 | def handle_img_src(self, text): 125 | img_search = re.compile(r"|") 126 | img_data_src_search = re.compile(r'data-src="(.*?)\?') 127 | src_search = re.compile(r'[^-]src="(.*?)"') 128 | img_list = img_search.findall(text) 129 | for img in img_list: 130 | try: 131 | img_data_src = img_data_src_search.search(img).group(1) 132 | src = src_search.search(img).group(1) 133 | img_new = img.replace(src, img_data_src) 134 | text = text.replace(img, img_new) 135 | except: 136 | pass 137 | return text 138 | -------------------------------------------------------------------------------- /mafengwo/mafengwo/spiders/crawl_mafengwo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import re 4 | import scrapy 5 | from scrapy import Selector 6 | from ..items import MafengwoItem 7 | import time 8 | 9 | 10 | class CrawlMafengwoSpider(scrapy.Spider): 11 | name = 'crawl_mafengwo' 12 | allowed_domains = ['mafengwo.cn'] 13 | # start_urls = ['http://www.mafengwo.cn/u/wenhao/note.html'] 14 | 15 | #请求首页 16 | def start_requests(self): 17 | #直接构造请求页码URL,如请求200页,热门游记 18 | for page in range(1,200): 19 | url = 'http://pagelet.mafengwo.cn/note/pagelet/recommendNoteApi?params={"type":0,"objid":0,"page":%s,"ajax":1,"retina":0}'%page 20 | yield scrapy.Request(url=url,callback=self.handle_page,dont_filter=True) 21 | 22 | #解析有多少篇游记,构造游记阅读量等信息URL并请求 23 | def handle_page(self, response): 24 | #获取页码页返回中的文章ID 25 | article_id_search = re.compile(r'(.*?)') 37 | name_search = re.compile(r'class="per_name"\stitle="(.*?)">') 38 | star_search = re.compile(r'(\d+)收藏') 39 | release_time_search = re.compile(r'(.*?)') 40 | html = json.loads(response.text)['data']['html'] 41 | info = {} 42 | read_comment = read_comment_search.search(html).group(1).split('/') 43 | info['read_sum'] = read_comment[0] 44 | info['comment_sum'] = read_comment[1] 45 | info['name'] = name_search.search(html).group(1) 46 | info['star_sum'] = star_search.search(html).group(1) 47 | info['release_time'] = release_time_search.search(html).group(1) 48 | info['id'] = response.request.meta['article_id'] 49 | info['url'] = 'http://www.mafengwo.cn/i/%s.html' % (response.request.meta['article_id']) 50 | print(info) 51 | yield scrapy.Request(url=info['url'],callback=self.handle_detail,meta=info,dont_filter=True) 52 | 53 | # 解析游记 54 | def handle_detail(self, response): 55 | id_search = re.compile(r"window.Env\s=\s(.*);") 56 | seq_search = re.compile(r'data-seq="(\d+)"') 57 | try: 58 | id_result = json.loads(id_search.search(response.text).group(1)) 59 | except: 60 | return 61 | #获取是否存在下一页标志 62 | iid = id_result.get('new_iid') 63 | # 存在下一页 64 | if iid: 65 | print(response.url + "存在多页") 66 | response.request.meta['iid'] = iid 67 | # 文章标题 68 | response.request.meta['title'] = response.xpath("//title/text()").extract_first() 69 | # 文章内容 70 | response.request.meta['content'] = response.xpath("//div[@class='_j_content_box']").extract() 71 | # 请求URL 72 | response.request.meta['from_url'] = response.url 73 | # 请求下一页所使用的ID 74 | next_request_seq = seq_search.findall(response.text)[-1] 75 | next_detail_url = "http://www.mafengwo.cn/note/ajax/detail/getNoteDetailContentChunk?id=%s&iid=%s&seq=%s&back=0" % (response.request.meta['id'], iid, next_request_seq) 76 | yield scrapy.Request(url=next_detail_url, callback=self.handle_detail_json, dont_filter=True,meta=response.request.meta) 77 | # 不存在下一页 78 | else: 79 | # 处理游记 80 | mafengwo_data = MafengwoItem() 81 | mafengwo_data['title'] = response.xpath("//title/text()").extract_first() 82 | mafengwo_data['from_url'] = response.request.meta['from_url'] 83 | mafengwo_data['read_sum'] = response.request.meta['read_sum'] 84 | mafengwo_data['comment_sum'] = response.request.meta['comment_sum'] 85 | mafengwo_data['star_sum'] = response.request.meta['star_sum'] 86 | mafengwo_data['release_time'] = response.request.meta['release_time'] 87 | mafengwo_data['name'] = response.request.meta['name'] 88 | mafengwo_data['id'] = response.request.meta['id'] 89 | mafengwo_data['content'] = self.handle_img_src(''.join(response.xpath("//div[@id='pnl_contentinfo']").extract_first())) 90 | #获取文章中所有图片URL 91 | photo_url_search = re.compile(r'data-src="(.*?)\?') 92 | mafengwo_data['image_urls'] = photo_url_search.findall(mafengwo_data['content']) 93 | mafengwo_data['crawl_time'] = time.strftime("%Y%m%d %H:%M:%S", time.localtime()) 94 | yield mafengwo_data 95 | 96 | def handle_detail_json(self, response): 97 | seq_search = re.compile(r'data-seq="(\d+)"') 98 | html_text = json.loads(response.text)['data'] 99 | #请求到末页 100 | if html_text['html'] == "": 101 | mafengwo_data = MafengwoItem() 102 | mafengwo_data['title'] = response.request.meta['title'] 103 | mafengwo_data['from_url'] = response.request.meta['from_url'] 104 | mafengwo_data['read_sum'] = response.request.meta['read_sum'] 105 | mafengwo_data['comment_sum'] = response.request.meta['comment_sum'] 106 | mafengwo_data['star_sum'] = response.request.meta['star_sum'] 107 | mafengwo_data['release_time'] = response.request.meta['release_time'] 108 | mafengwo_data['name'] = response.request.meta['name'] 109 | mafengwo_data['id'] = response.request.meta['id'] 110 | mafengwo_data['content'] = self.handle_img_src(''.join(response.request.meta['content'])) 111 | mafengwo_data['crawl_time'] = time.strftime("%Y%m%d %H:%M:%S", time.localtime()) 112 | photo_url_search = re.compile(r'data-src="(.*?)\?') 113 | mafengwo_data['image_urls'] = photo_url_search.findall(mafengwo_data['content']) 114 | yield mafengwo_data 115 | #继续请求下一页 116 | else: 117 | html = html_text['html'] 118 | response.request.meta['content'].append(html) 119 | next_request_seq = seq_search.findall(html)[-1] 120 | if next_request_seq: 121 | next_detail_url = "http://www.mafengwo.cn/note/ajax/detail/getNoteDetailContentChunk?id=%s&iid=%s&seq=%s&back=0" % (response.request.meta['id'], response.request.meta['iid'], next_request_seq) 122 | yield scrapy.Request(url=next_detail_url, callback=self.handle_detail_json, dont_filter=True,meta=response.request.meta) 123 | 124 | # 处理游记中的图片URL 125 | def handle_img_src(self, text): 126 | img_search = re.compile(r"|") 127 | img_data_src_search = re.compile(r'data-src="(.*?)\?') 128 | src_search = re.compile(r'[^-]src="(.*?)"') 129 | img_list = img_search.findall(text) 130 | for img in img_list: 131 | try: 132 | img_data_src = img_data_src_search.search(img).group(1) 133 | src = src_search.search(img).group(1) 134 | img_new = img.replace(src, img_data_src) 135 | text = text.replace(img, img_new) 136 | except: 137 | pass 138 | return text 139 | 140 | -------------------------------------------------------------------------------- /mafengwo/mafengwo/url_list.txt: -------------------------------------------------------------------------------- 1 | http://www.mafengwo.cn/u/wenhao/note.html 2 | http://www.mafengwo.cn/u/5295777/note.html 3 | http://www.mafengwo.cn/u/85713126/note.html 4 | http://www.mafengwo.cn/u/18015577/note.html 5 | http://www.mafengwo.cn/u/60798801/note.html 6 | http://www.mafengwo.cn/u/yiyinotes/note.html 7 | https://www.mafengwo.cn/u/88358953/note.html 8 | https://www.mafengwo.cn/u/daxigua/note.html 9 | https://www.mafengwo.cn/u/47448074/note.html 10 | https://www.mafengwo.cn/u/36909470/note.html 11 | https://www.mafengwo.cn/u/76823294/note.html 12 | https://www.mafengwo.cn/u/32216322/note.html 13 | https://www.mafengwo.cn/u/10704640/note.html 14 | https://www.mafengwo.cn/u/dearsummar/note.html 15 | https://www.mafengwo.cn/u/19894572/note.html 16 | https://www.mafengwo.cn/u/321294/note.html 17 | https://www.mafengwo.cn/u/5172228/note.html 18 | https://www.mafengwo.cn/u/5017124/note.html 19 | https://www.mafengwo.cn/u/hwf520/note.html 20 | https://www.mafengwo.cn/u/kido37/note.html 21 | https://www.mafengwo.cn/u/41037525/note.html 22 | https://www.mafengwo.cn/u/joyii0513/note.html 23 | https://www.mafengwo.cn/u/69709753/note.html 24 | https://www.mafengwo.cn/u/wayzhenyan/note.html 25 | https://www.mafengwo.cn/u/78343168/note.html 26 | https://www.mafengwo.cn/u/46337998/note.html 27 | https://www.mafengwo.cn/u/sellnuan/note.html 28 | https://www.mafengwo.cn/u/846867/note.html 29 | https://www.mafengwo.cn/u/54041143/note.html 30 | https://www.mafengwo.cn/u/17074212/note.html 31 | https://www.mafengwo.cn/u/5602249/note.html 32 | https://www.mafengwo.cn/u/45793678/note.html 33 | https://www.mafengwo.cn/u/42370376/note.html 34 | https://www.mafengwo.cn/u/81676700/note.html 35 | https://www.mafengwo.cn/u/78838404/note.html 36 | https://www.mafengwo.cn/u/5663320/note.html 37 | https://www.mafengwo.cn/u/56213436/note.html 38 | https://www.mafengwo.cn/u/68691572/note.html 39 | https://www.mafengwo.cn/u/67165115/note.html 40 | https://www.mafengwo.cn/u/45907046/note.html 41 | https://www.mafengwo.cn/u/samwong/note.html 42 | https://www.mafengwo.cn/u/48737554/note.html 43 | https://www.mafengwo.cn/u/5366541/note.html 44 | https://www.mafengwo.cn/u/1047345/note.html 45 | https://www.mafengwo.cn/u/73297474/note.html 46 | https://www.mafengwo.cn/u/64898562/note.html 47 | https://www.mafengwo.cn/u/ariel690/note.html 48 | https://www.mafengwo.cn/u/5133407/note.html 49 | https://www.mafengwo.cn/u/63932781/note.html 50 | https://www.mafengwo.cn/u/49231278/note.html 51 | https://www.mafengwo.cn/u/69833564/note.html 52 | https://www.mafengwo.cn/u/52482820/note.html 53 | https://www.mafengwo.cn/u/374140/note.html 54 | https://www.mafengwo.cn/u/5363625/note.html 55 | https://www.mafengwo.cn/u/64582645/note.html 56 | https://www.mafengwo.cn/u/32228262/note.html 57 | https://www.mafengwo.cn/u/68295140/note.html 58 | https://www.mafengwo.cn/u/93296829/note.html 59 | https://www.mafengwo.cn/u/biggun/note.html 60 | https://www.mafengwo.cn/u/57892379/note.html 61 | https://www.mafengwo.cn/u/76823294.html 62 | https://www.mafengwo.cn/u/pinkyvision/note.html 63 | https://www.mafengwo.cn/u/69536526/note.html 64 | https://www.mafengwo.cn/u/37311913/note.html 65 | https://www.mafengwo.cn/u/10345585/note.html 66 | https://www.mafengwo.cn/u/37369363/note.html 67 | https://www.mafengwo.cn/u/inlaoban5/note.html 68 | https://www.mafengwo.cn/u/75471465/note.html 69 | https://www.mafengwo.cn/u/40682663/note.html 70 | https://www.mafengwo.cn/u/799727/note.html 71 | https://www.mafengwo.cn/u/19560416/note.html 72 | https://www.mafengwo.cn/u/summer7/note.html 73 | https://www.mafengwo.cn/u/zhenmeiqu/note.html 74 | https://www.mafengwo.cn/u/93808795/note.html 75 | https://www.mafengwo.cn/u/ruanzhonghua/note.html 76 | https://www.mafengwo.cn/u/59633694/note.html 77 | https://www.mafengwo.cn/u/5172228/note.html 78 | https://www.mafengwo.cn/u/79862907/note.html 79 | https://www.mafengwo.cn/u/5119335/note.html 80 | https://www.mafengwo.cn/u/iiibiz/note.html 81 | https://www.mafengwo.cn/u/92990277/note.html 82 | https://www.mafengwo.cn/u/83736375.html 83 | https://www.mafengwo.cn/u/66016397/note.html 84 | https://www.mafengwo.cn/u/75334068/note.html 85 | https://www.mafengwo.cn/u/10606831/note.html 86 | https://www.mafengwo.cn/u/73953374/note.html 87 | https://www.mafengwo.cn/u/5328159/note.html 88 | https://www.mafengwo.cn/u/72226812/note.html 89 | https://www.mafengwo.cn/u/75867238/note.html 90 | https://www.mafengwo.cn/u/ruogu2/note.html 91 | https://www.mafengwo.cn/u/459268/note.html 92 | https://www.mafengwo.cn/u/5037685/note.html 93 | https://www.mafengwo.cn/u/32358313/note.html 94 | https://www.mafengwo.cn/u/ymy817/note.html 95 | https://www.mafengwo.cn/u/44131359/note.html 96 | https://www.mafengwo.cn/u/flyingwsh/note.html 97 | https://www.mafengwo.cn/u/36953718/note.html 98 | https://www.mafengwo.cn/u/830821/note.html 99 | https://www.mafengwo.cn/u/72465054/note.html 100 | https://www.mafengwo.cn/u/816643/note.html 101 | https://www.mafengwo.cn/u/5547423/note.html 102 | https://www.mafengwo.cn/u/85055587/note.html 103 | https://www.mafengwo.cn/u/77259555/note.html 104 | https://www.mafengwo.cn/u/58085128/note.html 105 | https://www.mafengwo.cn/u/85782763/note.html 106 | https://www.mafengwo.cn/u/448785/note.html 107 | https://www.mafengwo.cn/u/shanfeng/note.html 108 | https://www.mafengwo.cn/u/30730200/note.html 109 | https://www.mafengwo.cn/u/82532600/note.html 110 | https://www.mafengwo.cn/u/sellnuan/note.html 111 | https://www.mafengwo.cn/u/85205385/note.html 112 | https://www.mafengwo.cn/u/40525484/note.html 113 | https://www.mafengwo.cn/u/92931036/note.html 114 | https://www.mafengwo.cn/u/60022265/note.html 115 | https://www.mafengwo.cn/u/45066857.html 116 | https://www.mafengwo.cn/u/34957278/note.html 117 | https://www.mafengwo.cn/u/90472994/note.html 118 | https://www.mafengwo.cn/u/5295777/note.html 119 | https://www.mafengwo.cn/u/86494331/note.html 120 | https://www.mafengwo.cn/u/42395202.html 121 | https://www.mafengwo.cn/u/heididsy/note.html 122 | https://www.mafengwo.cn/u/42694746/note.html 123 | https://www.mafengwo.cn/u/yimeng/note.html 124 | https://www.mafengwo.cn/u/5172228/note.html 125 | https://www.mafengwo.cn/u/17639643.html 126 | https://www.mafengwo.cn/u/wuweixiang/note.html 127 | https://www.mafengwo.cn/u/92931036/note.html 128 | https://www.mafengwo.cn/u/49231278/note.html 129 | https://www.mafengwo.cn/u/5481686.html 130 | https://www.mafengwo.cn/u/19014378/note.html 131 | https://www.mafengwo.cn/u/seacen/note.html 132 | https://www.mafengwo.cn/u/beslan/note.html 133 | https://www.mafengwo.cn/u/ruanzhonghua/note.html 134 | https://www.mafengwo.cn/u/187367/note.html 135 | https://www.mafengwo.cn/u/32216322/note.html 136 | https://www.mafengwo.cn/u/93157709/note.html 137 | https://www.mafengwo.cn/u/13105932/note.html 138 | https://www.mafengwo.cn/u/86494331/note.html 139 | https://www.mafengwo.cn/u/10911951.html 140 | https://www.mafengwo.cn/u/77243222/note.html 141 | https://www.mafengwo.cn/u/yolichic/note.html 142 | https://www.mafengwo.cn/u/88371807/note.html 143 | https://www.mafengwo.cn/u/jklouise/note.html 144 | https://www.mafengwo.cn/u/85558645/note.html 145 | https://www.mafengwo.cn/u/69200064/note.html 146 | https://www.mafengwo.cn/u/88358953/note.html 147 | https://www.mafengwo.cn/u/54534899/note.html 148 | https://www.mafengwo.cn/u/kido37/note.html 149 | https://www.mafengwo.cn/u/ruogu2/note.html 150 | https://www.mafengwo.cn/u/32228262/note.html 151 | https://www.mafengwo.cn/u/208077/note.html 152 | https://www.mafengwo.cn/u/xmulazio/note.html 153 | https://www.mafengwo.cn/u/74369556/note.html 154 | https://www.mafengwo.cn/u/5028192/note.html 155 | https://www.mafengwo.cn/u/ptah0622/note.html 156 | https://www.mafengwo.cn/u/5203896/note.html 157 | https://www.mafengwo.cn/u/35296229/note.html 158 | https://www.mafengwo.cn/u/69709753/note.html 159 | https://www.mafengwo.cn/u/71897854/note.html 160 | https://www.mafengwo.cn/u/73941769/note.html 161 | https://www.mafengwo.cn/u/79167497/note.html 162 | https://www.mafengwo.cn/u/5648583/note.html 163 | https://www.mafengwo.cn/u/840399/note.html 164 | https://www.mafengwo.cn/u/34260694/note.html 165 | https://www.mafengwo.cn/u/89214773/note.html 166 | https://www.mafengwo.cn/u/47448074/note.html 167 | https://www.mafengwo.cn/u/90344916/note.html 168 | https://www.mafengwo.cn/u/5673085/note.html 169 | https://www.mafengwo.cn/u/fantasist/note.html 170 | https://www.mafengwo.cn/u/gemmakyoto/note.html 171 | https://www.mafengwo.cn/u/kidd1110/note.html 172 | https://www.mafengwo.cn/u/459539/note.html 173 | https://www.mafengwo.cn/u/clijsters/note.html 174 | https://www.mafengwo.cn/u/53816690/note.html 175 | https://www.mafengwo.cn/u/85224198/note.html 176 | https://www.mafengwo.cn/u/1115956/note.html 177 | https://www.mafengwo.cn/u/kevlee/note.html 178 | https://www.mafengwo.cn/u/sarahontheroad.html 179 | https://www.mafengwo.cn/u/10525543/note.html 180 | https://www.mafengwo.cn/u/374140/note.html 181 | https://www.mafengwo.cn/u/19268018/note.html 182 | https://www.mafengwo.cn/u/70816697/note.html 183 | https://www.mafengwo.cn/u/102065/note.html 184 | https://www.mafengwo.cn/u/yolichic/note.html 185 | https://www.mafengwo.cn/u/49130101/note.html 186 | https://www.mafengwo.cn/u/49221414/note.html 187 | https://www.mafengwo.cn/u/sicilia/note.html 188 | https://www.mafengwo.cn/u/zhangxiaofan/note.html 189 | https://www.mafengwo.cn/u/fantastic/note.html 190 | https://www.mafengwo.cn/u/193656/note.html 191 | https://www.mafengwo.cn/u/after17/note.html 192 | https://www.mafengwo.cn/u/guaiiiii/note.html 193 | https://www.mafengwo.cn/u/tianpinan/note.html 194 | https://www.mafengwo.cn/u/52233524/note.html 195 | https://www.mafengwo.cn/u/75151343/note.html 196 | https://www.mafengwo.cn/u/88358953/note.html 197 | https://www.mafengwo.cn/u/83796483/note.html 198 | https://www.mafengwo.cn/u/79297765/note.html 199 | https://www.mafengwo.cn/u/72512443/note.html 200 | https://www.mafengwo.cn/u/niuniu/note.html -------------------------------------------------------------------------------- /mafengwo_article_spider/mafengwo/url_list.txt: -------------------------------------------------------------------------------- 1 | http://www.mafengwo.cn/u/wenhao/note.html 2 | http://www.mafengwo.cn/u/5295777/note.html 3 | http://www.mafengwo.cn/u/85713126/note.html 4 | http://www.mafengwo.cn/u/18015577/note.html 5 | http://www.mafengwo.cn/u/60798801/note.html 6 | http://www.mafengwo.cn/u/yiyinotes/note.html 7 | https://www.mafengwo.cn/u/88358953/note.html 8 | https://www.mafengwo.cn/u/daxigua/note.html 9 | https://www.mafengwo.cn/u/47448074/note.html 10 | https://www.mafengwo.cn/u/36909470/note.html 11 | https://www.mafengwo.cn/u/76823294/note.html 12 | https://www.mafengwo.cn/u/32216322/note.html 13 | https://www.mafengwo.cn/u/10704640/note.html 14 | https://www.mafengwo.cn/u/dearsummar/note.html 15 | https://www.mafengwo.cn/u/19894572/note.html 16 | https://www.mafengwo.cn/u/321294/note.html 17 | https://www.mafengwo.cn/u/5172228/note.html 18 | https://www.mafengwo.cn/u/5017124/note.html 19 | https://www.mafengwo.cn/u/hwf520/note.html 20 | https://www.mafengwo.cn/u/kido37/note.html 21 | https://www.mafengwo.cn/u/41037525/note.html 22 | https://www.mafengwo.cn/u/joyii0513/note.html 23 | https://www.mafengwo.cn/u/69709753/note.html 24 | https://www.mafengwo.cn/u/wayzhenyan/note.html 25 | https://www.mafengwo.cn/u/78343168/note.html 26 | https://www.mafengwo.cn/u/46337998/note.html 27 | https://www.mafengwo.cn/u/sellnuan/note.html 28 | https://www.mafengwo.cn/u/846867/note.html 29 | https://www.mafengwo.cn/u/54041143/note.html 30 | https://www.mafengwo.cn/u/17074212/note.html 31 | https://www.mafengwo.cn/u/5602249/note.html 32 | https://www.mafengwo.cn/u/45793678/note.html 33 | https://www.mafengwo.cn/u/42370376/note.html 34 | https://www.mafengwo.cn/u/81676700/note.html 35 | https://www.mafengwo.cn/u/78838404/note.html 36 | https://www.mafengwo.cn/u/5663320/note.html 37 | https://www.mafengwo.cn/u/56213436/note.html 38 | https://www.mafengwo.cn/u/68691572/note.html 39 | https://www.mafengwo.cn/u/67165115/note.html 40 | https://www.mafengwo.cn/u/45907046/note.html 41 | https://www.mafengwo.cn/u/samwong/note.html 42 | https://www.mafengwo.cn/u/48737554/note.html 43 | https://www.mafengwo.cn/u/5366541/note.html 44 | https://www.mafengwo.cn/u/1047345/note.html 45 | https://www.mafengwo.cn/u/73297474/note.html 46 | https://www.mafengwo.cn/u/64898562/note.html 47 | https://www.mafengwo.cn/u/ariel690/note.html 48 | https://www.mafengwo.cn/u/5133407/note.html 49 | https://www.mafengwo.cn/u/63932781/note.html 50 | https://www.mafengwo.cn/u/49231278/note.html 51 | https://www.mafengwo.cn/u/69833564/note.html 52 | https://www.mafengwo.cn/u/52482820/note.html 53 | https://www.mafengwo.cn/u/374140/note.html 54 | https://www.mafengwo.cn/u/5363625/note.html 55 | https://www.mafengwo.cn/u/64582645/note.html 56 | https://www.mafengwo.cn/u/32228262/note.html 57 | https://www.mafengwo.cn/u/68295140/note.html 58 | https://www.mafengwo.cn/u/93296829/note.html 59 | https://www.mafengwo.cn/u/biggun/note.html 60 | https://www.mafengwo.cn/u/57892379/note.html 61 | https://www.mafengwo.cn/u/76823294/note.html 62 | https://www.mafengwo.cn/u/pinkyvision/note.html 63 | https://www.mafengwo.cn/u/69536526/note.html 64 | https://www.mafengwo.cn/u/37311913/note.html 65 | https://www.mafengwo.cn/u/10345585/note.html 66 | https://www.mafengwo.cn/u/37369363/note.html 67 | https://www.mafengwo.cn/u/inlaoban5/note.html 68 | https://www.mafengwo.cn/u/75471465/note.html 69 | https://www.mafengwo.cn/u/40682663/note.html 70 | https://www.mafengwo.cn/u/799727/note.html 71 | https://www.mafengwo.cn/u/19560416/note.html 72 | https://www.mafengwo.cn/u/summer7/note.html 73 | https://www.mafengwo.cn/u/zhenmeiqu/note.html 74 | https://www.mafengwo.cn/u/93808795/note.html 75 | https://www.mafengwo.cn/u/ruanzhonghua/note.html 76 | https://www.mafengwo.cn/u/59633694/note.html 77 | https://www.mafengwo.cn/u/5172228/note.html 78 | https://www.mafengwo.cn/u/79862907/note.html 79 | https://www.mafengwo.cn/u/5119335/note.html 80 | https://www.mafengwo.cn/u/iiibiz/note.html 81 | https://www.mafengwo.cn/u/92990277/note.html 82 | https://www.mafengwo.cn/u/83736375/note.html 83 | https://www.mafengwo.cn/u/66016397/note.html 84 | https://www.mafengwo.cn/u/75334068/note.html 85 | https://www.mafengwo.cn/u/10606831/note.html 86 | https://www.mafengwo.cn/u/73953374/note.html 87 | https://www.mafengwo.cn/u/5328159/note.html 88 | https://www.mafengwo.cn/u/72226812/note.html 89 | https://www.mafengwo.cn/u/75867238/note.html 90 | https://www.mafengwo.cn/u/ruogu2/note.html 91 | https://www.mafengwo.cn/u/459268/note.html 92 | https://www.mafengwo.cn/u/5037685/note.html 93 | https://www.mafengwo.cn/u/32358313/note.html 94 | https://www.mafengwo.cn/u/ymy817/note.html 95 | https://www.mafengwo.cn/u/44131359/note.html 96 | https://www.mafengwo.cn/u/flyingwsh/note.html 97 | https://www.mafengwo.cn/u/36953718/note.html 98 | https://www.mafengwo.cn/u/830821/note.html 99 | https://www.mafengwo.cn/u/72465054/note.html 100 | https://www.mafengwo.cn/u/816643/note.html 101 | https://www.mafengwo.cn/u/5547423/note.html 102 | https://www.mafengwo.cn/u/85055587/note.html 103 | https://www.mafengwo.cn/u/77259555/note.html 104 | https://www.mafengwo.cn/u/58085128/note.html 105 | https://www.mafengwo.cn/u/85782763/note.html 106 | https://www.mafengwo.cn/u/448785/note.html 107 | https://www.mafengwo.cn/u/shanfeng/note.html 108 | https://www.mafengwo.cn/u/30730200/note.html 109 | https://www.mafengwo.cn/u/82532600/note.html 110 | https://www.mafengwo.cn/u/sellnuan/note.html 111 | https://www.mafengwo.cn/u/85205385/note.html 112 | https://www.mafengwo.cn/u/40525484/note.html 113 | https://www.mafengwo.cn/u/92931036/note.html 114 | https://www.mafengwo.cn/u/60022265/note.html 115 | https://www.mafengwo.cn/u/45066857/note.html 116 | https://www.mafengwo.cn/u/34957278/note.html 117 | https://www.mafengwo.cn/u/90472994/note.html 118 | https://www.mafengwo.cn/u/5295777/note.html 119 | https://www.mafengwo.cn/u/86494331/note.html 120 | https://www.mafengwo.cn/u/42395202/note.html 121 | https://www.mafengwo.cn/u/heididsy/note.html 122 | https://www.mafengwo.cn/u/42694746/note.html 123 | https://www.mafengwo.cn/u/yimeng/note.html 124 | https://www.mafengwo.cn/u/5172228/note.html 125 | https://www.mafengwo.cn/u/17639643/note.html 126 | https://www.mafengwo.cn/u/wuweixiang/note.html 127 | https://www.mafengwo.cn/u/92931036/note.html 128 | https://www.mafengwo.cn/u/49231278/note.html 129 | https://www.mafengwo.cn/u/5481686/note.html 130 | https://www.mafengwo.cn/u/19014378/note.html 131 | https://www.mafengwo.cn/u/seacen/note.html 132 | https://www.mafengwo.cn/u/beslan/note.html 133 | https://www.mafengwo.cn/u/ruanzhonghua/note.html 134 | https://www.mafengwo.cn/u/187367/note.html 135 | https://www.mafengwo.cn/u/32216322/note.html 136 | https://www.mafengwo.cn/u/93157709/note.html 137 | https://www.mafengwo.cn/u/13105932/note.html 138 | https://www.mafengwo.cn/u/86494331/note.html 139 | https://www.mafengwo.cn/u/10911951/note.html 140 | https://www.mafengwo.cn/u/77243222/note.html 141 | https://www.mafengwo.cn/u/yolichic/note.html 142 | https://www.mafengwo.cn/u/88371807/note.html 143 | https://www.mafengwo.cn/u/jklouise/note.html 144 | https://www.mafengwo.cn/u/85558645/note.html 145 | https://www.mafengwo.cn/u/69200064/note.html 146 | https://www.mafengwo.cn/u/88358953/note.html 147 | https://www.mafengwo.cn/u/54534899/note.html 148 | https://www.mafengwo.cn/u/kido37/note.html 149 | https://www.mafengwo.cn/u/ruogu2/note.html 150 | https://www.mafengwo.cn/u/32228262/note.html 151 | https://www.mafengwo.cn/u/208077/note.html 152 | https://www.mafengwo.cn/u/xmulazio/note.html 153 | https://www.mafengwo.cn/u/74369556/note.html 154 | https://www.mafengwo.cn/u/5028192/note.html 155 | https://www.mafengwo.cn/u/ptah0622/note.html 156 | https://www.mafengwo.cn/u/5203896/note.html 157 | https://www.mafengwo.cn/u/35296229/note.html 158 | https://www.mafengwo.cn/u/69709753/note.html 159 | https://www.mafengwo.cn/u/71897854/note.html 160 | https://www.mafengwo.cn/u/73941769/note.html 161 | https://www.mafengwo.cn/u/79167497/note.html 162 | https://www.mafengwo.cn/u/5648583/note.html 163 | https://www.mafengwo.cn/u/840399/note.html 164 | https://www.mafengwo.cn/u/34260694/note.html 165 | https://www.mafengwo.cn/u/89214773/note.html 166 | https://www.mafengwo.cn/u/47448074/note.html 167 | https://www.mafengwo.cn/u/90344916/note.html 168 | https://www.mafengwo.cn/u/5673085/note.html 169 | https://www.mafengwo.cn/u/fantasist/note.html 170 | https://www.mafengwo.cn/u/gemmakyoto/note.html 171 | https://www.mafengwo.cn/u/kidd1110/note.html 172 | https://www.mafengwo.cn/u/459539/note.html 173 | https://www.mafengwo.cn/u/clijsters/note.html 174 | https://www.mafengwo.cn/u/53816690/note.html 175 | https://www.mafengwo.cn/u/85224198/note.html 176 | https://www.mafengwo.cn/u/1115956/note.html 177 | https://www.mafengwo.cn/u/kevlee/note.html 178 | https://www.mafengwo.cn/u/sarahontheroad/note.html 179 | https://www.mafengwo.cn/u/10525543/note.html 180 | https://www.mafengwo.cn/u/374140/note.html 181 | https://www.mafengwo.cn/u/19268018/note.html 182 | https://www.mafengwo.cn/u/70816697/note.html 183 | https://www.mafengwo.cn/u/102065/note.html 184 | https://www.mafengwo.cn/u/yolichic/note.html 185 | https://www.mafengwo.cn/u/49130101/note.html 186 | https://www.mafengwo.cn/u/49221414/note.html 187 | https://www.mafengwo.cn/u/sicilia/note.html 188 | https://www.mafengwo.cn/u/zhangxiaofan/note.html 189 | https://www.mafengwo.cn/u/fantastic/note.html 190 | https://www.mafengwo.cn/u/193656/note.html 191 | https://www.mafengwo.cn/u/after17/note.html 192 | https://www.mafengwo.cn/u/guaiiiii/note.html 193 | https://www.mafengwo.cn/u/tianpinan/note.html 194 | https://www.mafengwo.cn/u/52233524/note.html 195 | https://www.mafengwo.cn/u/75151343/note.html 196 | https://www.mafengwo.cn/u/88358953/note.html 197 | https://www.mafengwo.cn/u/83796483/note.html 198 | https://www.mafengwo.cn/u/79297765/note.html 199 | https://www.mafengwo.cn/u/72512443/note.html 200 | https://www.mafengwo.cn/u/niuniu/note.html 201 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 16 | 17 | 22 | 23 | 24 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 45 | 46 | 47 | 48 | 49 | 69 | 70 | 71 | 91 | 92 | 93 | 113 | 114 | 115 | 135 | 136 | 137 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 1609123434116 181 | 186 | 187 | 188 | 189 | 191 | 192 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | -------------------------------------------------------------------------------- /douban_movie_top250_scrapy/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 73 | 74 | 75 | 76 | 77 | true 78 | DEFINITION_ORDER 79 | 80 | 81 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 |