├── kolesa
├── __init__.py
├── README.md
├── handle_mongo.py
└── crawl_kolesa.py
├── boss_zhipin
├── __init__.py
├── README.md
└── crawl_boss_zhipin.py
├── dasouche
├── __init__.py
├── README.md
└── handle_dasouche.py
├── synchronous
├── __init__.py
├── sample
│ ├── __init__.py
│ ├── multiprocess_pool.py
│ ├── thread_test1.py
│ ├── multiprocess_test3.py
│ ├── multiprocess_test2.py
│ ├── process_not_share.py
│ ├── multiprocess_test1.py
│ ├── multiprocess_class.py
│ └── multiprocess_share.py
├── test1.py
├── handle_queue.py
├── handle_redis.py
├── handle_request.py
├── handle_spider.py
└── spider_multiprocess.py
├── login_github
├── __init__.py
├── README.md
└── handle_login.py
├── dongqiudi
├── dongqiudi
│ ├── __init__.py
│ ├── main.py
│ ├── dongqiudi_pic
│ │ ├── 西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑
│ │ │ ├── 7
│ │ │ ├── ChMf8FzSXFKAcY7DAAHvfisNUuY153.jpg
│ │ │ ├── ChMf8FzgKFSARnpeAAGp9Jfeq-A752.jpg
│ │ │ ├── ChMf8Fztq1-AXiXEADuUv2VgHRs890.gif
│ │ │ ├── ChNLklztqZOAFKI4AANLRhtfxnE659.jpg
│ │ │ ├── ChNLklztqnaAEJHSAAK8ZOypb-k480.jpg
│ │ │ ├── ChNLklztqrmAFEVQAAGNxQf4M3s194.jpg
│ │ │ └── ChNLklztvpmAGjgfAAFd4X3svKc014.jpg
│ │ └── C罗与法拉利车手勒克莱尔同场较劲!
│ │ │ ├── ChO2w1zuIRiAIJuDAAJrbFscpoU610.jpg
│ │ │ ├── ChO2w1zuISOAF511AAFaT82ScyE114.jpg
│ │ │ └── ChONolzuIOiASesEAAEgiz_2cMw359.jpg
│ ├── spiders
│ │ ├── __init__.py
│ │ └── crawl_dongqiudi.py
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ └── settings.py
├── scrapy.cfg
└── README.md
├── douban_movie_top250
├── __init__.py
├── README.md
├── handle_mongo.py
└── crawl_douban_movie_info_top250.py
├── mafengwo
├── mafengwo
│ ├── __init__.py
│ ├── main.py
│ ├── spiders
│ │ ├── __init__.py
│ │ └── crawl_mafengwo.py
│ ├── mafengwo_images
│ │ └── full
│ │ │ └── 0b5ae5f0ae9aaaa661bc67bbd23eae74bb71bc46.jpg
│ ├── items.py
│ ├── pipelines.py
│ ├── middlewares.py
│ ├── settings.py
│ └── url_list.txt
└── scrapy.cfg
├── douban_movie_top250_scrapy
├── douban
│ ├── __init__.py
│ ├── main.py
│ ├── douban.json
│ ├── spiders
│ │ ├── __init__.py
│ │ └── douban_spider.py
│ ├── items.py
│ ├── pipelines.py
│ ├── middlewares.py
│ └── settings.py
├── README.md
├── .idea
│ ├── misc.xml
│ ├── modules.xml
│ ├── douban.iml
│ ├── deployment.xml
│ └── workspace.xml
└── scrapy.cfg
├── mafengwo_article_spider
├── mafengwo
│ ├── __init__.py
│ ├── main.py
│ ├── spiders
│ │ ├── __init__.py
│ │ └── crawl_mafengwo.py
│ ├── js
│ │ ├── README.md
│ │ ├── handle_sn.py
│ │ └── tool_decode_index.js
│ ├── middlewares.py
│ ├── handle_mongo.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ ├── handle_task.py
│ └── url_list.txt
├── README.md
├── .idea
│ ├── encodings.xml
│ ├── misc.xml
│ ├── modules.xml
│ └── mafengwo.iml
└── scrapy.cfg
├── video
├── README.md
└── lishipin
│ └── crawl_lishipin.py
├── baidu_m_keyword_ranks
├── README.md
├── setting.py
├── handle_mysql.py
└── baidu_m_keyword.py
├── lagou
├── README.md
├── handle_mongo.py
├── crawl_lagou_job_old.py
├── handle_mysql.py
└── crawl_lagou_job_new.py
├── .idea
├── vcs.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── small-spider-project.iml
├── deployment.xml
└── workspace.xml
├── README.md
└── .gitignore
/kolesa/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/boss_zhipin/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dasouche/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/synchronous/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/login_github/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/douban_movie_top250/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/mafengwo/mafengwo/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/synchronous/sample/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/video/README.md:
--------------------------------------------------------------------------------
1 | #### 1、lishipin 梨视频数据抓取
2 |
--------------------------------------------------------------------------------
/dasouche/README.md:
--------------------------------------------------------------------------------
1 | # 大搜车爬虫
2 |
3 | #### bug:dazhuang_python@sina.com
4 |
--------------------------------------------------------------------------------
/baidu_m_keyword_ranks/README.md:
--------------------------------------------------------------------------------
1 | # 百度M站搜索关键字去除广告后的排名抓取
2 | ## python3.6 多线程
3 |
--------------------------------------------------------------------------------
/kolesa/README.md:
--------------------------------------------------------------------------------
1 | # kolesa爬虫
2 |
3 | #### bug:dazhuang_python@sina.com
4 |
--------------------------------------------------------------------------------
/login_github/README.md:
--------------------------------------------------------------------------------
1 | # 登录github
2 |
3 | #### bug:dazhuang_python@sina.com
4 |
--------------------------------------------------------------------------------
/boss_zhipin/README.md:
--------------------------------------------------------------------------------
1 | # boos直聘python岗位全国爬虫
2 |
3 | #### bug:dazhuang_python@sina.com
4 |
--------------------------------------------------------------------------------
/douban_movie_top250/README.md:
--------------------------------------------------------------------------------
1 | # douban电影top250爬虫
2 |
3 | #### bug:dazhuang_python@sina.com
4 |
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/main.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute("scrapy crawl crawl_dongqiudi".split())
--------------------------------------------------------------------------------
/mafengwo/mafengwo/main.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute("scrapy crawl crawl_mafengwo".split())
--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/README.md:
--------------------------------------------------------------------------------
1 | # douban电影top250爬虫-通过scrapy框架抓取
2 |
3 | #### bug:dazhuang_python@sina.com
4 |
--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/main.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute("scrapy crawl douban_spider".split())
--------------------------------------------------------------------------------
/lagou/README.md:
--------------------------------------------------------------------------------
1 | # 拉钩python岗位全国爬虫
2 |
3 | ##### 不能在__init__方法中写mongo信息,否则多进程无法启动
4 |
5 | #### bug:dazhuang_python@sina.com
6 |
--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/main.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute("scrapy crawl crawl_mafengwo".split())
--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/douban.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/douban_movie_top250_scrapy/douban/douban.json
--------------------------------------------------------------------------------
/mafengwo_article_spider/README.md:
--------------------------------------------------------------------------------
1 | # small-spider-project
2 | ## 日常爬虫
3 |
4 | #### mafengwo_article_spider 马蜂窝最新,最热游记抓取
5 |
6 |
7 |
8 | #### bug:dazhuang_python@sina.com
9 |
--------------------------------------------------------------------------------
/baidu_m_keyword_ranks/setting.py:
--------------------------------------------------------------------------------
1 | mysql_ip = '127.0.0.1'
2 | mysql_port = 3306
3 | mysql_database = '库名'
4 | mysql_table = ''
5 | mysql_username = '用户名'
6 | mysql_password = '密码'
7 |
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/7:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/7
--------------------------------------------------------------------------------
/mafengwo_article_spider/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/mafengwo/mafengwo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/mafengwo/mafengwo/mafengwo_images/full/0b5ae5f0ae9aaaa661bc67bbd23eae74bb71bc46.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/mafengwo/mafengwo/mafengwo_images/full/0b5ae5f0ae9aaaa661bc67bbd23eae74bb71bc46.jpg
--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/mafengwo_article_spider/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲!/ChO2w1zuIRiAIJuDAAJrbFscpoU610.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲!/ChO2w1zuIRiAIJuDAAJrbFscpoU610.jpg
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲!/ChO2w1zuISOAF511AAFaT82ScyE114.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲!/ChO2w1zuISOAF511AAFaT82ScyE114.jpg
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲!/ChONolzuIOiASesEAAEgiz_2cMw359.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲!/ChONolzuIOiASesEAAEgiz_2cMw359.jpg
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChMf8FzSXFKAcY7DAAHvfisNUuY153.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChMf8FzSXFKAcY7DAAHvfisNUuY153.jpg
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChMf8FzgKFSARnpeAAGp9Jfeq-A752.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChMf8FzgKFSARnpeAAGp9Jfeq-A752.jpg
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChMf8Fztq1-AXiXEADuUv2VgHRs890.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChMf8Fztq1-AXiXEADuUv2VgHRs890.gif
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChNLklztqZOAFKI4AANLRhtfxnE659.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChNLklztqZOAFKI4AANLRhtfxnE659.jpg
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChNLklztqnaAEJHSAAK8ZOypb-k480.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChNLklztqnaAEJHSAAK8ZOypb-k480.jpg
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChNLklztqrmAFEVQAAGNxQf4M3s194.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChNLklztqrmAFEVQAAGNxQf4M3s194.jpg
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChNLklztvpmAGjgfAAFd4X3svKc014.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机,巴尔韦德巴萨生涯的红与黑/ChNLklztvpmAGjgfAAFd4X3svKc014.jpg
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/dongqiudi/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = dongqiudi.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = dongqiudi
12 |
--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/mafengwo/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = mafengwo.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = mafengwo
12 |
--------------------------------------------------------------------------------
/mafengwo_article_spider/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = douban.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = douban
12 |
--------------------------------------------------------------------------------
/mafengwo_article_spider/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = mafengwo.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = mafengwo
12 |
--------------------------------------------------------------------------------
/douban_movie_top250/handle_mongo.py:
--------------------------------------------------------------------------------
1 | import pymongo
2 | from pymongo.collection import Collection
3 |
4 |
5 | class Handle_Mongo(object):
6 | def __init__(self):
7 | mongo_client = pymongo.MongoClient(host="127.0.0.1",port=27017)
8 | self.db_data = mongo_client['douban']
9 |
10 | def handle_save_data(self,item):
11 | task_collection = Collection(self.db_data,'douban_data')
12 | task_collection.insert(item)
13 |
14 | douban_mongo = Handle_Mongo()
15 |
--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/js/README.md:
--------------------------------------------------------------------------------
1 | # 马蜂窝生成sn的js解析
2 |
3 | #### index.js是马蜂窝网站上原有JS文件
4 | #### tool_decode_index.js是通过 http://jsnice.org/格式化和半解密
5 | #### handle_sn.py对SN进行破解,请求时发现无需传递SN,晕菜...
6 |
7 |
8 | ##### 619行:salt值:c9d6618dbc657b41a66eb0af952906f1
9 | ##### 632行: 获取时间戳p3["_ts"] = (new Date)[__Ox2133f[65]]();
10 | ##### 635行: 调用VIEW函数获取sn值var vroot = view(obj["extend"](true, {}, p3));
11 | ##### 63行: 返回sn值,md5并切片
12 |
13 |
14 | ### 交流:dazhuang_python@sina.com
15 |
16 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # small-spider-project
2 | ## 日常爬虫
3 |
4 | #### 1、baidu_m_keyword_ranks 百度M站关键词搜索去除广告后的抓取
5 | #### 2、video 视频抓取
6 | #### 3、mafengwo 马蜂窝游记和图片抓取
7 | #### 4、kolesa kolesa数据抓取
8 | #### 5、douban_movie_top250 豆瓣电影top250数据抓取
9 | #### 6、douban_movie_top250_scrapy 豆瓣电影top250数据抓取-通过scrapy框架抓取
10 | #### 7、mafengwo_article_spider 马蜂窝所有游记抓取
11 | #### 8、dasouche 大搜车数据抓取
12 | #### 9、dongqiudi 懂球帝新闻数据抓取
13 | #### 10、github 登录github
14 | #### 11、synchronous 同步爬虫
15 |
16 |
17 |
18 | #### bug:dazhuang_python@sina.com
19 |
--------------------------------------------------------------------------------
/dongqiudi/README.md:
--------------------------------------------------------------------------------
1 | # 懂球帝新闻爬虫
2 | ### 需求
3 | 抓取懂球帝新闻https://dongqiudi.com/news
4 | ### 项目结构
5 | ```text
6 | dongqiudi
7 | dongqiudi_pic 图片目录
8 | spiders 爬虫解析文件
9 | items.py 项目字段定义文件
10 | middlewares.py 中间件,包含下载代理中间件
11 | pipelines.py 数据管道,包含mongo数据存储和图片下载
12 | settings.py 配置文件
13 | main.py 启动文件
14 | ```
15 | ### 说明
16 | ```text
17 | 在pipelines.py中定义mongodb的ip地址和端口号
18 | 在settings.py中定义是否开启中间件,下载延迟等选项
19 | ```
20 |
21 |
22 | #### bug:dazhuang_python@sina.com
23 |
--------------------------------------------------------------------------------
/mafengwo_article_spider/.idea/mafengwo.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/lagou/handle_mongo.py:
--------------------------------------------------------------------------------
1 | import pymongo
2 | from pymongo.collection import Collection
3 |
4 |
5 |
6 | class Handle_lagou_mongo(object):
7 | def __init__(self):
8 | lagou_client = pymongo.MongoClient(host="127.0.0.1",port=27017)
9 | self.lagou_db = lagou_client['lagou']
10 |
11 | def handle_save_data(self,item):
12 | print(item)
13 | lagou_collection = Collection(self.lagou_db,"lagou_data")
14 | lagou_collection.update({"positionId":item['positionId']},item,True)
15 |
16 |
17 | lagou_mongo = Handle_lagou_mongo()
--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/.idea/douban.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class DoubanItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | #序号
15 | serial_number = scrapy.Field()
16 | #电影名称
17 | movie_name = scrapy.Field()
18 | #电影介绍
19 | introduce = scrapy.Field()
20 | #星级
21 | star = scrapy.Field()
22 | #评价
23 | evaluate = scrapy.Field()
24 | #电影描述
25 | describe = scrapy.Field()
26 |
--------------------------------------------------------------------------------
/synchronous/test1.py:
--------------------------------------------------------------------------------
1 | import aiohttp
2 | import asyncio
3 |
4 | headers = {
5 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
6 | "Chrome/86.0.4240.75 Safari/537.36 "
7 | }
8 |
9 |
10 | async def sample_get():
11 | # 发送一个简单的get请求
12 | async with aiohttp.ClientSession() as session:
13 | async with session.get(url="https://www.baidu.com", headers=headers) as response:
14 | print(response.status)
15 | print(await response.text())
16 |
17 |
18 | if __name__ == '__main__':
19 | loop = asyncio.get_event_loop()
20 | loop.run_until_complete(sample_get())
21 |
--------------------------------------------------------------------------------
/synchronous/sample/multiprocess_pool.py:
--------------------------------------------------------------------------------
1 | import multiprocessing
2 | import time
3 |
4 |
5 | def work(item):
6 | time.sleep(0.05)
7 | return "进程ID:{id},进程名称{name},执行任务item:{item}".format(id=multiprocessing.current_process().pid,
8 | name=multiprocessing.current_process().name, item=item)
9 |
10 |
11 | def main():
12 | # 进程池大小为4
13 | pool = multiprocessing.Pool(processes=4)
14 | for item in range(100):
15 | result = pool.apply_async(func=work, args=(item,))
16 | print(result.get())
17 | pool.close()
18 | pool.join()
19 |
20 |
21 | if __name__ == '__main__':
22 | main()
23 |
--------------------------------------------------------------------------------
/.idea/small-spider-project.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/synchronous/handle_queue.py:
--------------------------------------------------------------------------------
1 | import queue
2 | from handle_request import DangdangRequest
3 |
4 |
5 | class DangdangQueue(object):
6 | def __init__(self):
7 | self.queue = queue.Queue()
8 |
9 | def insert_data(self, data):
10 | print("添加抓取任务: ", data)
11 | if isinstance(data, DangdangRequest):
12 | self.queue.put(data)
13 | return False
14 |
15 | def get_data(self):
16 | if not self.queue.empty():
17 | data = self.queue.get()
18 | print("取出任务:", data)
19 | return data
20 | else:
21 | return False
22 |
23 | def database_empty(self):
24 | return self.queue.qsize() == 0
25 |
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class DongqiudiItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | #抓取URL
15 | from_url = scrapy.Field()
16 | #新闻标题
17 | title = scrapy.Field()
18 | #发表时间
19 | release_time = scrapy.Field()
20 | #作者
21 | author = scrapy.Field()
22 | #新闻内容
23 | content = scrapy.Field()
24 | # 抓取时间
25 | crawl_time = scrapy.Field()
26 | images = scrapy.Field()
27 | image_urls = scrapy.Field()
28 | image_paths = scrapy.Field()
29 |
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | import base64
9 |
10 |
11 | class DongqiudiProxyMiddleware(object):
12 | # 设置代理策略
13 | def process_request(self, request, spider):
14 | # proxy,主机头和端口号
15 | request.meta['proxy'] = 'http://http-dyn.abuyun.com:9020'
16 | # 用户名:密码,当前代理必须要有费用
17 | # 你自己买的代理,用户名和密码肯定和我的不一样
18 | proxy_name_pass = 'HTK32673HL02BK2D:50125D2D38937C94'.encode('utf-8')
19 | encode_pass_name = base64.b64encode(proxy_name_pass)
20 | # 将代理信息设置到头部去
21 | # 注意!!!!!Basic后面有一个空格
22 | request.headers['Proxy-Authorization'] = 'Basic ' + encode_pass_name.decode()
--------------------------------------------------------------------------------
/kolesa/handle_mongo.py:
--------------------------------------------------------------------------------
1 | import pymongo
2 | from pymongo.collection import Collection
3 |
4 |
5 | class Handle_Mongo(object):
6 | def __init__(self):
7 | mongo_client = pymongo.MongoClient(host="127.0.0.1",port=27017)
8 | self.db_data = mongo_client['kolesa']
9 |
10 | def handle_save_task(self,item):
11 | task_collection = Collection(self.db_data,'kolesa_task')
12 | task_collection.update({'id':item['id']},item,True)
13 |
14 | def handle_get_task(self):
15 | task_collection = Collection(self.db_data,'kolesa_task')
16 | return task_collection.find_one_and_delete({})
17 |
18 | def handle_save_data(self,item):
19 | task_collection = Collection(self.db_data,'kolesa_data')
20 | task_collection.update({'id':item['id']},item,True)
21 |
22 | kolesa_mongo = Handle_Mongo()
--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 | import requests
10 | import json
11 | import random
12 |
13 |
14 | class MafengwoProxyMiddleware(object):
15 |
16 | def process_response(self, request, response, spider):
17 | if 'mafengwo.net' in request.url:
18 | return response
19 | elif response is None:
20 | return request
21 | elif response.status == 302:
22 | return request
23 | elif response.status == 403:
24 | return request
25 | elif 'flashcookie.sw' in response.text:
26 | return request
27 | else:
28 | return response
29 |
--------------------------------------------------------------------------------
/synchronous/sample/thread_test1.py:
--------------------------------------------------------------------------------
1 | import _thread
2 | import threading
3 | import time
4 |
5 |
6 | def _thread_handle(thread_name, delay):
7 | for num in range(10):
8 | time.sleep(delay)
9 | print("{}的num:{}".format(thread_name, num))
10 |
11 |
12 | def threading_handle(delay=1):
13 | for num in range(10):
14 | time.sleep(delay)
15 | print("{}-num-{}".format(threading.current_thread().name, num))
16 |
17 |
18 | def main():
19 | # for item in range(10):
20 | # _thread.start_new_thread(_thread_handle, ("Thread - {}".format(item), 1))
21 | # # 和进程不同,如果进程死亡,则线程也会死亡
22 | # time.sleep(200)
23 | for item in range(10):
24 | # thread = threading.Thread(target=threading_handle, args=(1,), name="执行线程-{}".format(item))
25 | thread = threading.Thread(target=threading_handle, args=(1,))
26 | thread.start()
27 |
28 |
29 | if __name__ == '__main__':
30 | main()
31 |
--------------------------------------------------------------------------------
/synchronous/sample/multiprocess_test3.py:
--------------------------------------------------------------------------------
1 | import time
2 | import multiprocessing
3 |
4 |
5 | def status():
6 | """守护进程方法"""
7 | while True:
8 | print("守护进程ID:{id},守护进程名称:{name}".format(id=multiprocessing.current_process().pid,
9 | name=multiprocessing.current_process().name))
10 | time.sleep(1)
11 |
12 |
13 | def worker():
14 | """具体执行工作的方法"""
15 | # 创建守护进程,daemon为TRUE
16 | daemon_process = multiprocessing.Process(target=status, name="守护进程", daemon=True)
17 | daemon_process.start()
18 | for item in range(10):
19 | print("进程ID:{id},进程名称:{name}".format(id=multiprocessing.current_process().pid, name=multiprocessing.current_process().name))
20 | time.sleep(2)
21 |
22 |
23 | def main():
24 | process = multiprocessing.Process(target=worker, name="工作进程")
25 | process.start()
26 |
27 |
28 | if __name__ == '__main__':
29 | main()
30 |
--------------------------------------------------------------------------------
/synchronous/sample/multiprocess_test2.py:
--------------------------------------------------------------------------------
1 | import multiprocessing
2 | import time
3 |
4 |
5 | def send(msg):
6 | time.sleep(5)
7 | print("进程ID:{id},进程名称:{name},发送消息:{msg}".format(id=multiprocessing.current_process().pid,
8 | name=multiprocessing.current_process().name, msg=msg))
9 |
10 |
11 | def main():
12 | process = multiprocessing.Process(target=send, name="TEST", args=("发送消息测试",))
13 | process.start()
14 | # 阻塞主进程执行,将等待子进程执行完毕后再执行主进程
15 | # process.join()
16 | time.sleep(2)
17 | print("进程ID:{id},进程名称:{name}".format(id=multiprocessing.current_process().pid,
18 | name=multiprocessing.current_process().name))
19 | # 中断进程前判断进程是否存活
20 | if process.is_alive():
21 | # 中断进程
22 | process.terminate()
23 | print("进程被中断:{name}".format(name=process.name))
24 |
25 |
26 | if __name__ == '__main__':
27 | main()
28 |
--------------------------------------------------------------------------------
/synchronous/sample/process_not_share.py:
--------------------------------------------------------------------------------
1 | import multiprocessing
2 | import threading
3 |
4 | # 多进程修改值
5 | value = 0
6 | lock = multiprocessing.Lock()
7 |
8 |
9 | def test1(lock=None):
10 | global value
11 | for i in range(1000000):
12 | # 使用锁解决多线程共享变量时的不安全问题
13 | lock.acquire()
14 | value = value + 1
15 | lock.release()
16 |
17 |
18 | def multiprocess_value():
19 | p1 = multiprocessing.Process(target=test1)
20 | p2 = multiprocessing.Process(target=test1)
21 | p1.start()
22 | p2.start()
23 | p1.join()
24 | p2.join()
25 |
26 |
27 | def thread_value():
28 | t1 = threading.Thread(target=test1, args=(lock, ))
29 | t2 = threading.Thread(target=test1, args=(lock, ))
30 | t1.start()
31 | t2.start()
32 | t1.join()
33 | t2.join()
34 |
35 |
36 | if __name__ == '__main__':
37 | # 进程与进程之间不共享数据
38 | # multiprocess_value()
39 | # print(value)
40 | # 多线程间共享数据
41 | thread_value()
42 | print(value)
43 |
--------------------------------------------------------------------------------
/synchronous/sample/multiprocess_test1.py:
--------------------------------------------------------------------------------
1 | from multiprocessing import cpu_count
2 |
3 | print("cpu内核数量为:{count}".format(count=cpu_count()))
4 | import multiprocessing
5 | import sys
6 | import time
7 |
8 |
9 | def worker(delay, count):
10 | for num in range(count):
11 | print("{process}进程ID:{id},进程名称:{name}".format(process=num, id=multiprocessing.current_process().pid,
12 | name=multiprocessing.current_process().name))
13 | time.sleep(delay)
14 |
15 |
16 | def main():
17 | # 创建三个进程
18 | for item in range(3):
19 | # 传入参数和进程名称
20 | process = multiprocessing.Process(target=worker, args=(1, 10,), name="item-{item}".format(item=item))
21 | process.start()
22 | print("进程ID:{id},进程名称:{name}".format(id=multiprocessing.current_process().pid, name=multiprocessing.current_process().name))
23 | # 未设置进程阻塞,主进程即使退出也不会影响子进程执行
24 | print("主进程退出")
25 | sys.exit(0)
26 |
27 |
28 | if __name__ == '__main__':
29 | main()
30 |
--------------------------------------------------------------------------------
/mafengwo/mafengwo/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class MafengwoItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | #文章数量
15 | article_sum = scrapy.Field()
16 | #文章标题
17 | title = scrapy.Field()
18 | #作者名称
19 | name = scrapy.Field()
20 | #id
21 | id = scrapy.Field()
22 | #文章发表时间
23 | release_time = scrapy.Field()
24 | #评论数
25 | comment_sum = scrapy.Field()
26 | #收藏数
27 | star_sum = scrapy.Field()
28 | #顶
29 | support_sum = scrapy.Field()
30 | #阅读数
31 | read_sum = scrapy.Field()
32 | #文章内容
33 | content = scrapy.Field()
34 | #抓取URL
35 | from_url = scrapy.Field()
36 | #抓取时间
37 | crawl_time = scrapy.Field()
38 | images = scrapy.Field()
39 | image_urls = scrapy.Field()
40 | image_paths = scrapy.Field()
41 |
--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/handle_mongo.py:
--------------------------------------------------------------------------------
1 | import pymongo
2 | from pymongo.collection import Collection
3 |
4 |
5 |
6 |
7 | class Mafengwo_mongo(object):
8 | def __init__(self):
9 | # mongo_client = pymongo.MongoClient(host='127.0.0.1', port=39070)
10 | mongo_client = pymongo.MongoClient(host='10.70.120.156', port=27017)
11 | self.db_data = mongo_client['oreo']
12 |
13 | def get_from_url(self, item):
14 | db_collections = Collection(self.db_data, 'mafengwo_article')
15 | result = db_collections.find_one({'from_url':item})
16 | if result:
17 | return True
18 | else:
19 | return False
20 | #return False
21 |
22 | def insert_task(self,item):
23 | db_collections = Collection(self.db_data, 'mafengwo_article_task')
24 | db_collections.insert_one(item)
25 |
26 | def get_task(self):
27 | db_collections = Collection(self.db_data, 'mafengwo_article_task')
28 | return db_collections.find_one_and_delete({})
29 |
30 |
31 | mongo = Mafengwo_mongo()
32 |
--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/js/handle_sn.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import time
3 | import requests
4 | import json
5 |
6 |
7 |
8 |
9 | for i in range(1,301):
10 | input_value = {
11 | "cost":"0",
12 | "days":"0",
13 | "mddid":"10065",
14 | "month":"0",
15 | "page":i,
16 | "pageid":"mdd_index",
17 | "sort":"1",
18 | "tagid":"0",
19 | "_ts":"1558433973256"
20 | }
21 | salt = "c9d6618dbc657b41a66eb0af952906f1"
22 | str = json.dumps(input_value)+salt
23 |
24 | # 创建md5对象
25 | hl = hashlib.md5()
26 | hl.update(str.encode(encoding='utf-8'))
27 | md5_result = hl.hexdigest()[2:12]
28 | # input_value['_sn'] = md5_result
29 |
30 |
31 |
32 | url = 'http://www.mafengwo.cn/gonglve/ajax.php?act=get_travellist'
33 | header = {
34 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
35 | }
36 | response = requests.post(url=url,headers=header,data=input_value)
37 | print(response.text)
38 | time.sleep(1)
39 |
--------------------------------------------------------------------------------
/synchronous/handle_redis.py:
--------------------------------------------------------------------------------
1 | import redis
2 | from pickle import dumps, loads
3 | from handle_request import DangdangRequest
4 |
5 |
6 | class RedisQueue(object):
7 | def __init__(self):
8 | pool = redis.ConnectionPool(host="192.168.149.129", port=6379)
9 | self.r = redis.Redis(connection_pool=pool)
10 |
11 | def insert_data(self, data):
12 | print("添加抓取任务: ", data)
13 | if isinstance(data, DangdangRequest):
14 | self.r.rpush("TEST", dumps(data))
15 | return False
16 |
17 | def get_data(self):
18 | if self.r.llen("TEST"):
19 | data = loads(self.r.lpop("TEST"))
20 | print("取出任务:", data)
21 | return data
22 | else:
23 | return False
24 |
25 | def database_empty(self):
26 | return self.r.llen("TEST") == 0
27 |
28 |
29 | if __name__ == '__main__':
30 | db = RedisQueue()
31 | start_url = "https://www.baidu.com"
32 | baidu_request = DangdangRequest(url=start_url, callback="hello", need_proxy=True)
33 | db.insert_data(data=baidu_request)
34 | request = db.get_data()
35 |
--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class MafengwoItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | #文章数量
15 | article_sum = scrapy.Field()
16 | #文章标题
17 | title = scrapy.Field()
18 | #作者名称
19 | name = scrapy.Field()
20 | #id
21 | id = scrapy.Field()
22 | #文章发表时间
23 | release_time = scrapy.Field()
24 | #评论数
25 | comment_sum = scrapy.Field()
26 | #收藏数
27 | star_sum = scrapy.Field()
28 | #顶
29 | support_sum = scrapy.Field()
30 | #阅读数
31 | read_sum = scrapy.Field()
32 | #文章内容
33 | content = scrapy.Field()
34 | #抓取URL
35 | from_url = scrapy.Field()
36 | upload_status = scrapy.Field()
37 | #抓取时间
38 | crawl_time = scrapy.Field()
39 | images = scrapy.Field()
40 | image_urls = scrapy.Field()
41 | image_paths = scrapy.Field()
42 | video_urls = scrapy.Field()
43 |
--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import pymongo,json
9 | from pymongo.collection import Collection
10 |
11 | class DoubanPipeline(object):
12 | def __init__(self):
13 | mongo_client = pymongo.MongoClient(host='127.0.0.1', port=27017)
14 | self.db_data = mongo_client['douban_scrapy']
15 |
16 | def process_item(self, item, spider):
17 | #指定数据库和表
18 | douban_collection = Collection(self.db_data,'douban')
19 | douban_collection.insert(dict(item))
20 | return item
21 |
22 | class DoubanJsonPipeline(object):
23 | def __init__(self):
24 | self.file = open('douban.json','w')
25 |
26 | def process_item(self, item, spider):
27 | # json数据中添加逗号和换行符
28 | content = json.dumps(dict(item),ensure_ascii = False) + ",\n"
29 | self.file.write(content)
30 | return item
31 |
32 | def close_spider(self,spider):
33 | self.file.close()
34 |
--------------------------------------------------------------------------------
/synchronous/sample/multiprocess_class.py:
--------------------------------------------------------------------------------
1 | import multiprocessing
2 | import sys
3 | import time
4 |
5 |
6 | # 继承multiprocessing.Process类
7 | class MyProcess(multiprocessing.Process):
8 | def __init__(self, name, delay, count):
9 | # 调用父类方法传入名称
10 | super().__init__(name=name)
11 | self.delay = delay
12 | self.count = count
13 |
14 | # 多进程类具体执行方法
15 | def run(self) -> None:
16 | for num in range(self.count):
17 | print("{process}进程ID:{id},进程名称:{name}".format(process=num, id=multiprocessing.current_process().pid,
18 | name=multiprocessing.current_process().name))
19 | time.sleep(self.delay)
20 |
21 |
22 | def main():
23 | for item in range(3):
24 | process = MyProcess(name="item-{id}".format(id=item), delay=1, count=10)
25 | # 多进程类start方法会调用run方法
26 | process.start()
27 |
28 | print("进程ID:{id},进程名称:{name}".format(id=multiprocessing.current_process().pid,
29 | name=multiprocessing.current_process().name))
30 | print("主进程退出")
31 | sys.exit(0)
32 |
33 |
34 | if __name__ == '__main__':
35 | main()
36 |
--------------------------------------------------------------------------------
/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/mafengwo/mafengwo/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import pymongo
8 | from pymongo.collection import Collection
9 | from scrapy.pipelines.images import ImagesPipeline
10 | from scrapy import Request
11 |
12 | class MafengwoPipeline(object):
13 | def __init__(self):
14 | mongo_client = pymongo.MongoClient(host='127.0.0.1', port=27017)
15 | self.db_data = mongo_client['mafengwo']
16 |
17 | def process_item(self, item, spider):
18 | db_collections = Collection(self.db_data, 'mafengwo_article')
19 | db_collections.update({'from_url':item['from_url']},item,True)
20 | return item
21 |
22 |
23 | class MafengwoImagePipeline(ImagesPipeline):
24 | def get_media_requests(self, item, info):
25 | for image_url in item['image_urls']:
26 | yield Request(url=image_url)
27 |
28 | def item_completed(self, results, item, info):
29 | image_paths = [x['path'] for ok, x in results if ok]
30 | if not image_paths:
31 | pass
32 | item['image_paths'] = image_paths
33 | return item
34 |
--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 | import base64
8 | import random
9 |
10 | class ProxyMiddleware(object):
11 | def __init__(self):
12 | self.proxy_info = [
13 | {'proxy_url': 'ip4.hahado.cn:35410', 'proxy_user_pass': b'duoipbpvzyymn:tRf6NnfsBi7k0'},
14 | {'proxy_url': 'ip4.hahado.cn:35164', 'proxy_user_pass': b'duoipcnezxjlvkv:xXuXTPES9XPwp'},
15 | {'proxy_url': 'ip4.hahado.cn:35401', 'proxy_user_pass': b'duoipwpdlrfwc:888888'},
16 | {'proxy_url': 'ip4.hahado.cn:35404', 'proxy_user_pass': b'duoipcnxgfzfsyp:TjgLhDqqEj0Pe'},
17 | {'proxy_url': 'ip4.hahado.cn:35413', 'proxy_user_pass': b'duoipvriezfde:bq4RYrQiWuQzv'},
18 | ]
19 |
20 | def process_request(self, request, spider):
21 | proxy = random.choice(self.proxy_info)
22 | request.meta['proxy'] = proxy['proxy_url']
23 | proxy_user_pass = proxy['proxy_user_pass']
24 | encoded_user_pass = base64.b64encode(proxy_user_pass)
25 | request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass.decode()
26 | # return None
27 |
--------------------------------------------------------------------------------
/mafengwo/mafengwo/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 | import base64
8 |
9 |
10 | class MafengwoProxyMiddleware(object):
11 | #设置代理策略
12 | def process_request(self, request, spider):
13 | # proxy,主机头和端口号
14 | request.meta['proxy'] = 'http://http-dyn.abuyun.com:9020'
15 | # 用户名:密码,当前代理必须要有费用
16 | # 你自己买的代理,用户名和密码肯定和我的不一样
17 | proxy_name_pass = 'HTK32673HL02BK2D:50125D2D38937C94'.encode('utf-8')
18 | encode_pass_name = base64.b64encode(proxy_name_pass)
19 | # 将代理信息设置到头部去
20 | # 注意!!!!!Basic后面有一个空格
21 | request.headers['Proxy-Authorization'] = 'Basic ' + encode_pass_name.decode()
22 |
23 | #通过response判断下载是否成功
24 | def process_response(self, request, response, spider):
25 | if 'mafengwo.net' in request.url:
26 | return response
27 | elif response is None:
28 | return request
29 | elif response.status == 302:
30 | return request
31 | elif response.status == 403:
32 | return request
33 | elif 'flashcookie.sw' in response.text:
34 | return request
35 | else:
36 | return response
37 |
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | import pymongo
10 | from pymongo.collection import Collection
11 | from scrapy.pipelines.images import ImagesPipeline
12 | from scrapy import Request
13 |
14 | #存储数据
15 | class DongqiudiPipeline(object):
16 | def __init__(self):
17 | mongo_client = pymongo.MongoClient(host='192.168.7.142',port=27017)
18 | self.dongqiudi_db = mongo_client['dongqiudi_data']
19 | def process_item(self, item, spider):
20 | dongqiudi_collection = Collection(self.dongqiudi_db,"dongqiudi")
21 | dongqiudi_collection.update({'from_url':item['from_url']},item,True)
22 | return item
23 |
24 | #下载图片
25 | class DongqiudiImagePipeline(ImagesPipeline):
26 | def get_media_requests(self, item, info):
27 | for image_url in item['image_urls']:
28 | yield Request(url=image_url,meta={'img_name':image_url,'photo_id':item['title']})
29 |
30 | def item_completed(self, results, item, info):
31 | image_paths = [x['path'] for ok, x in results if ok]
32 | if not image_paths:
33 | pass
34 | return item
35 |
36 | def file_path(self, request, response=None, info=None):
37 | filename = './' + str(request.meta['photo_id'])+'/'+request.meta['img_name'].split("/")[-1]
38 | return filename
39 |
--------------------------------------------------------------------------------
/synchronous/handle_request.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import traceback
3 |
4 |
5 | class DangdangRequest(object):
6 | def __init__(self, url, headers, callback, method="GET", need_proxy=False, fail_time=0, timeout=(5, 5)):
7 | self.callback = callback
8 | self.need_proxy = need_proxy
9 | self.fail_time = fail_time
10 | self.timeout = timeout
11 | self.headers = headers
12 | self.url = url
13 | self.method = method
14 |
15 | def __str__(self):
16 | return self.url
17 |
18 | def send_request(self):
19 | print("请求{url}".format(url=self.url))
20 | proxy_info = {}
21 | if self.method == "GET":
22 | try:
23 | if not self.need_proxy:
24 | response = requests.get(url=self.url, headers=self.headers, timeout=self.timeout)
25 | else:
26 | response = requests.get(url=self.url, headers=self.headers, timeout=self.timeout,
27 | proxies=proxy_info)
28 | except Exception as e:
29 | print(traceback.format_exc())
30 | return self
31 | else:
32 | return response
33 |
34 |
35 | if __name__ == '__main__':
36 | headers = {
37 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36 "
38 | }
39 | q = DangdangRequest(url="https://www.baidu.com", headers=headers, callback="hello")
40 | response = q.send_request()
41 | print(response.text)
42 |
--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import pymongo
8 | from pymongo.collection import Collection
9 | from scrapy.pipelines.images import ImagesPipeline
10 | from scrapy import Request
11 |
12 | class MafengwoPipeline(object):
13 | def __init__(self):
14 | # mongo_client = pymongo.MongoClient(host='127.0.0.1', port=39070)
15 | mongo_client = pymongo.MongoClient(host='10.70.120.156', port=27017)
16 | self.db_data = mongo_client['oreo']
17 |
18 | def process_item(self, item, spider):
19 | db_collections = Collection(self.db_data, 'mafengwo_article')
20 | db_collections.update({'from_url':item['from_url']},item,True)
21 | return item
22 |
23 |
24 | class MafengwoImagePipeline(ImagesPipeline):
25 | def get_media_requests(self, item, info):
26 | for image_url in item['image_urls']:
27 | yield Request(url=image_url,meta={'img_name':image_url,'photo_id':item['id']})
28 |
29 | def item_completed(self, results, item, info):
30 | image_paths = [x['path'] for ok, x in results if ok]
31 | if not image_paths:
32 | pass
33 | #item['image_paths'] = image_paths
34 | return item
35 |
36 | def file_path(self, request, response=None, info=None):
37 | filename = './' + str(request.meta['photo_id'])+'/'+request.meta['img_name'].split("/")[-1]
38 | return filename
39 |
--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
--------------------------------------------------------------------------------
/synchronous/sample/multiprocess_share.py:
--------------------------------------------------------------------------------
1 | import multiprocessing
2 | import time
3 |
4 | value = 1
5 |
6 |
7 | def send_data(conn):
8 | global value
9 | value = value + 1
10 | conn.send(value)
11 |
12 |
13 | def receive_data(conn):
14 | print("接收到的数据为:{data}".format(data=conn.recv()))
15 |
16 |
17 | def pipe_main():
18 | # 进程通信管道
19 | conn_recv, conn_send = multiprocessing.Pipe()
20 | process_send = multiprocessing.Process(target=send_data, args=(conn_send,))
21 | process_send.start()
22 | process_send.join()
23 | process_recv = multiprocessing.Process(target=receive_data, args=(conn_recv,))
24 | process_recv.start()
25 | process_recv.join()
26 |
27 |
28 | def worker(dict, lock):
29 | while True:
30 | # lock.acquire()
31 | with lock:
32 | number = dict.get("ticket")
33 | if number > 0:
34 | time.sleep(1)
35 | number = number - 1
36 | print("{}-ticket={}".format(multiprocessing.current_process().name, number))
37 | dict.update({"ticket": number})
38 | else:
39 | print("无票")
40 | break
41 | # lock.release()
42 |
43 |
44 | def main():
45 | # 使用manager操作字典共享
46 | manager = multiprocessing.Manager()
47 | mgr_dict = manager.dict(ticket=5)
48 | lock = multiprocessing.Lock()
49 | print(mgr_dict)
50 | job_process = [multiprocessing.Process(target=worker, args=(mgr_dict, lock,), name="售票员-{item}".format(item=item))
51 | for item in range(3)]
52 | for job in job_process:
53 | job.start()
54 |
55 | for end in job_process:
56 | end.join()
57 |
58 |
59 | if __name__ == '__main__':
60 | # pipe_main()
61 | main()
62 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
--------------------------------------------------------------------------------
/baidu_m_keyword_ranks/handle_mysql.py:
--------------------------------------------------------------------------------
1 | import pymysql
2 | import time
3 | import setting
4 | import csv
5 |
6 |
7 |
8 | class Handle_mysql(object):
9 | def __init__(self):
10 | self.db = pymysql.connect(host=setting.mysql_ip,port=setting.mysql_port,database=setting.mysql_database,user=setting.mysql_username,password=setting.mysql_password)
11 | self.cursor = self.db.cursor()
12 |
13 | def __del__(self):
14 | self.cursor.close()
15 | self.db.close()
16 |
17 | def handle_task(self):
18 | #获取任务关键字
19 | sql = "SELECT search_word FROM seo_fast_rankings WHERE state=1;"
20 | self.cursor.execute(sql)
21 | result = self.cursor.fetchall()
22 | return result
23 |
24 | #插入和更新数据
25 | def handle_insert_db(self,item=None):
26 | sql_insert = """ INSERT INTO seo_baidu_m_keyword_ziran (keyword,rank,crawl_date) VALUES ("%s",'%s',"%s");""" % (item['keyword'],item['rank'],item['crawl_date'])
27 | try:
28 | self.cursor.execute(sql_insert)
29 | self.db.commit()
30 | except:
31 | pass
32 | # print(sql_insert)
33 |
34 | mysql = Handle_mysql()
35 | if __name__ == '__main__':
36 | #插入数据前先删除当日数据
37 | date = time.strftime("%Y-%m-%d", time.localtime())
38 | sql_delete = """ DELETE FROM seo_baidu_m_keyword_ziran where crawl_date='%s'"""%date
39 | mysql.cursor.execute(sql_delete)
40 | mysql.db.commit()
41 | #导入当日数据
42 | with open('baidu_m_keyword_ziran.csv','r',encoding='utf-8') as f:
43 | csv_reader = csv.reader(f)
44 | data = next(csv_reader)
45 | for i in csv_reader:
46 | info = {}
47 | info['keyword'] = i[0]
48 | info['rank'] = i[1]
49 | info['crawl_date'] = i[2]
50 | mysql.handle_insert_db(info)
51 |
--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/spiders/douban_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from douban.items import DoubanItem
4 |
5 |
6 | class DoubanSpiderSpider(scrapy.Spider):
7 | # scrapy项目名称
8 | name = 'douban_spider'
9 | allowed_domains = ['douban.com']
10 | # 起始URL
11 | start_urls = ['https://movie.douban.com/top250?start=0&filter=']
12 | custom_settings = {
13 | 'USER_AGENT':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0'
14 | }
15 |
16 | # 解析方法
17 | def parse(self, response):
18 | movie_list = response.xpath("//div[@class='article']//ol[@class='grid_view']/li")
19 | for i_item in movie_list:
20 | douban_item = DoubanItem()
21 | douban_item['serial_number'] = i_item.xpath(".//div[@class='item']//em/text()").extract_first()
22 | douban_item['movie_name'] = i_item.xpath(".//div[@class='info']/div[@class='hd']/a/span[1]/text()").extract_first()
23 | content = i_item.xpath(".//div[@class='info']//div[@class='bd']/p[1]/text()").extract()
24 | for i_content in content:
25 | content_s = "".join(i_content.split())
26 | douban_item['introduce'] = content_s
27 | douban_item['star'] = i_item.xpath(".//span[@class='rating_num']/text()").extract_first()
28 | douban_item['evaluate'] = i_item.xpath(".//div[@class='star']//span[4]/text()").extract_first()
29 | douban_item['describe'] = i_item.xpath(".//p[@class='quote']/span/text()").extract_first()
30 | # yield到pipeline,settings中需要启用,否则无法存储数据
31 | yield douban_item
32 |
33 | nextLink = response.xpath('//span[@class="next"]/link/@href').extract()
34 | # 第10页是最后一页,没有下一页的链接
35 | if nextLink:
36 | nextLink = nextLink[0]
37 | print (nextLink)
38 | yield scrapy.Request('https://movie.douban.com/top250'+nextLink, callback=self.parse)
39 | # # 递归将下一页的地址传给这个函数自己,在进行爬取
40 |
--------------------------------------------------------------------------------
/login_github/handle_login.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import re
3 |
4 |
5 | class Login(object):
6 | def __init__(self):
7 | self.login_session = requests.session()
8 | self.header = {
9 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
10 | }
11 | self.city_token = ""
12 |
13 |
14 | def handle_city_token(self):
15 | """
16 | 获取city_token,为登录做准备
17 | :return:self.city_token
18 | """
19 | login_url = "https://github.com/login"
20 | response = self.login_session.get(url=login_url,headers=self.header)
21 | city_token_search = re.compile(r'name="authenticity_token"\svalue="(.*?)"\s\/>')
22 | self.city_token = city_token_search.search(response.text).group(1)
23 |
24 | def handle_login_github(self):
25 | """
26 | 执行登录
27 | :return: 登录后匹配的字符串
28 | """
29 | login_name = input("请输入用户名:")
30 | login_password = input("请输入密码:")
31 | self.handle_city_token()
32 | #获取登录cookie
33 | self.login_session.get(url="https://github.com/manifest.json",headers=self.header)
34 | data = {
35 | "commit": "Sign in",
36 | "utf8": "✓",
37 | "authenticity_token":self.city_token,
38 | "login": login_name,
39 | "password": login_password,
40 | "webauthn-support": "supported",
41 | }
42 | session_url = "https://github.com/session"
43 | self.header['Referer'] = "https://github.com/login"
44 | # 登录
45 | self.login_session.post(url=session_url,headers=self.header,data=data)
46 | self.header.pop('Referer')
47 | #请求设置页
48 | response = self.login_session.get(url="https://github.com/settings/profile",headers=self.header)
49 | search_email = re.compile(login_name)
50 | # 登陆成功后可以获取到自己的登录名称
51 | print(search_email.search(response.text).group())
52 | if __name__ == '__main__':
53 | github = Login()
54 | github.handle_login_github()
55 |
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/spiders/crawl_dongqiudi.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from ..items import DongqiudiItem
4 | import time
5 | import json
6 |
7 |
8 | class CrawlDongqiudiSpider(scrapy.Spider):
9 | name = 'crawl_dongqiudi'
10 | allowed_domains = ['dongqiudi.com']
11 | start_urls = ['http://dongqiudi.com/']
12 |
13 | #分析懂球帝页面,通过浏览器开发者工具xhr,可以看到异步json请求
14 | def start_requests(self,time_value=None):
15 | #初始时间使用time.time()构造
16 | if time_value == None:
17 | time_value = int(time.time())
18 | #分析页面新闻结构
19 | for item_value in [56,232,57,3,4,5,6]:
20 | #如该请求https://dongqiudi.com/api/app/tabs/web/56.json?after=1572577395&page=1
21 | #其中56为栏目编号,after为时间戳,page为页码
22 | page_url = "https://dongqiudi.com/api/app/tabs/web/%s.json?after=%s&page=1"%(item_value,time_value)
23 | yield scrapy.Request(url=page_url,callback=self.handle_page_response,dont_filter=True)
24 |
25 | #处理页码请求的返回
26 | def handle_page_response(self,response):
27 | response_dict = json.loads(response.text)
28 | #从返回中获取下一页链接
29 | next_url = response_dict.get('next')
30 | if next_url:
31 | #请求下一页
32 | yield scrapy.Request(url=next_url,callback=self.handle_page_response,dont_filter=True)
33 |
34 | #解析新闻列表
35 | news_list = response_dict.get('articles')
36 | if news_list:
37 | for item in news_list:
38 | info = {}
39 | #新闻URL
40 | info['from_url'] = item.get('url')
41 | #新闻标题
42 | info['title'] = item.get('title')
43 | #新闻发表时间
44 | info['release_time'] = item.get('published_at')
45 | yield scrapy.Request(url=info['from_url'],callback=self.handle_detail,dont_filter=True,meta=info)
46 |
47 | #处理新闻详情页
48 | def handle_detail(self,response):
49 | dongqiudi = DongqiudiItem()
50 | #作者
51 | dongqiudi['author'] = response.xpath("//header/h2/a/text()").extract_first()
52 | #内容
53 | dongqiudi['content'] = ''.join(response.xpath("//div[@class='con']/p/text()").extract())
54 | #新闻图片
55 | dongqiudi['image_urls'] = response.xpath("//div[@class='con']/p/img/@data-src").extract()
56 | #抓取时间
57 | dongqiudi['crawl_time'] = time.strftime("%Y%m%d %H:%M:%S", time.localtime())
58 | #新闻标题
59 | dongqiudi['title'] = response.request.meta['title']
60 | #抓取url
61 | dongqiudi['from_url'] = response.request.meta['from_url']
62 | #发表时间
63 | dongqiudi['release_time'] = response.request.meta['release_time']
64 | #yield到pipeline中
65 | yield dongqiudi
66 |
--------------------------------------------------------------------------------
/synchronous/handle_spider.py:
--------------------------------------------------------------------------------
1 | # from handle_redis import RedisQueue
2 | from handle_queue import DangdangQueue
3 | from handle_request import DangdangRequest
4 | from lxml import etree
5 | import time
6 |
7 |
8 | class Spider(object):
9 | headers = {
10 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
11 | "Chrome/86.0.4240.75 Safari/537.36 "
12 | }
13 |
14 | # queue = RedisQueue()
15 | queue = DangdangQueue()
16 |
17 | def start(self):
18 | """爬虫起始方法"""
19 | for page in range(1, 26):
20 | start_url = "http://bang.dangdang.com/books/fivestars/2-{page}".format(page=page)
21 | dangdang_request = DangdangRequest(url=start_url, callback=self.parse_item, headers=Spider.headers)
22 | Spider.queue.insert_data(data=dangdang_request)
23 |
24 | def do_request(self, request):
25 | """发送请求"""
26 | response = request.send_request()
27 | return response
28 |
29 | def parse_item(self, response):
30 | """解析数据"""
31 | data = []
32 | html = etree.HTML(response.text)
33 | items = html.xpath("//ul[@class='bang_list']/li")
34 | for item in items:
35 | title = item.xpath(".//div[@class='name']/a/text()")
36 | if title:
37 | data.extend(title)
38 | yield data
39 |
40 | def error(self, request):
41 | """请求错误后返回队列"""
42 | request.fail_time = request.fail_time + 1
43 | if request.fail_time < 20:
44 | print("该请求异常{url}, 将该请求放回队列".format(url=request))
45 | Spider.queue.insert_data(data=request)
46 |
47 | def schedule(self):
48 | """任务调度"""
49 | start_time = time.time()
50 | while not Spider.queue.database_empty():
51 | dangdang_request = self.queue.get_data()
52 | if dangdang_request:
53 | print("当前调度:", dangdang_request)
54 | callback = dangdang_request.callback
55 | response = self.do_request(dangdang_request)
56 | if not isinstance(response, DangdangRequest):
57 | # 通过回调方法解析
58 | result = callback(response)
59 | for item in result:
60 | print(item)
61 | else:
62 | dangdang_request = DangdangRequest(url=response.url, headers=Spider.headers, callback=self.parse_item)
63 | # 错误处理
64 | self.error(dangdang_request)
65 | print("共耗时:", time.time()-start_time)
66 |
67 | def run(self):
68 | self.start()
69 | self.schedule()
70 |
71 |
72 | if __name__ == '__main__':
73 | s = Spider()
74 | s.run()
75 |
--------------------------------------------------------------------------------
/douban_movie_top250/crawl_douban_movie_info_top250.py:
--------------------------------------------------------------------------------
1 | import re
2 | from concurrent.futures import ThreadPoolExecutor
3 | import requests
4 | from lxml import etree
5 | from handle_mongo import douban_mongo
6 |
7 |
8 | class HandleDoubanMovieTop250(object):
9 | def __init__(self):
10 | self.header = {
11 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
12 | "Accept-Encoding":"gzip, deflate, br",
13 | "Accept-Language":"zh-CN,zh;q=0.9",
14 | "Connection":"keep-alive",
15 | "Host":"movie.douban.com",
16 | "Upgrade-Insecure-Requests":"1",
17 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
18 | }
19 | self.page_url = []
20 |
21 | def handle_page_url(self):
22 | #通过分析页面URL可以得知
23 | #通过range构造页码变量,从0开始,到249结束,步长为25
24 | for i in range(0,250,25):
25 | url = "https://movie.douban.com/top250?start=%s"%i
26 | self.page_url.append(url)
27 |
28 | #处理请求方法
29 | def handle_request(self,url):
30 | response = requests.get(url=url,headers=self.header)
31 | return response.text
32 |
33 |
34 | #处理页码页
35 | def handle_page_detail(self,url):
36 | print(url)
37 | #处理特殊字符
38 | sub_search = re.compile(r"[\s\r\t]")
39 | response = self.handle_request(url=url)
40 | html = etree.HTML(response)
41 | #解析当前页面有多少个电影信息
42 | item_list = html.xpath("//ol[@class='grid_view']/li")
43 | for item in item_list:
44 | info = {}
45 | #电影名称,将特殊字符替换为空
46 | info['movie_name'] = sub_search.sub('',''.join(item.xpath(".//div[@class='hd']/a//span/text()")))
47 | info['actors_information'] = sub_search.sub('',''.join(item.xpath(".//div[@class='bd']/p/text()")))
48 | info['score'] = sub_search.sub('',''.join(item.xpath(".//div[@class='bd']/div[@class='star']/span[2]/text()")))
49 | info['evaluate'] = sub_search.sub('',''.join(item.xpath(".//div[@class='bd']/div[@class='star']/span[4]/text()")))
50 | info['describe'] = sub_search.sub('',''.join(item.xpath(".//p[@class='quote']/span/text()")))
51 | info['from_url'] = url
52 | #数据入库
53 | douban_mongo.handle_save_data(info)
54 |
55 | #启动方法
56 | def run(self):
57 | self.handle_page_url()
58 | #创建线程池
59 | t = ThreadPoolExecutor()
60 | for i in self.page_url:
61 | t.submit(self.handle_page_detail,i)
62 | t.shutdown()
63 |
64 | #入口函数
65 | def main():
66 | douban = HandleDoubanMovieTop250()
67 | douban.run()
68 |
69 | if __name__ == '__main__':
70 | #入口函数调用
71 | main()
72 |
--------------------------------------------------------------------------------
/synchronous/spider_multiprocess.py:
--------------------------------------------------------------------------------
1 | # from handle_redis import RedisQueue
2 | import multiprocessing
3 | from handle_queue import DangdangQueue
4 | from handle_request import DangdangRequest
5 | from lxml import etree
6 | import time
7 |
8 |
9 | class Spider(object):
10 | headers = {
11 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
12 | "Chrome/86.0.4240.75 Safari/537.36 "
13 | }
14 |
15 | # queue = RedisQueue()
16 | queue = DangdangQueue()
17 |
18 | def start(self):
19 | """爬虫起始方法"""
20 | for page in range(1, 26):
21 | start_url = "http://bang.dangdang.com/books/fivestars/2-{page}".format(page=page)
22 | dangdang_request = DangdangRequest(url=start_url, callback=self.parse_item, headers=Spider.headers)
23 | Spider.queue.insert_data(data=dangdang_request)
24 |
25 | def do_request(self, request):
26 | """发送请求"""
27 | response = request.send_request()
28 | return response
29 |
30 | def parse_item(self, response):
31 | """解析数据"""
32 | data = []
33 | html = etree.HTML(response.text)
34 | items = html.xpath("//ul[@class='bang_list']/li")
35 | for item in items:
36 | title = item.xpath(".//div[@class='name']/a/text()")
37 | if title:
38 | data.extend(title)
39 | yield data
40 |
41 | def error(self, request):
42 | """请求错误后返回队列"""
43 | request.fail_time = request.fail_time + 1
44 | if request.fail_time < 20:
45 | print("该请求异常{url}, 将该请求放回队列".format(url=request))
46 | Spider.queue.insert_data(data=request)
47 |
48 | def handle_worker(self, request):
49 | print("{name}调度{url}".format(name=multiprocessing.current_process().name, url=request.url))
50 | callback = request.callback
51 | response = self.do_request(request)
52 | if not isinstance(response, DangdangRequest):
53 | # 通过回调方法解析
54 | result = callback(response)
55 | for item in result:
56 | print(item)
57 | else:
58 | dangdang_request = DangdangRequest(url=response.url, headers=Spider.headers, callback=self.parse_item)
59 | # 错误处理
60 | self.error(dangdang_request)
61 |
62 | def schedule(self):
63 | """任务调度"""
64 | start_time = time.time()
65 | pool = multiprocessing.Pool(multiprocessing.cpu_count())
66 | while not Spider.queue.database_empty():
67 | dangdang_request = self.queue.get_data()
68 | if dangdang_request:
69 | pool.apply_async(func=self.handle_worker, args=(dangdang_request,))
70 | pool.close()
71 | pool.join()
72 | print("共耗时:", time.time()-start_time)
73 |
74 | def run(self):
75 | self.start()
76 | self.schedule()
77 |
78 |
79 | if __name__ == '__main__':
80 | s = Spider()
81 | s.run()
82 |
--------------------------------------------------------------------------------
/video/lishipin/crawl_lishipin.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import time
3 | from lxml import etree
4 | import re
5 |
6 | class HandleLishipin(object):
7 | def __init__(self):
8 | self.header = {
9 | "Connection":"keep-alive",
10 | "Upgrade-Insecure-Requests":"1",
11 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
12 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
13 | "Accept-Encoding":"gzip, deflate, br",
14 | "Accept-Language":"zh-CN,zh;q=0.9",
15 | }
16 |
17 | def handle_html(self,url):
18 | response = requests.get(url=url,headers=self.header)
19 | return response.text
20 |
21 | if __name__ == '__main__':
22 | l = HandleLishipin()
23 | list_url = [
24 | {"name":"新知","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=10&start=%d&sort=%d"},
25 | {"name":"社会","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=1&start=%d&sort=%d"},
26 | {"name":"世界","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=2&start=%d&sort=%d"},
27 | {"name":"生活","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=5&start=%d&sort=%d"},
28 | {"name":"娱乐","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=4&start=%d&sort=%d"},
29 | {"name":"财富","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=3&start=%d&sort=%d"},
30 | {"name":"美食","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=6&start=%d&sort=%d"},
31 | {"name":"音乐","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=59&start=%d&sort=%d"},
32 | ]
33 | for item in list_url:
34 | for i in range(0,110,10):
35 | item_url =item['item_url']%(i,i)
36 | detail_text = l.handle_html(item_url)
37 | detail_html = etree.HTML(detail_text)
38 | detail_url = detail_html.xpath("//li[@class='popularem clearfix']//a[@class='actplay']/@href")
39 | video_url_search = re.compile(r'srcUrl="(.*?)"')
40 | video_name_search = re.compile(r'
(.*?)
')
41 | for url in detail_url:
42 | url = "https://www.pearvideo.com/"+url
43 | video_text = l.handle_html(url)
44 | video_url = video_url_search.search(video_text).group(1)
45 | video_name = video_name_search.search(video_text).group(1)
46 | info = {}
47 | info['video_url'] = video_url
48 | info['name'] = video_name
49 | info['type'] = item['name']
50 | info['from_url'] = url
51 | info['crawl_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
52 |
--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for douban project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'douban'
13 |
14 | SPIDER_MODULES = ['douban.spiders']
15 | NEWSPIDER_MODULE = 'douban.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'douban.middlewares.DoubanSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | DOWNLOADER_MIDDLEWARES = {
56 | 'douban.middlewares.ProxyMiddleware': 543,
57 | }
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'douban.pipelines.DoubanPipeline': 300,
69 | 'douban.pipelines.DoubanJsonPipeline': 301,
70 | }
71 |
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 |
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 |
93 | REDIRECT_ENABLED = False
94 | HTTPERROR_ALLOWED_CODES= [302]
95 |
96 | RETRY_ENABLED:True
97 | RETRY_HTTP_CODECS=[503]
98 | RETRY_TIMES=5
99 |
--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for dongqiudi project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'dongqiudi'
13 |
14 | SPIDER_MODULES = ['dongqiudi.spiders']
15 | NEWSPIDER_MODULE = 'dongqiudi.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | CONCURRENT_REQUESTS = 2
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | # DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'dongqiudi.middlewares.DongqiudiSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | DOWNLOADER_MIDDLEWARES = {
56 | # 'dongqiudi.middlewares.DongqiudiDownloaderMiddleware': 543,
57 | 'dongqiudi.middlewares.DongqiudiProxyMiddleware': 543,
58 | }
59 |
60 | # Enable or disable extensions
61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | # 'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 |
66 | #设置图片保存路径
67 | IMAGES_STORE = './dongqiudi_pic'
68 |
69 | # Configure item pipelines
70 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
71 |
72 | ITEM_PIPELINES = {
73 | 'dongqiudi.pipelines.DongqiudiPipeline': 300,
74 | #必须设置IMAGES_STORE,否则这条中间件不起作用
75 | 'dongqiudi.pipelines.DongqiudiImagePipeline': 209
76 | }
77 |
78 | # Enable and configure the AutoThrottle extension (disabled by default)
79 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
80 | #AUTOTHROTTLE_ENABLED = True
81 | # The initial download delay
82 | #AUTOTHROTTLE_START_DELAY = 5
83 | # The maximum download delay to be set in case of high latencies
84 | #AUTOTHROTTLE_MAX_DELAY = 60
85 | # The average number of requests Scrapy should be sending in parallel to
86 | # each remote server
87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88 | # Enable showing throttling stats for every response received:
89 | #AUTOTHROTTLE_DEBUG = False
90 |
91 | # Enable and configure HTTP caching (disabled by default)
92 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93 | #HTTPCACHE_ENABLED = True
94 | #HTTPCACHE_EXPIRATION_SECS = 0
95 | #HTTPCACHE_DIR = 'httpcache'
96 | #HTTPCACHE_IGNORE_HTTP_CODES = []
97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
98 |
--------------------------------------------------------------------------------
/mafengwo/mafengwo/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for mafengwo project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'mafengwo'
13 |
14 | SPIDER_MODULES = ['mafengwo.spiders']
15 | NEWSPIDER_MODULE = 'mafengwo.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'mafengwo (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | CONCURRENT_REQUESTS = 2#根据代理隧道数确定请求数
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 0.1
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | DEFAULT_REQUEST_HEADERS = {
43 | # "Host":"www.mafengwo.cn",
44 | # "Connection":"keep-alive",
45 | # "Upgrade-Insecure-Requests":"1",
46 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
47 | # "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
48 | # "Accept-Encoding":"gzip, deflate",
49 | # "Accept-Language":"zh-CN,zh;q=0.9",
50 | }
51 |
52 | # Enable or disable spider middlewares
53 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
54 | #SPIDER_MIDDLEWARES = {
55 | # 'mafengwo.middlewares.MafengwoSpiderMiddleware': 543,
56 | #}
57 |
58 | # Enable or disable downloader middlewares
59 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
60 | DOWNLOADER_MIDDLEWARES = {
61 | # 'mafengwo.middlewares.MafengwoDownloaderMiddleware': 543,
62 | 'mafengwo.middlewares.MafengwoProxyMiddleware': 543,
63 | }
64 |
65 | # Enable or disable extensions
66 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
67 | #EXTENSIONS = {
68 | # 'scrapy.extensions.telnet.TelnetConsole': None,
69 | #}
70 |
71 | # Configure item pipelines
72 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
73 | ITEM_PIPELINES = {
74 | 'mafengwo.pipelines.MafengwoPipeline': 300,
75 | 'mafengwo.pipelines.MafengwoImagePipeline': 301,
76 | }
77 |
78 | IMAGES_STORE="./mafengwo_images"
79 |
80 | # Enable and configure the AutoThrottle extension (disabled by default)
81 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
82 | #AUTOTHROTTLE_ENABLED = True
83 | # The initial download delay
84 | #AUTOTHROTTLE_START_DELAY = 5
85 | # The maximum download delay to be set in case of high latencies
86 | #AUTOTHROTTLE_MAX_DELAY = 60
87 | # The average number of requests Scrapy should be sending in parallel to
88 | # each remote server
89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
90 | # Enable showing throttling stats for every response received:
91 | #AUTOTHROTTLE_DEBUG = False
92 |
93 | # Enable and configure HTTP caching (disabled by default)
94 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
95 | #HTTPCACHE_ENABLED = True
96 | #HTTPCACHE_EXPIRATION_SECS = 0
97 | #HTTPCACHE_DIR = 'httpcache'
98 | #HTTPCACHE_IGNORE_HTTP_CODES = []
99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
100 | DOWNLOAD_TIMEOUT = 10
101 |
--------------------------------------------------------------------------------
/boss_zhipin/crawl_boss_zhipin.py:
--------------------------------------------------------------------------------
1 | from urllib.parse import urljoin
2 | import requests
3 | import pymongo
4 | from pymongo.collection import Collection
5 | import time
6 | import json
7 | from lxml import etree
8 | from concurrent.futures.thread import ThreadPoolExecutor
9 |
10 |
11 |
12 | class HandleBossZhiPin(object):
13 | def __init__(self):
14 | self.header = {
15 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
16 | }
17 | self.city_list = ""
18 | boss_client = pymongo.MongoClient(host="127.0.0.1", port=27017)
19 | self.boss_db = boss_client['boss']
20 | self.city_list = []
21 |
22 | def handle_city(self):
23 | city_api_url = "https://www.zhipin.com/wapi/zpCommon/data/city.json"
24 | city_response = self.handle_request(method='GET',url=city_api_url)
25 | for province in json.loads(city_response)['zpData']['cityList']:
26 | for city in province['subLevelModelList']:
27 | self.city_list.append(city)
28 |
29 | def handle_job_request(self,job,city):
30 | print(city['name'])
31 | for page in range(1,11):
32 | job_url = "https://www.zhipin.com/c%s/?query=%s&page=%s"%(city['code'],job,page)
33 | print(job_url)
34 | response = self.handle_request(method='GET',url=job_url)
35 | html = etree.HTML(response)
36 | job_list = html.xpath("//div[@class='job-list']/ul/li")
37 | for item in job_list:
38 | info = {}
39 | info['job_title'] = item.xpath(".//div[@class='job-title']/text()")[0]
40 | if '实习' in info['job_title']:
41 | continue
42 | info['price'] = item.xpath(".//span[@class='red']/text()")[0]
43 | describe_1 = item.xpath(".//div[@class='info-primary']/p/text()")
44 | if len(describe_1) == 3:
45 | info['location'] = describe_1[0]
46 | info['working_life'] = describe_1[1]
47 | info['education'] = describe_1[2]
48 | info['company_name'] = item.xpath(".//div[@class='info-company']//h3[@class='name']/a/text()")[0]
49 | describe_2 = item.xpath(".//div[@class='info-company']//p/text()")
50 | info['company_type'] = describe_2[0]
51 | info['job_id'] = urljoin("https://www.zhipin.com",item.xpath(".//h3/a/@href")[0])
52 | info['city'] = city['name']
53 | self.handle_save_data(item=info)
54 | if not html.xpath("//div[@class='page']/a[@class='next']"):
55 | break
56 |
57 |
58 | def handle_job_detail(self,response):
59 | pass
60 |
61 | def handle_save_data(self,item):
62 | boss_collection = Collection(self.boss_db, "boss_data")
63 | boss_collection.update({"job_id": item['job_id']}, item, True)
64 |
65 | def handle_request(self,method,url,data=None):
66 | while True:
67 | proxy="http://HTK32673HL02BK2D:50125D2D38937C94@http-dyn.abuyun.com:9020"
68 | proxies = {
69 | "http":proxy,
70 | "https":proxy
71 | }
72 | try:
73 | if method == "GET":
74 | response = requests.get(url=url,headers=self.header,proxies=proxies)
75 | elif method == "POST":
76 | response = requests.post(url=url,headers=self.header,data=data,proxies=proxies,timeout=3)
77 | except Exception as e:
78 | print(e)
79 | time.sleep(2)
80 | continue
81 | else:
82 | return response.text
83 |
84 | def run(self):
85 | self.handle_city()
86 | t = ThreadPoolExecutor(max_workers=3)
87 | for city in self.city_list:
88 | t.submit(self.handle_job_request(job='python',city=city))
89 | t.shutdown()
90 |
91 | def main():
92 | boss = HandleBossZhiPin()
93 | boss.run()
94 |
95 | if __name__ == '__main__':
96 | main()
97 |
--------------------------------------------------------------------------------
/dasouche/handle_dasouche.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | import re
4 | import pymongo
5 | from pymongo.collection import Collection
6 | from concurrent.futures.thread import ThreadPoolExecutor
7 |
8 |
9 | class HandleDaSouChe(object):
10 | def __init__(self):
11 | #页码请求URL
12 | self.page_url = "https://aolai.souche.com/v1/searchApi/searchCar.json?_security_token=undefined"
13 | self.header = {
14 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
15 | }
16 | self.item_url_list = []
17 | mongo_client = pymongo.MongoClient(host="10.70.120.156", port=27017)
18 | self.db_data = mongo_client['oreo']
19 |
20 | def handle_save_data(self,item):
21 | db_collection = Collection(self.db_data, 'dasouche_data')
22 | db_collection.update({'carId':item['carId']},item,True)
23 |
24 | def handle_page(self):
25 | for page in range(1,5):
26 | #构造请求数据POST,每页可现实500条数据,共4页
27 | data = {
28 | "keyword":"",
29 | "brandCode":"",
30 | "seriesCode":"",
31 | "price":"",
32 | "carModel":"",
33 | "carAge":"",
34 | "mileage":"",
35 | "gearboxType":"",
36 | "displacement":"",
37 | "emissionStandard":"",
38 | "bodyColor":"",
39 | "fuelType":"",
40 | "seatingCapacity":"",
41 | "drivingMode":"",
42 | "country":"",
43 | "pageNo":page,
44 | "pageSize":"500",
45 | "from":"pc",
46 | "cityCode":"",
47 | "shopCode":"",
48 | "sort":"newsOnShelf",
49 | }
50 | page_result = self.handle_request(method='POST',url=self.page_url,data=data)
51 | for item in json.loads(page_result)['data']['items']:
52 | self.item_url_list.append(item['detailUrl'])
53 |
54 | #处理详情页
55 | def handle_detail(self,url):
56 | id_search = re.compile(r"carId=(.*?)&shopCode=(\d+)")
57 | car_id = id_search.search(url).group(1)
58 | shop_id = id_search.search(url).group(2)
59 | #车辆详情信息
60 | car_detail_url = "https://aolai.souche.com//v1/carDetailsApi/carDetailInfo.json?carId=%s"%car_id
61 | car_detail = self.handle_request(method='GET',url=car_detail_url)
62 | car_detail_result = json.loads(car_detail)['data']
63 | #售卖商店信息
64 | shop_detail_url = "https://aolai.souche.com//v1/shopApi/queryTangecheShopInfo.json?carId=%s&citycode=%s&shopCode=%s"%(car_id,car_detail_result['baseCarInfoView']['cityCode'],shop_id)
65 | shop_detail_result = self.handle_request(method='GET',url=shop_detail_url)
66 | car_detail_result.update(json.loads(shop_detail_result)['data'])
67 | #车辆厂商配置信息
68 | car_config_url = "https://aolai.souche.com/v1/carDetailsApi/carConfigDetailInfo.json?_security_token=undefined&carId=%s"%car_id
69 | car_config_result = self.handle_request(method='GET',url=car_config_url)
70 | car_detail_result.update(json.loads(car_config_result)['data'])
71 | car_detail_result['from_url'] = url
72 | self.handle_save_data(car_detail_result)
73 |
74 |
75 |
76 | def handle_request(self,method,url,data=None):
77 | if method == 'POST':
78 | response = requests.post(url=url,headers=self.header,data=data)
79 | return response.text
80 | elif method == 'GET':
81 | response = requests.get(url=url,headers=self.header)
82 | return response.text
83 |
84 |
85 | def run(self):
86 | self.handle_page()
87 | t = ThreadPoolExecutor()
88 | for url in self.item_url_list:
89 | t.submit(self.handle_detail,url)
90 | t.shutdown()
91 |
92 |
93 | def main():
94 | dasouche = HandleDaSouChe()
95 | dasouche.run()
96 |
97 |
98 | if __name__ == '__main__':
99 | main()
100 |
--------------------------------------------------------------------------------
/lagou/crawl_lagou_job_old.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | import requests
4 | import time
5 | import multiprocessing
6 | from handle_mysql import lagou_mysql
7 | import random
8 |
9 |
10 |
11 | class HandleLaGou(object):
12 | def __init__(self):
13 | self.lagou_session = requests.session()
14 | self.header = {
15 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
16 | }
17 | self.city_list = ""
18 |
19 | def handle_city(self):
20 | city_search = re.compile(r'zhaopin/">(.*?)')
21 | city_url = "https://www.lagou.com/jobs/allCity.html"
22 | city_result = self.handle_request(method='GET',url=city_url)
23 | self.city_list = city_search.findall(city_result)
24 | #清除cookie
25 | self.lagou_session.cookies.clear()
26 |
27 | def handle_city_job(self,city):
28 | for page in range(1,31):
29 | data = {
30 | "pn":str(page),
31 | "kd":"python",
32 | }
33 | job_index_url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
34 | self.handle_request(method='GET',url=job_index_url)
35 | page_url = "https://www.lagou.com/jobs/positionAjax.json?city=%s&needAddtionalResult=false"%city
36 | self.header['Referer'] = job_index_url.encode()
37 | job_result = self.handle_request(method='POST',url=page_url,data=data)
38 | try:
39 | lagou_data = json.loads(job_result)
40 | except:
41 | continue
42 | else:
43 | job_list = lagou_data['content']['positionResult']['result']
44 | if job_list:
45 | for job in job_list:
46 | job['crawl_date'] = time.strftime("%Y-%m-%d", time.localtime())
47 | lagou_mysql.insert_item(job)
48 | else:
49 | break
50 |
51 | def handle_request(self,method,url,data=None):
52 | while True:
53 | proxyinfo = "http://%s:%s@%s:%s" %('H1V32R6470A7G90D','CD217C660A9143C3','http-dyn.abuyun.com','9020')
54 | proxy = {
55 | "http": proxyinfo,
56 | "https": proxyinfo,
57 | }
58 |
59 | try:
60 | if method == "GET":
61 | response = self.lagou_session.get(url=url,headers=self.header,proxies=proxy,timeout=6)
62 | elif method == "POST":
63 | response = self.lagou_session.post(url=url,headers=self.header,data=data,proxies=proxy,timeout=6)
64 | except Exception as e:
65 | print(e)
66 | else:
67 | if '您操作太频繁,请稍后再访问' in response.text:
68 | print('频繁')
69 | self.lagou_session.cookies.clear()
70 | # job_index_url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
71 | # self.handle_request(method='GET',url=job_index_url)
72 | # time.sleep(random.choice(range(3,11)))
73 | time.sleep(1)
74 | continue
75 | elif '爬虫行为' in response.text:
76 | print('爬虫')
77 | self.lagou_session.cookies.clear()
78 | time.sleep(1)
79 | # time.sleep(random.choice(range(3,11)))
80 | continue
81 | else:
82 | return response.text
83 |
84 | def run(self):
85 | self.handle_city()
86 | print(self.city_list)
87 | # for city in self.city_list:
88 | # self.handle_city_job(city=city)
89 | pool = multiprocessing.Pool(2)
90 | for city in self.city_list:
91 | pool.apply_async(self.handle_city_job,args=(city,))
92 | pool.close()
93 | pool.join()
94 |
95 |
96 | def main():
97 | lagou = HandleLaGou()
98 | lagou.run()
99 |
100 | if __name__ == '__main__':
101 | main()
102 |
--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for mafengwo project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'mafengwo'
13 |
14 | SPIDER_MODULES = ['mafengwo.spiders']
15 | NEWSPIDER_MODULE = 'mafengwo.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 0.5
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | DEFAULT_REQUEST_HEADERS = {
43 | # "Host":"www.mafengwo.cn",
44 | # "Connection":"keep-alive",
45 | # "Upgrade-Insecure-Requests":"1",
46 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
47 | # "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
48 | # "Accept-Encoding":"gzip, deflate",
49 | # "Accept-Language":"zh-CN,zh;q=0.9",
50 | }
51 |
52 | # Enable or disable spider middlewares
53 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
54 | #SPIDER_MIDDLEWARES = {
55 | # 'mafengwo.middlewares.MafengwoSpiderMiddleware': 543,
56 | #}
57 |
58 | # Enable or disable downloader middlewares
59 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
60 | DOWNLOADER_MIDDLEWARES = {
61 | # 'mafengwo.middlewares.MafengwoDownloaderMiddleware': 543,
62 | 'mafengwo.middlewares.MafengwoProxyMiddleware': 543,
63 | }
64 |
65 | # Enable or disable extensions
66 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
67 | #EXTENSIONS = {
68 | # 'scrapy.extensions.telnet.TelnetConsole': None,
69 | #}
70 |
71 | # Configure item pipelines
72 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
73 | ITEM_PIPELINES = {
74 | 'mafengwo.pipelines.MafengwoPipeline': 300,
75 | #'mafengwo.pipelines.MafengwoImagePipeline': 301,
76 | }
77 |
78 | IMAGES_STORE="./mafengwo_images"
79 |
80 | # Enable and configure the AutoThrottle extension (disabled by default)
81 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
82 | #AUTOTHROTTLE_ENABLED = True
83 | # The initial download delay
84 | #AUTOTHROTTLE_START_DELAY = 5
85 | # The maximum download delay to be set in case of high latencies
86 | #AUTOTHROTTLE_MAX_DELAY = 60
87 | # The average number of requests Scrapy should be sending in parallel to
88 | # each remote server
89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
90 | # Enable showing throttling stats for every response received:
91 | #AUTOTHROTTLE_DEBUG = False
92 |
93 | # Enable and configure HTTP caching (disabled by default)
94 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
95 | #HTTPCACHE_ENABLED = True
96 | #HTTPCACHE_EXPIRATION_SECS = 0
97 | #HTTPCACHE_DIR = 'httpcache'
98 | #HTTPCACHE_IGNORE_HTTP_CODES = []
99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
100 | DOWNLOAD_TIMEOUT = 10
101 | IMAGES_EXPIRES = 90 #90天内抓取的都不会被重抓
102 | RETRY_TIMES = 100
103 | # LOG_LEVEL = 'INFO'
104 | proxy_url = '代理库URL'
105 |
--------------------------------------------------------------------------------
/lagou/handle_mysql.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy import create_engine
2 | from sqlalchemy.ext.declarative import declarative_base
3 | from sqlalchemy import Column,Integer,String,Float,Date
4 | from sqlalchemy.orm import sessionmaker
5 | import time
6 |
7 |
8 | #创建数据库连接
9 | engine = create_engine("mysql+pymysql://root:abcd1234@127.0.0.1:3306/lagou?charset=utf8")
10 |
11 | #声明一个基类
12 | Base = declarative_base()
13 |
14 | #操作数据库需要使用session
15 | Session = sessionmaker(bind=engine)
16 |
17 | class Lagoutables(Base):
18 | __tablename__ = 'lagou_data'
19 |
20 | #id
21 | id = Column(Integer,primary_key=True,autoincrement=True)
22 | #岗位ID
23 | positionId = Column(Integer,nullable=False)
24 | #经度
25 | longitude = Column(Float,nullable=False)
26 | #纬度
27 | latitude = Column(Float,nullable=False)
28 | #岗位名称
29 | positionName = Column(String(length=50),nullable=False)
30 | #工作年限
31 | workYear = Column(String(length=20),nullable=False)
32 | #学历
33 | education = Column(String(length=20),nullable=False)
34 | #岗位性质
35 | jobNature = Column(String(length=20),nullable=True)
36 | #公司类型
37 | financeStage = Column(String(length=30),nullable=True)
38 | #公司规模
39 | companySize = Column(String(length=30),nullable=True)
40 | #业务方向
41 | industryField = Column(String(length=30),nullable=True)
42 | #所在城市
43 | city = Column(String(length=10),nullable=False)
44 | #岗位标签
45 | positionAdvantage = Column(String(length=200),nullable=True)
46 | #公司简称
47 | companyShortName = Column(String(length=50),nullable=True)
48 | #公司全称
49 | companyFullName = Column(String(length=200),nullable=True)
50 | #公司所在区
51 | district = Column(String(length=20),nullable=True)
52 | #公司福利标签
53 | companyLabelList = Column(String(length=200),nullable=True)
54 | #工资
55 | salary = Column(String(length=20),nullable=False)
56 | #抓取日期
57 | crawl_date = Column(Date,nullable=False)
58 |
59 | #创建表
60 | # Lagoutables.metadata.create_all(engine)
61 |
62 | class HandleLagouData(object):
63 | def __init__(self):
64 | self.mysql_session = Session()
65 | self.item = Lagoutables()
66 |
67 | def insert_item(self,item):
68 | date = time.strftime("%Y-%m-%d", time.localtime())
69 | data = Lagoutables(
70 | # 岗位ID
71 | positionId = item['positionId'],
72 | # 经度
73 | longitude = item['longitude'],
74 | # 纬度
75 | latitude = item['latitude'],
76 | # 岗位名称
77 | positionName = item['positionName'],
78 | # 工作年限
79 | workYear = item['workYear'],
80 | # 学历
81 | education = item['education'],
82 | # 岗位性质
83 | jobNature = item['jobNature'],
84 | # 公司类型
85 | financeStage = item['financeStage'],
86 | # 公司规模
87 | companySize = item['companySize'],
88 | # 业务方向
89 | industryField = item['industryField'],
90 | # 所在城市
91 | city = item['city'],
92 | # 岗位标签
93 | positionAdvantage = item['positionAdvantage'],
94 | # 公司简称
95 | companyShortName = item['companyShortName'],
96 | # 公司全称
97 | companyFullName = item['companyFullName'],
98 | # 公司所在区
99 | district = item['district'],
100 | # 公司福利标签
101 | companyLabelList = ','.join(item['companyLabelList']),
102 | salary = item['salary'],
103 | # 抓取日期
104 | crawl_date = item['crawl_date']
105 | )
106 | query_result = self.mysql_session.query(Lagoutables).filter(Lagoutables.crawl_date==date,Lagoutables.positionId==item['positionId']).first()
107 | if query_result:
108 | print('该岗位信息已存在%s:%s:%s'%(item['positionId'],item['city'],item['positionName']))
109 | else:
110 | self.mysql_session.add(data)
111 | self.mysql_session.commit()
112 | print('新增岗位信息%s'%item['positionId'])
113 | return self.item
114 |
115 | lagou_mysql = HandleLagouData()
116 | # item = {'positionId':6009711}
117 | # lagou_mysql.insert_item(item)
118 |
--------------------------------------------------------------------------------
/baidu_m_keyword_ranks/baidu_m_keyword.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | import re
3 | import requests
4 | from lxml import etree
5 | from concurrent.futures import ThreadPoolExecutor
6 | from baidu_m_keyword_ziran.handle_mysql import mysql
7 | from baidu_m_keyword_ziran.handle_mongo import mongo
8 | import time
9 |
10 |
11 | class Handle_baidu_m(object):
12 | def __init__(self):
13 | self.header = {
14 | "Host":"m.baidu.com",
15 | "Connection":"keep-alive",
16 | "Upgrade-Insecure-Requests":"1",
17 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36",
18 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
19 | "Accept-Encoding":"gzip, deflate",
20 | "Accept-Language":"zh-CN,zh;q=0.9",
21 | }
22 |
23 | #处理标题中的特殊字符
24 | def handle_title(self,title):
25 | search = re.compile('"|“|”|{|}')
26 | search_list = search.findall(title)
27 | for value in search_list:
28 | return re.sub(search,urllib.parse.quote(value),title)
29 | else:
30 | return title
31 |
32 | #处理任务
33 | def handle_task(self,keyword):
34 | print(keyword)
35 | result = {}
36 | result_list = []
37 | result['keyword'] = keyword
38 | url_list = ["http://m.baidu.com/s?pn=0&word="+keyword,"http://m.baidu.com/s?pn=10&word="+keyword,"http://m.baidu.com/s?pn=20&word="+keyword]
39 | for url in url_list:
40 | response = requests.get(url=url,headers=self.header)
41 | baidu_html = etree.HTML(response.text)
42 | item_list = baidu_html.xpath("//div[@id='results']/div")
43 | for item in item_list:
44 | info = {}
45 | #获取标题
46 | title = item.xpath(".//span[contains(@class,'title')]//text()|.//header[@class='c-row']/a/h3[@class='c-title']//text()")
47 | if title:
48 | info['title'] = self.handle_title(''.join(title)).replace("'","")
49 | if '百度百科' in info['title']:
50 | info['target_url'] = "https://wapbaike.baidu.com/item/"+keyword
51 | if '其他人还在搜' in info['title']:
52 | continue
53 | if '相关词语' in info['title']:
54 | continue
55 | if '相关平台' in info['title']:
56 | continue
57 | if '相关品牌' in info['title']:
58 | continue
59 | if '相关网站' in info['title']:
60 | continue
61 | if keyword+' - 资讯' in info['title']:
62 | info['target_url'] = 'http://m.baidu.com/sf/vsearch?pd=realtime&word='+keyword
63 | if keyword+' - 视频' in info['title']:
64 | info['target_url'] = 'http://m.baidu.com/sf/vsearch?pd=video&atn=index&tn=vsearch&word='+keyword
65 | if keyword+' - 小视频' in info['title']:
66 | info['target_url'] = 'http://m.baidu.com/sf/vsearch?pd=xsp&atn=index&tn=vsearch&word='+keyword
67 | else:
68 | target_url = eval(item.xpath("./@data-log")[0].encode('utf-8').decode())['mu']
69 | if target_url:
70 | info['target_url'] = target_url
71 | else:
72 | if '_企业信息' in info['title']:
73 | info['target_url'] = item.xpath("//a[@class='c-blocka']/@data-url")[0]
74 | result_list.append(info)
75 | else:
76 | continue
77 | result['rank'] = result_list
78 | result['crawl_time'] = time.strftime("%Y-%m-%d", time.localtime())
79 | print(result)
80 | # mongo.insert_item_in_db('baidu_m_keyword_ziran',result)
81 | # mysql.handle_insert_db(result)
82 |
83 | if __name__ == '__main__':
84 | baidu_m = Handle_baidu_m()
85 | # baidu_m.handle_task('盐城二手奥迪a1')
86 | #线程池
87 | t = ThreadPoolExecutor()
88 | thread_list = []
89 | #获取任务
90 | task = mysql.handle_task()
91 | for keyword in task:
92 | thread = t.submit(baidu_m.handle_task,keyword[0])
93 | thread_list.append(thread)
94 | t.shutdown()
95 | # print([thread.result() for thread in thread_list])
96 |
--------------------------------------------------------------------------------
/kolesa/crawl_kolesa.py:
--------------------------------------------------------------------------------
1 | import re
2 | from lxml import etree
3 | import requests
4 | import json
5 | from concurrent.futures import ThreadPoolExecutor
6 | import multiprocessing
7 | from handle_mongo import kolesa_mongo
8 |
9 | class Crawl_kolesa(object):
10 | def __init__(self):
11 | #首页URL
12 | self.index_url = "https://kolesa.kz/cars/"
13 | self.header = {
14 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
15 | "Accept-Encoding":"gzip, deflate, br",
16 | "Accept-Language":"zh-CN,zh;q=0.9",
17 | "Connection":"keep-alive",
18 | "Host":"kolesa.kz",
19 | "Upgrade-Insecure-Requests":"1",
20 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36",
21 | }
22 | self.brand_list_url = ""
23 |
24 | #处理请求方法
25 | def handle_request(self,url):
26 | response = requests.get(url=url,headers=self.header)
27 | return response.text
28 |
29 | #处理品牌方法
30 | def handle_brand(self):
31 | response = self.handle_request(url=self.index_url)
32 | html = etree.HTML(response)
33 | #解析品牌列表
34 | self.brand_list_url = html.xpath("//div[@class='cross-links'][2]/div[@class='cross-links-container']/ul[@class='col-sm-4 cross-links-list']/li/a/@href")
35 |
36 | #解析品牌筛选条件下的页码页
37 | def handle_brand_page(self,url):
38 | detail_info_search = re.compile(r"listing.items.push\((.*?)\);")
39 | #网站仅显示1000页
40 | for page in range(1,1001):
41 | #https://kolesa.kz/cars/gaz/?sort_by=add_date-asc&page=2
42 | #构造品牌页码URL
43 | brand_url = "https://kolesa.kz%s?sort_by=add_date-asc&page=%s"%(url,page)
44 | print(brand_url)
45 | #请求品牌页码页
46 | response = self.handle_request(url=brand_url)
47 | #每页的详情数据
48 | detail_list = detail_info_search.findall(response)
49 | if detail_list:
50 | for detail in detail_list:
51 | detail = json.loads(detail)
52 | detail_info = {}
53 | detail_info['car_name'] = detail.get("name",None)
54 | detail_info['id'] = detail.get("id",None)
55 | detail_info['car_model'] = detail['attributes']['model']
56 | detail_info['car_brand'] = detail['attributes']['brand']
57 | detail_info['price'] = detail.get("unitPrice",None)
58 | detail_info['from_url'] = detail.get("url",None)
59 | #对接mongo
60 | kolesa_mongo.handle_save_task(detail_info)
61 |
62 | #处理详情页
63 | def handle_detail(self,item):
64 | response = self.handle_request(item['from_url'])
65 | html = etree.HTML(response)
66 | item['year'] = html.xpath("//span[@class='year']/text()")[0].strip()
67 | item_list = html.xpath("//div[@class='offer__parameters']/dl")
68 | for i in item_list:
69 | name = i.xpath("./dt/span/text()")[0].strip()
70 | if name == "Пробег":
71 | #公里数
72 | item['mileage'] = i.xpath("./dd/text()")[0].strip()
73 | elif name == "Коробка передач":
74 | #变速箱
75 | item['gearbox'] = i.xpath("./dd/text()")[0].strip()
76 | elif name == "Руль":
77 | #方向盘方向
78 | item['steering_wheel'] = i.xpath("./dd/text()")[0].strip()
79 | if not item.get('mileage'):
80 | item['mileage'] = 'no data'
81 | if not item.get('gearbox'):
82 | item['geargox'] = 'no data'
83 | if not item.get('streering_wheel'):
84 | item['steering_wheel'] = 'no data'
85 | #保存数据
86 | kolesa_mongo.handle_save_data(item)
87 |
88 |
89 | #处理任务方法
90 | def handle_task(self):
91 | self.handle_brand()
92 | print("处理品牌")
93 | t = ThreadPoolExecutor()
94 | for url in self.brand_list_url:
95 | t.submit(self.handle_brand_page,url)
96 | t.shutdown()
97 |
98 | #处理最终数据方法
99 | def handle_data(self):
100 | t = ThreadPoolExecutor()
101 | while True:
102 | task = kolesa_mongo.handle_get_task()
103 | if task:
104 | t.submit(self.handle_detail, task)
105 | else:
106 | break
107 | t.shutdown()
108 |
109 | #爬虫启动方法
110 | def run(self):
111 | m1 = multiprocessing.Process(target=self.handle_task)
112 | m1.start()
113 | m1.join()
114 |
115 | m2 = multiprocessing.Process(target=self.handle_data)
116 | m2.start()
117 | m2.join()
118 |
119 |
120 |
121 |
122 |
123 | if __name__ == '__main__':
124 | kolesa = Crawl_kolesa()
125 | kolesa.run()
126 |
--------------------------------------------------------------------------------
/lagou/crawl_lagou_job_new.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | import requests
4 | import time
5 | import multiprocessing
6 | from handle_mysql import lagou_mysql
7 |
8 | class HandleLaGou(object):
9 | def __init__(self):
10 | #使用session保存cookie信息
11 | self.lagou_session = requests.session()
12 | self.header = {
13 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
14 | }
15 | self.city_list = ""
16 |
17 | def handle_request(self,method,url,data=None,info=None):
18 | '''
19 | 处理请求方法
20 | :param method: 请求方法
21 | :param url: 请求url
22 | :param data: post请求的数据
23 | :return: 数据入库
24 | '''
25 | # 由于代理不稳定,所以使用while循环
26 | while True:
27 | # 动态版阿布云代理
28 | proxyinfo = "http://%s:%s@%s:%s" %('H1V32R6470A7G90D','CD217C660A9143C3','http-dyn.abuyun.com','9020')
29 | proxy = {
30 | "http": proxyinfo,
31 | "https": proxyinfo,
32 | }
33 | try:
34 | if method == "GET":
35 | response = self.lagou_session.get(url=url,headers=self.header,proxies=proxy,timeout=6)
36 | elif method == "POST":
37 | response = self.lagou_session.post(url=url,headers=self.header,data=data,proxies=proxy,timeout=6)
38 | except Exception as e:
39 | print(e)
40 | else:
41 | # 由于反爬虫造成的continue
42 | if '频繁' in response.text:
43 | print('频繁')
44 | # 首先清除当前存在的cookie信息
45 | self.lagou_session.cookies.clear()
46 | # 重新请求cookie信息,并休眠10秒
47 | self.lagou_session.get(
48 | url="https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info,
49 | headers=self.header)
50 | time.sleep(10)
51 | continue
52 | elif '错误网关' in response.text:
53 | print('错误网关')
54 | time.sleep(1)
55 | continue
56 | elif '页面加载中' in response.text:
57 | print('页面加载中')
58 | time.sleep(2)
59 | continue
60 | else:
61 | return response.text
62 |
63 | def handle_city(self):
64 | '''
65 | 获取拉勾网岗位信息城市
66 | :return: 城市列表
67 | '''
68 | city_search = re.compile(r'zhaopin/">(.*?)')
69 | city_url = "https://www.lagou.com/jobs/allCity.html"
70 | city_result = self.handle_request(method='GET',url=city_url)
71 | self.city_list = city_search.findall(city_result)
72 | #清除cookie
73 | self.lagou_session.cookies.clear()
74 |
75 | def handle_city_job(self,city):
76 | '''
77 | :param city: 城市信息
78 | :return: 最终岗位数据,存储到Mysql
79 | '''
80 | #发出第一个请求,获取cookies信息和页码信息
81 | first_request_url="https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput="%city
82 | first_response = self.handle_request(method='GET',url=first_request_url)
83 | total_page_search = re.compile(r'class="span\stotalNum">(\d+)')
84 | try:
85 | total_page = total_page_search.search(first_response).group(1)
86 | #由于无岗位信息而return
87 | except Exception as e:
88 | return
89 | else:
90 | #经过分析,每个地区最多显示30页
91 | for i in range(1,int(total_page)+1):
92 | data = {
93 | "pn":i,
94 | "kd":"python"
95 | }
96 | #请求岗位信息时必须带上Referer
97 | referer_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput="%city
98 | self.header["Referer"]=referer_url.encode()
99 | page_url = "https://www.lagou.com/jobs/positionAjax.json?city=%s&needAddtionalResult=false"%city
100 | response = self.handle_request(method='POST',url=page_url,data=data,info=city)
101 | lagou_data = json.loads(response)
102 | job_list = lagou_data['content']['positionResult']['result']
103 | if job_list:
104 | for job in job_list:
105 | job['crawl_date'] = time.strftime("%Y-%m-%d", time.localtime())
106 | lagou_mysql.insert_item(job)
107 |
108 | if __name__ == '__main__':
109 | lagou = HandleLaGou()
110 | lagou.handle_city()
111 | print(lagou.city_list)
112 | pool = multiprocessing.Pool(2)
113 | for city in lagou.city_list:
114 | pool.apply_async(lagou.handle_city_job,args=(city,))
115 | pool.close()
116 | pool.join()
117 | # for city in lagou.city_list:
118 | # lagou.handle_city_job(city)
119 |
--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/handle_task.py:
--------------------------------------------------------------------------------
1 | import random
2 | import time
3 | import requests
4 | import re
5 | import json
6 | from handle_mongo import mongo
7 | from settings import proxy_url
8 | from concurrent.futures.thread import ThreadPoolExecutor
9 | import multiprocessing
10 |
11 |
12 | class HandleMaFengWoTask(object):
13 | def __init__(self):
14 | self.header = {
15 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
16 | }
17 | self.proxy_list = []
18 |
19 | def handle_proxy(self):
20 | response = requests.get(url=proxy_url)
21 | data = json.loads(response.text)
22 | sum = 0
23 | #每请求一次加入200个代理
24 | for proxy in data['proxys']:
25 | sum = sum + 1
26 | if sum > 200:
27 | break
28 | proxy_dict = {
29 | "http": proxy['proxy'],
30 | "https": proxy['proxy']
31 | }
32 | self.proxy_list.append(proxy_dict)
33 |
34 |
35 | #最新游记
36 | def handle_new_article(self,page):
37 | article_url_search = re.compile(r'a\shref="/i/(\d+)\.html"')
38 | info = {}
39 | info['flag'] = 'GET'
40 | info['url'] = 'https://pagelet.mafengwo.cn/note/pagelet/recommendNoteApi?params={"type":3,"objid":0,"page":%s,"ajax":1,"retina":0}'%page
41 | print(info['url'])
42 | new_article = self.handle_request(info)
43 | try:
44 | html = json.loads(new_article)['data']['html']
45 | except:
46 | return
47 | article_url_list = article_url_search.findall(html)
48 | for article_id in set(article_url_list):
49 | insert_mongo = {}
50 | insert_mongo['id'] = article_id
51 | insert_mongo['url'] = 'http://pagelet.mafengwo.cn/note/pagelet/headOperateApi?params={"iid":"%s"}'%article_id
52 | insert_mongo['item_type'] = 'head_item'
53 | print(insert_mongo)
54 | mongo.insert_task(insert_mongo)
55 |
56 | #热门游记
57 | def handle_hot_article(self,page):
58 | article_url_search = re.compile(r'a\shref="/i/(\d+)\.html"')
59 | info = {}
60 | info['flag'] = 'GET'
61 | info['url'] = 'https://pagelet.mafengwo.cn/note/pagelet/recommendNoteApi?params={"type":0,"objid":0,"page":%s,"ajax":1,"retina":0}' % page
62 | print(info['url'])
63 | new_article = self.handle_request(info)
64 | try:
65 | html = json.loads(new_article)['data']['html']
66 | except:
67 | return
68 | article_url_list = article_url_search.findall(html)
69 | for article_id in set(article_url_list):
70 | insert_mongo = {}
71 | insert_mongo['id'] = article_id
72 | insert_mongo[
73 | 'url'] = 'http://pagelet.mafengwo.cn/note/pagelet/headOperateApi?params={"iid":"%s"}' % article_id
74 | insert_mongo['item_type'] = 'head_item'
75 | print(insert_mongo)
76 | mongo.insert_task(insert_mongo)
77 |
78 | def handle_new_column(self):
79 | column_url_search = re.compile(r'/traveller/article.php\?id=\d+')
80 | for i in range(0,2000,10):
81 | info= {}
82 | info['flag'] = 'GET'
83 | info['url'] = 'https://www.mafengwo.cn/traveller/ajax.php?action=getMoreArticles&sort=ctime&start=%s'%i
84 | new_column = self.handle_request(info)
85 | html = json.loads(new_column)['html']
86 | column_list = column_url_search.findall(html)
87 | for column in set(column_list):
88 | url = 'https://www.mafengwo.cn'+column
89 | print(url)
90 | break
91 |
92 | def handle_hot_column(self):
93 | column_url_search = re.compile(r'/traveller/article.php\?id=\d+')
94 | for i in range(0,2000,10):
95 | info= {}
96 | info['flag'] = 'GET'
97 | info['url'] = 'https://www.mafengwo.cn/traveller/ajax.php?action=getMoreArticles&sort=hot&start=%s'%i
98 | new_column = self.handle_request(info)
99 | html = json.loads(new_column)['html']
100 | column_list = column_url_search.findall(html)
101 | for column in set(column_list):
102 | url = 'https://www.mafengwo.cn'+column
103 | print(url)
104 | break
105 |
106 | def handle_request(self,info):
107 | #判断代理数量,如果小于10则更新代理
108 | if len(self.proxy_list)<10:
109 | self.handle_proxy()
110 | if info['flag'] == 'GET':
111 | while True:
112 | try:
113 | response = requests.get(url=info['url'],headers=self.header,proxies=self.proxy_list.pop(0),timeout=6)
114 | except Exception as e:
115 | print(e)
116 | time.sleep(2)
117 | continue
118 | else:
119 | return response.text
120 | elif info['flag'] == 'POST':
121 | response = requests.post(url=info['url'],headers=self.header,data=info['data'],proxies=self.proxy_list.pop(0),timeout=6)
122 | return response.text
123 |
124 | #最新游记处理进程
125 | def process_1(self):
126 | t1 = ThreadPoolExecutor()
127 | for page in range(1,8):
128 | print(page)
129 | t1.submit(self.handle_new_article,page)
130 | t1.shutdown()
131 |
132 | #热门游记处理进程
133 | def process_2(self):
134 | t2 = ThreadPoolExecutor()
135 | for page in range(1,8):
136 | print(page)
137 | t2.submit(self.handle_hot_article,page)
138 | t2.shutdown()
139 | # self.handle_new_column()
140 | # self.handle_hot_column()
141 |
142 | def run(self):
143 | m1 = multiprocessing.Process(target=self.process_1)
144 | m2 = multiprocessing.Process(target=self.process_2)
145 | m1.start()
146 | m2.start()
147 | m1.join()
148 | m2.join()
149 |
150 | def main():
151 | mafengwo_task = HandleMaFengWoTask()
152 | mafengwo_task.run()
153 |
154 | if __name__ == '__main__':
155 | main()
156 |
--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/spiders/crawl_mafengwo.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 | import re
4 | import time
5 | import scrapy
6 | from ..items import MafengwoItem
7 | from mafengwo.handle_mongo import mongo
8 |
9 |
10 | class CrawlMafengwoSpider(scrapy.Spider):
11 | name = 'crawl_mafengwo'
12 | allowed_domains = ['mafengwo.cn']
13 |
14 | #从task库中取出任务
15 | def start_requests(self):
16 | for i in range(1):
17 | task = mongo.get_task()
18 | #如果有任务则执行
19 | if task:
20 | if '_id' in task:
21 | task.pop('_id')
22 | print(task)
23 | if task['item_type'] == 'head_item':
24 | yield scrapy.Request(url=task['url'],callback=self.handle_detail_head,dont_filter=True,meta=task)
25 | elif task['item_type'] == 'article_item':
26 | yield scrapy.Request(url=task['url'],callback=self.handle_detail,dont_filter=True,meta=task)
27 |
28 | #解析美篇游记的头部信息
29 | def handle_detail_head(self,response):
30 | read_comment_search = re.compile(r'(.*?)')
31 | name_search = re.compile(r'class="per_name"\stitle="(.*?)">')
32 | star_search = re.compile(r'(\d+)收藏')
33 | release_time_search = re.compile(r'(.*?)')
34 | html = json.loads(response.text)['data']['html']
35 | info = {}
36 | read_comment = read_comment_search.search(html).group(1).split('/')
37 | info['read_sum'] = read_comment[0]
38 | info['comment_sum'] = read_comment[1]
39 | info['name'] = name_search.search(html).group(1)
40 | info['star_sum'] = star_search.search(html).group(1)
41 | info['release_time'] = release_time_search.search(html).group(1)
42 | info['item_type'] = 'article_item'
43 | info['url'] = 'http://www.mafengwo.cn/i/%s.html'%(response.request.meta['id'])
44 | mongo.insert_task(info)
45 |
46 | #解析游记
47 | def handle_detail(self,response):
48 | id_search = re.compile(r"window.Env\s=\s(.*);")
49 | seq_search = re.compile(r'data-seq="(\d+)"')
50 | try:
51 | id_result = json.loads(id_search.search(response.text).group(1))
52 | except:
53 | return
54 | id = id_result['iid']
55 | iid = id_result.get('new_iid')
56 | #存在下一页
57 | if iid:
58 | print(response.url+"存在多页")
59 | response.request.meta['id'] = id
60 | response.request.meta['iid'] = iid
61 | #文章标题
62 | response.request.meta['title'] = response.xpath("//title/text()").extract_first()
63 | #文章内容
64 | response.request.meta['content'] = response.xpath("//div[@class='_j_content_box']").extract()
65 | #请求URL
66 | response.request.meta['from_url'] = response.url
67 | #请求下一页所使用的ID
68 | next_request_seq = seq_search.findall(response.text)[-1]
69 | next_detail_url = "http://www.mafengwo.cn/note/ajax/detail/getNoteDetailContentChunk?id=%s&iid=%s&seq=%s&back=0" % (id, iid, next_request_seq)
70 | yield scrapy.Request(url=next_detail_url, callback=self.handle_detail_json, dont_filter=True,meta=response.request.meta)
71 | # 不存在下一页
72 | else:
73 | #处理游记
74 | m3u8_search = re.compile(r'data-url="(.*\.m3u8)"')
75 | mafengwo_data = MafengwoItem()
76 | mafengwo_data['title'] = response.xpath("//title/text()").extract_first()
77 | mafengwo_data['from_url'] = response.request.meta['from_url']
78 | mafengwo_data['read_sum'] = response.request.meta['read_sum']
79 | mafengwo_data['comment_sum'] = response.request.meta['comment_sum']
80 | mafengwo_data['star_sum'] = response.request.meta['star_sum']
81 | # mafengwo_data['support_sum'] = response.request.meta['support_sum']
82 | mafengwo_data['release_time'] = response.request.meta['release_time']
83 | mafengwo_data['name'] = response.request.meta['name']
84 | mafengwo_data['id'] = id
85 | mafengwo_data['content'] = self.handle_img_src(''.join(response.xpath("//div[@id='pnl_contentinfo']").extract_first()))
86 | photo_url_search = re.compile(r'data-src="(.*?)\?')
87 | mafengwo_data['video_urls'] = m3u8_search.findall(mafengwo_data['content'])
88 | mafengwo_data['image_urls'] = photo_url_search.findall(mafengwo_data['content'])
89 | mafengwo_data['upload_status'] = 0
90 | mafengwo_data['crawl_time'] = time.strftime("%Y%m%d %H:%M:%S", time.localtime())
91 | yield mafengwo_data
92 |
93 | def handle_detail_json(self,response):
94 | m3u8_search = re.compile(r'data-url="(.*\.m3u8)"')
95 | seq_search = re.compile(r'data-seq="(\d+)"')
96 | html_text = json.loads(response.text)['data']
97 | if html_text['html'] == "":
98 | mafengwo_data = MafengwoItem()
99 | mafengwo_data['title'] = response.request.meta['title']
100 | mafengwo_data['from_url'] = response.request.meta['from_url']
101 | mafengwo_data['read_sum'] = response.request.meta['read_sum']
102 | mafengwo_data['comment_sum'] = response.request.meta['comment_sum']
103 | mafengwo_data['star_sum'] = response.request.meta['star_sum']
104 | # mafengwo_data['support_sum'] = response.request.meta['support_sum']
105 | mafengwo_data['release_time'] = response.request.meta['release_time']
106 | mafengwo_data['name'] = response.request.meta['name']
107 | mafengwo_data['id'] = response.request.meta['id']
108 | mafengwo_data['content'] = self.handle_img_src(''.join(response.request.meta['content']))
109 | mafengwo_data['upload_status'] = 0
110 | mafengwo_data['crawl_time'] = time.strftime("%Y%m%d %H:%M:%S", time.localtime())
111 | photo_url_search = re.compile(r'data-src="(.*?)\?')
112 | mafengwo_data['video_urls'] = m3u8_search.findall(mafengwo_data['content'])
113 | mafengwo_data['image_urls'] = photo_url_search.findall(mafengwo_data['content'])
114 | yield mafengwo_data
115 | else:
116 | html = html_text['html']
117 | response.request.meta['content'].append(html)
118 | next_request_seq = seq_search.findall(html)[-1]
119 | if next_request_seq:
120 | next_detail_url = "http://www.mafengwo.cn/note/ajax/detail/getNoteDetailContentChunk?id=%s&iid=%s&seq=%s&back=0" % (response.request.meta['id'], response.request.meta['iid'], next_request_seq)
121 | yield scrapy.Request(url=next_detail_url, callback=self.handle_detail_json, dont_filter=True,meta=response.request.meta)
122 |
123 | #处理游记中的图片URL
124 | def handle_img_src(self, text):
125 | img_search = re.compile(r"|")
126 | img_data_src_search = re.compile(r'data-src="(.*?)\?')
127 | src_search = re.compile(r'[^-]src="(.*?)"')
128 | img_list = img_search.findall(text)
129 | for img in img_list:
130 | try:
131 | img_data_src = img_data_src_search.search(img).group(1)
132 | src = src_search.search(img).group(1)
133 | img_new = img.replace(src, img_data_src)
134 | text = text.replace(img, img_new)
135 | except:
136 | pass
137 | return text
138 |
--------------------------------------------------------------------------------
/mafengwo/mafengwo/spiders/crawl_mafengwo.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 | import re
4 | import scrapy
5 | from scrapy import Selector
6 | from ..items import MafengwoItem
7 | import time
8 |
9 |
10 | class CrawlMafengwoSpider(scrapy.Spider):
11 | name = 'crawl_mafengwo'
12 | allowed_domains = ['mafengwo.cn']
13 | # start_urls = ['http://www.mafengwo.cn/u/wenhao/note.html']
14 |
15 | #请求首页
16 | def start_requests(self):
17 | #直接构造请求页码URL,如请求200页,热门游记
18 | for page in range(1,200):
19 | url = 'http://pagelet.mafengwo.cn/note/pagelet/recommendNoteApi?params={"type":0,"objid":0,"page":%s,"ajax":1,"retina":0}'%page
20 | yield scrapy.Request(url=url,callback=self.handle_page,dont_filter=True)
21 |
22 | #解析有多少篇游记,构造游记阅读量等信息URL并请求
23 | def handle_page(self, response):
24 | #获取页码页返回中的文章ID
25 | article_id_search = re.compile(r'(.*?)')
37 | name_search = re.compile(r'class="per_name"\stitle="(.*?)">')
38 | star_search = re.compile(r'(\d+)收藏')
39 | release_time_search = re.compile(r'(.*?)')
40 | html = json.loads(response.text)['data']['html']
41 | info = {}
42 | read_comment = read_comment_search.search(html).group(1).split('/')
43 | info['read_sum'] = read_comment[0]
44 | info['comment_sum'] = read_comment[1]
45 | info['name'] = name_search.search(html).group(1)
46 | info['star_sum'] = star_search.search(html).group(1)
47 | info['release_time'] = release_time_search.search(html).group(1)
48 | info['id'] = response.request.meta['article_id']
49 | info['url'] = 'http://www.mafengwo.cn/i/%s.html' % (response.request.meta['article_id'])
50 | print(info)
51 | yield scrapy.Request(url=info['url'],callback=self.handle_detail,meta=info,dont_filter=True)
52 |
53 | # 解析游记
54 | def handle_detail(self, response):
55 | id_search = re.compile(r"window.Env\s=\s(.*);")
56 | seq_search = re.compile(r'data-seq="(\d+)"')
57 | try:
58 | id_result = json.loads(id_search.search(response.text).group(1))
59 | except:
60 | return
61 | #获取是否存在下一页标志
62 | iid = id_result.get('new_iid')
63 | # 存在下一页
64 | if iid:
65 | print(response.url + "存在多页")
66 | response.request.meta['iid'] = iid
67 | # 文章标题
68 | response.request.meta['title'] = response.xpath("//title/text()").extract_first()
69 | # 文章内容
70 | response.request.meta['content'] = response.xpath("//div[@class='_j_content_box']").extract()
71 | # 请求URL
72 | response.request.meta['from_url'] = response.url
73 | # 请求下一页所使用的ID
74 | next_request_seq = seq_search.findall(response.text)[-1]
75 | next_detail_url = "http://www.mafengwo.cn/note/ajax/detail/getNoteDetailContentChunk?id=%s&iid=%s&seq=%s&back=0" % (response.request.meta['id'], iid, next_request_seq)
76 | yield scrapy.Request(url=next_detail_url, callback=self.handle_detail_json, dont_filter=True,meta=response.request.meta)
77 | # 不存在下一页
78 | else:
79 | # 处理游记
80 | mafengwo_data = MafengwoItem()
81 | mafengwo_data['title'] = response.xpath("//title/text()").extract_first()
82 | mafengwo_data['from_url'] = response.request.meta['from_url']
83 | mafengwo_data['read_sum'] = response.request.meta['read_sum']
84 | mafengwo_data['comment_sum'] = response.request.meta['comment_sum']
85 | mafengwo_data['star_sum'] = response.request.meta['star_sum']
86 | mafengwo_data['release_time'] = response.request.meta['release_time']
87 | mafengwo_data['name'] = response.request.meta['name']
88 | mafengwo_data['id'] = response.request.meta['id']
89 | mafengwo_data['content'] = self.handle_img_src(''.join(response.xpath("//div[@id='pnl_contentinfo']").extract_first()))
90 | #获取文章中所有图片URL
91 | photo_url_search = re.compile(r'data-src="(.*?)\?')
92 | mafengwo_data['image_urls'] = photo_url_search.findall(mafengwo_data['content'])
93 | mafengwo_data['crawl_time'] = time.strftime("%Y%m%d %H:%M:%S", time.localtime())
94 | yield mafengwo_data
95 |
96 | def handle_detail_json(self, response):
97 | seq_search = re.compile(r'data-seq="(\d+)"')
98 | html_text = json.loads(response.text)['data']
99 | #请求到末页
100 | if html_text['html'] == "":
101 | mafengwo_data = MafengwoItem()
102 | mafengwo_data['title'] = response.request.meta['title']
103 | mafengwo_data['from_url'] = response.request.meta['from_url']
104 | mafengwo_data['read_sum'] = response.request.meta['read_sum']
105 | mafengwo_data['comment_sum'] = response.request.meta['comment_sum']
106 | mafengwo_data['star_sum'] = response.request.meta['star_sum']
107 | mafengwo_data['release_time'] = response.request.meta['release_time']
108 | mafengwo_data['name'] = response.request.meta['name']
109 | mafengwo_data['id'] = response.request.meta['id']
110 | mafengwo_data['content'] = self.handle_img_src(''.join(response.request.meta['content']))
111 | mafengwo_data['crawl_time'] = time.strftime("%Y%m%d %H:%M:%S", time.localtime())
112 | photo_url_search = re.compile(r'data-src="(.*?)\?')
113 | mafengwo_data['image_urls'] = photo_url_search.findall(mafengwo_data['content'])
114 | yield mafengwo_data
115 | #继续请求下一页
116 | else:
117 | html = html_text['html']
118 | response.request.meta['content'].append(html)
119 | next_request_seq = seq_search.findall(html)[-1]
120 | if next_request_seq:
121 | next_detail_url = "http://www.mafengwo.cn/note/ajax/detail/getNoteDetailContentChunk?id=%s&iid=%s&seq=%s&back=0" % (response.request.meta['id'], response.request.meta['iid'], next_request_seq)
122 | yield scrapy.Request(url=next_detail_url, callback=self.handle_detail_json, dont_filter=True,meta=response.request.meta)
123 |
124 | # 处理游记中的图片URL
125 | def handle_img_src(self, text):
126 | img_search = re.compile(r"|")
127 | img_data_src_search = re.compile(r'data-src="(.*?)\?')
128 | src_search = re.compile(r'[^-]src="(.*?)"')
129 | img_list = img_search.findall(text)
130 | for img in img_list:
131 | try:
132 | img_data_src = img_data_src_search.search(img).group(1)
133 | src = src_search.search(img).group(1)
134 | img_new = img.replace(src, img_data_src)
135 | text = text.replace(img, img_new)
136 | except:
137 | pass
138 | return text
139 |
140 |
--------------------------------------------------------------------------------
/mafengwo/mafengwo/url_list.txt:
--------------------------------------------------------------------------------
1 | http://www.mafengwo.cn/u/wenhao/note.html
2 | http://www.mafengwo.cn/u/5295777/note.html
3 | http://www.mafengwo.cn/u/85713126/note.html
4 | http://www.mafengwo.cn/u/18015577/note.html
5 | http://www.mafengwo.cn/u/60798801/note.html
6 | http://www.mafengwo.cn/u/yiyinotes/note.html
7 | https://www.mafengwo.cn/u/88358953/note.html
8 | https://www.mafengwo.cn/u/daxigua/note.html
9 | https://www.mafengwo.cn/u/47448074/note.html
10 | https://www.mafengwo.cn/u/36909470/note.html
11 | https://www.mafengwo.cn/u/76823294/note.html
12 | https://www.mafengwo.cn/u/32216322/note.html
13 | https://www.mafengwo.cn/u/10704640/note.html
14 | https://www.mafengwo.cn/u/dearsummar/note.html
15 | https://www.mafengwo.cn/u/19894572/note.html
16 | https://www.mafengwo.cn/u/321294/note.html
17 | https://www.mafengwo.cn/u/5172228/note.html
18 | https://www.mafengwo.cn/u/5017124/note.html
19 | https://www.mafengwo.cn/u/hwf520/note.html
20 | https://www.mafengwo.cn/u/kido37/note.html
21 | https://www.mafengwo.cn/u/41037525/note.html
22 | https://www.mafengwo.cn/u/joyii0513/note.html
23 | https://www.mafengwo.cn/u/69709753/note.html
24 | https://www.mafengwo.cn/u/wayzhenyan/note.html
25 | https://www.mafengwo.cn/u/78343168/note.html
26 | https://www.mafengwo.cn/u/46337998/note.html
27 | https://www.mafengwo.cn/u/sellnuan/note.html
28 | https://www.mafengwo.cn/u/846867/note.html
29 | https://www.mafengwo.cn/u/54041143/note.html
30 | https://www.mafengwo.cn/u/17074212/note.html
31 | https://www.mafengwo.cn/u/5602249/note.html
32 | https://www.mafengwo.cn/u/45793678/note.html
33 | https://www.mafengwo.cn/u/42370376/note.html
34 | https://www.mafengwo.cn/u/81676700/note.html
35 | https://www.mafengwo.cn/u/78838404/note.html
36 | https://www.mafengwo.cn/u/5663320/note.html
37 | https://www.mafengwo.cn/u/56213436/note.html
38 | https://www.mafengwo.cn/u/68691572/note.html
39 | https://www.mafengwo.cn/u/67165115/note.html
40 | https://www.mafengwo.cn/u/45907046/note.html
41 | https://www.mafengwo.cn/u/samwong/note.html
42 | https://www.mafengwo.cn/u/48737554/note.html
43 | https://www.mafengwo.cn/u/5366541/note.html
44 | https://www.mafengwo.cn/u/1047345/note.html
45 | https://www.mafengwo.cn/u/73297474/note.html
46 | https://www.mafengwo.cn/u/64898562/note.html
47 | https://www.mafengwo.cn/u/ariel690/note.html
48 | https://www.mafengwo.cn/u/5133407/note.html
49 | https://www.mafengwo.cn/u/63932781/note.html
50 | https://www.mafengwo.cn/u/49231278/note.html
51 | https://www.mafengwo.cn/u/69833564/note.html
52 | https://www.mafengwo.cn/u/52482820/note.html
53 | https://www.mafengwo.cn/u/374140/note.html
54 | https://www.mafengwo.cn/u/5363625/note.html
55 | https://www.mafengwo.cn/u/64582645/note.html
56 | https://www.mafengwo.cn/u/32228262/note.html
57 | https://www.mafengwo.cn/u/68295140/note.html
58 | https://www.mafengwo.cn/u/93296829/note.html
59 | https://www.mafengwo.cn/u/biggun/note.html
60 | https://www.mafengwo.cn/u/57892379/note.html
61 | https://www.mafengwo.cn/u/76823294.html
62 | https://www.mafengwo.cn/u/pinkyvision/note.html
63 | https://www.mafengwo.cn/u/69536526/note.html
64 | https://www.mafengwo.cn/u/37311913/note.html
65 | https://www.mafengwo.cn/u/10345585/note.html
66 | https://www.mafengwo.cn/u/37369363/note.html
67 | https://www.mafengwo.cn/u/inlaoban5/note.html
68 | https://www.mafengwo.cn/u/75471465/note.html
69 | https://www.mafengwo.cn/u/40682663/note.html
70 | https://www.mafengwo.cn/u/799727/note.html
71 | https://www.mafengwo.cn/u/19560416/note.html
72 | https://www.mafengwo.cn/u/summer7/note.html
73 | https://www.mafengwo.cn/u/zhenmeiqu/note.html
74 | https://www.mafengwo.cn/u/93808795/note.html
75 | https://www.mafengwo.cn/u/ruanzhonghua/note.html
76 | https://www.mafengwo.cn/u/59633694/note.html
77 | https://www.mafengwo.cn/u/5172228/note.html
78 | https://www.mafengwo.cn/u/79862907/note.html
79 | https://www.mafengwo.cn/u/5119335/note.html
80 | https://www.mafengwo.cn/u/iiibiz/note.html
81 | https://www.mafengwo.cn/u/92990277/note.html
82 | https://www.mafengwo.cn/u/83736375.html
83 | https://www.mafengwo.cn/u/66016397/note.html
84 | https://www.mafengwo.cn/u/75334068/note.html
85 | https://www.mafengwo.cn/u/10606831/note.html
86 | https://www.mafengwo.cn/u/73953374/note.html
87 | https://www.mafengwo.cn/u/5328159/note.html
88 | https://www.mafengwo.cn/u/72226812/note.html
89 | https://www.mafengwo.cn/u/75867238/note.html
90 | https://www.mafengwo.cn/u/ruogu2/note.html
91 | https://www.mafengwo.cn/u/459268/note.html
92 | https://www.mafengwo.cn/u/5037685/note.html
93 | https://www.mafengwo.cn/u/32358313/note.html
94 | https://www.mafengwo.cn/u/ymy817/note.html
95 | https://www.mafengwo.cn/u/44131359/note.html
96 | https://www.mafengwo.cn/u/flyingwsh/note.html
97 | https://www.mafengwo.cn/u/36953718/note.html
98 | https://www.mafengwo.cn/u/830821/note.html
99 | https://www.mafengwo.cn/u/72465054/note.html
100 | https://www.mafengwo.cn/u/816643/note.html
101 | https://www.mafengwo.cn/u/5547423/note.html
102 | https://www.mafengwo.cn/u/85055587/note.html
103 | https://www.mafengwo.cn/u/77259555/note.html
104 | https://www.mafengwo.cn/u/58085128/note.html
105 | https://www.mafengwo.cn/u/85782763/note.html
106 | https://www.mafengwo.cn/u/448785/note.html
107 | https://www.mafengwo.cn/u/shanfeng/note.html
108 | https://www.mafengwo.cn/u/30730200/note.html
109 | https://www.mafengwo.cn/u/82532600/note.html
110 | https://www.mafengwo.cn/u/sellnuan/note.html
111 | https://www.mafengwo.cn/u/85205385/note.html
112 | https://www.mafengwo.cn/u/40525484/note.html
113 | https://www.mafengwo.cn/u/92931036/note.html
114 | https://www.mafengwo.cn/u/60022265/note.html
115 | https://www.mafengwo.cn/u/45066857.html
116 | https://www.mafengwo.cn/u/34957278/note.html
117 | https://www.mafengwo.cn/u/90472994/note.html
118 | https://www.mafengwo.cn/u/5295777/note.html
119 | https://www.mafengwo.cn/u/86494331/note.html
120 | https://www.mafengwo.cn/u/42395202.html
121 | https://www.mafengwo.cn/u/heididsy/note.html
122 | https://www.mafengwo.cn/u/42694746/note.html
123 | https://www.mafengwo.cn/u/yimeng/note.html
124 | https://www.mafengwo.cn/u/5172228/note.html
125 | https://www.mafengwo.cn/u/17639643.html
126 | https://www.mafengwo.cn/u/wuweixiang/note.html
127 | https://www.mafengwo.cn/u/92931036/note.html
128 | https://www.mafengwo.cn/u/49231278/note.html
129 | https://www.mafengwo.cn/u/5481686.html
130 | https://www.mafengwo.cn/u/19014378/note.html
131 | https://www.mafengwo.cn/u/seacen/note.html
132 | https://www.mafengwo.cn/u/beslan/note.html
133 | https://www.mafengwo.cn/u/ruanzhonghua/note.html
134 | https://www.mafengwo.cn/u/187367/note.html
135 | https://www.mafengwo.cn/u/32216322/note.html
136 | https://www.mafengwo.cn/u/93157709/note.html
137 | https://www.mafengwo.cn/u/13105932/note.html
138 | https://www.mafengwo.cn/u/86494331/note.html
139 | https://www.mafengwo.cn/u/10911951.html
140 | https://www.mafengwo.cn/u/77243222/note.html
141 | https://www.mafengwo.cn/u/yolichic/note.html
142 | https://www.mafengwo.cn/u/88371807/note.html
143 | https://www.mafengwo.cn/u/jklouise/note.html
144 | https://www.mafengwo.cn/u/85558645/note.html
145 | https://www.mafengwo.cn/u/69200064/note.html
146 | https://www.mafengwo.cn/u/88358953/note.html
147 | https://www.mafengwo.cn/u/54534899/note.html
148 | https://www.mafengwo.cn/u/kido37/note.html
149 | https://www.mafengwo.cn/u/ruogu2/note.html
150 | https://www.mafengwo.cn/u/32228262/note.html
151 | https://www.mafengwo.cn/u/208077/note.html
152 | https://www.mafengwo.cn/u/xmulazio/note.html
153 | https://www.mafengwo.cn/u/74369556/note.html
154 | https://www.mafengwo.cn/u/5028192/note.html
155 | https://www.mafengwo.cn/u/ptah0622/note.html
156 | https://www.mafengwo.cn/u/5203896/note.html
157 | https://www.mafengwo.cn/u/35296229/note.html
158 | https://www.mafengwo.cn/u/69709753/note.html
159 | https://www.mafengwo.cn/u/71897854/note.html
160 | https://www.mafengwo.cn/u/73941769/note.html
161 | https://www.mafengwo.cn/u/79167497/note.html
162 | https://www.mafengwo.cn/u/5648583/note.html
163 | https://www.mafengwo.cn/u/840399/note.html
164 | https://www.mafengwo.cn/u/34260694/note.html
165 | https://www.mafengwo.cn/u/89214773/note.html
166 | https://www.mafengwo.cn/u/47448074/note.html
167 | https://www.mafengwo.cn/u/90344916/note.html
168 | https://www.mafengwo.cn/u/5673085/note.html
169 | https://www.mafengwo.cn/u/fantasist/note.html
170 | https://www.mafengwo.cn/u/gemmakyoto/note.html
171 | https://www.mafengwo.cn/u/kidd1110/note.html
172 | https://www.mafengwo.cn/u/459539/note.html
173 | https://www.mafengwo.cn/u/clijsters/note.html
174 | https://www.mafengwo.cn/u/53816690/note.html
175 | https://www.mafengwo.cn/u/85224198/note.html
176 | https://www.mafengwo.cn/u/1115956/note.html
177 | https://www.mafengwo.cn/u/kevlee/note.html
178 | https://www.mafengwo.cn/u/sarahontheroad.html
179 | https://www.mafengwo.cn/u/10525543/note.html
180 | https://www.mafengwo.cn/u/374140/note.html
181 | https://www.mafengwo.cn/u/19268018/note.html
182 | https://www.mafengwo.cn/u/70816697/note.html
183 | https://www.mafengwo.cn/u/102065/note.html
184 | https://www.mafengwo.cn/u/yolichic/note.html
185 | https://www.mafengwo.cn/u/49130101/note.html
186 | https://www.mafengwo.cn/u/49221414/note.html
187 | https://www.mafengwo.cn/u/sicilia/note.html
188 | https://www.mafengwo.cn/u/zhangxiaofan/note.html
189 | https://www.mafengwo.cn/u/fantastic/note.html
190 | https://www.mafengwo.cn/u/193656/note.html
191 | https://www.mafengwo.cn/u/after17/note.html
192 | https://www.mafengwo.cn/u/guaiiiii/note.html
193 | https://www.mafengwo.cn/u/tianpinan/note.html
194 | https://www.mafengwo.cn/u/52233524/note.html
195 | https://www.mafengwo.cn/u/75151343/note.html
196 | https://www.mafengwo.cn/u/88358953/note.html
197 | https://www.mafengwo.cn/u/83796483/note.html
198 | https://www.mafengwo.cn/u/79297765/note.html
199 | https://www.mafengwo.cn/u/72512443/note.html
200 | https://www.mafengwo.cn/u/niuniu/note.html
--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/url_list.txt:
--------------------------------------------------------------------------------
1 | http://www.mafengwo.cn/u/wenhao/note.html
2 | http://www.mafengwo.cn/u/5295777/note.html
3 | http://www.mafengwo.cn/u/85713126/note.html
4 | http://www.mafengwo.cn/u/18015577/note.html
5 | http://www.mafengwo.cn/u/60798801/note.html
6 | http://www.mafengwo.cn/u/yiyinotes/note.html
7 | https://www.mafengwo.cn/u/88358953/note.html
8 | https://www.mafengwo.cn/u/daxigua/note.html
9 | https://www.mafengwo.cn/u/47448074/note.html
10 | https://www.mafengwo.cn/u/36909470/note.html
11 | https://www.mafengwo.cn/u/76823294/note.html
12 | https://www.mafengwo.cn/u/32216322/note.html
13 | https://www.mafengwo.cn/u/10704640/note.html
14 | https://www.mafengwo.cn/u/dearsummar/note.html
15 | https://www.mafengwo.cn/u/19894572/note.html
16 | https://www.mafengwo.cn/u/321294/note.html
17 | https://www.mafengwo.cn/u/5172228/note.html
18 | https://www.mafengwo.cn/u/5017124/note.html
19 | https://www.mafengwo.cn/u/hwf520/note.html
20 | https://www.mafengwo.cn/u/kido37/note.html
21 | https://www.mafengwo.cn/u/41037525/note.html
22 | https://www.mafengwo.cn/u/joyii0513/note.html
23 | https://www.mafengwo.cn/u/69709753/note.html
24 | https://www.mafengwo.cn/u/wayzhenyan/note.html
25 | https://www.mafengwo.cn/u/78343168/note.html
26 | https://www.mafengwo.cn/u/46337998/note.html
27 | https://www.mafengwo.cn/u/sellnuan/note.html
28 | https://www.mafengwo.cn/u/846867/note.html
29 | https://www.mafengwo.cn/u/54041143/note.html
30 | https://www.mafengwo.cn/u/17074212/note.html
31 | https://www.mafengwo.cn/u/5602249/note.html
32 | https://www.mafengwo.cn/u/45793678/note.html
33 | https://www.mafengwo.cn/u/42370376/note.html
34 | https://www.mafengwo.cn/u/81676700/note.html
35 | https://www.mafengwo.cn/u/78838404/note.html
36 | https://www.mafengwo.cn/u/5663320/note.html
37 | https://www.mafengwo.cn/u/56213436/note.html
38 | https://www.mafengwo.cn/u/68691572/note.html
39 | https://www.mafengwo.cn/u/67165115/note.html
40 | https://www.mafengwo.cn/u/45907046/note.html
41 | https://www.mafengwo.cn/u/samwong/note.html
42 | https://www.mafengwo.cn/u/48737554/note.html
43 | https://www.mafengwo.cn/u/5366541/note.html
44 | https://www.mafengwo.cn/u/1047345/note.html
45 | https://www.mafengwo.cn/u/73297474/note.html
46 | https://www.mafengwo.cn/u/64898562/note.html
47 | https://www.mafengwo.cn/u/ariel690/note.html
48 | https://www.mafengwo.cn/u/5133407/note.html
49 | https://www.mafengwo.cn/u/63932781/note.html
50 | https://www.mafengwo.cn/u/49231278/note.html
51 | https://www.mafengwo.cn/u/69833564/note.html
52 | https://www.mafengwo.cn/u/52482820/note.html
53 | https://www.mafengwo.cn/u/374140/note.html
54 | https://www.mafengwo.cn/u/5363625/note.html
55 | https://www.mafengwo.cn/u/64582645/note.html
56 | https://www.mafengwo.cn/u/32228262/note.html
57 | https://www.mafengwo.cn/u/68295140/note.html
58 | https://www.mafengwo.cn/u/93296829/note.html
59 | https://www.mafengwo.cn/u/biggun/note.html
60 | https://www.mafengwo.cn/u/57892379/note.html
61 | https://www.mafengwo.cn/u/76823294/note.html
62 | https://www.mafengwo.cn/u/pinkyvision/note.html
63 | https://www.mafengwo.cn/u/69536526/note.html
64 | https://www.mafengwo.cn/u/37311913/note.html
65 | https://www.mafengwo.cn/u/10345585/note.html
66 | https://www.mafengwo.cn/u/37369363/note.html
67 | https://www.mafengwo.cn/u/inlaoban5/note.html
68 | https://www.mafengwo.cn/u/75471465/note.html
69 | https://www.mafengwo.cn/u/40682663/note.html
70 | https://www.mafengwo.cn/u/799727/note.html
71 | https://www.mafengwo.cn/u/19560416/note.html
72 | https://www.mafengwo.cn/u/summer7/note.html
73 | https://www.mafengwo.cn/u/zhenmeiqu/note.html
74 | https://www.mafengwo.cn/u/93808795/note.html
75 | https://www.mafengwo.cn/u/ruanzhonghua/note.html
76 | https://www.mafengwo.cn/u/59633694/note.html
77 | https://www.mafengwo.cn/u/5172228/note.html
78 | https://www.mafengwo.cn/u/79862907/note.html
79 | https://www.mafengwo.cn/u/5119335/note.html
80 | https://www.mafengwo.cn/u/iiibiz/note.html
81 | https://www.mafengwo.cn/u/92990277/note.html
82 | https://www.mafengwo.cn/u/83736375/note.html
83 | https://www.mafengwo.cn/u/66016397/note.html
84 | https://www.mafengwo.cn/u/75334068/note.html
85 | https://www.mafengwo.cn/u/10606831/note.html
86 | https://www.mafengwo.cn/u/73953374/note.html
87 | https://www.mafengwo.cn/u/5328159/note.html
88 | https://www.mafengwo.cn/u/72226812/note.html
89 | https://www.mafengwo.cn/u/75867238/note.html
90 | https://www.mafengwo.cn/u/ruogu2/note.html
91 | https://www.mafengwo.cn/u/459268/note.html
92 | https://www.mafengwo.cn/u/5037685/note.html
93 | https://www.mafengwo.cn/u/32358313/note.html
94 | https://www.mafengwo.cn/u/ymy817/note.html
95 | https://www.mafengwo.cn/u/44131359/note.html
96 | https://www.mafengwo.cn/u/flyingwsh/note.html
97 | https://www.mafengwo.cn/u/36953718/note.html
98 | https://www.mafengwo.cn/u/830821/note.html
99 | https://www.mafengwo.cn/u/72465054/note.html
100 | https://www.mafengwo.cn/u/816643/note.html
101 | https://www.mafengwo.cn/u/5547423/note.html
102 | https://www.mafengwo.cn/u/85055587/note.html
103 | https://www.mafengwo.cn/u/77259555/note.html
104 | https://www.mafengwo.cn/u/58085128/note.html
105 | https://www.mafengwo.cn/u/85782763/note.html
106 | https://www.mafengwo.cn/u/448785/note.html
107 | https://www.mafengwo.cn/u/shanfeng/note.html
108 | https://www.mafengwo.cn/u/30730200/note.html
109 | https://www.mafengwo.cn/u/82532600/note.html
110 | https://www.mafengwo.cn/u/sellnuan/note.html
111 | https://www.mafengwo.cn/u/85205385/note.html
112 | https://www.mafengwo.cn/u/40525484/note.html
113 | https://www.mafengwo.cn/u/92931036/note.html
114 | https://www.mafengwo.cn/u/60022265/note.html
115 | https://www.mafengwo.cn/u/45066857/note.html
116 | https://www.mafengwo.cn/u/34957278/note.html
117 | https://www.mafengwo.cn/u/90472994/note.html
118 | https://www.mafengwo.cn/u/5295777/note.html
119 | https://www.mafengwo.cn/u/86494331/note.html
120 | https://www.mafengwo.cn/u/42395202/note.html
121 | https://www.mafengwo.cn/u/heididsy/note.html
122 | https://www.mafengwo.cn/u/42694746/note.html
123 | https://www.mafengwo.cn/u/yimeng/note.html
124 | https://www.mafengwo.cn/u/5172228/note.html
125 | https://www.mafengwo.cn/u/17639643/note.html
126 | https://www.mafengwo.cn/u/wuweixiang/note.html
127 | https://www.mafengwo.cn/u/92931036/note.html
128 | https://www.mafengwo.cn/u/49231278/note.html
129 | https://www.mafengwo.cn/u/5481686/note.html
130 | https://www.mafengwo.cn/u/19014378/note.html
131 | https://www.mafengwo.cn/u/seacen/note.html
132 | https://www.mafengwo.cn/u/beslan/note.html
133 | https://www.mafengwo.cn/u/ruanzhonghua/note.html
134 | https://www.mafengwo.cn/u/187367/note.html
135 | https://www.mafengwo.cn/u/32216322/note.html
136 | https://www.mafengwo.cn/u/93157709/note.html
137 | https://www.mafengwo.cn/u/13105932/note.html
138 | https://www.mafengwo.cn/u/86494331/note.html
139 | https://www.mafengwo.cn/u/10911951/note.html
140 | https://www.mafengwo.cn/u/77243222/note.html
141 | https://www.mafengwo.cn/u/yolichic/note.html
142 | https://www.mafengwo.cn/u/88371807/note.html
143 | https://www.mafengwo.cn/u/jklouise/note.html
144 | https://www.mafengwo.cn/u/85558645/note.html
145 | https://www.mafengwo.cn/u/69200064/note.html
146 | https://www.mafengwo.cn/u/88358953/note.html
147 | https://www.mafengwo.cn/u/54534899/note.html
148 | https://www.mafengwo.cn/u/kido37/note.html
149 | https://www.mafengwo.cn/u/ruogu2/note.html
150 | https://www.mafengwo.cn/u/32228262/note.html
151 | https://www.mafengwo.cn/u/208077/note.html
152 | https://www.mafengwo.cn/u/xmulazio/note.html
153 | https://www.mafengwo.cn/u/74369556/note.html
154 | https://www.mafengwo.cn/u/5028192/note.html
155 | https://www.mafengwo.cn/u/ptah0622/note.html
156 | https://www.mafengwo.cn/u/5203896/note.html
157 | https://www.mafengwo.cn/u/35296229/note.html
158 | https://www.mafengwo.cn/u/69709753/note.html
159 | https://www.mafengwo.cn/u/71897854/note.html
160 | https://www.mafengwo.cn/u/73941769/note.html
161 | https://www.mafengwo.cn/u/79167497/note.html
162 | https://www.mafengwo.cn/u/5648583/note.html
163 | https://www.mafengwo.cn/u/840399/note.html
164 | https://www.mafengwo.cn/u/34260694/note.html
165 | https://www.mafengwo.cn/u/89214773/note.html
166 | https://www.mafengwo.cn/u/47448074/note.html
167 | https://www.mafengwo.cn/u/90344916/note.html
168 | https://www.mafengwo.cn/u/5673085/note.html
169 | https://www.mafengwo.cn/u/fantasist/note.html
170 | https://www.mafengwo.cn/u/gemmakyoto/note.html
171 | https://www.mafengwo.cn/u/kidd1110/note.html
172 | https://www.mafengwo.cn/u/459539/note.html
173 | https://www.mafengwo.cn/u/clijsters/note.html
174 | https://www.mafengwo.cn/u/53816690/note.html
175 | https://www.mafengwo.cn/u/85224198/note.html
176 | https://www.mafengwo.cn/u/1115956/note.html
177 | https://www.mafengwo.cn/u/kevlee/note.html
178 | https://www.mafengwo.cn/u/sarahontheroad/note.html
179 | https://www.mafengwo.cn/u/10525543/note.html
180 | https://www.mafengwo.cn/u/374140/note.html
181 | https://www.mafengwo.cn/u/19268018/note.html
182 | https://www.mafengwo.cn/u/70816697/note.html
183 | https://www.mafengwo.cn/u/102065/note.html
184 | https://www.mafengwo.cn/u/yolichic/note.html
185 | https://www.mafengwo.cn/u/49130101/note.html
186 | https://www.mafengwo.cn/u/49221414/note.html
187 | https://www.mafengwo.cn/u/sicilia/note.html
188 | https://www.mafengwo.cn/u/zhangxiaofan/note.html
189 | https://www.mafengwo.cn/u/fantastic/note.html
190 | https://www.mafengwo.cn/u/193656/note.html
191 | https://www.mafengwo.cn/u/after17/note.html
192 | https://www.mafengwo.cn/u/guaiiiii/note.html
193 | https://www.mafengwo.cn/u/tianpinan/note.html
194 | https://www.mafengwo.cn/u/52233524/note.html
195 | https://www.mafengwo.cn/u/75151343/note.html
196 | https://www.mafengwo.cn/u/88358953/note.html
197 | https://www.mafengwo.cn/u/83796483/note.html
198 | https://www.mafengwo.cn/u/79297765/note.html
199 | https://www.mafengwo.cn/u/72512443/note.html
200 | https://www.mafengwo.cn/u/niuniu/note.html
201 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 | 1609123434116
181 |
182 |
183 | 1609123434116
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 | true
78 | DEFINITION_ORDER
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 | 1527054871956
181 |
182 |
183 | 1527054871956
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/js/tool_decode_index.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | /** @type {!Array} */
3 | var _0xb483 = ["_decode", "http://www.sojson.com/javascriptobfuscator.html"];
4 | (function(metaWindow) {
5 | metaWindow[_0xb483[0]] = _0xb483[1];
6 | })(window);
7 | /** @type {!Array} */
8 | var __Ox2133f = ["use strict", "$", "SparkMD5", "charCodeAt", "length", "substring", "match", "subarray", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f", "", "join", "reset", "hello", "5d41402abc4b2a76b9719d911017c592", "append", "prototype", "test", "appendBinary", "_buff", "_length", "substr", "end", "_state", "_finish", "destroy", "hash", "hashBinary", "ArrayBuffer", "byteLength", "_concatArrayBuffer", "set", "c9d6618dbc657b41a66eb0af952906f1", "name", "slice",
9 | "call", "toString", "Object", "value", "Array", "Null", "Undefined", "map", "push", "sort", "forEach", "stringify", "data", "extend", "_sn", "_ts", "getTime", "&_ts=", "&_sn=", "_ts=", "ajaxPrefilter"];
10 | (function() {
11 | /**
12 | * @param {!Object} index
13 | * @return {?}
14 | */
15 | function view(index) {
16 | /**
17 | * @param {!Object} it
18 | * @return {?}
19 | */
20 | function render(it) {
21 | /** @type {!Array} */
22 | var command_codes = [];
23 | var out = {};
24 | var j;
25 | for (j in it) {
26 | var data = {};
27 | /** @type {string} */
28 | data["name"] = j;
29 | var _0xe7fex3c = Object["prototype"]["toString"]["call"](it[j])["slice"](8, -1);
30 | if (_0xe7fex3c === "Object") {
31 | data["value"] = render(it[j]);
32 | } else {
33 | if (_0xe7fex3c === "Array") {
34 | data["value"]] = it[j]["map"](function(slackName) {
35 | var _0xe7fex3e = Object["prototype"]["toString"]["call"](it[j])["slice"](8, -1);
36 | if (_0xe7fex3e === "Null" || _0xe7fex3e === "Undefined") {
37 | return "";
38 | }
39 | return String(slackName);
40 | });
41 | } else {
42 | if (_0xe7fex3c === "Null" || _0xe7fex3c === "Undefined") {
43 | data["value"] = "";
44 | } else {
45 | /** @type {string} */
46 | data["value"] = String(it[j]);
47 | }
48 | }
49 | }
50 | command_codes["push"](data);
51 | }
52 | command_codes["sort"](function(boxA, boxB) {
53 | return boxA["name"] > boxB["name"] ? 1 : boxA["name"] < boxB["name"] ? -1 : 0;
54 | });
55 | command_codes["forEach"](function(line) {
56 | out[line["name"] = line["value"];
57 | });
58 | return out;
59 | }
60 | // 函数调用
61 | var input = render(index);
62 | // return taiji[__Ox2133f[40]](JSON[__Ox2133f[60]](input) + code)[__Ox2133f[48]](2, 12);
63 | return window['SparkMD5']["hash"](JSON["stringify"](input) + "c9d6618dbc657b41a66eb0af952906f1")["slice"](2, 12);
64 | }
65 | __Ox2133f[0];
66 | var obj = window[__Ox2133f[1]];
67 | var taiji = window['SparkMD5'] = function() {
68 | __Ox2133f[0];
69 | /**
70 | * @param {number} name
71 | * @param {number} _
72 | * @return {?}
73 | */
74 | var $ = function(name, _) {
75 | return name + _ & 4294967295;
76 | };
77 | /**
78 | * @param {number} next
79 | * @param {number} a
80 | * @param {number} s
81 | * @param {number} v
82 | * @param {number} b
83 | * @param {number} target
84 | * @return {?}
85 | */
86 | var log = function(next, a, s, v, b, target) {
87 | a = $($(a, next), $(v, target));
88 | return $(a << b | a >>> 32 - b, s);
89 | };
90 | /**
91 | * @param {undefined} o
92 | * @param {number} n
93 | * @param {number} t
94 | * @param {number} a
95 | * @param {undefined} user
96 | * @param {number} token
97 | * @param {number} data
98 | * @return {?}
99 | */
100 | var callback = function(o, n, t, a, user, token, data) {
101 | return log(n & t | ~n & a, o, n, user, token, data);
102 | };
103 | /**
104 | * @param {undefined} s
105 | * @param {number} n
106 | * @param {number} t
107 | * @param {number} a
108 | * @param {undefined} user
109 | * @param {number} url
110 | * @param {number} data
111 | * @return {?}
112 | */
113 | var load = function(s, n, t, a, user, url, data) {
114 | return log(n & a | t & ~a, s, n, user, url, data);
115 | };
116 | /**
117 | * @param {undefined} params
118 | * @param {number} type
119 | * @param {number} index
120 | * @param {number} prop
121 | * @param {undefined} msg
122 | * @param {number} url
123 | * @param {number} data
124 | * @return {?}
125 | */
126 | var fn = function(params, type, index, prop, msg, url, data) {
127 | return log(type ^ index ^ prop, params, type, msg, url, data);
128 | };
129 | /**
130 | * @param {undefined} o
131 | * @param {number} n
132 | * @param {?} t
133 | * @param {?} a
134 | * @param {undefined} c
135 | * @param {number} data
136 | * @param {number} value
137 | * @return {?}
138 | */
139 | var print = function(o, n, t, a, c, data, value) {
140 | return log(t ^ (n | ~a), o, n, c, data, value);
141 | };
142 | /**
143 | * @param {!Array} args
144 | * @param {!Array} obj
145 | * @return {undefined}
146 | */
147 | var test = function(args, obj) {
148 | var name = args[0];
149 | var value = args[1];
150 | var options = args[2];
151 | var key = args[3];
152 | name = callback(name, value, options, key, obj[0], 7, -680876936);
153 | key = callback(key, name, value, options, obj[1], 12, -389564586);
154 | options = callback(options, key, name, value, obj[2], 17, 606105819);
155 | value = callback(value, options, key, name, obj[3], 22, -1044525330);
156 | name = callback(name, value, options, key, obj[4], 7, -176418897);
157 | key = callback(key, name, value, options, obj[5], 12, 1200080426);
158 | options = callback(options, key, name, value, obj[6], 17, -1473231341);
159 | value = callback(value, options, key, name, obj[7], 22, -45705983);
160 | name = callback(name, value, options, key, obj[8], 7, 1770035416);
161 | key = callback(key, name, value, options, obj[9], 12, -1958414417);
162 | options = callback(options, key, name, value, obj[10], 17, -42063);
163 | value = callback(value, options, key, name, obj[11], 22, -1990404162);
164 | name = callback(name, value, options, key, obj[12], 7, 1804603682);
165 | key = callback(key, name, value, options, obj[13], 12, -40341101);
166 | options = callback(options, key, name, value, obj[14], 17, -1502002290);
167 | value = callback(value, options, key, name, obj[15], 22, 1236535329);
168 | name = load(name, value, options, key, obj[1], 5, -165796510);
169 | key = load(key, name, value, options, obj[6], 9, -1069501632);
170 | options = load(options, key, name, value, obj[11], 14, 643717713);
171 | value = load(value, options, key, name, obj[0], 20, -373897302);
172 | name = load(name, value, options, key, obj[5], 5, -701558691);
173 | key = load(key, name, value, options, obj[10], 9, 38016083);
174 | options = load(options, key, name, value, obj[15], 14, -660478335);
175 | value = load(value, options, key, name, obj[4], 20, -405537848);
176 | name = load(name, value, options, key, obj[9], 5, 568446438);
177 | key = load(key, name, value, options, obj[14], 9, -1019803690);
178 | options = load(options, key, name, value, obj[3], 14, -187363961);
179 | value = load(value, options, key, name, obj[8], 20, 1163531501);
180 | name = load(name, value, options, key, obj[13], 5, -1444681467);
181 | key = load(key, name, value, options, obj[2], 9, -51403784);
182 | options = load(options, key, name, value, obj[7], 14, 1735328473);
183 | value = load(value, options, key, name, obj[12], 20, -1926607734);
184 | name = fn(name, value, options, key, obj[5], 4, -378558);
185 | key = fn(key, name, value, options, obj[8], 11, -2022574463);
186 | options = fn(options, key, name, value, obj[11], 16, 1839030562);
187 | value = fn(value, options, key, name, obj[14], 23, -35309556);
188 | name = fn(name, value, options, key, obj[1], 4, -1530992060);
189 | key = fn(key, name, value, options, obj[4], 11, 1272893353);
190 | options = fn(options, key, name, value, obj[7], 16, -155497632);
191 | value = fn(value, options, key, name, obj[10], 23, -1094730640);
192 | name = fn(name, value, options, key, obj[13], 4, 681279174);
193 | key = fn(key, name, value, options, obj[0], 11, -358537222);
194 | options = fn(options, key, name, value, obj[3], 16, -722521979);
195 | value = fn(value, options, key, name, obj[6], 23, 76029189);
196 | name = fn(name, value, options, key, obj[9], 4, -640364487);
197 | key = fn(key, name, value, options, obj[12], 11, -421815835);
198 | options = fn(options, key, name, value, obj[15], 16, 530742520);
199 | value = fn(value, options, key, name, obj[2], 23, -995338651);
200 | name = print(name, value, options, key, obj[0], 6, -198630844);
201 | key = print(key, name, value, options, obj[7], 10, 1126891415);
202 | options = print(options, key, name, value, obj[14], 15, -1416354905);
203 | value = print(value, options, key, name, obj[5], 21, -57434055);
204 | name = print(name, value, options, key, obj[12], 6, 1700485571);
205 | key = print(key, name, value, options, obj[3], 10, -1894986606);
206 | options = print(options, key, name, value, obj[10], 15, -1051523);
207 | value = print(value, options, key, name, obj[1], 21, -2054922799);
208 | name = print(name, value, options, key, obj[8], 6, 1873313359);
209 | key = print(key, name, value, options, obj[15], 10, -30611744);
210 | options = print(options, key, name, value, obj[6], 15, -1560198380);
211 | value = print(value, options, key, name, obj[13], 21, 1309151649);
212 | name = print(name, value, options, key, obj[4], 6, -145523070);
213 | key = print(key, name, value, options, obj[11], 10, -1120210379);
214 | options = print(options, key, name, value, obj[2], 15, 718787259);
215 | value = print(value, options, key, name, obj[9], 21, -343485551);
216 | args[0] = $(name, args[0]);
217 | args[1] = $(value, args[1]);
218 | args[2] = $(options, args[2]);
219 | args[3] = $(key, args[3]);
220 | };
221 | /**
222 | * @param {?} validator
223 | * @return {?}
224 | */
225 | var extend = function(validator) {
226 | /** @type {!Array} */
227 | var wavetones = [];
228 | var value;
229 | /** @type {number} */
230 | value = 0;
231 | for (; value < 64; value = value + 4) {
232 | wavetones[value >> 2] = validator[__Ox2133f[3]](value) + (validator[__Ox2133f[3]](value + 1) << 8) + (validator[__Ox2133f[3]](value + 2) << 16) + (validator[__Ox2133f[3]](value + 3) << 24);
233 | }
234 | return wavetones;
235 | };
236 | /**
237 | * @param {!Object} n
238 | * @return {?}
239 | */
240 | var parse = function(n) {
241 | /** @type {!Array} */
242 | var input = [];
243 | var i;
244 | /** @type {number} */
245 | i = 0;
246 | for (; i < 64; i = i + 4) {
247 | input[i >> 2] = n[i] + (n[i + 1] << 8) + (n[i + 2] << 16) + (n[i + 3] << 24);
248 | }
249 | return input;
250 | };
251 | /**
252 | * @param {string} args
253 | * @return {?}
254 | */
255 | var get = function(args) {
256 | var val = args[__Ox2133f[4]];
257 | /** @type {!Array} */
258 | var item = [1732584193, -271733879, -1732584194, 271733878];
259 | var data;
260 | var condition;
261 | var p;
262 | var duration;
263 | var minWidth;
264 | var myPundit;
265 | /** @type {number} */
266 | data = 64;
267 | for (; data <= val; data = data + 64) {
268 | test(item, extend(args[__Ox2133f[5]](data - 64, data)));
269 | }
270 | args = args[__Ox2133f[5]](data - 64);
271 | condition = args[__Ox2133f[4]];
272 | /** @type {!Array} */
273 | p = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
274 | /** @type {number} */
275 | data = 0;
276 | for (; data < condition; data = data + 1) {
277 | p[data >> 2] |= args[__Ox2133f[3]](data) << (data % 4 << 3);
278 | }
279 | p[data >> 2] |= 128 << (data % 4 << 3);
280 | if (data > 55) {
281 | test(item, p);
282 | /** @type {number} */
283 | data = 0;
284 | for (; data < 16; data = data + 1) {
285 | /** @type {number} */
286 | p[data] = 0;
287 | }
288 | }
289 | /** @type {number} */
290 | duration = val * 8;
291 | duration = duration.toString(16)[__Ox2133f[6]](/(.*?)(.{0,8})$/);
292 | /** @type {number} */
293 | minWidth = parseInt(duration[2], 16);
294 | /** @type {number} */
295 | myPundit = parseInt(duration[1], 16) || 0;
296 | /** @type {number} */
297 | p[14] = minWidth;
298 | /** @type {number} */
299 | p[15] = myPundit;
300 | test(item, p);
301 | return item;
302 | };
303 | /**
304 | * @param {?} data
305 | * @return {?}
306 | */
307 | var cb = function(data) {
308 | var p = data[__Ox2133f[4]];
309 | /** @type {!Array} */
310 | var cb = [1732584193, -271733879, -1732584194, 271733878];
311 | var pos;
312 | var last;
313 | var ret;
314 | var t;
315 | var energy;
316 | var document;
317 | /** @type {number} */
318 | pos = 64;
319 | for (; pos <= p; pos = pos + 64) {
320 | test(cb, parse(data[__Ox2133f[7]](pos - 64, pos)));
321 | }
322 | data = pos - 64 < p ? data[__Ox2133f[7]](pos - 64) : new Uint8Array(0);
323 | last = data[__Ox2133f[4]];
324 | /** @type {!Array} */
325 | ret = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
326 | /** @type {number} */
327 | pos = 0;
328 | for (; pos < last; pos = pos + 1) {
329 | ret[pos >> 2] |= data[pos] << (pos % 4 << 3);
330 | }
331 | ret[pos >> 2] |= 128 << (pos % 4 << 3);
332 | if (pos > 55) {
333 | test(cb, ret);
334 | /** @type {number} */
335 | pos = 0;
336 | for (; pos < 16; pos = pos + 1) {
337 | /** @type {number} */
338 | ret[pos] = 0;
339 | }
340 | }
341 | /** @type {number} */
342 | t = p * 8;
343 | t = t.toString(16)[__Ox2133f[6]](/(.*?)(.{0,8})$/);
344 | /** @type {number} */
345 | energy = parseInt(t[2], 16);
346 | /** @type {number} */
347 | document = parseInt(t[1], 16) || 0;
348 | /** @type {number} */
349 | ret[14] = energy;
350 | /** @type {number} */
351 | ret[15] = document;
352 | test(cb, ret);
353 | return cb;
354 | };
355 | /** @type {!Array} */
356 | var _0xe7fexe = [__Ox2133f[8], __Ox2133f[9], __Ox2133f[10], __Ox2133f[11], __Ox2133f[12], __Ox2133f[13], __Ox2133f[14], __Ox2133f[15], __Ox2133f[16], __Ox2133f[17], __Ox2133f[18], __Ox2133f[19], __Ox2133f[20], __Ox2133f[21], __Ox2133f[22], __Ox2133f[23]];
357 | /**
358 | * @param {number} value
359 | * @return {?}
360 | */
361 | var expect = function(value) {
362 | var chain = __Ox2133f[24];
363 | var i;
364 | /** @type {number} */
365 | i = 0;
366 | for (; i < 4; i = i + 1) {
367 | chain = chain + (_0xe7fexe[value >> i * 8 + 4 & 15] + _0xe7fexe[value >> i * 8 & 15]);
368 | }
369 | return chain;
370 | };
371 | /**
372 | * @param {!Array} state
373 | * @return {?}
374 | */
375 | var resolve = function(state) {
376 | var reducerKey;
377 | /** @type {number} */
378 | reducerKey = 0;
379 | for (; reducerKey < state[__Ox2133f[4]]; reducerKey = reducerKey + 1) {
380 | state[reducerKey] = expect(state[reducerKey]);
381 | }
382 | return state[__Ox2133f[25]](__Ox2133f[24]);
383 | };
384 | /**
385 | * @param {string} config
386 | * @return {?}
387 | */
388 | var gettingStartedGateCheck = function(config) {
389 | return resolve(get(config));
390 | };
391 | /**
392 | * @return {undefined}
393 | */
394 | var _0xe7fex2 = function() {
395 | this[__Ox2133f[26]]();
396 | };
397 | if (gettingStartedGateCheck(__Ox2133f[27]) !== __Ox2133f[28]) {
398 | /**
399 | * @param {number} a
400 | * @param {number} b
401 | * @return {?}
402 | */
403 | $ = function(a, b) {
404 | /** @type {number} */
405 | var uch = (a & 65535) + (b & 65535);
406 | /** @type {number} */
407 | var dwch = (a >> 16) + (b >> 16) + (uch >> 16);
408 | return dwch << 16 | uch & 65535;
409 | };
410 | }
411 | /**
412 | * @param {string} value
413 | * @return {?}
414 | */
415 | _0xe7fex2[__Ox2133f[30]][__Ox2133f[29]] = function(value) {
416 | if (/[\u0080-\uFFFF]/[__Ox2133f[31]](value)) {
417 | /** @type {string} */
418 | value = unescape(encodeURIComponent(value));
419 | }
420 | this[__Ox2133f[32]](value);
421 | return this;
422 | };
423 | /**
424 | * @param {?} canCreateDiscussions
425 | * @return {?}
426 | */
427 | _0xe7fex2[__Ox2133f[30]][__Ox2133f[32]] = function(canCreateDiscussions) {
428 | this[__Ox2133f[33]] += canCreateDiscussions;
429 | this[__Ox2133f[34]] += canCreateDiscussions[__Ox2133f[4]];
430 | var _zAdjPortWidth = this[__Ox2133f[33]][__Ox2133f[4]];
431 | var _xpos;
432 | /** @type {number} */
433 | _xpos = 64;
434 | for (; _xpos <= _zAdjPortWidth; _xpos = _xpos + 64) {
435 | test(this._state, extend(this[__Ox2133f[33]][__Ox2133f[5]](_xpos - 64, _xpos)));
436 | }
437 | this[__Ox2133f[33]] = this[__Ox2133f[33]][__Ox2133f[35]](_xpos - 64);
438 | return this;
439 | };
440 | /**
441 | * @param {?} canCreateDiscussions
442 | * @return {?}
443 | */
444 | _0xe7fex2[__Ox2133f[30]][__Ox2133f[36]] = function(canCreateDiscussions) {
445 | var ref = this[__Ox2133f[33]];
446 | var length = ref[__Ox2133f[4]];
447 | var value;
448 | /** @type {!Array} */
449 | var hexDigits = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
450 | var _0xe7fex2c;
451 | /** @type {number} */
452 | value = 0;
453 | for (; value < length; value = value + 1) {
454 | hexDigits[value >> 2] |= ref[__Ox2133f[3]](value) << (value % 4 << 3);
455 | }
456 | this._finish(hexDigits, length);
457 | _0xe7fex2c = !!canCreateDiscussions ? this[__Ox2133f[37]] : resolve(this._state);
458 | this[__Ox2133f[26]]();
459 | return _0xe7fex2c;
460 | };
461 | /**
462 | * @param {!Array} proto
463 | * @param {number} s
464 | * @return {undefined}
465 | */
466 | _0xe7fex2[__Ox2133f[30]][__Ox2133f[38]] = function(proto, s) {
467 | /** @type {number} */
468 | var style = s;
469 | var expectedZIndices;
470 | var f;
471 | var val;
472 | proto[style >> 2] |= 128 << (style % 4 << 3);
473 | if (style > 55) {
474 | test(this._state, proto);
475 | /** @type {number} */
476 | style = 0;
477 | for (; style < 16; style = style + 1) {
478 | /** @type {number} */
479 | proto[style] = 0;
480 | }
481 | }
482 | /** @type {number} */
483 | expectedZIndices = this[__Ox2133f[34]] * 8;
484 | expectedZIndices = expectedZIndices.toString(16)[__Ox2133f[6]](/(.*?)(.{0,8})$/);
485 | /** @type {number} */
486 | f = parseInt(expectedZIndices[2], 16);
487 | /** @type {number} */
488 | val = parseInt(expectedZIndices[1], 16) || 0;
489 | /** @type {number} */
490 | proto[14] = f;
491 | /** @type {number} */
492 | proto[15] = val;
493 | test(this._state, proto);
494 | };
495 | /**
496 | * @return {?}
497 | */
498 | _0xe7fex2[__Ox2133f[30]][__Ox2133f[26]] = function() {
499 | this[__Ox2133f[33]] = __Ox2133f[24];
500 | /** @type {number} */
501 | this[__Ox2133f[34]] = 0;
502 | /** @type {!Array} */
503 | this[__Ox2133f[37]] = [1732584193, -271733879, -1732584194, 271733878];
504 | return this;
505 | };
506 | /**
507 | * @return {undefined}
508 | */
509 | _0xe7fex2[__Ox2133f[30]][__Ox2133f[39]] = function() {
510 | delete this[__Ox2133f[37]];
511 | delete this[__Ox2133f[33]];
512 | delete this[__Ox2133f[34]];
513 | };
514 | /**
515 | * @param {string} message
516 | * @param {?} canCreateDiscussions
517 | * @return {?}
518 | */
519 | _0xe7fex2[__Ox2133f[40]] = function(message, canCreateDiscussions) {
520 | if (/[\u0080-\uFFFF]/[__Ox2133f[31]](message)) {
521 | /** @type {string} */
522 | message = unescape(encodeURIComponent(message));
523 | }
524 | var ret = get(message);
525 | return !!canCreateDiscussions ? ret : resolve(ret);
526 | };
527 | /**
528 | * @param {string} title
529 | * @param {?} canCreateDiscussions
530 | * @return {?}
531 | */
532 | _0xe7fex2[__Ox2133f[41]] = function(title, canCreateDiscussions) {
533 | var ret = get(title);
534 | return !!canCreateDiscussions ? ret : resolve(ret);
535 | };
536 | /**
537 | * @return {undefined}
538 | */
539 | _0xe7fex2[__Ox2133f[42]] = function() {
540 | this[__Ox2133f[26]]();
541 | };
542 | /**
543 | * @param {?} arr
544 | * @return {?}
545 | */
546 | _0xe7fex2[__Ox2133f[42]][__Ox2133f[30]][__Ox2133f[29]] = function(arr) {
547 | var buff = this._concatArrayBuffer(this._buff, arr);
548 | var length = buff[__Ox2133f[4]];
549 | var i;
550 | this[__Ox2133f[34]] += arr[__Ox2133f[43]];
551 | /** @type {number} */
552 | i = 64;
553 | for (; i <= length; i = i + 64) {
554 | test(this._state, parse(buff[__Ox2133f[7]](i - 64, i)));
555 | }
556 | this[__Ox2133f[33]] = i - 64 < length ? buff[__Ox2133f[7]](i - 64) : new Uint8Array(0);
557 | return this;
558 | };
559 | /**
560 | * @param {?} canCreateDiscussions
561 | * @return {?}
562 | */
563 | _0xe7fex2[__Ox2133f[42]][__Ox2133f[30]][__Ox2133f[36]] = function(canCreateDiscussions) {
564 | var array = this[__Ox2133f[33]];
565 | var length = array[__Ox2133f[4]];
566 | /** @type {!Array} */
567 | var tail = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
568 | var i;
569 | var _0xe7fex2c;
570 | /** @type {number} */
571 | i = 0;
572 | for (; i < length; i = i + 1) {
573 | tail[i >> 2] |= array[i] << (i % 4 << 3);
574 | }
575 | this._finish(tail, length);
576 | _0xe7fex2c = !!canCreateDiscussions ? this[__Ox2133f[37]] : resolve(this._state);
577 | this[__Ox2133f[26]]();
578 | return _0xe7fex2c;
579 | };
580 | _0xe7fex2[__Ox2133f[42]][__Ox2133f[30]][__Ox2133f[38]] = _0xe7fex2[__Ox2133f[30]][__Ox2133f[38]];
581 | /**
582 | * @return {?}
583 | */
584 | _0xe7fex2[__Ox2133f[42]][__Ox2133f[30]][__Ox2133f[26]] = function() {
585 | /** @type {!Uint8Array} */
586 | this[__Ox2133f[33]] = new Uint8Array(0);
587 | /** @type {number} */
588 | this[__Ox2133f[34]] = 0;
589 | /** @type {!Array} */
590 | this[__Ox2133f[37]] = [1732584193, -271733879, -1732584194, 271733878];
591 | return this;
592 | };
593 | _0xe7fex2[__Ox2133f[42]][__Ox2133f[30]][__Ox2133f[39]] = _0xe7fex2[__Ox2133f[30]][__Ox2133f[39]];
594 | /**
595 | * @param {?} row
596 | * @param {?} b
597 | * @return {?}
598 | */
599 | _0xe7fex2[__Ox2133f[42]][__Ox2133f[30]][__Ox2133f[44]] = function(row, b) {
600 | var val = row[__Ox2133f[4]];
601 | /** @type {!Uint8Array} */
602 | var c = new Uint8Array(val + b[__Ox2133f[43]]);
603 | c[__Ox2133f[45]](row);
604 | c[__Ox2133f[45]](new Uint8Array(b), val);
605 | return c;
606 | };
607 | /**
608 | * @param {?} output
609 | * @param {?} canCreateDiscussions
610 | * @return {?}
611 | */
612 | _0xe7fex2[__Ox2133f[42]][__Ox2133f[40]] = function(output, canCreateDiscussions) {
613 | var ret = cb(new Uint8Array(output));
614 | return !!canCreateDiscussions ? ret : resolve(ret);
615 | };
616 | return _0xe7fex2;
617 | }();
618 | // var code = __Ox2133f[46];
619 | var code = 'c9d6618dbc657b41a66eb0af952906f1'
620 | obj['ajaxPrefilter'](function(boardManager, isSlidingUp) {
621 | //boardManager是一个对象,包含data
622 | // obj[__Ox2133f[69]](function(boardManager, isSlidingUp) {
623 | // var p3 = obj[__Ox2133f[62]](true, {}, isSlidingUp[__Ox2133f[61]] || {});
624 | // extend方法扩展对象属性
625 | var p3 = obj['extend'](true, {}, isSlidingUp['data'] || {});
626 | //sn
627 | // 如果存在sn,则先删除
628 | if (p3["_sn"]) {
629 | delete p3["_sn"];
630 | }
631 | // 获取时间戳
632 | p3["_ts"] = (new Date)[__Ox2133f[65]]();
633 | // 调用view函数,计算出vroot也就是sn值
634 | // var vroot = view(obj[__Ox2133f[62]](true, {}, p3));
635 | var vroot = view(obj["extend"](true, {}, p3));
636 | if ("data" in boardManager) {
637 | // boardManager[__Ox2133f[61]] += __Ox2133f[66] + p3[__Ox2133f[64]] + __Ox2133f[67] + vroot;
638 | boardManager["data"] += "&_ts=" + p3["_ts"] + "&_sn=" + vroot;
639 | } else {
640 | // boardManager[__Ox2133f[61]] = __Ox2133f[68] + p3[__Ox2133f[64]] + __Ox2133f[67] + vroot;
641 | boardManager["data"] = "_ts=" + p3["_ts"] + "&_sn" + vroot;
642 | }
643 | });
644 | })();
--------------------------------------------------------------------------------