├── kolesa
    ├── __init__.py
    ├── README.md
    ├── handle_mongo.py
    └── crawl_kolesa.py
├── boss_zhipin
    ├── __init__.py
    ├── README.md
    └── crawl_boss_zhipin.py
├── dasouche
    ├── __init__.py
    ├── README.md
    └── handle_dasouche.py
├── synchronous
    ├── __init__.py
    ├── sample
    │   ├── __init__.py
    │   ├── multiprocess_pool.py
    │   ├── thread_test1.py
    │   ├── multiprocess_test3.py
    │   ├── multiprocess_test2.py
    │   ├── process_not_share.py
    │   ├── multiprocess_test1.py
    │   ├── multiprocess_class.py
    │   └── multiprocess_share.py
    ├── test1.py
    ├── handle_queue.py
    ├── handle_redis.py
    ├── handle_request.py
    ├── handle_spider.py
    └── spider_multiprocess.py
├── login_github
    ├── __init__.py
    ├── README.md
    └── handle_login.py
├── dongqiudi
    ├── dongqiudi
    │   ├── __init__.py
    │   ├── main.py
    │   ├── dongqiudi_pic
    │   │   ├── 西甲两连冠仍存在下课危机，巴尔韦德巴萨生涯的红与黑
    │   │   │   ├── 7
    │   │   │   ├── ChMf8FzSXFKAcY7DAAHvfisNUuY153.jpg
    │   │   │   ├── ChMf8FzgKFSARnpeAAGp9Jfeq-A752.jpg
    │   │   │   ├── ChMf8Fztq1-AXiXEADuUv2VgHRs890.gif
    │   │   │   ├── ChNLklztqZOAFKI4AANLRhtfxnE659.jpg
    │   │   │   ├── ChNLklztqnaAEJHSAAK8ZOypb-k480.jpg
    │   │   │   ├── ChNLklztqrmAFEVQAAGNxQf4M3s194.jpg
    │   │   │   └── ChNLklztvpmAGjgfAAFd4X3svKc014.jpg
    │   │   └── C罗与法拉利车手勒克莱尔同场较劲！
    │   │   │   ├── ChO2w1zuIRiAIJuDAAJrbFscpoU610.jpg
    │   │   │   ├── ChO2w1zuISOAF511AAFaT82ScyE114.jpg
    │   │   │   └── ChONolzuIOiASesEAAEgiz_2cMw359.jpg
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── crawl_dongqiudi.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   └── settings.py
    ├── scrapy.cfg
    └── README.md
├── douban_movie_top250
    ├── __init__.py
    ├── README.md
    ├── handle_mongo.py
    └── crawl_douban_movie_info_top250.py
├── mafengwo
    ├── mafengwo
    │   ├── __init__.py
    │   ├── main.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── crawl_mafengwo.py
    │   ├── mafengwo_images
    │   │   └── full
    │   │   │   └── 0b5ae5f0ae9aaaa661bc67bbd23eae74bb71bc46.jpg
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── middlewares.py
    │   ├── settings.py
    │   └── url_list.txt
    └── scrapy.cfg
├── douban_movie_top250_scrapy
    ├── douban
    │   ├── __init__.py
    │   ├── main.py
    │   ├── douban.json
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── douban_spider.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── middlewares.py
    │   └── settings.py
    ├── README.md
    ├── .idea
    │   ├── misc.xml
    │   ├── modules.xml
    │   ├── douban.iml
    │   ├── deployment.xml
    │   └── workspace.xml
    └── scrapy.cfg
├── mafengwo_article_spider
    ├── mafengwo
    │   ├── __init__.py
    │   ├── main.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── crawl_mafengwo.py
    │   ├── js
    │   │   ├── README.md
    │   │   ├── handle_sn.py
    │   │   └── tool_decode_index.js
    │   ├── middlewares.py
    │   ├── handle_mongo.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   ├── handle_task.py
    │   └── url_list.txt
    ├── README.md
    ├── .idea
    │   ├── encodings.xml
    │   ├── misc.xml
    │   ├── modules.xml
    │   └── mafengwo.iml
    └── scrapy.cfg
├── video
    ├── README.md
    └── lishipin
    │   └── crawl_lishipin.py
├── baidu_m_keyword_ranks
    ├── README.md
    ├── setting.py
    ├── handle_mysql.py
    └── baidu_m_keyword.py
├── lagou
    ├── README.md
    ├── handle_mongo.py
    ├── crawl_lagou_job_old.py
    ├── handle_mysql.py
    └── crawl_lagou_job_new.py
├── .idea
    ├── vcs.xml
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── modules.xml
    ├── small-spider-project.iml
    ├── deployment.xml
    └── workspace.xml
├── README.md
└── .gitignore


/kolesa/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/boss_zhipin/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dasouche/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/synchronous/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/login_github/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/douban_movie_top250/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mafengwo/mafengwo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/synchronous/sample/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video/README.md:
--------------------------------------------------------------------------------
1 | #### 1、lishipin   梨视频数据抓取
2 | 


--------------------------------------------------------------------------------
/dasouche/README.md:
--------------------------------------------------------------------------------
1 | # 大搜车爬虫
2 | 
3 | #### bug:dazhuang_python@sina.com
4 | 


--------------------------------------------------------------------------------
/baidu_m_keyword_ranks/README.md:
--------------------------------------------------------------------------------
1 | # 百度M站搜索关键字去除广告后的排名抓取
2 | ## python3.6 多线程
3 | 


--------------------------------------------------------------------------------
/kolesa/README.md:
--------------------------------------------------------------------------------
1 | # kolesa爬虫
2 | 
3 | #### bug:dazhuang_python@sina.com
4 | 


--------------------------------------------------------------------------------
/login_github/README.md:
--------------------------------------------------------------------------------
1 | # 登录github
2 | 
3 | #### bug:dazhuang_python@sina.com
4 | 


--------------------------------------------------------------------------------
/boss_zhipin/README.md:
--------------------------------------------------------------------------------
1 | # boos直聘python岗位全国爬虫
2 | 
3 | #### bug:dazhuang_python@sina.com
4 | 


--------------------------------------------------------------------------------
/douban_movie_top250/README.md:
--------------------------------------------------------------------------------
1 | # douban电影top250爬虫
2 | 
3 | #### bug:dazhuang_python@sina.com
4 | 


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/main.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute("scrapy crawl crawl_dongqiudi".split())


--------------------------------------------------------------------------------
/mafengwo/mafengwo/main.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute("scrapy crawl crawl_mafengwo".split())


--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/README.md:
--------------------------------------------------------------------------------
1 | # douban电影top250爬虫-通过scrapy框架抓取
2 | 
3 | #### bug:dazhuang_python@sina.com
4 | 


--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/main.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute("scrapy crawl douban_spider".split())


--------------------------------------------------------------------------------
/lagou/README.md:
--------------------------------------------------------------------------------
1 | # 拉钩python岗位全国爬虫
2 | 
3 | ##### 不能在__init__方法中写mongo信息，否则多进程无法启动
4 | 
5 | #### bug:dazhuang_python@sina.com
6 | 


--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/main.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute("scrapy crawl crawl_mafengwo".split())


--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/douban.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/douban_movie_top250_scrapy/douban/douban.json


--------------------------------------------------------------------------------
/mafengwo_article_spider/README.md:
--------------------------------------------------------------------------------
1 | # small-spider-project
2 | ## 日常爬虫
3 | 
4 | #### mafengwo_article_spider 马蜂窝最新，最热游记抓取 
5 | 
6 | 
7 | 
8 | #### bug:dazhuang_python@sina.com
9 | 


--------------------------------------------------------------------------------
/baidu_m_keyword_ranks/setting.py:
--------------------------------------------------------------------------------
1 | mysql_ip = '127.0.0.1'
2 | mysql_port = 3306
3 | mysql_database = '库名'
4 | mysql_table = ''
5 | mysql_username = '用户名'
6 | mysql_password = '密码'
7 | 


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机，巴尔韦德巴萨生涯的红与黑/7:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机，巴尔韦德巴萨生涯的红与黑/7


--------------------------------------------------------------------------------
/mafengwo_article_spider/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Encoding" addBOMForNewFiles="with NO BOM" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/mafengwo/mafengwo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/mafengwo/mafengwo/mafengwo_images/full/0b5ae5f0ae9aaaa661bc67bbd23eae74bb71bc46.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/mafengwo/mafengwo/mafengwo_images/full/0b5ae5f0ae9aaaa661bc67bbd23eae74bb71bc46.jpg


--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/mafengwo_article_spider/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲！/ChO2w1zuIRiAIJuDAAJrbFscpoU610.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲！/ChO2w1zuIRiAIJuDAAJrbFscpoU610.jpg


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲！/ChO2w1zuISOAF511AAFaT82ScyE114.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲！/ChO2w1zuISOAF511AAFaT82ScyE114.jpg


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲！/ChONolzuIOiASesEAAEgiz_2cMw359.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/C罗与法拉利车手勒克莱尔同场较劲！/ChONolzuIOiASesEAAEgiz_2cMw359.jpg


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机，巴尔韦德巴萨生涯的红与黑/ChMf8FzSXFKAcY7DAAHvfisNUuY153.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机，巴尔韦德巴萨生涯的红与黑/ChMf8FzSXFKAcY7DAAHvfisNUuY153.jpg


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机，巴尔韦德巴萨生涯的红与黑/ChMf8FzgKFSARnpeAAGp9Jfeq-A752.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机，巴尔韦德巴萨生涯的红与黑/ChMf8FzgKFSARnpeAAGp9Jfeq-A752.jpg


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机，巴尔韦德巴萨生涯的红与黑/ChMf8Fztq1-AXiXEADuUv2VgHRs890.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机，巴尔韦德巴萨生涯的红与黑/ChMf8Fztq1-AXiXEADuUv2VgHRs890.gif


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机，巴尔韦德巴萨生涯的红与黑/ChNLklztqZOAFKI4AANLRhtfxnE659.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机，巴尔韦德巴萨生涯的红与黑/ChNLklztqZOAFKI4AANLRhtfxnE659.jpg


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机，巴尔韦德巴萨生涯的红与黑/ChNLklztqnaAEJHSAAK8ZOypb-k480.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机，巴尔韦德巴萨生涯的红与黑/ChNLklztqnaAEJHSAAK8ZOypb-k480.jpg


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机，巴尔韦德巴萨生涯的红与黑/ChNLklztqrmAFEVQAAGNxQf4M3s194.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机，巴尔韦德巴萨生涯的红与黑/ChNLklztqrmAFEVQAAGNxQf4M3s194.jpg


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机，巴尔韦德巴萨生涯的红与黑/ChNLklztvpmAGjgfAAFd4X3svKc014.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freedom-wy/small-spider-project/HEAD/dongqiudi/dongqiudi/dongqiudi_pic/西甲两连冠仍存在下课危机，巴尔韦德巴萨生涯的红与黑/ChNLklztvpmAGjgfAAFd4X3svKc014.jpg


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.6.4 (sftp://test_python@192.168.130.132:22/home/test_python/python3/bin/python3)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/small-spider-project.iml" filepath="$PROJECT_DIR$/.idea/small-spider-project.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/dongqiudi/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = dongqiudi.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = dongqiudi
12 | 


--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/douban.iml" filepath="$PROJECT_DIR$/.idea/douban.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/mafengwo/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = mafengwo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = mafengwo
12 | 


--------------------------------------------------------------------------------
/mafengwo_article_spider/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/mafengwo.iml" filepath="$PROJECT_DIR$/.idea/mafengwo.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = douban.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = douban
12 | 


--------------------------------------------------------------------------------
/mafengwo_article_spider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = mafengwo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = mafengwo
12 | 


--------------------------------------------------------------------------------
/douban_movie_top250/handle_mongo.py:
--------------------------------------------------------------------------------
 1 | import pymongo
 2 | from pymongo.collection import Collection
 3 | 
 4 | 
 5 | class Handle_Mongo(object):
 6 |     def __init__(self):
 7 |         mongo_client = pymongo.MongoClient(host="127.0.0.1",port=27017)
 8 |         self.db_data = mongo_client['douban']
 9 | 
10 |     def handle_save_data(self,item):
11 |         task_collection = Collection(self.db_data,'douban_data')
12 |         task_collection.insert(item)
13 | 
14 | douban_mongo = Handle_Mongo()
15 | 


--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/js/README.md:
--------------------------------------------------------------------------------
 1 | # 马蜂窝生成sn的js解析
 2 | 
 3 | #### index.js是马蜂窝网站上原有JS文件
 4 | #### tool_decode_index.js是通过 http://jsnice.org/格式化和半解密
 5 | #### handle_sn.py对SN进行破解，请求时发现无需传递SN,晕菜...
 6 | 
 7 | 
 8 | ##### 619行：salt值：c9d6618dbc657b41a66eb0af952906f1
 9 | ##### 632行: 获取时间戳p3["_ts"] = (new Date)[__Ox2133f[65]]();
10 | ##### 635行: 调用VIEW函数获取sn值var vroot = view(obj["extend"](true, {}, p3));
11 | ##### 63行: 返回sn值,md5并切片 
12 | 
13 | 
14 | ### 交流:dazhuang_python@sina.com
15 | 
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # small-spider-project
 2 | ## 日常爬虫
 3 | 
 4 | #### 1、baidu_m_keyword_ranks 百度M站关键词搜索去除广告后的抓取
 5 | #### 2、video 视频抓取
 6 | #### 3、mafengwo 马蜂窝游记和图片抓取 
 7 | #### 4、kolesa kolesa数据抓取 
 8 | #### 5、douban_movie_top250 豆瓣电影top250数据抓取 
 9 | #### 6、douban_movie_top250_scrapy 豆瓣电影top250数据抓取-通过scrapy框架抓取 
10 | #### 7、mafengwo_article_spider 马蜂窝所有游记抓取
11 | #### 8、dasouche 大搜车数据抓取
12 | #### 9、dongqiudi 懂球帝新闻数据抓取
13 | #### 10、github 登录github
14 | #### 11、synchronous 同步爬虫
15 | 
16 | 
17 | 
18 | #### bug:dazhuang_python@sina.com
19 | 


--------------------------------------------------------------------------------
/dongqiudi/README.md:
--------------------------------------------------------------------------------
 1 | # 懂球帝新闻爬虫
 2 | ### 需求
 3 | 抓取懂球帝新闻https://dongqiudi.com/news
 4 | ### 项目结构
 5 | ```text
 6 | dongqiudi
 7 |     dongqiudi_pic   图片目录
 8 |     spiders         爬虫解析文件
 9 |     items.py        项目字段定义文件
10 |     middlewares.py  中间件,包含下载代理中间件
11 |     pipelines.py    数据管道,包含mongo数据存储和图片下载
12 |     settings.py     配置文件
13 |     main.py         启动文件
14 | ```
15 | ### 说明
16 | ```text
17 | 在pipelines.py中定义mongodb的ip地址和端口号
18 | 在settings.py中定义是否开启中间件,下载延迟等选项
19 | ```
20 | 
21 | 
22 | #### bug:dazhuang_python@sina.com
23 | 


--------------------------------------------------------------------------------
/mafengwo_article_spider/.idea/mafengwo.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/lagou/handle_mongo.py:
--------------------------------------------------------------------------------
 1 | import pymongo
 2 | from pymongo.collection import Collection
 3 | 
 4 | 
 5 | 
 6 | class Handle_lagou_mongo(object):
 7 |     def __init__(self):
 8 |         lagou_client = pymongo.MongoClient(host="127.0.0.1",port=27017)
 9 |         self.lagou_db = lagou_client['lagou']
10 | 
11 |     def handle_save_data(self,item):
12 |         print(item)
13 |         lagou_collection = Collection(self.lagou_db,"lagou_data")
14 |         lagou_collection.update({"positionId":item['positionId']},item,True)
15 | 
16 | 
17 | lagou_mongo = Handle_lagou_mongo()


--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/.idea/douban.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Remote Python 3.6.4 (sftp://test_python@192.168.130.132:22/home/test_python/python3/bin/python3)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DoubanItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     #序号
15 |     serial_number = scrapy.Field()
16 |     #电影名称
17 |     movie_name = scrapy.Field()
18 |     #电影介绍
19 |     introduce = scrapy.Field()
20 |     #星级
21 |     star = scrapy.Field()
22 |     #评价
23 |     evaluate = scrapy.Field()
24 |     #电影描述
25 |     describe = scrapy.Field()
26 | 


--------------------------------------------------------------------------------
/synchronous/test1.py:
--------------------------------------------------------------------------------
 1 | import aiohttp
 2 | import asyncio
 3 | 
 4 | headers = {
 5 |     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
 6 |                   "Chrome/86.0.4240.75 Safari/537.36 "
 7 | }
 8 | 
 9 | 
10 | async def sample_get():
11 |     # 发送一个简单的get请求
12 |     async with aiohttp.ClientSession() as session:
13 |         async with session.get(url="https://www.baidu.com", headers=headers) as response:
14 |             print(response.status)
15 |             print(await response.text())
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     loop = asyncio.get_event_loop()
20 |     loop.run_until_complete(sample_get())
21 | 


--------------------------------------------------------------------------------
/synchronous/sample/multiprocess_pool.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import time
 3 | 
 4 | 
 5 | def work(item):
 6 |     time.sleep(0.05)
 7 |     return "进程ID:{id},进程名称{name},执行任务item:{item}".format(id=multiprocessing.current_process().pid,
 8 |                                                          name=multiprocessing.current_process().name, item=item)
 9 | 
10 | 
11 | def main():
12 |     # 进程池大小为4
13 |     pool = multiprocessing.Pool(processes=4)
14 |     for item in range(100):
15 |         result = pool.apply_async(func=work, args=(item,))
16 |         print(result.get())
17 |     pool.close()
18 |     pool.join()
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     main()
23 | 


--------------------------------------------------------------------------------
/.idea/small-spider-project.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <sourceFolder url="file://$MODULE_DIR$/synchronous" isTestSource="false" />
 6 |     </content>
 7 |     <orderEntry type="inheritedJdk" />
 8 |     <orderEntry type="sourceFolder" forTests="false" />
 9 |   </component>
10 |   <component name="PackageRequirementsSettings">
11 |     <option name="requirementsPath" value="" />
12 |   </component>
13 |   <component name="TestRunnerService">
14 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
15 |   </component>
16 | </module>


--------------------------------------------------------------------------------
/synchronous/handle_queue.py:
--------------------------------------------------------------------------------
 1 | import queue
 2 | from handle_request import DangdangRequest
 3 | 
 4 | 
 5 | class DangdangQueue(object):
 6 |     def __init__(self):
 7 |         self.queue = queue.Queue()
 8 | 
 9 |     def insert_data(self, data):
10 |         print("添加抓取任务: ", data)
11 |         if isinstance(data, DangdangRequest):
12 |             self.queue.put(data)
13 |         return False
14 | 
15 |     def get_data(self):
16 |         if not self.queue.empty():
17 |             data = self.queue.get()
18 |             print("取出任务：", data)
19 |             return data
20 |         else:
21 |             return False
22 | 
23 |     def database_empty(self):
24 |         return self.queue.qsize() == 0
25 | 


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DongqiudiItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     #抓取URL
15 |     from_url = scrapy.Field()
16 |     #新闻标题
17 |     title = scrapy.Field()
18 |     #发表时间
19 |     release_time = scrapy.Field()
20 |     #作者
21 |     author = scrapy.Field()
22 |     #新闻内容
23 |     content = scrapy.Field()
24 |     # 抓取时间
25 |     crawl_time = scrapy.Field()
26 |     images = scrapy.Field()
27 |     image_urls = scrapy.Field()
28 |     image_paths = scrapy.Field()
29 | 


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | import base64
 9 | 
10 | 
11 | class DongqiudiProxyMiddleware(object):
12 |     # 设置代理策略
13 |     def process_request(self, request, spider):
14 |         # proxy，主机头和端口号
15 |         request.meta['proxy'] = 'http://http-dyn.abuyun.com:9020'
16 |         # 用户名:密码,当前代理必须要有费用
17 |         # 你自己买的代理，用户名和密码肯定和我的不一样
18 |         proxy_name_pass = 'HTK32673HL02BK2D:50125D2D38937C94'.encode('utf-8')
19 |         encode_pass_name = base64.b64encode(proxy_name_pass)
20 |         # 将代理信息设置到头部去
21 |         # 注意！！！！！Basic后面有一个空格
22 |         request.headers['Proxy-Authorization'] = 'Basic ' + encode_pass_name.decode()


--------------------------------------------------------------------------------
/kolesa/handle_mongo.py:
--------------------------------------------------------------------------------
 1 | import pymongo
 2 | from pymongo.collection import Collection
 3 | 
 4 | 
 5 | class Handle_Mongo(object):
 6 |     def __init__(self):
 7 |         mongo_client = pymongo.MongoClient(host="127.0.0.1",port=27017)
 8 |         self.db_data = mongo_client['kolesa']
 9 | 
10 |     def handle_save_task(self,item):
11 |         task_collection = Collection(self.db_data,'kolesa_task')
12 |         task_collection.update({'id':item['id']},item,True)
13 | 
14 |     def handle_get_task(self):
15 |         task_collection = Collection(self.db_data,'kolesa_task')
16 |         return task_collection.find_one_and_delete({})
17 | 
18 |     def handle_save_data(self,item):
19 |         task_collection = Collection(self.db_data,'kolesa_data')
20 |         task_collection.update({'id':item['id']},item,True)
21 | 
22 | kolesa_mongo = Handle_Mongo()


--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | import requests
10 | import json
11 | import random
12 | 
13 | 
14 | class MafengwoProxyMiddleware(object):
15 | 
16 |     def process_response(self, request, response, spider):
17 |         if 'mafengwo.net' in request.url:
18 |             return response
19 |         elif response is None:
20 |             return request
21 |         elif response.status == 302:
22 |             return request
23 |         elif response.status == 403:
24 |             return request
25 |         elif 'flashcookie.sw' in response.text:
26 |             return request
27 |         else:
28 |             return response
29 | 


--------------------------------------------------------------------------------
/synchronous/sample/thread_test1.py:
--------------------------------------------------------------------------------
 1 | import _thread
 2 | import threading
 3 | import time
 4 | 
 5 | 
 6 | def _thread_handle(thread_name, delay):
 7 |     for num in range(10):
 8 |         time.sleep(delay)
 9 |         print("{}的num:{}".format(thread_name, num))
10 | 
11 | 
12 | def threading_handle(delay=1):
13 |     for num in range(10):
14 |         time.sleep(delay)
15 |         print("{}-num-{}".format(threading.current_thread().name, num))
16 | 
17 | 
18 | def main():
19 |     # for item in range(10):
20 |     #     _thread.start_new_thread(_thread_handle, ("Thread - {}".format(item), 1))
21 |     # # 和进程不同，如果进程死亡，则线程也会死亡
22 |     # time.sleep(200)
23 |     for item in range(10):
24 |         # thread = threading.Thread(target=threading_handle, args=(1,), name="执行线程-{}".format(item))
25 |         thread = threading.Thread(target=threading_handle, args=(1,))
26 |         thread.start()
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     main()
31 | 


--------------------------------------------------------------------------------
/synchronous/sample/multiprocess_test3.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import multiprocessing
 3 | 
 4 | 
 5 | def status():
 6 |     """守护进程方法"""
 7 |     while True:
 8 |         print("守护进程ID:{id},守护进程名称:{name}".format(id=multiprocessing.current_process().pid,
 9 |                                                  name=multiprocessing.current_process().name))
10 |         time.sleep(1)
11 | 
12 | 
13 | def worker():
14 |     """具体执行工作的方法"""
15 |     # 创建守护进程,daemon为TRUE
16 |     daemon_process = multiprocessing.Process(target=status, name="守护进程", daemon=True)
17 |     daemon_process.start()
18 |     for item in range(10):
19 |         print("进程ID:{id},进程名称:{name}".format(id=multiprocessing.current_process().pid, name=multiprocessing.current_process().name))
20 |         time.sleep(2)
21 | 
22 | 
23 | def main():
24 |     process = multiprocessing.Process(target=worker, name="工作进程")
25 |     process.start()
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     main()
30 | 


--------------------------------------------------------------------------------
/synchronous/sample/multiprocess_test2.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import time
 3 | 
 4 | 
 5 | def send(msg):
 6 |     time.sleep(5)
 7 |     print("进程ID:{id},进程名称:{name},发送消息:{msg}".format(id=multiprocessing.current_process().pid,
 8 |                                                     name=multiprocessing.current_process().name, msg=msg))
 9 | 
10 | 
11 | def main():
12 |     process = multiprocessing.Process(target=send, name="TEST", args=("发送消息测试",))
13 |     process.start()
14 |     # 阻塞主进程执行，将等待子进程执行完毕后再执行主进程
15 |     # process.join()
16 |     time.sleep(2)
17 |     print("进程ID:{id},进程名称:{name}".format(id=multiprocessing.current_process().pid,
18 |                                          name=multiprocessing.current_process().name))
19 |     # 中断进程前判断进程是否存活
20 |     if process.is_alive():
21 |         # 中断进程
22 |         process.terminate()
23 |         print("进程被中断:{name}".format(name=process.name))
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     main()
28 | 


--------------------------------------------------------------------------------
/synchronous/sample/process_not_share.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import threading
 3 | 
 4 | # 多进程修改值
 5 | value = 0
 6 | lock = multiprocessing.Lock()
 7 | 
 8 | 
 9 | def test1(lock=None):
10 |     global value
11 |     for i in range(1000000):
12 |         # 使用锁解决多线程共享变量时的不安全问题
13 |         lock.acquire()
14 |         value = value + 1
15 |         lock.release()
16 | 
17 | 
18 | def multiprocess_value():
19 |     p1 = multiprocessing.Process(target=test1)
20 |     p2 = multiprocessing.Process(target=test1)
21 |     p1.start()
22 |     p2.start()
23 |     p1.join()
24 |     p2.join()
25 | 
26 | 
27 | def thread_value():
28 |     t1 = threading.Thread(target=test1, args=(lock, ))
29 |     t2 = threading.Thread(target=test1, args=(lock, ))
30 |     t1.start()
31 |     t2.start()
32 |     t1.join()
33 |     t2.join()
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     # 进程与进程之间不共享数据
38 |     # multiprocess_value()
39 |     # print(value)
40 |     # 多线程间共享数据
41 |     thread_value()
42 |     print(value)
43 | 


--------------------------------------------------------------------------------
/synchronous/sample/multiprocess_test1.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import cpu_count
 2 | 
 3 | print("cpu内核数量为:{count}".format(count=cpu_count()))
 4 | import multiprocessing
 5 | import sys
 6 | import time
 7 | 
 8 | 
 9 | def worker(delay, count):
10 |     for num in range(count):
11 |         print("{process}进程ID:{id},进程名称:{name}".format(process=num, id=multiprocessing.current_process().pid,
12 |                                                       name=multiprocessing.current_process().name))
13 |         time.sleep(delay)
14 | 
15 | 
16 | def main():
17 |     # 创建三个进程
18 |     for item in range(3):
19 |         # 传入参数和进程名称
20 |         process = multiprocessing.Process(target=worker, args=(1, 10,), name="item-{item}".format(item=item))
21 |         process.start()
22 |     print("进程ID:{id},进程名称:{name}".format(id=multiprocessing.current_process().pid, name=multiprocessing.current_process().name))
23 |     # 未设置进程阻塞，主进程即使退出也不会影响子进程执行
24 |     print("主进程退出")
25 |     sys.exit(0)
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     main()
30 | 


--------------------------------------------------------------------------------
/mafengwo/mafengwo/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class MafengwoItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     #文章数量
15 |     article_sum = scrapy.Field()
16 |     #文章标题
17 |     title = scrapy.Field()
18 |     #作者名称
19 |     name = scrapy.Field()
20 |     #id
21 |     id = scrapy.Field()
22 |     #文章发表时间
23 |     release_time = scrapy.Field()
24 |     #评论数
25 |     comment_sum = scrapy.Field()
26 |     #收藏数
27 |     star_sum = scrapy.Field()
28 |     #顶
29 |     support_sum = scrapy.Field()
30 |     #阅读数
31 |     read_sum = scrapy.Field()
32 |     #文章内容
33 |     content = scrapy.Field()
34 |     #抓取URL
35 |     from_url = scrapy.Field()
36 |     #抓取时间
37 |     crawl_time = scrapy.Field()
38 |     images = scrapy.Field()
39 |     image_urls = scrapy.Field()
40 |     image_paths = scrapy.Field()
41 | 


--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/handle_mongo.py:
--------------------------------------------------------------------------------
 1 | import pymongo
 2 | from pymongo.collection import Collection
 3 | 
 4 | 
 5 | 
 6 | 
 7 | class Mafengwo_mongo(object):
 8 |     def __init__(self):
 9 |         # mongo_client = pymongo.MongoClient(host='127.0.0.1', port=39070)
10 |         mongo_client = pymongo.MongoClient(host='10.70.120.156', port=27017)
11 |         self.db_data = mongo_client['oreo']
12 | 
13 |     def get_from_url(self, item):
14 |         db_collections = Collection(self.db_data, 'mafengwo_article')
15 |         result = db_collections.find_one({'from_url':item})
16 |         if result:
17 |             return True
18 |         else:
19 |             return False
20 |         #return False
21 | 
22 |     def insert_task(self,item):
23 |         db_collections = Collection(self.db_data, 'mafengwo_article_task')
24 |         db_collections.insert_one(item)
25 | 
26 |     def get_task(self):
27 |         db_collections = Collection(self.db_data, 'mafengwo_article_task')
28 |         return db_collections.find_one_and_delete({})
29 | 
30 | 
31 | mongo = Mafengwo_mongo()
32 | 


--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/js/handle_sn.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import time
 3 | import requests
 4 | import json
 5 | 
 6 | 
 7 | 
 8 | 
 9 | for i in range(1,301):
10 |     input_value = {
11 |         "cost":"0",
12 |         "days":"0",
13 |         "mddid":"10065",
14 |         "month":"0",
15 |         "page":i,
16 |         "pageid":"mdd_index",
17 |         "sort":"1",
18 |         "tagid":"0",
19 |         "_ts":"1558433973256"
20 |     }
21 |     salt = "c9d6618dbc657b41a66eb0af952906f1"
22 |     str = json.dumps(input_value)+salt
23 | 
24 |     # 创建md5对象
25 |     hl = hashlib.md5()
26 |     hl.update(str.encode(encoding='utf-8'))
27 |     md5_result = hl.hexdigest()[2:12]
28 |     # input_value['_sn'] = md5_result
29 | 
30 | 
31 | 
32 |     url = 'http://www.mafengwo.cn/gonglve/ajax.php?act=get_travellist'
33 |     header = {
34 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
35 |     }
36 |     response = requests.post(url=url,headers=header,data=input_value)
37 |     print(response.text)
38 |     time.sleep(1)
39 | 


--------------------------------------------------------------------------------
/synchronous/handle_redis.py:
--------------------------------------------------------------------------------
 1 | import redis
 2 | from pickle import dumps, loads
 3 | from handle_request import DangdangRequest
 4 | 
 5 | 
 6 | class RedisQueue(object):
 7 |     def __init__(self):
 8 |         pool = redis.ConnectionPool(host="192.168.149.129", port=6379)
 9 |         self.r = redis.Redis(connection_pool=pool)
10 | 
11 |     def insert_data(self, data):
12 |         print("添加抓取任务: ", data)
13 |         if isinstance(data, DangdangRequest):
14 |             self.r.rpush("TEST", dumps(data))
15 |         return False
16 | 
17 |     def get_data(self):
18 |         if self.r.llen("TEST"):
19 |             data = loads(self.r.lpop("TEST"))
20 |             print("取出任务：", data)
21 |             return data
22 |         else:
23 |             return False
24 | 
25 |     def database_empty(self):
26 |         return self.r.llen("TEST") == 0
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     db = RedisQueue()
31 |     start_url = "https://www.baidu.com"
32 |     baidu_request = DangdangRequest(url=start_url, callback="hello", need_proxy=True)
33 |     db.insert_data(data=baidu_request)
34 |     request = db.get_data()
35 | 


--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class MafengwoItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     #文章数量
15 |     article_sum = scrapy.Field()
16 |     #文章标题
17 |     title = scrapy.Field()
18 |     #作者名称
19 |     name = scrapy.Field()
20 |     #id
21 |     id = scrapy.Field()
22 |     #文章发表时间
23 |     release_time = scrapy.Field()
24 |     #评论数
25 |     comment_sum = scrapy.Field()
26 |     #收藏数
27 |     star_sum = scrapy.Field()
28 |     #顶
29 |     support_sum = scrapy.Field()
30 |     #阅读数
31 |     read_sum = scrapy.Field()
32 |     #文章内容
33 |     content = scrapy.Field()
34 |     #抓取URL
35 |     from_url = scrapy.Field()
36 |     upload_status = scrapy.Field()
37 |     #抓取时间
38 |     crawl_time = scrapy.Field()
39 |     images = scrapy.Field()
40 |     image_urls = scrapy.Field()
41 |     image_paths = scrapy.Field()
42 |     video_urls = scrapy.Field()
43 | 


--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import pymongo,json
 9 | from pymongo.collection import Collection
10 | 
11 | class DoubanPipeline(object):
12 |     def __init__(self):
13 |         mongo_client = pymongo.MongoClient(host='127.0.0.1', port=27017)
14 |         self.db_data = mongo_client['douban_scrapy']
15 | 
16 |     def process_item(self, item, spider):
17 |         #指定数据库和表
18 |         douban_collection = Collection(self.db_data,'douban')
19 |         douban_collection.insert(dict(item))
20 |         return item
21 | 
22 | class DoubanJsonPipeline(object):
23 |     def __init__(self):
24 |         self.file = open('douban.json','w')
25 | 
26 |     def process_item(self, item, spider):
27 |         # json数据中添加逗号和换行符
28 |         content = json.dumps(dict(item),ensure_ascii = False) + ",\n"
29 |         self.file.write(content)
30 |         return item
31 | 
32 |     def close_spider(self,spider):
33 |         self.file.close()
34 | 


--------------------------------------------------------------------------------
/synchronous/sample/multiprocess_class.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import sys
 3 | import time
 4 | 
 5 | 
 6 | # 继承multiprocessing.Process类
 7 | class MyProcess(multiprocessing.Process):
 8 |     def __init__(self, name, delay, count):
 9 |         # 调用父类方法传入名称
10 |         super().__init__(name=name)
11 |         self.delay = delay
12 |         self.count = count
13 | 
14 |     # 多进程类具体执行方法
15 |     def run(self) -> None:
16 |         for num in range(self.count):
17 |             print("{process}进程ID:{id},进程名称:{name}".format(process=num, id=multiprocessing.current_process().pid,
18 |                                                           name=multiprocessing.current_process().name))
19 |             time.sleep(self.delay)
20 | 
21 | 
22 | def main():
23 |     for item in range(3):
24 |         process = MyProcess(name="item-{id}".format(id=item), delay=1, count=10)
25 |         # 多进程类start方法会调用run方法
26 |         process.start()
27 | 
28 |     print("进程ID:{id},进程名称:{name}".format(id=multiprocessing.current_process().pid,
29 |                                                   name=multiprocessing.current_process().name))
30 |     print("主进程退出")
31 |     sys.exit(0)
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     main()
36 | 


--------------------------------------------------------------------------------
/.idea/deployment.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="PublishConfigData">
 4 |     <serverData>
 5 |       <paths name="linux_info">
 6 |         <serverdata>
 7 |           <mappings>
 8 |             <mapping local="$PROJECT_DIR$" web="/" />
 9 |           </mappings>
10 |         </serverdata>
11 |       </paths>
12 |       <paths name="logontracer">
13 |         <serverdata>
14 |           <mappings>
15 |             <mapping local="$PROJECT_DIR$" web="/" />
16 |           </mappings>
17 |         </serverdata>
18 |       </paths>
19 |       <paths name="vp">
20 |         <serverdata>
21 |           <mappings>
22 |             <mapping local="$PROJECT_DIR$" web="/" />
23 |           </mappings>
24 |         </serverdata>
25 |       </paths>
26 |       <paths name="weixin">
27 |         <serverdata>
28 |           <mappings>
29 |             <mapping local="$PROJECT_DIR$" web="/" />
30 |           </mappings>
31 |         </serverdata>
32 |       </paths>
33 |       <paths name="xss">
34 |         <serverdata>
35 |           <mappings>
36 |             <mapping local="$PROJECT_DIR$" web="/" />
37 |           </mappings>
38 |         </serverdata>
39 |       </paths>
40 |     </serverData>
41 |   </component>
42 | </project>


--------------------------------------------------------------------------------
/mafengwo/mafengwo/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | from pymongo.collection import Collection
 9 | from scrapy.pipelines.images import ImagesPipeline
10 | from scrapy import Request
11 | 
12 | class MafengwoPipeline(object):
13 |     def __init__(self):
14 |         mongo_client = pymongo.MongoClient(host='127.0.0.1', port=27017)
15 |         self.db_data = mongo_client['mafengwo']
16 | 
17 |     def process_item(self, item, spider):
18 |         db_collections = Collection(self.db_data, 'mafengwo_article')
19 |         db_collections.update({'from_url':item['from_url']},item,True)
20 |         return item
21 | 
22 | 
23 | class MafengwoImagePipeline(ImagesPipeline):
24 |     def get_media_requests(self, item, info):
25 |         for image_url in item['image_urls']:
26 |             yield Request(url=image_url)
27 | 
28 |     def item_completed(self, results, item, info):
29 |         image_paths = [x['path'] for ok, x in results if ok]
30 |         if not image_paths:
31 |             pass
32 |         item['image_paths'] = image_paths
33 |         return item
34 | 


--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | import base64
 8 | import random
 9 | 
10 | class ProxyMiddleware(object):
11 |     def __init__(self):
12 |         self.proxy_info = [
13 |             {'proxy_url': 'ip4.hahado.cn:35410', 'proxy_user_pass': b'duoipbpvzyymn:tRf6NnfsBi7k0'},
14 |             {'proxy_url': 'ip4.hahado.cn:35164', 'proxy_user_pass': b'duoipcnezxjlvkv:xXuXTPES9XPwp'},
15 |             {'proxy_url': 'ip4.hahado.cn:35401', 'proxy_user_pass': b'duoipwpdlrfwc:888888'},
16 |             {'proxy_url': 'ip4.hahado.cn:35404', 'proxy_user_pass': b'duoipcnxgfzfsyp:TjgLhDqqEj0Pe'},
17 |             {'proxy_url': 'ip4.hahado.cn:35413', 'proxy_user_pass': b'duoipvriezfde:bq4RYrQiWuQzv'},
18 |         ]
19 | 
20 |     def process_request(self, request, spider):
21 |         proxy = random.choice(self.proxy_info)
22 |         request.meta['proxy'] = proxy['proxy_url']
23 |         proxy_user_pass = proxy['proxy_user_pass']
24 |         encoded_user_pass = base64.b64encode(proxy_user_pass)
25 |         request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass.decode()
26 |         # return None
27 | 


--------------------------------------------------------------------------------
/mafengwo/mafengwo/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | import base64
 8 | 
 9 | 
10 | class MafengwoProxyMiddleware(object):
11 |     #设置代理策略
12 |     def process_request(self, request, spider):
13 |         # proxy，主机头和端口号
14 |         request.meta['proxy'] = 'http://http-dyn.abuyun.com:9020'
15 |         # 用户名:密码,当前代理必须要有费用
16 |         # 你自己买的代理，用户名和密码肯定和我的不一样
17 |         proxy_name_pass = 'HTK32673HL02BK2D:50125D2D38937C94'.encode('utf-8')
18 |         encode_pass_name = base64.b64encode(proxy_name_pass)
19 |         # 将代理信息设置到头部去
20 |         # 注意！！！！！Basic后面有一个空格
21 |         request.headers['Proxy-Authorization'] = 'Basic ' + encode_pass_name.decode()
22 | 
23 |     #通过response判断下载是否成功
24 |     def process_response(self, request, response, spider):
25 |         if 'mafengwo.net' in request.url:
26 |             return response
27 |         elif response is None:
28 |             return request
29 |         elif response.status == 302:
30 |             return request
31 |         elif response.status == 403:
32 |             return request
33 |         elif 'flashcookie.sw' in response.text:
34 |             return request
35 |         else:
36 |             return response
37 | 


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | import pymongo
10 | from pymongo.collection import Collection
11 | from scrapy.pipelines.images import ImagesPipeline
12 | from scrapy import Request
13 | 
14 | #存储数据
15 | class DongqiudiPipeline(object):
16 |     def __init__(self):
17 |         mongo_client = pymongo.MongoClient(host='192.168.7.142',port=27017)
18 |         self.dongqiudi_db = mongo_client['dongqiudi_data']
19 |     def process_item(self, item, spider):
20 |         dongqiudi_collection = Collection(self.dongqiudi_db,"dongqiudi")
21 |         dongqiudi_collection.update({'from_url':item['from_url']},item,True)
22 |         return item
23 | 
24 | #下载图片
25 | class DongqiudiImagePipeline(ImagesPipeline):
26 |     def get_media_requests(self, item, info):
27 |         for image_url in item['image_urls']:
28 |             yield Request(url=image_url,meta={'img_name':image_url,'photo_id':item['title']})
29 | 
30 |     def item_completed(self, results, item, info):
31 |         image_paths = [x['path'] for ok, x in results if ok]
32 |         if not image_paths:
33 |             pass
34 |         return item
35 | 
36 |     def file_path(self, request, response=None, info=None):
37 |         filename = './' + str(request.meta['photo_id'])+'/'+request.meta['img_name'].split("/")[-1]
38 |         return filename
39 | 


--------------------------------------------------------------------------------
/synchronous/handle_request.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import traceback
 3 | 
 4 | 
 5 | class DangdangRequest(object):
 6 |     def __init__(self, url, headers, callback, method="GET", need_proxy=False, fail_time=0, timeout=(5, 5)):
 7 |         self.callback = callback
 8 |         self.need_proxy = need_proxy
 9 |         self.fail_time = fail_time
10 |         self.timeout = timeout
11 |         self.headers = headers
12 |         self.url = url
13 |         self.method = method
14 | 
15 |     def __str__(self):
16 |         return self.url
17 | 
18 |     def send_request(self):
19 |         print("请求{url}".format(url=self.url))
20 |         proxy_info = {}
21 |         if self.method == "GET":
22 |             try:
23 |                 if not self.need_proxy:
24 |                     response = requests.get(url=self.url, headers=self.headers, timeout=self.timeout)
25 |                 else:
26 |                     response = requests.get(url=self.url, headers=self.headers, timeout=self.timeout,
27 |                                             proxies=proxy_info)
28 |             except Exception as e:
29 |                 print(traceback.format_exc())
30 |                 return self
31 |             else:
32 |                 return response
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     headers = {
37 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36 "
38 |     }
39 |     q = DangdangRequest(url="https://www.baidu.com", headers=headers, callback="hello")
40 |     response = q.send_request()
41 |     print(response.text)
42 | 


--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | from pymongo.collection import Collection
 9 | from scrapy.pipelines.images import ImagesPipeline
10 | from scrapy import Request
11 | 
12 | class MafengwoPipeline(object):
13 |     def __init__(self):
14 |         # mongo_client = pymongo.MongoClient(host='127.0.0.1', port=39070)
15 |         mongo_client = pymongo.MongoClient(host='10.70.120.156', port=27017)
16 |         self.db_data = mongo_client['oreo']
17 | 
18 |     def process_item(self, item, spider):
19 |         db_collections = Collection(self.db_data, 'mafengwo_article')
20 |         db_collections.update({'from_url':item['from_url']},item,True)
21 |         return item
22 | 
23 | 
24 | class MafengwoImagePipeline(ImagesPipeline):
25 |     def get_media_requests(self, item, info):
26 |         for image_url in item['image_urls']:
27 |             yield Request(url=image_url,meta={'img_name':image_url,'photo_id':item['id']})
28 | 
29 |     def item_completed(self, results, item, info):
30 |         image_paths = [x['path'] for ok, x in results if ok]
31 |         if not image_paths:
32 |             pass
33 |         #item['image_paths'] = image_paths
34 |         return item
35 | 
36 |     def file_path(self, request, response=None, info=None):
37 |         filename = './' + str(request.meta['photo_id'])+'/'+request.meta['img_name'].split("/")[-1]
38 |         return filename
39 | 


--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/.idea/deployment.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="PublishConfigData" autoUpload="Always" serverName="scrapy3">
 4 |     <serverData>
 5 |       <paths name="block_chain">
 6 |         <serverdata>
 7 |           <mappings>
 8 |             <mapping local="$PROJECT_DIR$" web="/" />
 9 |           </mappings>
10 |         </serverdata>
11 |       </paths>
12 |       <paths name="nlp_py3">
13 |         <serverdata>
14 |           <mappings>
15 |             <mapping local="$PROJECT_DIR$" web="/" />
16 |           </mappings>
17 |         </serverdata>
18 |       </paths>
19 |       <paths name="ocr_py3">
20 |         <serverdata>
21 |           <mappings>
22 |             <mapping local="$PROJECT_DIR$" web="/" />
23 |           </mappings>
24 |         </serverdata>
25 |       </paths>
26 |       <paths name="scrapy3">
27 |         <serverdata>
28 |           <mappings>
29 |             <mapping deploy="/home/test_python/douban" local="$PROJECT_DIR$" web="/" />
30 |           </mappings>
31 |         </serverdata>
32 |       </paths>
33 |       <paths name="scrapy_project">
34 |         <serverdata>
35 |           <mappings>
36 |             <mapping local="$PROJECT_DIR$" web="/" />
37 |           </mappings>
38 |         </serverdata>
39 |       </paths>
40 |       <paths name="spider2">
41 |         <serverdata>
42 |           <mappings>
43 |             <mapping local="$PROJECT_DIR$" web="/" />
44 |           </mappings>
45 |         </serverdata>
46 |       </paths>
47 |     </serverData>
48 |     <option name="myAutoUpload" value="ALWAYS" />
49 |   </component>
50 | </project>


--------------------------------------------------------------------------------
/synchronous/sample/multiprocess_share.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import time
 3 | 
 4 | value = 1
 5 | 
 6 | 
 7 | def send_data(conn):
 8 |     global value
 9 |     value = value + 1
10 |     conn.send(value)
11 | 
12 | 
13 | def receive_data(conn):
14 |     print("接收到的数据为:{data}".format(data=conn.recv()))
15 | 
16 | 
17 | def pipe_main():
18 |     # 进程通信管道
19 |     conn_recv, conn_send = multiprocessing.Pipe()
20 |     process_send = multiprocessing.Process(target=send_data, args=(conn_send,))
21 |     process_send.start()
22 |     process_send.join()
23 |     process_recv = multiprocessing.Process(target=receive_data, args=(conn_recv,))
24 |     process_recv.start()
25 |     process_recv.join()
26 | 
27 | 
28 | def worker(dict, lock):
29 |     while True:
30 |         # lock.acquire()
31 |         with lock:
32 |             number = dict.get("ticket")
33 |             if number > 0:
34 |                 time.sleep(1)
35 |                 number = number - 1
36 |                 print("{}-ticket={}".format(multiprocessing.current_process().name, number))
37 |                 dict.update({"ticket": number})
38 |             else:
39 |                 print("无票")
40 |                 break
41 |         # lock.release()
42 | 
43 | 
44 | def main():
45 |     # 使用manager操作字典共享
46 |     manager = multiprocessing.Manager()
47 |     mgr_dict = manager.dict(ticket=5)
48 |     lock = multiprocessing.Lock()
49 |     print(mgr_dict)
50 |     job_process = [multiprocessing.Process(target=worker, args=(mgr_dict, lock,), name="售票员-{item}".format(item=item))
51 |                    for item in range(3)]
52 |     for job in job_process:
53 |         job.start()
54 | 
55 |     for end in job_process:
56 |         end.join()
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     # pipe_main()
61 |     main()
62 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/baidu_m_keyword_ranks/handle_mysql.py:
--------------------------------------------------------------------------------
 1 | import pymysql
 2 | import time
 3 | import setting
 4 | import csv
 5 | 
 6 | 
 7 | 
 8 | class Handle_mysql(object):
 9 |     def __init__(self):
10 |         self.db = pymysql.connect(host=setting.mysql_ip,port=setting.mysql_port,database=setting.mysql_database,user=setting.mysql_username,password=setting.mysql_password)
11 |         self.cursor = self.db.cursor()
12 | 
13 |     def __del__(self):
14 |         self.cursor.close()
15 |         self.db.close()
16 | 
17 |     def handle_task(self):
18 |         #获取任务关键字
19 |         sql = "SELECT search_word FROM seo_fast_rankings WHERE state=1;"
20 |         self.cursor.execute(sql)
21 |         result = self.cursor.fetchall()
22 |         return result
23 | 
24 |     #插入和更新数据
25 |     def handle_insert_db(self,item=None):
26 |         sql_insert = """ INSERT INTO seo_baidu_m_keyword_ziran (keyword,rank,crawl_date) VALUES ("%s",'%s',"%s");""" % (item['keyword'],item['rank'],item['crawl_date'])
27 |         try:
28 |             self.cursor.execute(sql_insert)
29 |             self.db.commit()
30 |         except:
31 |             pass
32 |             # print(sql_insert)
33 | 
34 | mysql = Handle_mysql()
35 | if __name__ == '__main__':
36 |     #插入数据前先删除当日数据
37 |     date = time.strftime("%Y-%m-%d", time.localtime())
38 |     sql_delete = """ DELETE FROM seo_baidu_m_keyword_ziran where crawl_date='%s'"""%date
39 |     mysql.cursor.execute(sql_delete)
40 |     mysql.db.commit()
41 |     #导入当日数据
42 |     with open('baidu_m_keyword_ziran.csv','r',encoding='utf-8') as f:
43 |         csv_reader = csv.reader(f)
44 |         data = next(csv_reader)
45 |         for i in csv_reader:
46 |             info = {}
47 |             info['keyword'] = i[0]
48 |             info['rank'] = i[1]
49 |             info['crawl_date'] = i[2]
50 |             mysql.handle_insert_db(info)
51 | 


--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/spiders/douban_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from douban.items import DoubanItem
 4 | 
 5 | 
 6 | class DoubanSpiderSpider(scrapy.Spider):
 7 |     # scrapy项目名称
 8 |     name = 'douban_spider'
 9 |     allowed_domains = ['douban.com']
10 |     # 起始URL
11 |     start_urls = ['https://movie.douban.com/top250?start=0&filter=']
12 |     custom_settings = {
13 |         'USER_AGENT':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:60.0) Gecko/20100101 Firefox/60.0'
14 |     }
15 | 
16 |     # 解析方法
17 |     def parse(self, response):
18 |         movie_list = response.xpath("//div[@class='article']//ol[@class='grid_view']/li")
19 |         for i_item in movie_list:
20 |             douban_item = DoubanItem()
21 |             douban_item['serial_number'] = i_item.xpath(".//div[@class='item']//em/text()").extract_first()
22 |             douban_item['movie_name'] = i_item.xpath(".//div[@class='info']/div[@class='hd']/a/span[1]/text()").extract_first()
23 |             content = i_item.xpath(".//div[@class='info']//div[@class='bd']/p[1]/text()").extract()
24 |             for i_content in content:
25 |                 content_s = "".join(i_content.split())
26 |                 douban_item['introduce'] = content_s
27 |             douban_item['star'] = i_item.xpath(".//span[@class='rating_num']/text()").extract_first()
28 |             douban_item['evaluate'] = i_item.xpath(".//div[@class='star']//span[4]/text()").extract_first()
29 |             douban_item['describe'] = i_item.xpath(".//p[@class='quote']/span/text()").extract_first()
30 |             # yield到pipeline,settings中需要启用，否则无法存储数据
31 |             yield douban_item
32 | 
33 |         nextLink = response.xpath('//span[@class="next"]/link/@href').extract()
34 |         # 第10页是最后一页，没有下一页的链接
35 |         if nextLink:
36 |             nextLink = nextLink[0]
37 |             print (nextLink)
38 |             yield scrapy.Request('https://movie.douban.com/top250'+nextLink, callback=self.parse)
39 |             # # 递归将下一页的地址传给这个函数自己，在进行爬取
40 | 


--------------------------------------------------------------------------------
/login_github/handle_login.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import re
 3 | 
 4 | 
 5 | class Login(object):
 6 |     def __init__(self):
 7 |         self.login_session = requests.session()
 8 |         self.header = {
 9 |             "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
10 |         }
11 |         self.city_token = ""
12 | 
13 | 
14 |     def handle_city_token(self):
15 |         """
16 |         获取city_token,为登录做准备
17 |         :return:self.city_token
18 |         """
19 |         login_url = "https://github.com/login"
20 |         response = self.login_session.get(url=login_url,headers=self.header)
21 |         city_token_search = re.compile(r'name="authenticity_token"\svalue="(.*?)"\s\/>')
22 |         self.city_token = city_token_search.search(response.text).group(1)
23 | 
24 |     def handle_login_github(self):
25 |         """
26 |         执行登录
27 |         :return: 登录后匹配的字符串
28 |         """
29 |         login_name = input("请输入用户名:")
30 |         login_password = input("请输入密码:")
31 |         self.handle_city_token()
32 |         #获取登录cookie
33 |         self.login_session.get(url="https://github.com/manifest.json",headers=self.header)
34 |         data = {
35 |             "commit": "Sign in",
36 |             "utf8": "✓",
37 |             "authenticity_token":self.city_token,
38 |             "login": login_name,
39 |             "password": login_password,
40 |             "webauthn-support": "supported",
41 |         }
42 |         session_url = "https://github.com/session"
43 |         self.header['Referer'] = "https://github.com/login"
44 |         # 登录
45 |         self.login_session.post(url=session_url,headers=self.header,data=data)
46 |         self.header.pop('Referer')
47 |         #请求设置页
48 |         response = self.login_session.get(url="https://github.com/settings/profile",headers=self.header)
49 |         search_email = re.compile(login_name)
50 |         # 登陆成功后可以获取到自己的登录名称
51 |         print(search_email.search(response.text).group())
52 | if __name__ == '__main__':
53 |     github = Login()
54 |     github.handle_login_github()
55 | 


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/spiders/crawl_dongqiudi.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from ..items import DongqiudiItem
 4 | import time
 5 | import json
 6 | 
 7 | 
 8 | class CrawlDongqiudiSpider(scrapy.Spider):
 9 |     name = 'crawl_dongqiudi'
10 |     allowed_domains = ['dongqiudi.com']
11 |     start_urls = ['http://dongqiudi.com/']
12 | 
13 |     #分析懂球帝页面，通过浏览器开发者工具xhr，可以看到异步json请求
14 |     def start_requests(self,time_value=None):
15 |         #初始时间使用time.time()构造
16 |         if time_value == None:
17 |             time_value = int(time.time())
18 |         #分析页面新闻结构
19 |         for item_value in [56,232,57,3,4,5,6]:
20 |             #如该请求https://dongqiudi.com/api/app/tabs/web/56.json?after=1572577395&page=1
21 |             #其中56为栏目编号,after为时间戳,page为页码
22 |             page_url = "https://dongqiudi.com/api/app/tabs/web/%s.json?after=%s&page=1"%(item_value,time_value)
23 |             yield scrapy.Request(url=page_url,callback=self.handle_page_response,dont_filter=True)
24 | 
25 |     #处理页码请求的返回
26 |     def handle_page_response(self,response):
27 |         response_dict = json.loads(response.text)
28 |         #从返回中获取下一页链接
29 |         next_url = response_dict.get('next')
30 |         if next_url:
31 |             #请求下一页
32 |             yield scrapy.Request(url=next_url,callback=self.handle_page_response,dont_filter=True)
33 | 
34 |         #解析新闻列表
35 |         news_list = response_dict.get('articles')
36 |         if news_list:
37 |             for item in news_list:
38 |                 info = {}
39 |                 #新闻URL
40 |                 info['from_url'] = item.get('url')
41 |                 #新闻标题
42 |                 info['title'] = item.get('title')
43 |                 #新闻发表时间
44 |                 info['release_time'] = item.get('published_at')
45 |                 yield scrapy.Request(url=info['from_url'],callback=self.handle_detail,dont_filter=True,meta=info)
46 | 
47 |     #处理新闻详情页
48 |     def handle_detail(self,response):
49 |         dongqiudi = DongqiudiItem()
50 |         #作者
51 |         dongqiudi['author'] = response.xpath("//header/h2/a/text()").extract_first()
52 |         #内容
53 |         dongqiudi['content'] = ''.join(response.xpath("//div[@class='con']/p/text()").extract())
54 |         #新闻图片
55 |         dongqiudi['image_urls'] = response.xpath("//div[@class='con']/p/img/@data-src").extract()
56 |         #抓取时间
57 |         dongqiudi['crawl_time'] = time.strftime("%Y%m%d %H:%M:%S", time.localtime())
58 |         #新闻标题
59 |         dongqiudi['title'] = response.request.meta['title']
60 |         #抓取url
61 |         dongqiudi['from_url'] = response.request.meta['from_url']
62 |         #发表时间
63 |         dongqiudi['release_time'] = response.request.meta['release_time']
64 |         #yield到pipeline中
65 |         yield dongqiudi
66 | 


--------------------------------------------------------------------------------
/synchronous/handle_spider.py:
--------------------------------------------------------------------------------
 1 | # from handle_redis import RedisQueue
 2 | from handle_queue import DangdangQueue
 3 | from handle_request import DangdangRequest
 4 | from lxml import etree
 5 | import time
 6 | 
 7 | 
 8 | class Spider(object):
 9 |     headers = {
10 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
11 |                       "Chrome/86.0.4240.75 Safari/537.36 "
12 |     }
13 | 
14 |     # queue = RedisQueue()
15 |     queue = DangdangQueue()
16 | 
17 |     def start(self):
18 |         """爬虫起始方法"""
19 |         for page in range(1, 26):
20 |             start_url = "http://bang.dangdang.com/books/fivestars/2-{page}".format(page=page)
21 |             dangdang_request = DangdangRequest(url=start_url, callback=self.parse_item, headers=Spider.headers)
22 |             Spider.queue.insert_data(data=dangdang_request)
23 | 
24 |     def do_request(self, request):
25 |         """发送请求"""
26 |         response = request.send_request()
27 |         return response
28 | 
29 |     def parse_item(self, response):
30 |         """解析数据"""
31 |         data = []
32 |         html = etree.HTML(response.text)
33 |         items = html.xpath("//ul[@class='bang_list']/li")
34 |         for item in items:
35 |             title = item.xpath(".//div[@class='name']/a/text()")
36 |             if title:
37 |                 data.extend(title)
38 |         yield data
39 | 
40 |     def error(self, request):
41 |         """请求错误后返回队列"""
42 |         request.fail_time = request.fail_time + 1
43 |         if request.fail_time < 20:
44 |             print("该请求异常{url}, 将该请求放回队列".format(url=request))
45 |             Spider.queue.insert_data(data=request)
46 | 
47 |     def schedule(self):
48 |         """任务调度"""
49 |         start_time = time.time()
50 |         while not Spider.queue.database_empty():
51 |             dangdang_request = self.queue.get_data()
52 |             if dangdang_request:
53 |                 print("当前调度：", dangdang_request)
54 |                 callback = dangdang_request.callback
55 |                 response = self.do_request(dangdang_request)
56 |                 if not isinstance(response, DangdangRequest):
57 |                     # 通过回调方法解析
58 |                     result = callback(response)
59 |                     for item in result:
60 |                         print(item)
61 |                 else:
62 |                     dangdang_request = DangdangRequest(url=response.url, headers=Spider.headers, callback=self.parse_item)
63 |                     # 错误处理
64 |                     self.error(dangdang_request)
65 |         print("共耗时:", time.time()-start_time)
66 | 
67 |     def run(self):
68 |         self.start()
69 |         self.schedule()
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     s = Spider()
74 |     s.run()
75 | 


--------------------------------------------------------------------------------
/douban_movie_top250/crawl_douban_movie_info_top250.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from concurrent.futures import ThreadPoolExecutor
 3 | import requests
 4 | from lxml import etree
 5 | from handle_mongo import douban_mongo
 6 | 
 7 | 
 8 | class HandleDoubanMovieTop250(object):
 9 |     def __init__(self):
10 |         self.header = {
11 |             "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
12 |             "Accept-Encoding":"gzip, deflate, br",
13 |             "Accept-Language":"zh-CN,zh;q=0.9",
14 |             "Connection":"keep-alive",
15 |             "Host":"movie.douban.com",
16 |             "Upgrade-Insecure-Requests":"1",
17 |             "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
18 |         }
19 |         self.page_url = []
20 | 
21 |     def handle_page_url(self):
22 |         #通过分析页面URL可以得知
23 |         #通过range构造页码变量,从0开始,到249结束,步长为25
24 |         for i in range(0,250,25):
25 |             url = "https://movie.douban.com/top250?start=%s"%i
26 |             self.page_url.append(url)
27 | 
28 |     #处理请求方法
29 |     def handle_request(self,url):
30 |         response = requests.get(url=url,headers=self.header)
31 |         return response.text
32 | 
33 | 
34 |     #处理页码页
35 |     def handle_page_detail(self,url):
36 |         print(url)
37 |         #处理特殊字符
38 |         sub_search = re.compile(r"[\s\r\t]")
39 |         response = self.handle_request(url=url)
40 |         html = etree.HTML(response)
41 |         #解析当前页面有多少个电影信息
42 |         item_list = html.xpath("//ol[@class='grid_view']/li")
43 |         for item in item_list:
44 |             info = {}
45 |             #电影名称,将特殊字符替换为空
46 |             info['movie_name'] = sub_search.sub('',''.join(item.xpath(".//div[@class='hd']/a//span/text()")))
47 |             info['actors_information'] = sub_search.sub('',''.join(item.xpath(".//div[@class='bd']/p/text()")))
48 |             info['score'] = sub_search.sub('',''.join(item.xpath(".//div[@class='bd']/div[@class='star']/span[2]/text()")))
49 |             info['evaluate'] = sub_search.sub('',''.join(item.xpath(".//div[@class='bd']/div[@class='star']/span[4]/text()")))
50 |             info['describe'] = sub_search.sub('',''.join(item.xpath(".//p[@class='quote']/span/text()")))
51 |             info['from_url'] = url
52 |             #数据入库
53 |             douban_mongo.handle_save_data(info)
54 | 
55 |     #启动方法
56 |     def run(self):
57 |         self.handle_page_url()
58 |         #创建线程池
59 |         t = ThreadPoolExecutor()
60 |         for i in self.page_url:
61 |             t.submit(self.handle_page_detail,i)
62 |         t.shutdown()
63 | 
64 | #入口函数
65 | def main():
66 |     douban = HandleDoubanMovieTop250()
67 |     douban.run()
68 | 
69 | if __name__ == '__main__':
70 |     #入口函数调用
71 |     main()
72 | 


--------------------------------------------------------------------------------
/synchronous/spider_multiprocess.py:
--------------------------------------------------------------------------------
 1 | # from handle_redis import RedisQueue
 2 | import multiprocessing
 3 | from handle_queue import DangdangQueue
 4 | from handle_request import DangdangRequest
 5 | from lxml import etree
 6 | import time
 7 | 
 8 | 
 9 | class Spider(object):
10 |     headers = {
11 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
12 |                       "Chrome/86.0.4240.75 Safari/537.36 "
13 |     }
14 | 
15 |     # queue = RedisQueue()
16 |     queue = DangdangQueue()
17 | 
18 |     def start(self):
19 |         """爬虫起始方法"""
20 |         for page in range(1, 26):
21 |             start_url = "http://bang.dangdang.com/books/fivestars/2-{page}".format(page=page)
22 |             dangdang_request = DangdangRequest(url=start_url, callback=self.parse_item, headers=Spider.headers)
23 |             Spider.queue.insert_data(data=dangdang_request)
24 | 
25 |     def do_request(self, request):
26 |         """发送请求"""
27 |         response = request.send_request()
28 |         return response
29 | 
30 |     def parse_item(self, response):
31 |         """解析数据"""
32 |         data = []
33 |         html = etree.HTML(response.text)
34 |         items = html.xpath("//ul[@class='bang_list']/li")
35 |         for item in items:
36 |             title = item.xpath(".//div[@class='name']/a/text()")
37 |             if title:
38 |                 data.extend(title)
39 |         yield data
40 | 
41 |     def error(self, request):
42 |         """请求错误后返回队列"""
43 |         request.fail_time = request.fail_time + 1
44 |         if request.fail_time < 20:
45 |             print("该请求异常{url}, 将该请求放回队列".format(url=request))
46 |             Spider.queue.insert_data(data=request)
47 | 
48 |     def handle_worker(self, request):
49 |         print("{name}调度{url}".format(name=multiprocessing.current_process().name, url=request.url))
50 |         callback = request.callback
51 |         response = self.do_request(request)
52 |         if not isinstance(response, DangdangRequest):
53 |             # 通过回调方法解析
54 |             result = callback(response)
55 |             for item in result:
56 |                 print(item)
57 |         else:
58 |             dangdang_request = DangdangRequest(url=response.url, headers=Spider.headers, callback=self.parse_item)
59 |             # 错误处理
60 |             self.error(dangdang_request)
61 | 
62 |     def schedule(self):
63 |         """任务调度"""
64 |         start_time = time.time()
65 |         pool = multiprocessing.Pool(multiprocessing.cpu_count())
66 |         while not Spider.queue.database_empty():
67 |             dangdang_request = self.queue.get_data()
68 |             if dangdang_request:
69 |                 pool.apply_async(func=self.handle_worker, args=(dangdang_request,))
70 |         pool.close()
71 |         pool.join()
72 |         print("共耗时:", time.time()-start_time)
73 | 
74 |     def run(self):
75 |         self.start()
76 |         self.schedule()
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     s = Spider()
81 |     s.run()
82 | 


--------------------------------------------------------------------------------
/video/lishipin/crawl_lishipin.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import time
 3 | from lxml import etree
 4 | import re
 5 | 
 6 | class HandleLishipin(object):
 7 |     def __init__(self):
 8 |         self.header = {
 9 |             "Connection":"keep-alive",
10 |             "Upgrade-Insecure-Requests":"1",
11 |             "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
12 |             "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
13 |             "Accept-Encoding":"gzip, deflate, br",
14 |             "Accept-Language":"zh-CN,zh;q=0.9",
15 |         }
16 | 
17 |     def handle_html(self,url):
18 |         response = requests.get(url=url,headers=self.header)
19 |         return response.text
20 | 
21 | if __name__ == '__main__':
22 |     l = HandleLishipin()
23 |     list_url = [
24 |         {"name":"新知","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=10&start=%d&sort=%d"},
25 |         {"name":"社会","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=1&start=%d&sort=%d"},
26 |         {"name":"世界","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=2&start=%d&sort=%d"},
27 |         {"name":"生活","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=5&start=%d&sort=%d"},
28 |         {"name":"娱乐","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=4&start=%d&sort=%d"},
29 |         {"name":"财富","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=3&start=%d&sort=%d"},
30 |         {"name":"美食","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=6&start=%d&sort=%d"},
31 |         {"name":"音乐","item_url":"https://www.pearvideo.com/popular_loading.jsp?reqType=1&categoryId=59&start=%d&sort=%d"},
32 |     ]
33 |     for item in list_url:
34 |         for i in range(0,110,10):
35 |             item_url =item['item_url']%(i,i)
36 |             detail_text = l.handle_html(item_url)
37 |             detail_html = etree.HTML(detail_text)
38 |             detail_url = detail_html.xpath("//li[@class='popularem clearfix']//a[@class='actplay']/@href")
39 |             video_url_search = re.compile(r'srcUrl="(.*?)"')
40 |             video_name_search = re.compile(r'<h1\sclass="video-tt">(.*?)</h1>')
41 |             for url in detail_url:
42 |                 url = "https://www.pearvideo.com/"+url
43 |                 video_text = l.handle_html(url)
44 |                 video_url = video_url_search.search(video_text).group(1)
45 |                 video_name = video_name_search.search(video_text).group(1)
46 |                 info = {}
47 |                 info['video_url'] = video_url
48 |                 info['name'] = video_name
49 |                 info['type'] = item['name']
50 |                 info['from_url'] = url
51 |                 info['crawl_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
52 | 


--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/douban/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for douban project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'douban'
13 | 
14 | SPIDER_MODULES = ['douban.spiders']
15 | NEWSPIDER_MODULE = 'douban.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'douban.middlewares.DoubanSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | DOWNLOADER_MIDDLEWARES = {
56 |     'douban.middlewares.ProxyMiddleware': 543,
57 | }
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'douban.pipelines.DoubanPipeline': 300,
69 |     'douban.pipelines.DoubanJsonPipeline': 301,
70 | }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 
93 | REDIRECT_ENABLED = False
94 | HTTPERROR_ALLOWED_CODES= [302]
95 | 
96 | RETRY_ENABLED:True
97 | RETRY_HTTP_CODECS=[503]
98 | RETRY_TIMES=5
99 | 


--------------------------------------------------------------------------------
/dongqiudi/dongqiudi/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for dongqiudi project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'dongqiudi'
13 | 
14 | SPIDER_MODULES = ['dongqiudi.spiders']
15 | NEWSPIDER_MODULE = 'dongqiudi.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | CONCURRENT_REQUESTS = 2
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | # DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'dongqiudi.middlewares.DongqiudiSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | DOWNLOADER_MIDDLEWARES = {
56 |    # 'dongqiudi.middlewares.DongqiudiDownloaderMiddleware': 543,
57 |    'dongqiudi.middlewares.DongqiudiProxyMiddleware': 543,
58 | }
59 | 
60 | # Enable or disable extensions
61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 | 
66 | #设置图片保存路径
67 | IMAGES_STORE = './dongqiudi_pic'
68 | 
69 | # Configure item pipelines
70 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
71 | 
72 | ITEM_PIPELINES = {
73 |    'dongqiudi.pipelines.DongqiudiPipeline': 300,
74 |    #必须设置IMAGES_STORE,否则这条中间件不起作用
75 |    'dongqiudi.pipelines.DongqiudiImagePipeline': 209
76 | }
77 | 
78 | # Enable and configure the AutoThrottle extension (disabled by default)
79 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
80 | #AUTOTHROTTLE_ENABLED = True
81 | # The initial download delay
82 | #AUTOTHROTTLE_START_DELAY = 5
83 | # The maximum download delay to be set in case of high latencies
84 | #AUTOTHROTTLE_MAX_DELAY = 60
85 | # The average number of requests Scrapy should be sending in parallel to
86 | # each remote server
87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88 | # Enable showing throttling stats for every response received:
89 | #AUTOTHROTTLE_DEBUG = False
90 | 
91 | # Enable and configure HTTP caching (disabled by default)
92 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93 | #HTTPCACHE_ENABLED = True
94 | #HTTPCACHE_EXPIRATION_SECS = 0
95 | #HTTPCACHE_DIR = 'httpcache'
96 | #HTTPCACHE_IGNORE_HTTP_CODES = []
97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
98 | 


--------------------------------------------------------------------------------
/mafengwo/mafengwo/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for mafengwo project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'mafengwo'
 13 | 
 14 | SPIDER_MODULES = ['mafengwo.spiders']
 15 | NEWSPIDER_MODULE = 'mafengwo.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'mafengwo (+http://www.yourdomain.com)'
 20 | 
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | CONCURRENT_REQUESTS = 2#根据代理隧道数确定请求数
 26 | 
 27 | # Configure a delay for requests for the same website (default: 0)
 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 29 | # See also autothrottle settings and docs
 30 | DOWNLOAD_DELAY = 0.1
 31 | # The download delay setting will honor only one of:
 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 33 | #CONCURRENT_REQUESTS_PER_IP = 16
 34 | 
 35 | # Disable cookies (enabled by default)
 36 | COOKIES_ENABLED = False
 37 | 
 38 | # Disable Telnet Console (enabled by default)
 39 | #TELNETCONSOLE_ENABLED = False
 40 | 
 41 | # Override the default request headers:
 42 | DEFAULT_REQUEST_HEADERS = {
 43 |     # "Host":"www.mafengwo.cn",
 44 |     # "Connection":"keep-alive",
 45 |     # "Upgrade-Insecure-Requests":"1",
 46 |     "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
 47 |     # "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
 48 |     # "Accept-Encoding":"gzip, deflate",
 49 |     # "Accept-Language":"zh-CN,zh;q=0.9",
 50 | }
 51 | 
 52 | # Enable or disable spider middlewares
 53 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 54 | #SPIDER_MIDDLEWARES = {
 55 | #    'mafengwo.middlewares.MafengwoSpiderMiddleware': 543,
 56 | #}
 57 | 
 58 | # Enable or disable downloader middlewares
 59 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 60 | DOWNLOADER_MIDDLEWARES = {
 61 |    # 'mafengwo.middlewares.MafengwoDownloaderMiddleware': 543,
 62 |     'mafengwo.middlewares.MafengwoProxyMiddleware': 543,
 63 | }
 64 | 
 65 | # Enable or disable extensions
 66 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 67 | #EXTENSIONS = {
 68 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 69 | #}
 70 | 
 71 | # Configure item pipelines
 72 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 73 | ITEM_PIPELINES = {
 74 |    'mafengwo.pipelines.MafengwoPipeline': 300,
 75 |     'mafengwo.pipelines.MafengwoImagePipeline': 301,
 76 | }
 77 | 
 78 | IMAGES_STORE="./mafengwo_images"
 79 | 
 80 | # Enable and configure the AutoThrottle extension (disabled by default)
 81 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 82 | #AUTOTHROTTLE_ENABLED = True
 83 | # The initial download delay
 84 | #AUTOTHROTTLE_START_DELAY = 5
 85 | # The maximum download delay to be set in case of high latencies
 86 | #AUTOTHROTTLE_MAX_DELAY = 60
 87 | # The average number of requests Scrapy should be sending in parallel to
 88 | # each remote server
 89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 90 | # Enable showing throttling stats for every response received:
 91 | #AUTOTHROTTLE_DEBUG = False
 92 | 
 93 | # Enable and configure HTTP caching (disabled by default)
 94 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 95 | #HTTPCACHE_ENABLED = True
 96 | #HTTPCACHE_EXPIRATION_SECS = 0
 97 | #HTTPCACHE_DIR = 'httpcache'
 98 | #HTTPCACHE_IGNORE_HTTP_CODES = []
 99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
100 | DOWNLOAD_TIMEOUT = 10
101 | 


--------------------------------------------------------------------------------
/boss_zhipin/crawl_boss_zhipin.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urljoin
 2 | import requests
 3 | import pymongo
 4 | from pymongo.collection import Collection
 5 | import time
 6 | import json
 7 | from lxml import etree
 8 | from concurrent.futures.thread import ThreadPoolExecutor
 9 | 
10 | 
11 | 
12 | class HandleBossZhiPin(object):
13 |     def __init__(self):
14 |         self.header = {
15 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
16 |         }
17 |         self.city_list = ""
18 |         boss_client = pymongo.MongoClient(host="127.0.0.1", port=27017)
19 |         self.boss_db = boss_client['boss']
20 |         self.city_list = []
21 | 
22 |     def handle_city(self):
23 |         city_api_url = "https://www.zhipin.com/wapi/zpCommon/data/city.json"
24 |         city_response = self.handle_request(method='GET',url=city_api_url)
25 |         for province in json.loads(city_response)['zpData']['cityList']:
26 |             for city in province['subLevelModelList']:
27 |                 self.city_list.append(city)
28 | 
29 |     def handle_job_request(self,job,city):
30 |         print(city['name'])
31 |         for page in range(1,11):
32 |             job_url = "https://www.zhipin.com/c%s/?query=%s&page=%s"%(city['code'],job,page)
33 |             print(job_url)
34 |             response = self.handle_request(method='GET',url=job_url)
35 |             html = etree.HTML(response)
36 |             job_list = html.xpath("//div[@class='job-list']/ul/li")
37 |             for item in job_list:
38 |                 info = {}
39 |                 info['job_title'] = item.xpath(".//div[@class='job-title']/text()")[0]
40 |                 if '实习' in info['job_title']:
41 |                     continue
42 |                 info['price'] = item.xpath(".//span[@class='red']/text()")[0]
43 |                 describe_1 = item.xpath(".//div[@class='info-primary']/p/text()")
44 |                 if len(describe_1) == 3:
45 |                     info['location'] = describe_1[0]
46 |                     info['working_life'] = describe_1[1]
47 |                     info['education'] = describe_1[2]
48 |                 info['company_name'] = item.xpath(".//div[@class='info-company']//h3[@class='name']/a/text()")[0]
49 |                 describe_2 = item.xpath(".//div[@class='info-company']//p/text()")
50 |                 info['company_type'] = describe_2[0]
51 |                 info['job_id'] = urljoin("https://www.zhipin.com",item.xpath(".//h3/a/@href")[0])
52 |                 info['city'] = city['name']
53 |                 self.handle_save_data(item=info)
54 |             if not html.xpath("//div[@class='page']/a[@class='next']"):
55 |                 break
56 | 
57 | 
58 |     def handle_job_detail(self,response):
59 |         pass
60 | 
61 |     def handle_save_data(self,item):
62 |         boss_collection = Collection(self.boss_db, "boss_data")
63 |         boss_collection.update({"job_id": item['job_id']}, item, True)
64 | 
65 |     def handle_request(self,method,url,data=None):
66 |         while True:
67 |             proxy="http://HTK32673HL02BK2D:50125D2D38937C94@http-dyn.abuyun.com:9020"
68 |             proxies = {
69 |                 "http":proxy,
70 |                 "https":proxy
71 |             }
72 |             try:
73 |                 if method == "GET":
74 |                     response = requests.get(url=url,headers=self.header,proxies=proxies)
75 |                 elif method == "POST":
76 |                     response = requests.post(url=url,headers=self.header,data=data,proxies=proxies,timeout=3)
77 |             except Exception as e:
78 |                 print(e)
79 |                 time.sleep(2)
80 |                 continue
81 |             else:
82 |                 return response.text
83 | 
84 |     def run(self):
85 |         self.handle_city()
86 |         t = ThreadPoolExecutor(max_workers=3)
87 |         for city in self.city_list:
88 |             t.submit(self.handle_job_request(job='python',city=city))
89 |         t.shutdown()
90 | 
91 | def main():
92 |     boss = HandleBossZhiPin()
93 |     boss.run()
94 | 
95 | if __name__ == '__main__':
96 |     main()
97 | 


--------------------------------------------------------------------------------
/dasouche/handle_dasouche.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import json
  3 | import re
  4 | import pymongo
  5 | from pymongo.collection import Collection
  6 | from concurrent.futures.thread import ThreadPoolExecutor
  7 | 
  8 | 
  9 | class HandleDaSouChe(object):
 10 |     def __init__(self):
 11 |         #页码请求URL
 12 |         self.page_url = "https://aolai.souche.com/v1/searchApi/searchCar.json?_security_token=undefined"
 13 |         self.header = {
 14 |             "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
 15 |         }
 16 |         self.item_url_list = []
 17 |         mongo_client = pymongo.MongoClient(host="10.70.120.156", port=27017)
 18 |         self.db_data = mongo_client['oreo']
 19 | 
 20 |     def handle_save_data(self,item):
 21 |         db_collection = Collection(self.db_data, 'dasouche_data')
 22 |         db_collection.update({'carId':item['carId']},item,True)
 23 | 
 24 |     def handle_page(self):
 25 |         for page in range(1,5):
 26 |             #构造请求数据POST,每页可现实500条数据，共4页
 27 |             data = {
 28 |                 "keyword":"",
 29 |                 "brandCode":"",
 30 |                 "seriesCode":"",
 31 |                 "price":"",
 32 |                 "carModel":"",
 33 |                 "carAge":"",
 34 |                 "mileage":"",
 35 |                 "gearboxType":"",
 36 |                 "displacement":"",
 37 |                 "emissionStandard":"",
 38 |                 "bodyColor":"",
 39 |                 "fuelType":"",
 40 |                 "seatingCapacity":"",
 41 |                 "drivingMode":"",
 42 |                 "country":"",
 43 |                 "pageNo":page,
 44 |                 "pageSize":"500",
 45 |                 "from":"pc",
 46 |                 "cityCode":"",
 47 |                 "shopCode":"",
 48 |                 "sort":"newsOnShelf",
 49 |             }
 50 |             page_result = self.handle_request(method='POST',url=self.page_url,data=data)
 51 |             for item in json.loads(page_result)['data']['items']:
 52 |                 self.item_url_list.append(item['detailUrl'])
 53 | 
 54 |     #处理详情页
 55 |     def handle_detail(self,url):
 56 |         id_search = re.compile(r"carId=(.*?)&shopCode=(\d+)")
 57 |         car_id = id_search.search(url).group(1)
 58 |         shop_id = id_search.search(url).group(2)
 59 |         #车辆详情信息
 60 |         car_detail_url = "https://aolai.souche.com//v1/carDetailsApi/carDetailInfo.json?carId=%s"%car_id
 61 |         car_detail = self.handle_request(method='GET',url=car_detail_url)
 62 |         car_detail_result = json.loads(car_detail)['data']
 63 |         #售卖商店信息
 64 |         shop_detail_url = "https://aolai.souche.com//v1/shopApi/queryTangecheShopInfo.json?carId=%s&citycode=%s&shopCode=%s"%(car_id,car_detail_result['baseCarInfoView']['cityCode'],shop_id)
 65 |         shop_detail_result = self.handle_request(method='GET',url=shop_detail_url)
 66 |         car_detail_result.update(json.loads(shop_detail_result)['data'])
 67 |         #车辆厂商配置信息
 68 |         car_config_url = "https://aolai.souche.com/v1/carDetailsApi/carConfigDetailInfo.json?_security_token=undefined&carId=%s"%car_id
 69 |         car_config_result = self.handle_request(method='GET',url=car_config_url)
 70 |         car_detail_result.update(json.loads(car_config_result)['data'])
 71 |         car_detail_result['from_url'] = url
 72 |         self.handle_save_data(car_detail_result)
 73 | 
 74 | 
 75 | 
 76 |     def handle_request(self,method,url,data=None):
 77 |         if method == 'POST':
 78 |             response = requests.post(url=url,headers=self.header,data=data)
 79 |             return response.text
 80 |         elif method == 'GET':
 81 |             response = requests.get(url=url,headers=self.header)
 82 |             return response.text
 83 | 
 84 | 
 85 |     def run(self):
 86 |         self.handle_page()
 87 |         t = ThreadPoolExecutor()
 88 |         for url in self.item_url_list:
 89 |             t.submit(self.handle_detail,url)
 90 |         t.shutdown()
 91 | 
 92 | 
 93 | def main():
 94 |     dasouche = HandleDaSouChe()
 95 |     dasouche.run()
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     main()
100 | 


--------------------------------------------------------------------------------
/lagou/crawl_lagou_job_old.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | import requests
  4 | import time
  5 | import multiprocessing
  6 | from handle_mysql import lagou_mysql
  7 | import random
  8 | 
  9 | 
 10 | 
 11 | class HandleLaGou(object):
 12 |     def __init__(self):
 13 |         self.lagou_session = requests.session()
 14 |         self.header = {
 15 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
 16 |         }
 17 |         self.city_list = ""
 18 | 
 19 |     def handle_city(self):
 20 |         city_search = re.compile(r'zhaopin/">(.*?)</a>')
 21 |         city_url = "https://www.lagou.com/jobs/allCity.html"
 22 |         city_result = self.handle_request(method='GET',url=city_url)
 23 |         self.city_list = city_search.findall(city_result)
 24 |         #清除cookie
 25 |         self.lagou_session.cookies.clear()
 26 | 
 27 |     def handle_city_job(self,city):
 28 |         for page in range(1,31):
 29 |             data = {
 30 |                 "pn":str(page),
 31 |                 "kd":"python",
 32 |             }
 33 |             job_index_url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
 34 |             self.handle_request(method='GET',url=job_index_url)
 35 |             page_url = "https://www.lagou.com/jobs/positionAjax.json?city=%s&needAddtionalResult=false"%city
 36 |             self.header['Referer'] = job_index_url.encode()
 37 |             job_result = self.handle_request(method='POST',url=page_url,data=data)
 38 |             try:
 39 |                 lagou_data = json.loads(job_result)
 40 |             except:
 41 |                 continue
 42 |             else:
 43 |                 job_list = lagou_data['content']['positionResult']['result']
 44 |                 if job_list:
 45 |                     for job in job_list:
 46 |                         job['crawl_date'] = time.strftime("%Y-%m-%d", time.localtime())
 47 |                         lagou_mysql.insert_item(job)
 48 |                 else:
 49 |                     break
 50 | 
 51 |     def handle_request(self,method,url,data=None):
 52 |         while True:
 53 |             proxyinfo = "http://%s:%s@%s:%s" %('H1V32R6470A7G90D','CD217C660A9143C3','http-dyn.abuyun.com','9020')
 54 |             proxy = {
 55 |                 "http": proxyinfo,
 56 |                 "https": proxyinfo,
 57 |             }
 58 | 
 59 |             try:
 60 |                 if method == "GET":
 61 |                     response = self.lagou_session.get(url=url,headers=self.header,proxies=proxy,timeout=6)
 62 |                 elif method == "POST":
 63 |                     response = self.lagou_session.post(url=url,headers=self.header,data=data,proxies=proxy,timeout=6)
 64 |             except Exception as e:
 65 |                 print(e)
 66 |             else:
 67 |                 if '您操作太频繁,请稍后再访问' in response.text:
 68 |                     print('频繁')
 69 |                     self.lagou_session.cookies.clear()
 70 |                     # job_index_url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
 71 |                     # self.handle_request(method='GET',url=job_index_url)
 72 |                     # time.sleep(random.choice(range(3,11)))
 73 |                     time.sleep(1)
 74 |                     continue
 75 |                 elif '爬虫行为' in response.text:
 76 |                     print('爬虫')
 77 |                     self.lagou_session.cookies.clear()
 78 |                     time.sleep(1)
 79 |                     # time.sleep(random.choice(range(3,11)))
 80 |                     continue
 81 |                 else:
 82 |                     return response.text
 83 | 
 84 |     def run(self):
 85 |         self.handle_city()
 86 |         print(self.city_list)
 87 |         # for city in self.city_list:
 88 |         #     self.handle_city_job(city=city)
 89 |         pool = multiprocessing.Pool(2)
 90 |         for city in self.city_list:
 91 |             pool.apply_async(self.handle_city_job,args=(city,))
 92 |         pool.close()
 93 |         pool.join()
 94 | 
 95 | 
 96 | def main():
 97 |     lagou = HandleLaGou()
 98 |     lagou.run()
 99 | 
100 | if __name__ == '__main__':
101 |     main()
102 | 


--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for mafengwo project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'mafengwo'
 13 | 
 14 | SPIDER_MODULES = ['mafengwo.spiders']
 15 | NEWSPIDER_MODULE = 'mafengwo.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
 20 | 
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | CONCURRENT_REQUESTS = 32
 26 | 
 27 | # Configure a delay for requests for the same website (default: 0)
 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 29 | # See also autothrottle settings and docs
 30 | DOWNLOAD_DELAY = 0.5
 31 | # The download delay setting will honor only one of:
 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 33 | #CONCURRENT_REQUESTS_PER_IP = 16
 34 | 
 35 | # Disable cookies (enabled by default)
 36 | COOKIES_ENABLED = False
 37 | 
 38 | # Disable Telnet Console (enabled by default)
 39 | #TELNETCONSOLE_ENABLED = False
 40 | 
 41 | # Override the default request headers:
 42 | DEFAULT_REQUEST_HEADERS = {
 43 |     # "Host":"www.mafengwo.cn",
 44 |     # "Connection":"keep-alive",
 45 |     # "Upgrade-Insecure-Requests":"1",
 46 |     "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
 47 |     # "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
 48 |     # "Accept-Encoding":"gzip, deflate",
 49 |     # "Accept-Language":"zh-CN,zh;q=0.9",
 50 | }
 51 | 
 52 | # Enable or disable spider middlewares
 53 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 54 | #SPIDER_MIDDLEWARES = {
 55 | #    'mafengwo.middlewares.MafengwoSpiderMiddleware': 543,
 56 | #}
 57 | 
 58 | # Enable or disable downloader middlewares
 59 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 60 | DOWNLOADER_MIDDLEWARES = {
 61 |    # 'mafengwo.middlewares.MafengwoDownloaderMiddleware': 543,
 62 |     'mafengwo.middlewares.MafengwoProxyMiddleware': 543,
 63 | }
 64 | 
 65 | # Enable or disable extensions
 66 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 67 | #EXTENSIONS = {
 68 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 69 | #}
 70 | 
 71 | # Configure item pipelines
 72 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 73 | ITEM_PIPELINES = {
 74 |    'mafengwo.pipelines.MafengwoPipeline': 300,
 75 |    #'mafengwo.pipelines.MafengwoImagePipeline': 301,
 76 | }
 77 | 
 78 | IMAGES_STORE="./mafengwo_images"
 79 | 
 80 | # Enable and configure the AutoThrottle extension (disabled by default)
 81 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 82 | #AUTOTHROTTLE_ENABLED = True
 83 | # The initial download delay
 84 | #AUTOTHROTTLE_START_DELAY = 5
 85 | # The maximum download delay to be set in case of high latencies
 86 | #AUTOTHROTTLE_MAX_DELAY = 60
 87 | # The average number of requests Scrapy should be sending in parallel to
 88 | # each remote server
 89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 90 | # Enable showing throttling stats for every response received:
 91 | #AUTOTHROTTLE_DEBUG = False
 92 | 
 93 | # Enable and configure HTTP caching (disabled by default)
 94 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 95 | #HTTPCACHE_ENABLED = True
 96 | #HTTPCACHE_EXPIRATION_SECS = 0
 97 | #HTTPCACHE_DIR = 'httpcache'
 98 | #HTTPCACHE_IGNORE_HTTP_CODES = []
 99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
100 | DOWNLOAD_TIMEOUT = 10
101 | IMAGES_EXPIRES = 90  #90天内抓取的都不会被重抓
102 | RETRY_TIMES = 100
103 | # LOG_LEVEL = 'INFO'
104 | proxy_url = '代理库URL'
105 | 


--------------------------------------------------------------------------------
/lagou/handle_mysql.py:
--------------------------------------------------------------------------------
  1 | from sqlalchemy import create_engine
  2 | from sqlalchemy.ext.declarative import declarative_base
  3 | from sqlalchemy import Column,Integer,String,Float,Date
  4 | from sqlalchemy.orm import sessionmaker
  5 | import time
  6 | 
  7 | 
  8 | #创建数据库连接
  9 | engine = create_engine("mysql+pymysql://root:abcd1234@127.0.0.1:3306/lagou?charset=utf8")
 10 | 
 11 | #声明一个基类
 12 | Base = declarative_base()
 13 | 
 14 | #操作数据库需要使用session
 15 | Session = sessionmaker(bind=engine)
 16 | 
 17 | class Lagoutables(Base):
 18 |     __tablename__ = 'lagou_data'
 19 | 
 20 |     #id
 21 |     id = Column(Integer,primary_key=True,autoincrement=True)
 22 |     #岗位ID
 23 |     positionId = Column(Integer,nullable=False)
 24 |     #经度
 25 |     longitude = Column(Float,nullable=False)
 26 |     #纬度
 27 |     latitude = Column(Float,nullable=False)
 28 |     #岗位名称
 29 |     positionName = Column(String(length=50),nullable=False)
 30 |     #工作年限
 31 |     workYear = Column(String(length=20),nullable=False)
 32 |     #学历
 33 |     education = Column(String(length=20),nullable=False)
 34 |     #岗位性质
 35 |     jobNature = Column(String(length=20),nullable=True)
 36 |     #公司类型
 37 |     financeStage = Column(String(length=30),nullable=True)
 38 |     #公司规模
 39 |     companySize = Column(String(length=30),nullable=True)
 40 |     #业务方向
 41 |     industryField = Column(String(length=30),nullable=True)
 42 |     #所在城市
 43 |     city = Column(String(length=10),nullable=False)
 44 |     #岗位标签
 45 |     positionAdvantage = Column(String(length=200),nullable=True)
 46 |     #公司简称
 47 |     companyShortName = Column(String(length=50),nullable=True)
 48 |     #公司全称
 49 |     companyFullName = Column(String(length=200),nullable=True)
 50 |     #公司所在区
 51 |     district = Column(String(length=20),nullable=True)
 52 |     #公司福利标签
 53 |     companyLabelList = Column(String(length=200),nullable=True)
 54 |     #工资
 55 |     salary = Column(String(length=20),nullable=False)
 56 |     #抓取日期
 57 |     crawl_date = Column(Date,nullable=False)
 58 | 
 59 | #创建表
 60 | # Lagoutables.metadata.create_all(engine)
 61 | 
 62 | class HandleLagouData(object):
 63 |     def __init__(self):
 64 |         self.mysql_session = Session()
 65 |         self.item = Lagoutables()
 66 | 
 67 |     def insert_item(self,item):
 68 |         date = time.strftime("%Y-%m-%d", time.localtime())
 69 |         data = Lagoutables(
 70 |             # 岗位ID
 71 |             positionId = item['positionId'],
 72 |             # 经度
 73 |             longitude = item['longitude'],
 74 |             # 纬度
 75 |             latitude = item['latitude'],
 76 |             # 岗位名称
 77 |             positionName = item['positionName'],
 78 |             # 工作年限
 79 |             workYear = item['workYear'],
 80 |             # 学历
 81 |             education = item['education'],
 82 |             # 岗位性质
 83 |             jobNature = item['jobNature'],
 84 |             # 公司类型
 85 |             financeStage = item['financeStage'],
 86 |             # 公司规模
 87 |             companySize = item['companySize'],
 88 |             # 业务方向
 89 |             industryField = item['industryField'],
 90 |             # 所在城市
 91 |             city = item['city'],
 92 |             # 岗位标签
 93 |             positionAdvantage = item['positionAdvantage'],
 94 |             # 公司简称
 95 |             companyShortName = item['companyShortName'],
 96 |             # 公司全称
 97 |             companyFullName = item['companyFullName'],
 98 |             # 公司所在区
 99 |             district = item['district'],
100 |             # 公司福利标签
101 |             companyLabelList = ','.join(item['companyLabelList']),
102 |             salary = item['salary'],
103 |             # 抓取日期
104 |             crawl_date = item['crawl_date']
105 |             )
106 |         query_result = self.mysql_session.query(Lagoutables).filter(Lagoutables.crawl_date==date,Lagoutables.positionId==item['positionId']).first()
107 |         if query_result:
108 |             print('该岗位信息已存在%s:%s:%s'%(item['positionId'],item['city'],item['positionName']))
109 |         else:
110 |             self.mysql_session.add(data)
111 |             self.mysql_session.commit()
112 |             print('新增岗位信息%s'%item['positionId'])
113 |             return self.item
114 | 
115 | lagou_mysql = HandleLagouData()
116 | # item = {'positionId':6009711}
117 | # lagou_mysql.insert_item(item)
118 | 


--------------------------------------------------------------------------------
/baidu_m_keyword_ranks/baidu_m_keyword.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | import re
 3 | import requests
 4 | from lxml import etree
 5 | from concurrent.futures import ThreadPoolExecutor
 6 | from baidu_m_keyword_ziran.handle_mysql import mysql
 7 | from baidu_m_keyword_ziran.handle_mongo import mongo
 8 | import time
 9 | 
10 | 
11 | class Handle_baidu_m(object):
12 |     def __init__(self):
13 |         self.header = {
14 |             "Host":"m.baidu.com",
15 |             "Connection":"keep-alive",
16 |             "Upgrade-Insecure-Requests":"1",
17 |             "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36",
18 |             "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
19 |             "Accept-Encoding":"gzip, deflate",
20 |             "Accept-Language":"zh-CN,zh;q=0.9",
21 |         }
22 | 
23 |     #处理标题中的特殊字符
24 |     def handle_title(self,title):
25 |         search = re.compile('"|“|”|{|}')
26 |         search_list = search.findall(title)
27 |         for value in search_list:
28 |             return re.sub(search,urllib.parse.quote(value),title)
29 |         else:
30 |             return title
31 | 
32 |     #处理任务
33 |     def handle_task(self,keyword):
34 |         print(keyword)
35 |         result = {}
36 |         result_list = []
37 |         result['keyword'] = keyword
38 |         url_list = ["http://m.baidu.com/s?pn=0&word="+keyword,"http://m.baidu.com/s?pn=10&word="+keyword,"http://m.baidu.com/s?pn=20&word="+keyword]
39 |         for url in url_list:
40 |             response = requests.get(url=url,headers=self.header)
41 |             baidu_html = etree.HTML(response.text)
42 |             item_list = baidu_html.xpath("//div[@id='results']/div")
43 |             for item in item_list:
44 |                 info = {}
45 |                 #获取标题
46 |                 title = item.xpath(".//span[contains(@class,'title')]//text()|.//header[@class='c-row']/a/h3[@class='c-title']//text()")
47 |                 if title:
48 |                     info['title'] = self.handle_title(''.join(title)).replace("'","")
49 |                     if '百度百科' in info['title']:
50 |                         info['target_url'] = "https://wapbaike.baidu.com/item/"+keyword
51 |                     if '其他人还在搜' in info['title']:
52 |                         continue
53 |                     if '相关词语' in info['title']:
54 |                         continue
55 |                     if '相关平台' in info['title']:
56 |                         continue
57 |                     if '相关品牌' in info['title']:
58 |                         continue
59 |                     if '相关网站' in info['title']:
60 |                         continue
61 |                     if keyword+' - 资讯' in info['title']:
62 |                         info['target_url'] = 'http://m.baidu.com/sf/vsearch?pd=realtime&word='+keyword
63 |                     if keyword+' - 视频' in info['title']:
64 |                         info['target_url'] = 'http://m.baidu.com/sf/vsearch?pd=video&atn=index&tn=vsearch&word='+keyword
65 |                     if keyword+' - 小视频' in info['title']:
66 |                         info['target_url'] = 'http://m.baidu.com/sf/vsearch?pd=xsp&atn=index&tn=vsearch&word='+keyword
67 |                     else:
68 |                         target_url = eval(item.xpath("./@data-log")[0].encode('utf-8').decode())['mu']
69 |                         if target_url:
70 |                             info['target_url'] = target_url
71 |                         else:
72 |                             if '_企业信息' in info['title']:
73 |                                 info['target_url'] = item.xpath("//a[@class='c-blocka']/@data-url")[0]
74 |                     result_list.append(info)
75 |                 else:
76 |                     continue
77 |         result['rank'] = result_list
78 |         result['crawl_time'] = time.strftime("%Y-%m-%d", time.localtime())
79 |         print(result)
80 |         # mongo.insert_item_in_db('baidu_m_keyword_ziran',result)
81 |         # mysql.handle_insert_db(result)
82 | 
83 | if __name__ == '__main__':
84 |     baidu_m = Handle_baidu_m()
85 |     # baidu_m.handle_task('盐城二手奥迪a1')
86 |     #线程池
87 |     t = ThreadPoolExecutor()
88 |     thread_list = []
89 |     #获取任务
90 |     task = mysql.handle_task()
91 |     for keyword in task:
92 |         thread = t.submit(baidu_m.handle_task,keyword[0])
93 |         thread_list.append(thread)
94 |     t.shutdown()
95 |     # print([thread.result() for thread in thread_list])
96 | 


--------------------------------------------------------------------------------
/kolesa/crawl_kolesa.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from lxml import etree
  3 | import requests
  4 | import json
  5 | from concurrent.futures import ThreadPoolExecutor
  6 | import multiprocessing
  7 | from handle_mongo import kolesa_mongo
  8 | 
  9 | class Crawl_kolesa(object):
 10 |     def __init__(self):
 11 |         #首页URL
 12 |         self.index_url = "https://kolesa.kz/cars/"
 13 |         self.header = {
 14 |             "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
 15 |             "Accept-Encoding":"gzip, deflate, br",
 16 |             "Accept-Language":"zh-CN,zh;q=0.9",
 17 |             "Connection":"keep-alive",
 18 |             "Host":"kolesa.kz",
 19 |             "Upgrade-Insecure-Requests":"1",
 20 |             "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36",
 21 |         }
 22 |         self.brand_list_url = ""
 23 | 
 24 |     #处理请求方法
 25 |     def handle_request(self,url):
 26 |         response = requests.get(url=url,headers=self.header)
 27 |         return response.text
 28 | 
 29 |     #处理品牌方法
 30 |     def handle_brand(self):
 31 |         response = self.handle_request(url=self.index_url)
 32 |         html = etree.HTML(response)
 33 |         #解析品牌列表
 34 |         self.brand_list_url = html.xpath("//div[@class='cross-links'][2]/div[@class='cross-links-container']/ul[@class='col-sm-4 cross-links-list']/li/a/@href")
 35 | 
 36 |     #解析品牌筛选条件下的页码页
 37 |     def handle_brand_page(self,url):
 38 |         detail_info_search = re.compile(r"listing.items.push\((.*?)\);")
 39 |         #网站仅显示1000页
 40 |         for page in range(1,1001):
 41 |             #https://kolesa.kz/cars/gaz/?sort_by=add_date-asc&page=2
 42 |             #构造品牌页码URL
 43 |             brand_url = "https://kolesa.kz%s?sort_by=add_date-asc&page=%s"%(url,page)
 44 |             print(brand_url)
 45 |             #请求品牌页码页
 46 |             response = self.handle_request(url=brand_url)
 47 |             #每页的详情数据
 48 |             detail_list = detail_info_search.findall(response)
 49 |             if detail_list:
 50 |                 for detail in detail_list:
 51 |                     detail = json.loads(detail)
 52 |                     detail_info = {}
 53 |                     detail_info['car_name'] = detail.get("name",None)
 54 |                     detail_info['id'] = detail.get("id",None)
 55 |                     detail_info['car_model'] = detail['attributes']['model']
 56 |                     detail_info['car_brand'] = detail['attributes']['brand']
 57 |                     detail_info['price'] = detail.get("unitPrice",None)
 58 |                     detail_info['from_url'] = detail.get("url",None)
 59 |                     #对接mongo
 60 |                     kolesa_mongo.handle_save_task(detail_info)
 61 | 
 62 |     #处理详情页
 63 |     def handle_detail(self,item):
 64 |         response = self.handle_request(item['from_url'])
 65 |         html = etree.HTML(response)
 66 |         item['year'] = html.xpath("//span[@class='year']/text()")[0].strip()
 67 |         item_list = html.xpath("//div[@class='offer__parameters']/dl")
 68 |         for i in item_list:
 69 |             name = i.xpath("./dt/span/text()")[0].strip()
 70 |             if name == "Пробег":
 71 |                 #公里数
 72 |                 item['mileage'] = i.xpath("./dd/text()")[0].strip()
 73 |             elif name == "Коробка передач":
 74 |                 #变速箱
 75 |                 item['gearbox'] = i.xpath("./dd/text()")[0].strip()
 76 |             elif name == "Руль":
 77 |                 #方向盘方向
 78 |                 item['steering_wheel'] = i.xpath("./dd/text()")[0].strip()
 79 |         if not item.get('mileage'):
 80 |             item['mileage'] = 'no data'
 81 |         if not item.get('gearbox'):
 82 |             item['geargox'] = 'no data'
 83 |         if not item.get('streering_wheel'):
 84 |             item['steering_wheel'] = 'no data'
 85 |         #保存数据
 86 |         kolesa_mongo.handle_save_data(item)
 87 | 
 88 | 
 89 |     #处理任务方法
 90 |     def handle_task(self):
 91 |         self.handle_brand()
 92 |         print("处理品牌")
 93 |         t = ThreadPoolExecutor()
 94 |         for url in self.brand_list_url:
 95 |             t.submit(self.handle_brand_page,url)
 96 |         t.shutdown()
 97 | 
 98 |     #处理最终数据方法
 99 |     def handle_data(self):
100 |         t = ThreadPoolExecutor()
101 |         while True:
102 |             task = kolesa_mongo.handle_get_task()
103 |             if task:
104 |                 t.submit(self.handle_detail, task)
105 |             else:
106 |                 break
107 |         t.shutdown()
108 | 
109 |     #爬虫启动方法
110 |     def run(self):
111 |         m1 = multiprocessing.Process(target=self.handle_task)
112 |         m1.start()
113 |         m1.join()
114 | 
115 |         m2 = multiprocessing.Process(target=self.handle_data)
116 |         m2.start()
117 |         m2.join()
118 | 
119 | 
120 | 
121 | 
122 | 
123 | if __name__ == '__main__':
124 |     kolesa = Crawl_kolesa()
125 |     kolesa.run()
126 | 


--------------------------------------------------------------------------------
/lagou/crawl_lagou_job_new.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | import requests
  4 | import time
  5 | import multiprocessing
  6 | from handle_mysql import lagou_mysql
  7 | 
  8 | class HandleLaGou(object):
  9 |     def __init__(self):
 10 |         #使用session保存cookie信息
 11 |         self.lagou_session = requests.session()
 12 |         self.header = {
 13 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
 14 |         }
 15 |         self.city_list = ""
 16 | 
 17 |     def handle_request(self,method,url,data=None,info=None):
 18 |         '''
 19 |         处理请求方法
 20 |         :param method: 请求方法
 21 |         :param url: 请求url
 22 |         :param data: post请求的数据
 23 |         :return: 数据入库
 24 |         '''
 25 |         # 由于代理不稳定，所以使用while循环
 26 |         while True:
 27 |             # 动态版阿布云代理
 28 |             proxyinfo = "http://%s:%s@%s:%s" %('H1V32R6470A7G90D','CD217C660A9143C3','http-dyn.abuyun.com','9020')
 29 |             proxy = {
 30 |                 "http": proxyinfo,
 31 |                 "https": proxyinfo,
 32 |             }
 33 |             try:
 34 |                 if method == "GET":
 35 |                     response = self.lagou_session.get(url=url,headers=self.header,proxies=proxy,timeout=6)
 36 |                 elif method == "POST":
 37 |                     response = self.lagou_session.post(url=url,headers=self.header,data=data,proxies=proxy,timeout=6)
 38 |             except Exception as e:
 39 |                 print(e)
 40 |             else:
 41 |                 # 由于反爬虫造成的continue
 42 |                 if '频繁' in response.text:
 43 |                     print('频繁')
 44 |                     # 首先清除当前存在的cookie信息
 45 |                     self.lagou_session.cookies.clear()
 46 |                     # 重新请求cookie信息,并休眠10秒
 47 |                     self.lagou_session.get(
 48 |                         url="https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info,
 49 |                         headers=self.header)
 50 |                     time.sleep(10)
 51 |                     continue
 52 |                 elif '错误网关' in response.text:
 53 |                     print('错误网关')
 54 |                     time.sleep(1)
 55 |                     continue
 56 |                 elif '页面加载中' in response.text:
 57 |                     print('页面加载中')
 58 |                     time.sleep(2)
 59 |                     continue
 60 |                 else:
 61 |                     return response.text
 62 | 
 63 |     def handle_city(self):
 64 |         '''
 65 |         获取拉勾网岗位信息城市
 66 |         :return: 城市列表
 67 |         '''
 68 |         city_search = re.compile(r'zhaopin/">(.*?)</a>')
 69 |         city_url = "https://www.lagou.com/jobs/allCity.html"
 70 |         city_result = self.handle_request(method='GET',url=city_url)
 71 |         self.city_list = city_search.findall(city_result)
 72 |         #清除cookie
 73 |         self.lagou_session.cookies.clear()
 74 | 
 75 |     def handle_city_job(self,city):
 76 |         '''
 77 |         :param city: 城市信息
 78 |         :return: 最终岗位数据，存储到Mysql
 79 |         '''
 80 |         #发出第一个请求，获取cookies信息和页码信息
 81 |         first_request_url="https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput="%city
 82 |         first_response = self.handle_request(method='GET',url=first_request_url)
 83 |         total_page_search = re.compile(r'class="span\stotalNum">(\d+)</span>')
 84 |         try:
 85 |             total_page = total_page_search.search(first_response).group(1)
 86 |         #由于无岗位信息而return
 87 |         except Exception as e:
 88 |             return
 89 |         else:
 90 |             #经过分析，每个地区最多显示30页
 91 |             for i in range(1,int(total_page)+1):
 92 |                 data = {
 93 |                     "pn":i,
 94 |                     "kd":"python"
 95 |                 }
 96 |                 #请求岗位信息时必须带上Referer
 97 |                 referer_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput="%city
 98 |                 self.header["Referer"]=referer_url.encode()
 99 |                 page_url = "https://www.lagou.com/jobs/positionAjax.json?city=%s&needAddtionalResult=false"%city
100 |                 response = self.handle_request(method='POST',url=page_url,data=data,info=city)
101 |                 lagou_data = json.loads(response)
102 |                 job_list = lagou_data['content']['positionResult']['result']
103 |                 if job_list:
104 |                     for job in job_list:
105 |                         job['crawl_date'] = time.strftime("%Y-%m-%d", time.localtime())
106 |                         lagou_mysql.insert_item(job)
107 | 
108 | if __name__ == '__main__':
109 |     lagou = HandleLaGou()
110 |     lagou.handle_city()
111 |     print(lagou.city_list)
112 |     pool = multiprocessing.Pool(2)
113 |     for city in lagou.city_list:
114 |         pool.apply_async(lagou.handle_city_job,args=(city,))
115 |     pool.close()
116 |     pool.join()
117 |     # for city in lagou.city_list:
118 |     #     lagou.handle_city_job(city)
119 | 


--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/handle_task.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import time
  3 | import requests
  4 | import re
  5 | import json
  6 | from handle_mongo import mongo
  7 | from settings import proxy_url
  8 | from concurrent.futures.thread import ThreadPoolExecutor
  9 | import multiprocessing
 10 | 
 11 | 
 12 | class HandleMaFengWoTask(object):
 13 |     def __init__(self):
 14 |         self.header = {
 15 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
 16 |         }
 17 |         self.proxy_list = []
 18 | 
 19 |     def handle_proxy(self):
 20 |         response = requests.get(url=proxy_url)
 21 |         data = json.loads(response.text)
 22 |         sum = 0
 23 |         #每请求一次加入200个代理
 24 |         for proxy in data['proxys']:
 25 |             sum = sum + 1
 26 |             if sum > 200:
 27 |                 break
 28 |             proxy_dict = {
 29 |                 "http": proxy['proxy'],
 30 |                 "https": proxy['proxy']
 31 |             }
 32 |             self.proxy_list.append(proxy_dict)
 33 | 
 34 | 
 35 |     #最新游记
 36 |     def handle_new_article(self,page):
 37 |         article_url_search = re.compile(r'a\shref="/i/(\d+)\.html"')
 38 |         info = {}
 39 |         info['flag'] = 'GET'
 40 |         info['url'] = 'https://pagelet.mafengwo.cn/note/pagelet/recommendNoteApi?params={"type":3,"objid":0,"page":%s,"ajax":1,"retina":0}'%page
 41 |         print(info['url'])
 42 |         new_article = self.handle_request(info)
 43 |         try:
 44 |             html = json.loads(new_article)['data']['html']
 45 |         except:
 46 |             return
 47 |         article_url_list = article_url_search.findall(html)
 48 |         for article_id in set(article_url_list):
 49 |             insert_mongo = {}
 50 |             insert_mongo['id'] = article_id
 51 |             insert_mongo['url'] = 'http://pagelet.mafengwo.cn/note/pagelet/headOperateApi?params={"iid":"%s"}'%article_id
 52 |             insert_mongo['item_type'] = 'head_item'
 53 |             print(insert_mongo)
 54 |             mongo.insert_task(insert_mongo)
 55 | 
 56 |     #热门游记
 57 |     def handle_hot_article(self,page):
 58 |         article_url_search = re.compile(r'a\shref="/i/(\d+)\.html"')
 59 |         info = {}
 60 |         info['flag'] = 'GET'
 61 |         info['url'] = 'https://pagelet.mafengwo.cn/note/pagelet/recommendNoteApi?params={"type":0,"objid":0,"page":%s,"ajax":1,"retina":0}' % page
 62 |         print(info['url'])
 63 |         new_article = self.handle_request(info)
 64 |         try:
 65 |             html = json.loads(new_article)['data']['html']
 66 |         except:
 67 |             return
 68 |         article_url_list = article_url_search.findall(html)
 69 |         for article_id in set(article_url_list):
 70 |             insert_mongo = {}
 71 |             insert_mongo['id'] = article_id
 72 |             insert_mongo[
 73 |                 'url'] = 'http://pagelet.mafengwo.cn/note/pagelet/headOperateApi?params={"iid":"%s"}' % article_id
 74 |             insert_mongo['item_type'] = 'head_item'
 75 |             print(insert_mongo)
 76 |             mongo.insert_task(insert_mongo)
 77 | 
 78 |     def handle_new_column(self):
 79 |         column_url_search = re.compile(r'/traveller/article.php\?id=\d+')
 80 |         for i in range(0,2000,10):
 81 |             info= {}
 82 |             info['flag'] = 'GET'
 83 |             info['url'] = 'https://www.mafengwo.cn/traveller/ajax.php?action=getMoreArticles&sort=ctime&start=%s'%i
 84 |             new_column = self.handle_request(info)
 85 |             html = json.loads(new_column)['html']
 86 |             column_list = column_url_search.findall(html)
 87 |             for column in set(column_list):
 88 |                 url = 'https://www.mafengwo.cn'+column
 89 |                 print(url)
 90 |             break
 91 | 
 92 |     def handle_hot_column(self):
 93 |         column_url_search = re.compile(r'/traveller/article.php\?id=\d+')
 94 |         for i in range(0,2000,10):
 95 |             info= {}
 96 |             info['flag'] = 'GET'
 97 |             info['url'] = 'https://www.mafengwo.cn/traveller/ajax.php?action=getMoreArticles&sort=hot&start=%s'%i
 98 |             new_column = self.handle_request(info)
 99 |             html = json.loads(new_column)['html']
100 |             column_list = column_url_search.findall(html)
101 |             for column in set(column_list):
102 |                 url = 'https://www.mafengwo.cn'+column
103 |                 print(url)
104 |             break
105 | 
106 |     def handle_request(self,info):
107 |         #判断代理数量，如果小于10则更新代理
108 |         if len(self.proxy_list)<10:
109 |             self.handle_proxy()
110 |         if info['flag'] == 'GET':
111 |             while True:
112 |                 try:
113 |                     response = requests.get(url=info['url'],headers=self.header,proxies=self.proxy_list.pop(0),timeout=6)
114 |                 except Exception as e:
115 |                     print(e)
116 |                     time.sleep(2)
117 |                     continue
118 |                 else:
119 |                     return response.text
120 |         elif info['flag'] == 'POST':
121 |             response = requests.post(url=info['url'],headers=self.header,data=info['data'],proxies=self.proxy_list.pop(0),timeout=6)
122 |             return response.text
123 | 
124 |     #最新游记处理进程
125 |     def process_1(self):
126 |         t1 = ThreadPoolExecutor()
127 |         for page in range(1,8):
128 |             print(page)
129 |             t1.submit(self.handle_new_article,page)
130 |         t1.shutdown()
131 | 
132 |     #热门游记处理进程
133 |     def process_2(self):
134 |         t2 = ThreadPoolExecutor()
135 |         for page in range(1,8):
136 |             print(page)
137 |             t2.submit(self.handle_hot_article,page)
138 |         t2.shutdown()
139 |         # self.handle_new_column()
140 |         # self.handle_hot_column()
141 | 
142 |     def run(self):
143 |         m1 = multiprocessing.Process(target=self.process_1)
144 |         m2 = multiprocessing.Process(target=self.process_2)
145 |         m1.start()
146 |         m2.start()
147 |         m1.join()
148 |         m2.join()
149 | 
150 | def main():
151 |     mafengwo_task = HandleMaFengWoTask()
152 |     mafengwo_task.run()
153 | 
154 | if __name__ == '__main__':
155 |     main()
156 | 


--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/spiders/crawl_mafengwo.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import json
  3 | import re
  4 | import time
  5 | import scrapy
  6 | from ..items import MafengwoItem
  7 | from mafengwo.handle_mongo import mongo
  8 | 
  9 | 
 10 | class CrawlMafengwoSpider(scrapy.Spider):
 11 |     name = 'crawl_mafengwo'
 12 |     allowed_domains = ['mafengwo.cn']
 13 | 
 14 |     #从task库中取出任务
 15 |     def start_requests(self):
 16 |         for i in range(1):
 17 |             task = mongo.get_task()
 18 |             #如果有任务则执行
 19 |             if task:
 20 |                 if '_id' in task:
 21 |                     task.pop('_id')
 22 |                 print(task)
 23 |                 if task['item_type'] == 'head_item':
 24 |                     yield scrapy.Request(url=task['url'],callback=self.handle_detail_head,dont_filter=True,meta=task)
 25 |                 elif task['item_type'] == 'article_item':
 26 |                     yield scrapy.Request(url=task['url'],callback=self.handle_detail,dont_filter=True,meta=task)
 27 | 
 28 |     #解析美篇游记的头部信息
 29 |     def handle_detail_head(self,response):
 30 |         read_comment_search = re.compile(r'<span><i\sclass="ico_view"></i>(.*?)</span>')
 31 |         name_search = re.compile(r'class="per_name"\stitle="(.*?)">')
 32 |         star_search = re.compile(r'<span>(\d+)</span><strong>收藏</strong>')
 33 |         release_time_search = re.compile(r'<span\sclass="time">(.*?)</span>')
 34 |         html = json.loads(response.text)['data']['html']
 35 |         info = {}
 36 |         read_comment = read_comment_search.search(html).group(1).split('/')
 37 |         info['read_sum'] = read_comment[0]
 38 |         info['comment_sum'] = read_comment[1]
 39 |         info['name'] = name_search.search(html).group(1)
 40 |         info['star_sum'] = star_search.search(html).group(1)
 41 |         info['release_time'] = release_time_search.search(html).group(1)
 42 |         info['item_type'] = 'article_item'
 43 |         info['url'] = 'http://www.mafengwo.cn/i/%s.html'%(response.request.meta['id'])
 44 |         mongo.insert_task(info)
 45 | 
 46 |     #解析游记
 47 |     def handle_detail(self,response):
 48 |         id_search = re.compile(r"window.Env\s=\s(.*);")
 49 |         seq_search = re.compile(r'data-seq="(\d+)"')
 50 |         try:
 51 |             id_result = json.loads(id_search.search(response.text).group(1))
 52 |         except:
 53 |             return
 54 |         id = id_result['iid']
 55 |         iid = id_result.get('new_iid')
 56 |         #存在下一页
 57 |         if iid:
 58 |             print(response.url+"存在多页")
 59 |             response.request.meta['id'] = id
 60 |             response.request.meta['iid'] = iid
 61 |             #文章标题
 62 |             response.request.meta['title'] = response.xpath("//title/text()").extract_first()
 63 |             #文章内容
 64 |             response.request.meta['content'] = response.xpath("//div[@class='_j_content_box']").extract()
 65 |             #请求URL
 66 |             response.request.meta['from_url'] = response.url
 67 |             #请求下一页所使用的ID
 68 |             next_request_seq = seq_search.findall(response.text)[-1]
 69 |             next_detail_url = "http://www.mafengwo.cn/note/ajax/detail/getNoteDetailContentChunk?id=%s&iid=%s&seq=%s&back=0" % (id, iid, next_request_seq)
 70 |             yield scrapy.Request(url=next_detail_url, callback=self.handle_detail_json, dont_filter=True,meta=response.request.meta)
 71 |         # 不存在下一页
 72 |         else:
 73 |             #处理游记
 74 |             m3u8_search = re.compile(r'data-url="(.*\.m3u8)"')
 75 |             mafengwo_data = MafengwoItem()
 76 |             mafengwo_data['title'] = response.xpath("//title/text()").extract_first()
 77 |             mafengwo_data['from_url'] = response.request.meta['from_url']
 78 |             mafengwo_data['read_sum'] = response.request.meta['read_sum']
 79 |             mafengwo_data['comment_sum'] = response.request.meta['comment_sum']
 80 |             mafengwo_data['star_sum'] = response.request.meta['star_sum']
 81 |             # mafengwo_data['support_sum'] = response.request.meta['support_sum']
 82 |             mafengwo_data['release_time'] = response.request.meta['release_time']
 83 |             mafengwo_data['name'] = response.request.meta['name']
 84 |             mafengwo_data['id'] = id
 85 |             mafengwo_data['content'] = self.handle_img_src(''.join(response.xpath("//div[@id='pnl_contentinfo']").extract_first()))
 86 |             photo_url_search = re.compile(r'data-src="(.*?)\?')
 87 |             mafengwo_data['video_urls'] = m3u8_search.findall(mafengwo_data['content'])
 88 |             mafengwo_data['image_urls'] = photo_url_search.findall(mafengwo_data['content'])
 89 |             mafengwo_data['upload_status'] = 0
 90 |             mafengwo_data['crawl_time'] = time.strftime("%Y%m%d %H:%M:%S", time.localtime())
 91 |             yield mafengwo_data
 92 | 
 93 |     def handle_detail_json(self,response):
 94 |         m3u8_search = re.compile(r'data-url="(.*\.m3u8)"')
 95 |         seq_search = re.compile(r'data-seq="(\d+)"')
 96 |         html_text = json.loads(response.text)['data']
 97 |         if html_text['html'] == "":
 98 |             mafengwo_data = MafengwoItem()
 99 |             mafengwo_data['title'] = response.request.meta['title']
100 |             mafengwo_data['from_url'] = response.request.meta['from_url']
101 |             mafengwo_data['read_sum'] = response.request.meta['read_sum']
102 |             mafengwo_data['comment_sum'] = response.request.meta['comment_sum']
103 |             mafengwo_data['star_sum'] = response.request.meta['star_sum']
104 |             # mafengwo_data['support_sum'] = response.request.meta['support_sum']
105 |             mafengwo_data['release_time'] = response.request.meta['release_time']
106 |             mafengwo_data['name'] = response.request.meta['name']
107 |             mafengwo_data['id'] = response.request.meta['id']
108 |             mafengwo_data['content'] = self.handle_img_src(''.join(response.request.meta['content']))
109 |             mafengwo_data['upload_status'] = 0
110 |             mafengwo_data['crawl_time'] = time.strftime("%Y%m%d %H:%M:%S", time.localtime())
111 |             photo_url_search = re.compile(r'data-src="(.*?)\?')
112 |             mafengwo_data['video_urls'] = m3u8_search.findall(mafengwo_data['content'])
113 |             mafengwo_data['image_urls'] = photo_url_search.findall(mafengwo_data['content'])
114 |             yield mafengwo_data
115 |         else:
116 |             html = html_text['html']
117 |             response.request.meta['content'].append(html)
118 |             next_request_seq = seq_search.findall(html)[-1]
119 |             if next_request_seq:
120 |                 next_detail_url = "http://www.mafengwo.cn/note/ajax/detail/getNoteDetailContentChunk?id=%s&iid=%s&seq=%s&back=0" % (response.request.meta['id'], response.request.meta['iid'], next_request_seq)
121 |                 yield scrapy.Request(url=next_detail_url, callback=self.handle_detail_json, dont_filter=True,meta=response.request.meta)
122 | 
123 |     #处理游记中的图片URL
124 |     def handle_img_src(self, text):
125 |         img_search = re.compile(r"<img.*?alt=.*?>|<img.*?>")
126 |         img_data_src_search = re.compile(r'data-src="(.*?)\?')
127 |         src_search = re.compile(r'[^-]src="(.*?)"')
128 |         img_list = img_search.findall(text)
129 |         for img in img_list:
130 |             try:
131 |                 img_data_src = img_data_src_search.search(img).group(1)
132 |                 src = src_search.search(img).group(1)
133 |                 img_new = img.replace(src, img_data_src)
134 |                 text = text.replace(img, img_new)
135 |             except:
136 |                 pass
137 |         return text
138 | 


--------------------------------------------------------------------------------
/mafengwo/mafengwo/spiders/crawl_mafengwo.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import json
  3 | import re
  4 | import scrapy
  5 | from scrapy import Selector
  6 | from ..items import MafengwoItem
  7 | import time
  8 | 
  9 | 
 10 | class CrawlMafengwoSpider(scrapy.Spider):
 11 |     name = 'crawl_mafengwo'
 12 |     allowed_domains = ['mafengwo.cn']
 13 |     # start_urls = ['http://www.mafengwo.cn/u/wenhao/note.html']
 14 | 
 15 |     #请求首页
 16 |     def start_requests(self):
 17 |         #直接构造请求页码URL,如请求200页,热门游记
 18 |         for page in range(1,200):
 19 |             url = 'http://pagelet.mafengwo.cn/note/pagelet/recommendNoteApi?params={"type":0,"objid":0,"page":%s,"ajax":1,"retina":0}'%page
 20 |             yield scrapy.Request(url=url,callback=self.handle_page,dont_filter=True)
 21 | 
 22 |     #解析有多少篇游记，构造游记阅读量等信息URL并请求
 23 |     def handle_page(self, response):
 24 |         #获取页码页返回中的文章ID
 25 |         article_id_search = re.compile(r'<a href="/i/(.*?)\.html"')
 26 |         #获取文章ID并去重
 27 |         article_id_list = set(article_id_search.findall(json.loads(response.text)['data']['html']))
 28 |         print(article_id_list)
 29 |         #构造文章阅读量、评论数量、发表时间请求URL
 30 |         for article_id in article_id_list:
 31 |             article_header_url = 'http://pagelet.mafengwo.cn/note/pagelet/headOperateApi?params={"iid":%s}'%article_id
 32 |             yield scrapy.Request(url=article_header_url,callback=self.handle_detail_request,meta={"article_id":article_id},dont_filter=True)
 33 | 
 34 |     #获取文章阅读量、评论数量、发表时间等数据，构造游记URL并请求
 35 |     def handle_detail_request(self,response):
 36 |         read_comment_search = re.compile(r'<span><i\sclass="ico_view"></i>(.*?)</span>')
 37 |         name_search = re.compile(r'class="per_name"\stitle="(.*?)">')
 38 |         star_search = re.compile(r'<span>(\d+)</span><strong>收藏</strong>')
 39 |         release_time_search = re.compile(r'<span\sclass="time">(.*?)</span>')
 40 |         html = json.loads(response.text)['data']['html']
 41 |         info = {}
 42 |         read_comment = read_comment_search.search(html).group(1).split('/')
 43 |         info['read_sum'] = read_comment[0]
 44 |         info['comment_sum'] = read_comment[1]
 45 |         info['name'] = name_search.search(html).group(1)
 46 |         info['star_sum'] = star_search.search(html).group(1)
 47 |         info['release_time'] = release_time_search.search(html).group(1)
 48 |         info['id'] = response.request.meta['article_id']
 49 |         info['url'] = 'http://www.mafengwo.cn/i/%s.html' % (response.request.meta['article_id'])
 50 |         print(info)
 51 |         yield scrapy.Request(url=info['url'],callback=self.handle_detail,meta=info,dont_filter=True)
 52 | 
 53 |     # 解析游记
 54 |     def handle_detail(self, response):
 55 |         id_search = re.compile(r"window.Env\s=\s(.*);")
 56 |         seq_search = re.compile(r'data-seq="(\d+)"')
 57 |         try:
 58 |             id_result = json.loads(id_search.search(response.text).group(1))
 59 |         except:
 60 |             return
 61 |         #获取是否存在下一页标志
 62 |         iid = id_result.get('new_iid')
 63 |         # 存在下一页
 64 |         if iid:
 65 |             print(response.url + "存在多页")
 66 |             response.request.meta['iid'] = iid
 67 |             # 文章标题
 68 |             response.request.meta['title'] = response.xpath("//title/text()").extract_first()
 69 |             # 文章内容
 70 |             response.request.meta['content'] = response.xpath("//div[@class='_j_content_box']").extract()
 71 |             # 请求URL
 72 |             response.request.meta['from_url'] = response.url
 73 |             # 请求下一页所使用的ID
 74 |             next_request_seq = seq_search.findall(response.text)[-1]
 75 |             next_detail_url = "http://www.mafengwo.cn/note/ajax/detail/getNoteDetailContentChunk?id=%s&iid=%s&seq=%s&back=0" % (response.request.meta['id'], iid, next_request_seq)
 76 |             yield scrapy.Request(url=next_detail_url, callback=self.handle_detail_json, dont_filter=True,meta=response.request.meta)
 77 |         # 不存在下一页
 78 |         else:
 79 |             # 处理游记
 80 |             mafengwo_data = MafengwoItem()
 81 |             mafengwo_data['title'] = response.xpath("//title/text()").extract_first()
 82 |             mafengwo_data['from_url'] = response.request.meta['from_url']
 83 |             mafengwo_data['read_sum'] = response.request.meta['read_sum']
 84 |             mafengwo_data['comment_sum'] = response.request.meta['comment_sum']
 85 |             mafengwo_data['star_sum'] = response.request.meta['star_sum']
 86 |             mafengwo_data['release_time'] = response.request.meta['release_time']
 87 |             mafengwo_data['name'] = response.request.meta['name']
 88 |             mafengwo_data['id'] = response.request.meta['id']
 89 |             mafengwo_data['content'] = self.handle_img_src(''.join(response.xpath("//div[@id='pnl_contentinfo']").extract_first()))
 90 |             #获取文章中所有图片URL
 91 |             photo_url_search = re.compile(r'data-src="(.*?)\?')
 92 |             mafengwo_data['image_urls'] = photo_url_search.findall(mafengwo_data['content'])
 93 |             mafengwo_data['crawl_time'] = time.strftime("%Y%m%d %H:%M:%S", time.localtime())
 94 |             yield mafengwo_data
 95 | 
 96 |     def handle_detail_json(self, response):
 97 |         seq_search = re.compile(r'data-seq="(\d+)"')
 98 |         html_text = json.loads(response.text)['data']
 99 |         #请求到末页
100 |         if html_text['html'] == "":
101 |             mafengwo_data = MafengwoItem()
102 |             mafengwo_data['title'] = response.request.meta['title']
103 |             mafengwo_data['from_url'] = response.request.meta['from_url']
104 |             mafengwo_data['read_sum'] = response.request.meta['read_sum']
105 |             mafengwo_data['comment_sum'] = response.request.meta['comment_sum']
106 |             mafengwo_data['star_sum'] = response.request.meta['star_sum']
107 |             mafengwo_data['release_time'] = response.request.meta['release_time']
108 |             mafengwo_data['name'] = response.request.meta['name']
109 |             mafengwo_data['id'] = response.request.meta['id']
110 |             mafengwo_data['content'] = self.handle_img_src(''.join(response.request.meta['content']))
111 |             mafengwo_data['crawl_time'] = time.strftime("%Y%m%d %H:%M:%S", time.localtime())
112 |             photo_url_search = re.compile(r'data-src="(.*?)\?')
113 |             mafengwo_data['image_urls'] = photo_url_search.findall(mafengwo_data['content'])
114 |             yield mafengwo_data
115 |         #继续请求下一页
116 |         else:
117 |             html = html_text['html']
118 |             response.request.meta['content'].append(html)
119 |             next_request_seq = seq_search.findall(html)[-1]
120 |             if next_request_seq:
121 |                 next_detail_url = "http://www.mafengwo.cn/note/ajax/detail/getNoteDetailContentChunk?id=%s&iid=%s&seq=%s&back=0" % (response.request.meta['id'], response.request.meta['iid'], next_request_seq)
122 |                 yield scrapy.Request(url=next_detail_url, callback=self.handle_detail_json, dont_filter=True,meta=response.request.meta)
123 | 
124 |     # 处理游记中的图片URL
125 |     def handle_img_src(self, text):
126 |         img_search = re.compile(r"<img.*?alt=.*?>|<img.*?>")
127 |         img_data_src_search = re.compile(r'data-src="(.*?)\?')
128 |         src_search = re.compile(r'[^-]src="(.*?)"')
129 |         img_list = img_search.findall(text)
130 |         for img in img_list:
131 |             try:
132 |                 img_data_src = img_data_src_search.search(img).group(1)
133 |                 src = src_search.search(img).group(1)
134 |                 img_new = img.replace(src, img_data_src)
135 |                 text = text.replace(img, img_new)
136 |             except:
137 |                 pass
138 |         return text
139 | 
140 | 


--------------------------------------------------------------------------------
/mafengwo/mafengwo/url_list.txt:
--------------------------------------------------------------------------------
  1 | http://www.mafengwo.cn/u/wenhao/note.html
  2 | http://www.mafengwo.cn/u/5295777/note.html
  3 | http://www.mafengwo.cn/u/85713126/note.html
  4 | http://www.mafengwo.cn/u/18015577/note.html
  5 | http://www.mafengwo.cn/u/60798801/note.html
  6 | http://www.mafengwo.cn/u/yiyinotes/note.html
  7 | https://www.mafengwo.cn/u/88358953/note.html
  8 | https://www.mafengwo.cn/u/daxigua/note.html
  9 | https://www.mafengwo.cn/u/47448074/note.html
 10 | https://www.mafengwo.cn/u/36909470/note.html
 11 | https://www.mafengwo.cn/u/76823294/note.html
 12 | https://www.mafengwo.cn/u/32216322/note.html
 13 | https://www.mafengwo.cn/u/10704640/note.html
 14 | https://www.mafengwo.cn/u/dearsummar/note.html
 15 | https://www.mafengwo.cn/u/19894572/note.html
 16 | https://www.mafengwo.cn/u/321294/note.html
 17 | https://www.mafengwo.cn/u/5172228/note.html
 18 | https://www.mafengwo.cn/u/5017124/note.html
 19 | https://www.mafengwo.cn/u/hwf520/note.html
 20 | https://www.mafengwo.cn/u/kido37/note.html
 21 | https://www.mafengwo.cn/u/41037525/note.html
 22 | https://www.mafengwo.cn/u/joyii0513/note.html
 23 | https://www.mafengwo.cn/u/69709753/note.html
 24 | https://www.mafengwo.cn/u/wayzhenyan/note.html
 25 | https://www.mafengwo.cn/u/78343168/note.html
 26 | https://www.mafengwo.cn/u/46337998/note.html
 27 | https://www.mafengwo.cn/u/sellnuan/note.html
 28 | https://www.mafengwo.cn/u/846867/note.html
 29 | https://www.mafengwo.cn/u/54041143/note.html
 30 | https://www.mafengwo.cn/u/17074212/note.html
 31 | https://www.mafengwo.cn/u/5602249/note.html
 32 | https://www.mafengwo.cn/u/45793678/note.html
 33 | https://www.mafengwo.cn/u/42370376/note.html
 34 | https://www.mafengwo.cn/u/81676700/note.html
 35 | https://www.mafengwo.cn/u/78838404/note.html
 36 | https://www.mafengwo.cn/u/5663320/note.html
 37 | https://www.mafengwo.cn/u/56213436/note.html
 38 | https://www.mafengwo.cn/u/68691572/note.html
 39 | https://www.mafengwo.cn/u/67165115/note.html
 40 | https://www.mafengwo.cn/u/45907046/note.html
 41 | https://www.mafengwo.cn/u/samwong/note.html
 42 | https://www.mafengwo.cn/u/48737554/note.html
 43 | https://www.mafengwo.cn/u/5366541/note.html
 44 | https://www.mafengwo.cn/u/1047345/note.html
 45 | https://www.mafengwo.cn/u/73297474/note.html
 46 | https://www.mafengwo.cn/u/64898562/note.html
 47 | https://www.mafengwo.cn/u/ariel690/note.html
 48 | https://www.mafengwo.cn/u/5133407/note.html
 49 | https://www.mafengwo.cn/u/63932781/note.html
 50 | https://www.mafengwo.cn/u/49231278/note.html
 51 | https://www.mafengwo.cn/u/69833564/note.html
 52 | https://www.mafengwo.cn/u/52482820/note.html
 53 | https://www.mafengwo.cn/u/374140/note.html
 54 | https://www.mafengwo.cn/u/5363625/note.html
 55 | https://www.mafengwo.cn/u/64582645/note.html
 56 | https://www.mafengwo.cn/u/32228262/note.html
 57 | https://www.mafengwo.cn/u/68295140/note.html
 58 | https://www.mafengwo.cn/u/93296829/note.html
 59 | https://www.mafengwo.cn/u/biggun/note.html
 60 | https://www.mafengwo.cn/u/57892379/note.html
 61 | https://www.mafengwo.cn/u/76823294.html
 62 | https://www.mafengwo.cn/u/pinkyvision/note.html
 63 | https://www.mafengwo.cn/u/69536526/note.html
 64 | https://www.mafengwo.cn/u/37311913/note.html
 65 | https://www.mafengwo.cn/u/10345585/note.html
 66 | https://www.mafengwo.cn/u/37369363/note.html
 67 | https://www.mafengwo.cn/u/inlaoban5/note.html
 68 | https://www.mafengwo.cn/u/75471465/note.html
 69 | https://www.mafengwo.cn/u/40682663/note.html
 70 | https://www.mafengwo.cn/u/799727/note.html
 71 | https://www.mafengwo.cn/u/19560416/note.html
 72 | https://www.mafengwo.cn/u/summer7/note.html
 73 | https://www.mafengwo.cn/u/zhenmeiqu/note.html
 74 | https://www.mafengwo.cn/u/93808795/note.html
 75 | https://www.mafengwo.cn/u/ruanzhonghua/note.html
 76 | https://www.mafengwo.cn/u/59633694/note.html
 77 | https://www.mafengwo.cn/u/5172228/note.html
 78 | https://www.mafengwo.cn/u/79862907/note.html
 79 | https://www.mafengwo.cn/u/5119335/note.html
 80 | https://www.mafengwo.cn/u/iiibiz/note.html
 81 | https://www.mafengwo.cn/u/92990277/note.html
 82 | https://www.mafengwo.cn/u/83736375.html
 83 | https://www.mafengwo.cn/u/66016397/note.html
 84 | https://www.mafengwo.cn/u/75334068/note.html
 85 | https://www.mafengwo.cn/u/10606831/note.html
 86 | https://www.mafengwo.cn/u/73953374/note.html
 87 | https://www.mafengwo.cn/u/5328159/note.html
 88 | https://www.mafengwo.cn/u/72226812/note.html
 89 | https://www.mafengwo.cn/u/75867238/note.html
 90 | https://www.mafengwo.cn/u/ruogu2/note.html
 91 | https://www.mafengwo.cn/u/459268/note.html
 92 | https://www.mafengwo.cn/u/5037685/note.html
 93 | https://www.mafengwo.cn/u/32358313/note.html
 94 | https://www.mafengwo.cn/u/ymy817/note.html
 95 | https://www.mafengwo.cn/u/44131359/note.html
 96 | https://www.mafengwo.cn/u/flyingwsh/note.html
 97 | https://www.mafengwo.cn/u/36953718/note.html
 98 | https://www.mafengwo.cn/u/830821/note.html
 99 | https://www.mafengwo.cn/u/72465054/note.html
100 | https://www.mafengwo.cn/u/816643/note.html
101 | https://www.mafengwo.cn/u/5547423/note.html
102 | https://www.mafengwo.cn/u/85055587/note.html
103 | https://www.mafengwo.cn/u/77259555/note.html
104 | https://www.mafengwo.cn/u/58085128/note.html
105 | https://www.mafengwo.cn/u/85782763/note.html
106 | https://www.mafengwo.cn/u/448785/note.html
107 | https://www.mafengwo.cn/u/shanfeng/note.html
108 | https://www.mafengwo.cn/u/30730200/note.html
109 | https://www.mafengwo.cn/u/82532600/note.html
110 | https://www.mafengwo.cn/u/sellnuan/note.html
111 | https://www.mafengwo.cn/u/85205385/note.html
112 | https://www.mafengwo.cn/u/40525484/note.html
113 | https://www.mafengwo.cn/u/92931036/note.html
114 | https://www.mafengwo.cn/u/60022265/note.html
115 | https://www.mafengwo.cn/u/45066857.html
116 | https://www.mafengwo.cn/u/34957278/note.html
117 | https://www.mafengwo.cn/u/90472994/note.html
118 | https://www.mafengwo.cn/u/5295777/note.html
119 | https://www.mafengwo.cn/u/86494331/note.html
120 | https://www.mafengwo.cn/u/42395202.html
121 | https://www.mafengwo.cn/u/heididsy/note.html
122 | https://www.mafengwo.cn/u/42694746/note.html
123 | https://www.mafengwo.cn/u/yimeng/note.html
124 | https://www.mafengwo.cn/u/5172228/note.html
125 | https://www.mafengwo.cn/u/17639643.html
126 | https://www.mafengwo.cn/u/wuweixiang/note.html
127 | https://www.mafengwo.cn/u/92931036/note.html
128 | https://www.mafengwo.cn/u/49231278/note.html
129 | https://www.mafengwo.cn/u/5481686.html
130 | https://www.mafengwo.cn/u/19014378/note.html
131 | https://www.mafengwo.cn/u/seacen/note.html
132 | https://www.mafengwo.cn/u/beslan/note.html
133 | https://www.mafengwo.cn/u/ruanzhonghua/note.html
134 | https://www.mafengwo.cn/u/187367/note.html
135 | https://www.mafengwo.cn/u/32216322/note.html
136 | https://www.mafengwo.cn/u/93157709/note.html
137 | https://www.mafengwo.cn/u/13105932/note.html
138 | https://www.mafengwo.cn/u/86494331/note.html
139 | https://www.mafengwo.cn/u/10911951.html
140 | https://www.mafengwo.cn/u/77243222/note.html
141 | https://www.mafengwo.cn/u/yolichic/note.html
142 | https://www.mafengwo.cn/u/88371807/note.html
143 | https://www.mafengwo.cn/u/jklouise/note.html
144 | https://www.mafengwo.cn/u/85558645/note.html
145 | https://www.mafengwo.cn/u/69200064/note.html
146 | https://www.mafengwo.cn/u/88358953/note.html
147 | https://www.mafengwo.cn/u/54534899/note.html
148 | https://www.mafengwo.cn/u/kido37/note.html
149 | https://www.mafengwo.cn/u/ruogu2/note.html
150 | https://www.mafengwo.cn/u/32228262/note.html
151 | https://www.mafengwo.cn/u/208077/note.html
152 | https://www.mafengwo.cn/u/xmulazio/note.html
153 | https://www.mafengwo.cn/u/74369556/note.html
154 | https://www.mafengwo.cn/u/5028192/note.html
155 | https://www.mafengwo.cn/u/ptah0622/note.html
156 | https://www.mafengwo.cn/u/5203896/note.html
157 | https://www.mafengwo.cn/u/35296229/note.html
158 | https://www.mafengwo.cn/u/69709753/note.html
159 | https://www.mafengwo.cn/u/71897854/note.html
160 | https://www.mafengwo.cn/u/73941769/note.html
161 | https://www.mafengwo.cn/u/79167497/note.html
162 | https://www.mafengwo.cn/u/5648583/note.html
163 | https://www.mafengwo.cn/u/840399/note.html
164 | https://www.mafengwo.cn/u/34260694/note.html
165 | https://www.mafengwo.cn/u/89214773/note.html
166 | https://www.mafengwo.cn/u/47448074/note.html
167 | https://www.mafengwo.cn/u/90344916/note.html
168 | https://www.mafengwo.cn/u/5673085/note.html
169 | https://www.mafengwo.cn/u/fantasist/note.html
170 | https://www.mafengwo.cn/u/gemmakyoto/note.html
171 | https://www.mafengwo.cn/u/kidd1110/note.html
172 | https://www.mafengwo.cn/u/459539/note.html
173 | https://www.mafengwo.cn/u/clijsters/note.html
174 | https://www.mafengwo.cn/u/53816690/note.html
175 | https://www.mafengwo.cn/u/85224198/note.html
176 | https://www.mafengwo.cn/u/1115956/note.html
177 | https://www.mafengwo.cn/u/kevlee/note.html
178 | https://www.mafengwo.cn/u/sarahontheroad.html
179 | https://www.mafengwo.cn/u/10525543/note.html
180 | https://www.mafengwo.cn/u/374140/note.html
181 | https://www.mafengwo.cn/u/19268018/note.html
182 | https://www.mafengwo.cn/u/70816697/note.html
183 | https://www.mafengwo.cn/u/102065/note.html
184 | https://www.mafengwo.cn/u/yolichic/note.html
185 | https://www.mafengwo.cn/u/49130101/note.html
186 | https://www.mafengwo.cn/u/49221414/note.html
187 | https://www.mafengwo.cn/u/sicilia/note.html
188 | https://www.mafengwo.cn/u/zhangxiaofan/note.html
189 | https://www.mafengwo.cn/u/fantastic/note.html
190 | https://www.mafengwo.cn/u/193656/note.html
191 | https://www.mafengwo.cn/u/after17/note.html
192 | https://www.mafengwo.cn/u/guaiiiii/note.html
193 | https://www.mafengwo.cn/u/tianpinan/note.html
194 | https://www.mafengwo.cn/u/52233524/note.html
195 | https://www.mafengwo.cn/u/75151343/note.html
196 | https://www.mafengwo.cn/u/88358953/note.html
197 | https://www.mafengwo.cn/u/83796483/note.html
198 | https://www.mafengwo.cn/u/79297765/note.html
199 | https://www.mafengwo.cn/u/72512443/note.html
200 | https://www.mafengwo.cn/u/niuniu/note.html


--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/url_list.txt:
--------------------------------------------------------------------------------
  1 | http://www.mafengwo.cn/u/wenhao/note.html
  2 | http://www.mafengwo.cn/u/5295777/note.html
  3 | http://www.mafengwo.cn/u/85713126/note.html
  4 | http://www.mafengwo.cn/u/18015577/note.html
  5 | http://www.mafengwo.cn/u/60798801/note.html
  6 | http://www.mafengwo.cn/u/yiyinotes/note.html
  7 | https://www.mafengwo.cn/u/88358953/note.html
  8 | https://www.mafengwo.cn/u/daxigua/note.html
  9 | https://www.mafengwo.cn/u/47448074/note.html
 10 | https://www.mafengwo.cn/u/36909470/note.html
 11 | https://www.mafengwo.cn/u/76823294/note.html
 12 | https://www.mafengwo.cn/u/32216322/note.html
 13 | https://www.mafengwo.cn/u/10704640/note.html
 14 | https://www.mafengwo.cn/u/dearsummar/note.html
 15 | https://www.mafengwo.cn/u/19894572/note.html
 16 | https://www.mafengwo.cn/u/321294/note.html
 17 | https://www.mafengwo.cn/u/5172228/note.html
 18 | https://www.mafengwo.cn/u/5017124/note.html
 19 | https://www.mafengwo.cn/u/hwf520/note.html
 20 | https://www.mafengwo.cn/u/kido37/note.html
 21 | https://www.mafengwo.cn/u/41037525/note.html
 22 | https://www.mafengwo.cn/u/joyii0513/note.html
 23 | https://www.mafengwo.cn/u/69709753/note.html
 24 | https://www.mafengwo.cn/u/wayzhenyan/note.html
 25 | https://www.mafengwo.cn/u/78343168/note.html
 26 | https://www.mafengwo.cn/u/46337998/note.html
 27 | https://www.mafengwo.cn/u/sellnuan/note.html
 28 | https://www.mafengwo.cn/u/846867/note.html
 29 | https://www.mafengwo.cn/u/54041143/note.html
 30 | https://www.mafengwo.cn/u/17074212/note.html
 31 | https://www.mafengwo.cn/u/5602249/note.html
 32 | https://www.mafengwo.cn/u/45793678/note.html
 33 | https://www.mafengwo.cn/u/42370376/note.html
 34 | https://www.mafengwo.cn/u/81676700/note.html
 35 | https://www.mafengwo.cn/u/78838404/note.html
 36 | https://www.mafengwo.cn/u/5663320/note.html
 37 | https://www.mafengwo.cn/u/56213436/note.html
 38 | https://www.mafengwo.cn/u/68691572/note.html
 39 | https://www.mafengwo.cn/u/67165115/note.html
 40 | https://www.mafengwo.cn/u/45907046/note.html
 41 | https://www.mafengwo.cn/u/samwong/note.html
 42 | https://www.mafengwo.cn/u/48737554/note.html
 43 | https://www.mafengwo.cn/u/5366541/note.html
 44 | https://www.mafengwo.cn/u/1047345/note.html
 45 | https://www.mafengwo.cn/u/73297474/note.html
 46 | https://www.mafengwo.cn/u/64898562/note.html
 47 | https://www.mafengwo.cn/u/ariel690/note.html
 48 | https://www.mafengwo.cn/u/5133407/note.html
 49 | https://www.mafengwo.cn/u/63932781/note.html
 50 | https://www.mafengwo.cn/u/49231278/note.html
 51 | https://www.mafengwo.cn/u/69833564/note.html
 52 | https://www.mafengwo.cn/u/52482820/note.html
 53 | https://www.mafengwo.cn/u/374140/note.html
 54 | https://www.mafengwo.cn/u/5363625/note.html
 55 | https://www.mafengwo.cn/u/64582645/note.html
 56 | https://www.mafengwo.cn/u/32228262/note.html
 57 | https://www.mafengwo.cn/u/68295140/note.html
 58 | https://www.mafengwo.cn/u/93296829/note.html
 59 | https://www.mafengwo.cn/u/biggun/note.html
 60 | https://www.mafengwo.cn/u/57892379/note.html
 61 | https://www.mafengwo.cn/u/76823294/note.html
 62 | https://www.mafengwo.cn/u/pinkyvision/note.html
 63 | https://www.mafengwo.cn/u/69536526/note.html
 64 | https://www.mafengwo.cn/u/37311913/note.html
 65 | https://www.mafengwo.cn/u/10345585/note.html
 66 | https://www.mafengwo.cn/u/37369363/note.html
 67 | https://www.mafengwo.cn/u/inlaoban5/note.html
 68 | https://www.mafengwo.cn/u/75471465/note.html
 69 | https://www.mafengwo.cn/u/40682663/note.html
 70 | https://www.mafengwo.cn/u/799727/note.html
 71 | https://www.mafengwo.cn/u/19560416/note.html
 72 | https://www.mafengwo.cn/u/summer7/note.html
 73 | https://www.mafengwo.cn/u/zhenmeiqu/note.html
 74 | https://www.mafengwo.cn/u/93808795/note.html
 75 | https://www.mafengwo.cn/u/ruanzhonghua/note.html
 76 | https://www.mafengwo.cn/u/59633694/note.html
 77 | https://www.mafengwo.cn/u/5172228/note.html
 78 | https://www.mafengwo.cn/u/79862907/note.html
 79 | https://www.mafengwo.cn/u/5119335/note.html
 80 | https://www.mafengwo.cn/u/iiibiz/note.html
 81 | https://www.mafengwo.cn/u/92990277/note.html
 82 | https://www.mafengwo.cn/u/83736375/note.html
 83 | https://www.mafengwo.cn/u/66016397/note.html
 84 | https://www.mafengwo.cn/u/75334068/note.html
 85 | https://www.mafengwo.cn/u/10606831/note.html
 86 | https://www.mafengwo.cn/u/73953374/note.html
 87 | https://www.mafengwo.cn/u/5328159/note.html
 88 | https://www.mafengwo.cn/u/72226812/note.html
 89 | https://www.mafengwo.cn/u/75867238/note.html
 90 | https://www.mafengwo.cn/u/ruogu2/note.html
 91 | https://www.mafengwo.cn/u/459268/note.html
 92 | https://www.mafengwo.cn/u/5037685/note.html
 93 | https://www.mafengwo.cn/u/32358313/note.html
 94 | https://www.mafengwo.cn/u/ymy817/note.html
 95 | https://www.mafengwo.cn/u/44131359/note.html
 96 | https://www.mafengwo.cn/u/flyingwsh/note.html
 97 | https://www.mafengwo.cn/u/36953718/note.html
 98 | https://www.mafengwo.cn/u/830821/note.html
 99 | https://www.mafengwo.cn/u/72465054/note.html
100 | https://www.mafengwo.cn/u/816643/note.html
101 | https://www.mafengwo.cn/u/5547423/note.html
102 | https://www.mafengwo.cn/u/85055587/note.html
103 | https://www.mafengwo.cn/u/77259555/note.html
104 | https://www.mafengwo.cn/u/58085128/note.html
105 | https://www.mafengwo.cn/u/85782763/note.html
106 | https://www.mafengwo.cn/u/448785/note.html
107 | https://www.mafengwo.cn/u/shanfeng/note.html
108 | https://www.mafengwo.cn/u/30730200/note.html
109 | https://www.mafengwo.cn/u/82532600/note.html
110 | https://www.mafengwo.cn/u/sellnuan/note.html
111 | https://www.mafengwo.cn/u/85205385/note.html
112 | https://www.mafengwo.cn/u/40525484/note.html
113 | https://www.mafengwo.cn/u/92931036/note.html
114 | https://www.mafengwo.cn/u/60022265/note.html
115 | https://www.mafengwo.cn/u/45066857/note.html
116 | https://www.mafengwo.cn/u/34957278/note.html
117 | https://www.mafengwo.cn/u/90472994/note.html
118 | https://www.mafengwo.cn/u/5295777/note.html
119 | https://www.mafengwo.cn/u/86494331/note.html
120 | https://www.mafengwo.cn/u/42395202/note.html
121 | https://www.mafengwo.cn/u/heididsy/note.html
122 | https://www.mafengwo.cn/u/42694746/note.html
123 | https://www.mafengwo.cn/u/yimeng/note.html
124 | https://www.mafengwo.cn/u/5172228/note.html
125 | https://www.mafengwo.cn/u/17639643/note.html
126 | https://www.mafengwo.cn/u/wuweixiang/note.html
127 | https://www.mafengwo.cn/u/92931036/note.html
128 | https://www.mafengwo.cn/u/49231278/note.html
129 | https://www.mafengwo.cn/u/5481686/note.html
130 | https://www.mafengwo.cn/u/19014378/note.html
131 | https://www.mafengwo.cn/u/seacen/note.html
132 | https://www.mafengwo.cn/u/beslan/note.html
133 | https://www.mafengwo.cn/u/ruanzhonghua/note.html
134 | https://www.mafengwo.cn/u/187367/note.html
135 | https://www.mafengwo.cn/u/32216322/note.html
136 | https://www.mafengwo.cn/u/93157709/note.html
137 | https://www.mafengwo.cn/u/13105932/note.html
138 | https://www.mafengwo.cn/u/86494331/note.html
139 | https://www.mafengwo.cn/u/10911951/note.html
140 | https://www.mafengwo.cn/u/77243222/note.html
141 | https://www.mafengwo.cn/u/yolichic/note.html
142 | https://www.mafengwo.cn/u/88371807/note.html
143 | https://www.mafengwo.cn/u/jklouise/note.html
144 | https://www.mafengwo.cn/u/85558645/note.html
145 | https://www.mafengwo.cn/u/69200064/note.html
146 | https://www.mafengwo.cn/u/88358953/note.html
147 | https://www.mafengwo.cn/u/54534899/note.html
148 | https://www.mafengwo.cn/u/kido37/note.html
149 | https://www.mafengwo.cn/u/ruogu2/note.html
150 | https://www.mafengwo.cn/u/32228262/note.html
151 | https://www.mafengwo.cn/u/208077/note.html
152 | https://www.mafengwo.cn/u/xmulazio/note.html
153 | https://www.mafengwo.cn/u/74369556/note.html
154 | https://www.mafengwo.cn/u/5028192/note.html
155 | https://www.mafengwo.cn/u/ptah0622/note.html
156 | https://www.mafengwo.cn/u/5203896/note.html
157 | https://www.mafengwo.cn/u/35296229/note.html
158 | https://www.mafengwo.cn/u/69709753/note.html
159 | https://www.mafengwo.cn/u/71897854/note.html
160 | https://www.mafengwo.cn/u/73941769/note.html
161 | https://www.mafengwo.cn/u/79167497/note.html
162 | https://www.mafengwo.cn/u/5648583/note.html
163 | https://www.mafengwo.cn/u/840399/note.html
164 | https://www.mafengwo.cn/u/34260694/note.html
165 | https://www.mafengwo.cn/u/89214773/note.html
166 | https://www.mafengwo.cn/u/47448074/note.html
167 | https://www.mafengwo.cn/u/90344916/note.html
168 | https://www.mafengwo.cn/u/5673085/note.html
169 | https://www.mafengwo.cn/u/fantasist/note.html
170 | https://www.mafengwo.cn/u/gemmakyoto/note.html
171 | https://www.mafengwo.cn/u/kidd1110/note.html
172 | https://www.mafengwo.cn/u/459539/note.html
173 | https://www.mafengwo.cn/u/clijsters/note.html
174 | https://www.mafengwo.cn/u/53816690/note.html
175 | https://www.mafengwo.cn/u/85224198/note.html
176 | https://www.mafengwo.cn/u/1115956/note.html
177 | https://www.mafengwo.cn/u/kevlee/note.html
178 | https://www.mafengwo.cn/u/sarahontheroad/note.html
179 | https://www.mafengwo.cn/u/10525543/note.html
180 | https://www.mafengwo.cn/u/374140/note.html
181 | https://www.mafengwo.cn/u/19268018/note.html
182 | https://www.mafengwo.cn/u/70816697/note.html
183 | https://www.mafengwo.cn/u/102065/note.html
184 | https://www.mafengwo.cn/u/yolichic/note.html
185 | https://www.mafengwo.cn/u/49130101/note.html
186 | https://www.mafengwo.cn/u/49221414/note.html
187 | https://www.mafengwo.cn/u/sicilia/note.html
188 | https://www.mafengwo.cn/u/zhangxiaofan/note.html
189 | https://www.mafengwo.cn/u/fantastic/note.html
190 | https://www.mafengwo.cn/u/193656/note.html
191 | https://www.mafengwo.cn/u/after17/note.html
192 | https://www.mafengwo.cn/u/guaiiiii/note.html
193 | https://www.mafengwo.cn/u/tianpinan/note.html
194 | https://www.mafengwo.cn/u/52233524/note.html
195 | https://www.mafengwo.cn/u/75151343/note.html
196 | https://www.mafengwo.cn/u/88358953/note.html
197 | https://www.mafengwo.cn/u/83796483/note.html
198 | https://www.mafengwo.cn/u/79297765/note.html
199 | https://www.mafengwo.cn/u/72512443/note.html
200 | https://www.mafengwo.cn/u/niuniu/note.html
201 | 


--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="916b1673-7e8e-4fb3-9d28-86bc05ef6640" name="Default Changelist" comment="">
  5 |       <change beforePath="$PROJECT_DIR$/.idea/small-spider-project.iml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/small-spider-project.iml" afterDir="false" />
  6 |       <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
  7 |       <change beforePath="$PROJECT_DIR$/synchronous/sample/multiprocess_share.py" beforeDir="false" afterPath="$PROJECT_DIR$/synchronous/sample/multiprocess_share.py" afterDir="false" />
  8 |       <change beforePath="$PROJECT_DIR$/synchronous/sample/process_not_share.py" beforeDir="false" afterPath="$PROJECT_DIR$/synchronous/sample/process_not_share.py" afterDir="false" />
  9 |     </list>
 10 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
 11 |     <option name="SHOW_DIALOG" value="false" />
 12 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 13 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 14 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 15 |   </component>
 16 |   <component name="FileTemplateManagerImpl">
 17 |     <option name="RECENT_TEMPLATES">
 18 |       <list>
 19 |         <option value="Python Script" />
 20 |       </list>
 21 |     </option>
 22 |   </component>
 23 |   <component name="Git.Settings">
 24 |     <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
 25 |   </component>
 26 |   <component name="ProjectId" id="1mGcam1xS3DvorOYCBNZzMyhfJH" />
 27 |   <component name="ProjectLevelVcsManager">
 28 |     <ConfirmationsSetting value="1" id="Add" />
 29 |   </component>
 30 |   <component name="PropertiesComponent">
 31 |     <property name="WebServerToolWindowFactoryState" value="true" />
 32 |     <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyIntegratedToolsModulesConfigurable" />
 33 |   </component>
 34 |   <component name="RunDashboard">
 35 |     <option name="ruleStates">
 36 |       <list>
 37 |         <RuleState>
 38 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
 39 |         </RuleState>
 40 |         <RuleState>
 41 |           <option name="name" value="StatusDashboardGroupingRule" />
 42 |         </RuleState>
 43 |       </list>
 44 |     </option>
 45 |   </component>
 46 |   <component name="RunManager" selected="Python.multiprocess_share">
 47 |     <configuration name="download_file" type="PythonConfigurationType" factoryName="Python" temporary="true">
 48 |       <module name="small-spider-project" />
 49 |       <option name="INTERPRETER_OPTIONS" value="" />
 50 |       <option name="PARENT_ENVS" value="true" />
 51 |       <envs>
 52 |         <env name="PYTHONUNBUFFERED" value="1" />
 53 |       </envs>
 54 |       <option name="SDK_HOME" value="" />
 55 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/iot_10086" />
 56 |       <option name="IS_MODULE_SDK" value="true" />
 57 |       <option name="ADD_CONTENT_ROOTS" value="true" />
 58 |       <option name="ADD_SOURCE_ROOTS" value="true" />
 59 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
 60 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/iot_10086/download_file.py" />
 61 |       <option name="PARAMETERS" value="" />
 62 |       <option name="SHOW_COMMAND_LINE" value="false" />
 63 |       <option name="EMULATE_TERMINAL" value="false" />
 64 |       <option name="MODULE_MODE" value="false" />
 65 |       <option name="REDIRECT_INPUT" value="false" />
 66 |       <option name="INPUT_FILE" value="" />
 67 |       <method v="2" />
 68 |     </configuration>
 69 |     <configuration name="multiprocess_share" type="PythonConfigurationType" factoryName="Python" temporary="true">
 70 |       <module name="small-spider-project" />
 71 |       <option name="INTERPRETER_OPTIONS" value="" />
 72 |       <option name="PARENT_ENVS" value="true" />
 73 |       <envs>
 74 |         <env name="PYTHONUNBUFFERED" value="1" />
 75 |       </envs>
 76 |       <option name="SDK_HOME" value="" />
 77 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/synchronous/sample" />
 78 |       <option name="IS_MODULE_SDK" value="true" />
 79 |       <option name="ADD_CONTENT_ROOTS" value="true" />
 80 |       <option name="ADD_SOURCE_ROOTS" value="true" />
 81 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
 82 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/synchronous/sample/multiprocess_share.py" />
 83 |       <option name="PARAMETERS" value="" />
 84 |       <option name="SHOW_COMMAND_LINE" value="false" />
 85 |       <option name="EMULATE_TERMINAL" value="false" />
 86 |       <option name="MODULE_MODE" value="false" />
 87 |       <option name="REDIRECT_INPUT" value="false" />
 88 |       <option name="INPUT_FILE" value="" />
 89 |       <method v="2" />
 90 |     </configuration>
 91 |     <configuration name="process_not_share" type="PythonConfigurationType" factoryName="Python" temporary="true">
 92 |       <module name="small-spider-project" />
 93 |       <option name="INTERPRETER_OPTIONS" value="" />
 94 |       <option name="PARENT_ENVS" value="true" />
 95 |       <envs>
 96 |         <env name="PYTHONUNBUFFERED" value="1" />
 97 |       </envs>
 98 |       <option name="SDK_HOME" value="" />
 99 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/synchronous/sample" />
100 |       <option name="IS_MODULE_SDK" value="true" />
101 |       <option name="ADD_CONTENT_ROOTS" value="true" />
102 |       <option name="ADD_SOURCE_ROOTS" value="true" />
103 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
104 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/synchronous/sample/process_not_share.py" />
105 |       <option name="PARAMETERS" value="" />
106 |       <option name="SHOW_COMMAND_LINE" value="false" />
107 |       <option name="EMULATE_TERMINAL" value="false" />
108 |       <option name="MODULE_MODE" value="false" />
109 |       <option name="REDIRECT_INPUT" value="false" />
110 |       <option name="INPUT_FILE" value="" />
111 |       <method v="2" />
112 |     </configuration>
113 |     <configuration name="spider_multiprocess" type="PythonConfigurationType" factoryName="Python" temporary="true">
114 |       <module name="small-spider-project" />
115 |       <option name="INTERPRETER_OPTIONS" value="" />
116 |       <option name="PARENT_ENVS" value="true" />
117 |       <envs>
118 |         <env name="PYTHONUNBUFFERED" value="1" />
119 |       </envs>
120 |       <option name="SDK_HOME" value="" />
121 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/synchronous" />
122 |       <option name="IS_MODULE_SDK" value="true" />
123 |       <option name="ADD_CONTENT_ROOTS" value="true" />
124 |       <option name="ADD_SOURCE_ROOTS" value="true" />
125 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
126 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/synchronous/spider_multiprocess.py" />
127 |       <option name="PARAMETERS" value="" />
128 |       <option name="SHOW_COMMAND_LINE" value="false" />
129 |       <option name="EMULATE_TERMINAL" value="false" />
130 |       <option name="MODULE_MODE" value="false" />
131 |       <option name="REDIRECT_INPUT" value="false" />
132 |       <option name="INPUT_FILE" value="" />
133 |       <method v="2" />
134 |     </configuration>
135 |     <configuration name="thread_test1" type="PythonConfigurationType" factoryName="Python" temporary="true">
136 |       <module name="small-spider-project" />
137 |       <option name="INTERPRETER_OPTIONS" value="" />
138 |       <option name="PARENT_ENVS" value="true" />
139 |       <envs>
140 |         <env name="PYTHONUNBUFFERED" value="1" />
141 |       </envs>
142 |       <option name="SDK_HOME" value="" />
143 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/synchronous/sample" />
144 |       <option name="IS_MODULE_SDK" value="true" />
145 |       <option name="ADD_CONTENT_ROOTS" value="true" />
146 |       <option name="ADD_SOURCE_ROOTS" value="true" />
147 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
148 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/synchronous/sample/thread_test1.py" />
149 |       <option name="PARAMETERS" value="" />
150 |       <option name="SHOW_COMMAND_LINE" value="false" />
151 |       <option name="EMULATE_TERMINAL" value="false" />
152 |       <option name="MODULE_MODE" value="false" />
153 |       <option name="REDIRECT_INPUT" value="false" />
154 |       <option name="INPUT_FILE" value="" />
155 |       <method v="2" />
156 |     </configuration>
157 |     <list>
158 |       <item itemvalue="Python.multiprocess_share" />
159 |       <item itemvalue="Python.spider_multiprocess" />
160 |       <item itemvalue="Python.process_not_share" />
161 |       <item itemvalue="Python.download_file" />
162 |       <item itemvalue="Python.thread_test1" />
163 |     </list>
164 |     <recent_temporary>
165 |       <list>
166 |         <item itemvalue="Python.multiprocess_share" />
167 |         <item itemvalue="Python.thread_test1" />
168 |         <item itemvalue="Python.download_file" />
169 |         <item itemvalue="Python.process_not_share" />
170 |         <item itemvalue="Python.spider_multiprocess" />
171 |       </list>
172 |     </recent_temporary>
173 |   </component>
174 |   <component name="SvnConfiguration">
175 |     <configuration />
176 |   </component>
177 |   <component name="TaskManager">
178 |     <task active="true" id="Default" summary="Default task">
179 |       <changelist id="916b1673-7e8e-4fb3-9d28-86bc05ef6640" name="Default Changelist" comment="" />
180 |       <created>1609123434116</created>
181 |       <option name="number" value="Default" />
182 |       <option name="presentableId" value="Default" />
183 |       <updated>1609123434116</updated>
184 |       <workItem from="1609123438316" duration="12812000" />
185 |     </task>
186 |     <servers />
187 |   </component>
188 |   <component name="TypeScriptGeneratedFilesManager">
189 |     <option name="version" value="1" />
190 |   </component>
191 |   <component name="Vcs.Log.Tabs.Properties">
192 |     <option name="TAB_STATES">
193 |       <map>
194 |         <entry key="MAIN">
195 |           <value>
196 |             <State />
197 |           </value>
198 |         </entry>
199 |       </map>
200 |     </option>
201 |   </component>
202 |   <component name="com.intellij.coverage.CoverageDataManagerImpl">
203 |     <SUITE FILE_PATH="coverage/small_spider_project$thread_test1.coverage" NAME="thread_test1 Coverage Results" MODIFIED="1609145931875" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/synchronous/sample" />
204 |     <SUITE FILE_PATH="coverage/small_spider_project$process_not_share.coverage" NAME="process_not_share Coverage Results" MODIFIED="1609136364539" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/synchronous/sample" />
205 |     <SUITE FILE_PATH="coverage/small_spider_project$download_file.coverage" NAME="download_file Coverage Results" MODIFIED="1609139033356" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/iot_10086" />
206 |     <SUITE FILE_PATH="coverage/small_spider_project$pytest_in_process_not_share_py.coverage" NAME="pytest in process_not_share.py Coverage Results" MODIFIED="1609134732735" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/synchronous/sample" />
207 |     <SUITE FILE_PATH="coverage/small_spider_project$multiprocess_share.coverage" NAME="multiprocess_share Coverage Results" MODIFIED="1609147586619" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/synchronous/sample" />
208 |     <SUITE FILE_PATH="coverage/small_spider_project$spider_multiprocess.coverage" NAME="spider_multiprocess Coverage Results" MODIFIED="1609123479566" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/synchronous" />
209 |   </component>
210 | </project>


--------------------------------------------------------------------------------
/douban_movie_top250_scrapy/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="BookmarkManager">
  4 |     <bookmark url="file://$PROJECT_DIR$/douban/items.py" />
  5 |   </component>
  6 |   <component name="ChangeListManager">
  7 |     <list default="true" id="e112b6e4-8e1d-428f-bf45-b3a39aea59f2" name="Default" comment="" />
  8 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
  9 |     <option name="TRACKING_ENABLED" value="true" />
 10 |     <option name="SHOW_DIALOG" value="false" />
 11 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 12 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 13 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 14 |   </component>
 15 |   <component name="CoverageDataManager">
 16 |     <SUITE FILE_PATH="coverage/douban$douban_spider.coverage" NAME="douban_spider Coverage Results" MODIFIED="1527055417733" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/douban/spiders" />
 17 |   </component>
 18 |   <component name="FileEditorManager">
 19 |     <leaf>
 20 |       <file leaf-file-name="douban_spider.py" pinned="false" current-in-tab="false">
 21 |         <entry file="file://$PROJECT_DIR$/douban/spiders/douban_spider.py">
 22 |           <provider selected="true" editor-type-id="text-editor">
 23 |             <state relative-caret-position="48">
 24 |               <caret line="2" column="30" lean-forward="false" selection-start-line="2" selection-start-column="30" selection-end-line="2" selection-end-column="30" />
 25 |               <folding />
 26 |             </state>
 27 |           </provider>
 28 |         </entry>
 29 |       </file>
 30 |       <file leaf-file-name="middlewares.py" pinned="false" current-in-tab="false">
 31 |         <entry file="file://$PROJECT_DIR$/douban/middlewares.py">
 32 |           <provider selected="true" editor-type-id="text-editor">
 33 |             <state relative-caret-position="678">
 34 |               <caret line="121" column="0" lean-forward="true" selection-start-line="121" selection-start-column="0" selection-end-line="121" selection-end-column="0" />
 35 |               <folding>
 36 |                 <element signature="e#168#181#0" expanded="true" />
 37 |               </folding>
 38 |             </state>
 39 |           </provider>
 40 |         </entry>
 41 |       </file>
 42 |       <file leaf-file-name="items.py" pinned="false" current-in-tab="false">
 43 |         <entry file="file://$PROJECT_DIR$/douban/items.py">
 44 |           <provider selected="true" editor-type-id="text-editor">
 45 |             <state relative-caret-position="576">
 46 |               <caret line="24" column="8" lean-forward="false" selection-start-line="24" selection-start-column="8" selection-end-line="24" selection-end-column="8" />
 47 |               <folding />
 48 |             </state>
 49 |           </provider>
 50 |         </entry>
 51 |       </file>
 52 |       <file leaf-file-name="settings.py" pinned="false" current-in-tab="true">
 53 |         <entry file="file://$PROJECT_DIR$/douban/settings.py">
 54 |           <provider selected="true" editor-type-id="text-editor">
 55 |             <state relative-caret-position="1920">
 56 |               <caret line="98" column="11" lean-forward="true" selection-start-line="98" selection-start-column="11" selection-end-line="98" selection-end-column="11" />
 57 |               <folding />
 58 |             </state>
 59 |           </provider>
 60 |         </entry>
 61 |       </file>
 62 |     </leaf>
 63 |   </component>
 64 |   <component name="IdeDocumentHistory">
 65 |     <option name="CHANGED_PATHS">
 66 |       <list>
 67 |         <option value="$PROJECT_DIR$/douban/items.py" />
 68 |         <option value="$PROJECT_DIR$/douban/middlewares.py" />
 69 |         <option value="$PROJECT_DIR$/douban/settings.py" />
 70 |         <option value="$PROJECT_DIR$/douban/spiders/douban_spider.py" />
 71 |       </list>
 72 |     </option>
 73 |   </component>
 74 |   <component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
 75 |   <component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
 76 |   <component name="JsGulpfileManager">
 77 |     <detection-done>true</detection-done>
 78 |     <sorting>DEFINITION_ORDER</sorting>
 79 |   </component>
 80 |   <component name="ProjectFrameBounds" extendedState="6">
 81 |     <option name="x" value="225" />
 82 |     <option name="y" value="444" />
 83 |     <option name="width" value="1832" />
 84 |     <option name="height" value="673" />
 85 |   </component>
 86 |   <component name="ProjectView">
 87 |     <navigator currentView="ProjectPane" proportions="" version="1">
 88 |       <flattenPackages />
 89 |       <showMembers />
 90 |       <showModules />
 91 |       <showLibraryContents />
 92 |       <hideEmptyPackages />
 93 |       <abbreviatePackageNames />
 94 |       <autoscrollToSource />
 95 |       <autoscrollFromSource />
 96 |       <sortByType />
 97 |       <manualOrder />
 98 |       <foldersAlwaysOnTop value="true" />
 99 |     </navigator>
100 |     <panes>
101 |       <pane id="Scope" />
102 |       <pane id="Scratches" />
103 |       <pane id="ProjectPane">
104 |         <subPane>
105 |           <expand>
106 |             <path>
107 |               <item name="douban" type="b2602c69:ProjectViewProjectNode" />
108 |               <item name="douban" type="462c0819:PsiDirectoryNode" />
109 |             </path>
110 |             <path>
111 |               <item name="douban" type="b2602c69:ProjectViewProjectNode" />
112 |               <item name="douban" type="462c0819:PsiDirectoryNode" />
113 |               <item name="douban" type="462c0819:PsiDirectoryNode" />
114 |             </path>
115 |             <path>
116 |               <item name="douban" type="b2602c69:ProjectViewProjectNode" />
117 |               <item name="douban" type="462c0819:PsiDirectoryNode" />
118 |               <item name="douban" type="462c0819:PsiDirectoryNode" />
119 |               <item name="spiders" type="462c0819:PsiDirectoryNode" />
120 |             </path>
121 |           </expand>
122 |           <select />
123 |         </subPane>
124 |       </pane>
125 |     </panes>
126 |   </component>
127 |   <component name="PropertiesComponent">
128 |     <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
129 |     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
130 |     <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
131 |     <property name="WebServerToolWindowFactoryState" value="true" />
132 |   </component>
133 |   <component name="RunDashboard">
134 |     <option name="ruleStates">
135 |       <list>
136 |         <RuleState>
137 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
138 |         </RuleState>
139 |         <RuleState>
140 |           <option name="name" value="StatusDashboardGroupingRule" />
141 |         </RuleState>
142 |       </list>
143 |     </option>
144 |   </component>
145 |   <component name="RunManager">
146 |     <configuration name="douban_spider" type="PythonConfigurationType" factoryName="Python" temporary="true">
147 |       <option name="INTERPRETER_OPTIONS" value="" />
148 |       <option name="PARENT_ENVS" value="true" />
149 |       <envs>
150 |         <env name="PYTHONUNBUFFERED" value="1" />
151 |       </envs>
152 |       <option name="SDK_HOME" value="" />
153 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/douban/spiders" />
154 |       <option name="IS_MODULE_SDK" value="true" />
155 |       <option name="ADD_CONTENT_ROOTS" value="true" />
156 |       <option name="ADD_SOURCE_ROOTS" value="true" />
157 |       <module name="douban" />
158 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
159 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/douban/spiders/douban_spider.py" />
160 |       <option name="PARAMETERS" value="" />
161 |       <option name="SHOW_COMMAND_LINE" value="false" />
162 |       <option name="EMULATE_TERMINAL" value="false" />
163 |       <option name="MODULE_MODE" value="false" />
164 |     </configuration>
165 |     <recent_temporary>
166 |       <list size="1">
167 |         <item index="0" class="java.lang.String" itemvalue="Python.douban_spider" />
168 |       </list>
169 |     </recent_temporary>
170 |   </component>
171 |   <component name="ShelveChangesManager" show_recycled="false">
172 |     <option name="remove_strategy" value="false" />
173 |   </component>
174 |   <component name="SvnConfiguration">
175 |     <configuration />
176 |   </component>
177 |   <component name="TaskManager">
178 |     <task active="true" id="Default" summary="Default task">
179 |       <changelist id="e112b6e4-8e1d-428f-bf45-b3a39aea59f2" name="Default" comment="" />
180 |       <created>1527054871956</created>
181 |       <option name="number" value="Default" />
182 |       <option name="presentableId" value="Default" />
183 |       <updated>1527054871956</updated>
184 |     </task>
185 |     <servers />
186 |   </component>
187 |   <component name="ToolWindowManager">
188 |     <frame x="-8" y="-8" width="1936" height="1056" extended-state="6" />
189 |     <layout>
190 |       <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
191 |       <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="true" content_ui="tabs" />
192 |       <window_info id="File Transfer" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32934782" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
193 |       <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
194 |       <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
195 |       <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4326087" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
196 |       <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
197 |       <window_info id="Remote Host" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
198 |       <window_info id="Project" active="true" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.114605546" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
199 |       <window_info id="Docker" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
200 |       <window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
201 |       <window_info id="SciView" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
202 |       <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
203 |       <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="true" content_ui="tabs" />
204 |       <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
205 |       <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
206 |       <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
207 |       <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
208 |       <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
209 |       <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
210 |       <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
211 |       <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
212 |     </layout>
213 |   </component>
214 |   <component name="TypeScriptGeneratedFilesManager">
215 |     <option name="version" value="1" />
216 |   </component>
217 |   <component name="VcsContentAnnotationSettings">
218 |     <option name="myLimit" value="2678400000" />
219 |   </component>
220 |   <component name="XDebuggerManager">
221 |     <breakpoint-manager />
222 |     <watches-manager />
223 |   </component>
224 |   <component name="editorHistoryManager">
225 |     <entry file="file://$USER_HOME$/.PyCharm2017.3/system/remote_sources/1217917413/-36040614/scrapy/spiders/__init__.py">
226 |       <provider selected="true" editor-type-id="text-editor">
227 |         <state relative-caret-position="192">
228 |           <caret line="17" column="18" lean-forward="true" selection-start-line="17" selection-start-column="18" selection-end-line="17" selection-end-column="18" />
229 |           <folding />
230 |         </state>
231 |       </provider>
232 |     </entry>
233 |     <entry file="file://$PROJECT_DIR$/douban/spiders/douban_spider.py">
234 |       <provider selected="true" editor-type-id="text-editor">
235 |         <state relative-caret-position="48">
236 |           <caret line="2" column="30" lean-forward="false" selection-start-line="2" selection-start-column="30" selection-end-line="2" selection-end-column="30" />
237 |           <folding />
238 |         </state>
239 |       </provider>
240 |     </entry>
241 |     <entry file="file://$PROJECT_DIR$/douban/middlewares.py">
242 |       <provider selected="true" editor-type-id="text-editor">
243 |         <state relative-caret-position="678">
244 |           <caret line="121" column="0" lean-forward="true" selection-start-line="121" selection-start-column="0" selection-end-line="121" selection-end-column="0" />
245 |           <folding>
246 |             <element signature="e#168#181#0" expanded="true" />
247 |           </folding>
248 |         </state>
249 |       </provider>
250 |     </entry>
251 |     <entry file="file://$PROJECT_DIR$/douban/items.py">
252 |       <provider selected="true" editor-type-id="text-editor">
253 |         <state relative-caret-position="576">
254 |           <caret line="24" column="8" lean-forward="false" selection-start-line="24" selection-start-column="8" selection-end-line="24" selection-end-column="8" />
255 |           <folding />
256 |         </state>
257 |       </provider>
258 |     </entry>
259 |     <entry file="file://$PROJECT_DIR$/douban/settings.py">
260 |       <provider selected="true" editor-type-id="text-editor">
261 |         <state relative-caret-position="1920">
262 |           <caret line="98" column="11" lean-forward="true" selection-start-line="98" selection-start-column="11" selection-end-line="98" selection-end-column="11" />
263 |           <folding />
264 |         </state>
265 |       </provider>
266 |     </entry>
267 |   </component>
268 | </project>


--------------------------------------------------------------------------------
/mafengwo_article_spider/mafengwo/js/tool_decode_index.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | /** @type {!Array} */
  3 | var _0xb483 = ["_decode", "http://www.sojson.com/javascriptobfuscator.html"];
  4 | (function(metaWindow) {
  5 |   metaWindow[_0xb483[0]] = _0xb483[1];
  6 | })(window);
  7 | /** @type {!Array} */
  8 | var __Ox2133f = ["use strict", "$", "SparkMD5", "charCodeAt", "length", "substring", "match", "subarray", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f", "", "join", "reset", "hello", "5d41402abc4b2a76b9719d911017c592", "append", "prototype", "test", "appendBinary", "_buff", "_length", "substr", "end", "_state", "_finish", "destroy", "hash", "hashBinary", "ArrayBuffer", "byteLength", "_concatArrayBuffer", "set", "c9d6618dbc657b41a66eb0af952906f1", "name", "slice",
  9 | "call", "toString", "Object", "value", "Array", "Null", "Undefined", "map", "push", "sort", "forEach", "stringify", "data", "extend", "_sn", "_ts", "getTime", "&_ts=", "&_sn=", "_ts=", "ajaxPrefilter"];
 10 | (function() {
 11 |   /**
 12 |    * @param {!Object} index
 13 |    * @return {?}
 14 |    */
 15 |   function view(index) {
 16 |     /**
 17 |      * @param {!Object} it
 18 |      * @return {?}
 19 |      */
 20 |     function render(it) {
 21 |       /** @type {!Array} */
 22 |       var command_codes = [];
 23 |       var out = {};
 24 |       var j;
 25 |       for (j in it) {
 26 |         var data = {};
 27 |         /** @type {string} */
 28 |         data["name"] = j;
 29 |         var _0xe7fex3c = Object["prototype"]["toString"]["call"](it[j])["slice"](8, -1);
 30 |         if (_0xe7fex3c === "Object") {
 31 |           data["value"] = render(it[j]);
 32 |         } else {
 33 |           if (_0xe7fex3c === "Array") {
 34 |             data["value"]] = it[j]["map"](function(slackName) {
 35 |               var _0xe7fex3e = Object["prototype"]["toString"]["call"](it[j])["slice"](8, -1);
 36 |               if (_0xe7fex3e === "Null" || _0xe7fex3e === "Undefined") {
 37 |                 return "";
 38 |               }
 39 |               return String(slackName);
 40 |             });
 41 |           } else {
 42 |             if (_0xe7fex3c === "Null" || _0xe7fex3c === "Undefined") {
 43 |               data["value"] = "";
 44 |             } else {
 45 |               /** @type {string} */
 46 |               data["value"] = String(it[j]);
 47 |             }
 48 |           }
 49 |         }
 50 |         command_codes["push"](data);
 51 |       }
 52 |       command_codes["sort"](function(boxA, boxB) {
 53 |         return boxA["name"] > boxB["name"] ? 1 : boxA["name"] < boxB["name"] ? -1 : 0;
 54 |       });
 55 |       command_codes["forEach"](function(line) {
 56 |         out[line["name"] = line["value"];
 57 |       });
 58 |       return out;
 59 |     }
 60 | //    函数调用
 61 |     var input = render(index);
 62 | //    return taiji[__Ox2133f[40]](JSON[__Ox2133f[60]](input) + code)[__Ox2133f[48]](2, 12);
 63 |     return window['SparkMD5']["hash"](JSON["stringify"](input) + "c9d6618dbc657b41a66eb0af952906f1")["slice"](2, 12);
 64 |   }
 65 |   __Ox2133f[0];
 66 |   var obj = window[__Ox2133f[1]];
 67 |   var taiji = window['SparkMD5'] = function() {
 68 |     __Ox2133f[0];
 69 |     /**
 70 |      * @param {number} name
 71 |      * @param {number} _
 72 |      * @return {?}
 73 |      */
 74 |     var $ = function(name, _) {
 75 |       return name + _ & 4294967295;
 76 |     };
 77 |     /**
 78 |      * @param {number} next
 79 |      * @param {number} a
 80 |      * @param {number} s
 81 |      * @param {number} v
 82 |      * @param {number} b
 83 |      * @param {number} target
 84 |      * @return {?}
 85 |      */
 86 |     var log = function(next, a, s, v, b, target) {
 87 |       a = $($(a, next), $(v, target));
 88 |       return $(a << b | a >>> 32 - b, s);
 89 |     };
 90 |     /**
 91 |      * @param {undefined} o
 92 |      * @param {number} n
 93 |      * @param {number} t
 94 |      * @param {number} a
 95 |      * @param {undefined} user
 96 |      * @param {number} token
 97 |      * @param {number} data
 98 |      * @return {?}
 99 |      */
100 |     var callback = function(o, n, t, a, user, token, data) {
101 |       return log(n & t | ~n & a, o, n, user, token, data);
102 |     };
103 |     /**
104 |      * @param {undefined} s
105 |      * @param {number} n
106 |      * @param {number} t
107 |      * @param {number} a
108 |      * @param {undefined} user
109 |      * @param {number} url
110 |      * @param {number} data
111 |      * @return {?}
112 |      */
113 |     var load = function(s, n, t, a, user, url, data) {
114 |       return log(n & a | t & ~a, s, n, user, url, data);
115 |     };
116 |     /**
117 |      * @param {undefined} params
118 |      * @param {number} type
119 |      * @param {number} index
120 |      * @param {number} prop
121 |      * @param {undefined} msg
122 |      * @param {number} url
123 |      * @param {number} data
124 |      * @return {?}
125 |      */
126 |     var fn = function(params, type, index, prop, msg, url, data) {
127 |       return log(type ^ index ^ prop, params, type, msg, url, data);
128 |     };
129 |     /**
130 |      * @param {undefined} o
131 |      * @param {number} n
132 |      * @param {?} t
133 |      * @param {?} a
134 |      * @param {undefined} c
135 |      * @param {number} data
136 |      * @param {number} value
137 |      * @return {?}
138 |      */
139 |     var print = function(o, n, t, a, c, data, value) {
140 |       return log(t ^ (n | ~a), o, n, c, data, value);
141 |     };
142 |     /**
143 |      * @param {!Array} args
144 |      * @param {!Array} obj
145 |      * @return {undefined}
146 |      */
147 |     var test = function(args, obj) {
148 |       var name = args[0];
149 |       var value = args[1];
150 |       var options = args[2];
151 |       var key = args[3];
152 |       name = callback(name, value, options, key, obj[0], 7, -680876936);
153 |       key = callback(key, name, value, options, obj[1], 12, -389564586);
154 |       options = callback(options, key, name, value, obj[2], 17, 606105819);
155 |       value = callback(value, options, key, name, obj[3], 22, -1044525330);
156 |       name = callback(name, value, options, key, obj[4], 7, -176418897);
157 |       key = callback(key, name, value, options, obj[5], 12, 1200080426);
158 |       options = callback(options, key, name, value, obj[6], 17, -1473231341);
159 |       value = callback(value, options, key, name, obj[7], 22, -45705983);
160 |       name = callback(name, value, options, key, obj[8], 7, 1770035416);
161 |       key = callback(key, name, value, options, obj[9], 12, -1958414417);
162 |       options = callback(options, key, name, value, obj[10], 17, -42063);
163 |       value = callback(value, options, key, name, obj[11], 22, -1990404162);
164 |       name = callback(name, value, options, key, obj[12], 7, 1804603682);
165 |       key = callback(key, name, value, options, obj[13], 12, -40341101);
166 |       options = callback(options, key, name, value, obj[14], 17, -1502002290);
167 |       value = callback(value, options, key, name, obj[15], 22, 1236535329);
168 |       name = load(name, value, options, key, obj[1], 5, -165796510);
169 |       key = load(key, name, value, options, obj[6], 9, -1069501632);
170 |       options = load(options, key, name, value, obj[11], 14, 643717713);
171 |       value = load(value, options, key, name, obj[0], 20, -373897302);
172 |       name = load(name, value, options, key, obj[5], 5, -701558691);
173 |       key = load(key, name, value, options, obj[10], 9, 38016083);
174 |       options = load(options, key, name, value, obj[15], 14, -660478335);
175 |       value = load(value, options, key, name, obj[4], 20, -405537848);
176 |       name = load(name, value, options, key, obj[9], 5, 568446438);
177 |       key = load(key, name, value, options, obj[14], 9, -1019803690);
178 |       options = load(options, key, name, value, obj[3], 14, -187363961);
179 |       value = load(value, options, key, name, obj[8], 20, 1163531501);
180 |       name = load(name, value, options, key, obj[13], 5, -1444681467);
181 |       key = load(key, name, value, options, obj[2], 9, -51403784);
182 |       options = load(options, key, name, value, obj[7], 14, 1735328473);
183 |       value = load(value, options, key, name, obj[12], 20, -1926607734);
184 |       name = fn(name, value, options, key, obj[5], 4, -378558);
185 |       key = fn(key, name, value, options, obj[8], 11, -2022574463);
186 |       options = fn(options, key, name, value, obj[11], 16, 1839030562);
187 |       value = fn(value, options, key, name, obj[14], 23, -35309556);
188 |       name = fn(name, value, options, key, obj[1], 4, -1530992060);
189 |       key = fn(key, name, value, options, obj[4], 11, 1272893353);
190 |       options = fn(options, key, name, value, obj[7], 16, -155497632);
191 |       value = fn(value, options, key, name, obj[10], 23, -1094730640);
192 |       name = fn(name, value, options, key, obj[13], 4, 681279174);
193 |       key = fn(key, name, value, options, obj[0], 11, -358537222);
194 |       options = fn(options, key, name, value, obj[3], 16, -722521979);
195 |       value = fn(value, options, key, name, obj[6], 23, 76029189);
196 |       name = fn(name, value, options, key, obj[9], 4, -640364487);
197 |       key = fn(key, name, value, options, obj[12], 11, -421815835);
198 |       options = fn(options, key, name, value, obj[15], 16, 530742520);
199 |       value = fn(value, options, key, name, obj[2], 23, -995338651);
200 |       name = print(name, value, options, key, obj[0], 6, -198630844);
201 |       key = print(key, name, value, options, obj[7], 10, 1126891415);
202 |       options = print(options, key, name, value, obj[14], 15, -1416354905);
203 |       value = print(value, options, key, name, obj[5], 21, -57434055);
204 |       name = print(name, value, options, key, obj[12], 6, 1700485571);
205 |       key = print(key, name, value, options, obj[3], 10, -1894986606);
206 |       options = print(options, key, name, value, obj[10], 15, -1051523);
207 |       value = print(value, options, key, name, obj[1], 21, -2054922799);
208 |       name = print(name, value, options, key, obj[8], 6, 1873313359);
209 |       key = print(key, name, value, options, obj[15], 10, -30611744);
210 |       options = print(options, key, name, value, obj[6], 15, -1560198380);
211 |       value = print(value, options, key, name, obj[13], 21, 1309151649);
212 |       name = print(name, value, options, key, obj[4], 6, -145523070);
213 |       key = print(key, name, value, options, obj[11], 10, -1120210379);
214 |       options = print(options, key, name, value, obj[2], 15, 718787259);
215 |       value = print(value, options, key, name, obj[9], 21, -343485551);
216 |       args[0] = $(name, args[0]);
217 |       args[1] = $(value, args[1]);
218 |       args[2] = $(options, args[2]);
219 |       args[3] = $(key, args[3]);
220 |     };
221 |     /**
222 |      * @param {?} validator
223 |      * @return {?}
224 |      */
225 |     var extend = function(validator) {
226 |       /** @type {!Array} */
227 |       var wavetones = [];
228 |       var value;
229 |       /** @type {number} */
230 |       value = 0;
231 |       for (; value < 64; value = value + 4) {
232 |         wavetones[value >> 2] = validator[__Ox2133f[3]](value) + (validator[__Ox2133f[3]](value + 1) << 8) + (validator[__Ox2133f[3]](value + 2) << 16) + (validator[__Ox2133f[3]](value + 3) << 24);
233 |       }
234 |       return wavetones;
235 |     };
236 |     /**
237 |      * @param {!Object} n
238 |      * @return {?}
239 |      */
240 |     var parse = function(n) {
241 |       /** @type {!Array} */
242 |       var input = [];
243 |       var i;
244 |       /** @type {number} */
245 |       i = 0;
246 |       for (; i < 64; i = i + 4) {
247 |         input[i >> 2] = n[i] + (n[i + 1] << 8) + (n[i + 2] << 16) + (n[i + 3] << 24);
248 |       }
249 |       return input;
250 |     };
251 |     /**
252 |      * @param {string} args
253 |      * @return {?}
254 |      */
255 |     var get = function(args) {
256 |       var val = args[__Ox2133f[4]];
257 |       /** @type {!Array} */
258 |       var item = [1732584193, -271733879, -1732584194, 271733878];
259 |       var data;
260 |       var condition;
261 |       var p;
262 |       var duration;
263 |       var minWidth;
264 |       var myPundit;
265 |       /** @type {number} */
266 |       data = 64;
267 |       for (; data <= val; data = data + 64) {
268 |         test(item, extend(args[__Ox2133f[5]](data - 64, data)));
269 |       }
270 |       args = args[__Ox2133f[5]](data - 64);
271 |       condition = args[__Ox2133f[4]];
272 |       /** @type {!Array} */
273 |       p = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
274 |       /** @type {number} */
275 |       data = 0;
276 |       for (; data < condition; data = data + 1) {
277 |         p[data >> 2] |= args[__Ox2133f[3]](data) << (data % 4 << 3);
278 |       }
279 |       p[data >> 2] |= 128 << (data % 4 << 3);
280 |       if (data > 55) {
281 |         test(item, p);
282 |         /** @type {number} */
283 |         data = 0;
284 |         for (; data < 16; data = data + 1) {
285 |           /** @type {number} */
286 |           p[data] = 0;
287 |         }
288 |       }
289 |       /** @type {number} */
290 |       duration = val * 8;
291 |       duration = duration.toString(16)[__Ox2133f[6]](/(.*?)(.{0,8})$/);
292 |       /** @type {number} */
293 |       minWidth = parseInt(duration[2], 16);
294 |       /** @type {number} */
295 |       myPundit = parseInt(duration[1], 16) || 0;
296 |       /** @type {number} */
297 |       p[14] = minWidth;
298 |       /** @type {number} */
299 |       p[15] = myPundit;
300 |       test(item, p);
301 |       return item;
302 |     };
303 |     /**
304 |      * @param {?} data
305 |      * @return {?}
306 |      */
307 |     var cb = function(data) {
308 |       var p = data[__Ox2133f[4]];
309 |       /** @type {!Array} */
310 |       var cb = [1732584193, -271733879, -1732584194, 271733878];
311 |       var pos;
312 |       var last;
313 |       var ret;
314 |       var t;
315 |       var energy;
316 |       var document;
317 |       /** @type {number} */
318 |       pos = 64;
319 |       for (; pos <= p; pos = pos + 64) {
320 |         test(cb, parse(data[__Ox2133f[7]](pos - 64, pos)));
321 |       }
322 |       data = pos - 64 < p ? data[__Ox2133f[7]](pos - 64) : new Uint8Array(0);
323 |       last = data[__Ox2133f[4]];
324 |       /** @type {!Array} */
325 |       ret = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
326 |       /** @type {number} */
327 |       pos = 0;
328 |       for (; pos < last; pos = pos + 1) {
329 |         ret[pos >> 2] |= data[pos] << (pos % 4 << 3);
330 |       }
331 |       ret[pos >> 2] |= 128 << (pos % 4 << 3);
332 |       if (pos > 55) {
333 |         test(cb, ret);
334 |         /** @type {number} */
335 |         pos = 0;
336 |         for (; pos < 16; pos = pos + 1) {
337 |           /** @type {number} */
338 |           ret[pos] = 0;
339 |         }
340 |       }
341 |       /** @type {number} */
342 |       t = p * 8;
343 |       t = t.toString(16)[__Ox2133f[6]](/(.*?)(.{0,8})$/);
344 |       /** @type {number} */
345 |       energy = parseInt(t[2], 16);
346 |       /** @type {number} */
347 |       document = parseInt(t[1], 16) || 0;
348 |       /** @type {number} */
349 |       ret[14] = energy;
350 |       /** @type {number} */
351 |       ret[15] = document;
352 |       test(cb, ret);
353 |       return cb;
354 |     };
355 |     /** @type {!Array} */
356 |     var _0xe7fexe = [__Ox2133f[8], __Ox2133f[9], __Ox2133f[10], __Ox2133f[11], __Ox2133f[12], __Ox2133f[13], __Ox2133f[14], __Ox2133f[15], __Ox2133f[16], __Ox2133f[17], __Ox2133f[18], __Ox2133f[19], __Ox2133f[20], __Ox2133f[21], __Ox2133f[22], __Ox2133f[23]];
357 |     /**
358 |      * @param {number} value
359 |      * @return {?}
360 |      */
361 |     var expect = function(value) {
362 |       var chain = __Ox2133f[24];
363 |       var i;
364 |       /** @type {number} */
365 |       i = 0;
366 |       for (; i < 4; i = i + 1) {
367 |         chain = chain + (_0xe7fexe[value >> i * 8 + 4 & 15] + _0xe7fexe[value >> i * 8 & 15]);
368 |       }
369 |       return chain;
370 |     };
371 |     /**
372 |      * @param {!Array} state
373 |      * @return {?}
374 |      */
375 |     var resolve = function(state) {
376 |       var reducerKey;
377 |       /** @type {number} */
378 |       reducerKey = 0;
379 |       for (; reducerKey < state[__Ox2133f[4]]; reducerKey = reducerKey + 1) {
380 |         state[reducerKey] = expect(state[reducerKey]);
381 |       }
382 |       return state[__Ox2133f[25]](__Ox2133f[24]);
383 |     };
384 |     /**
385 |      * @param {string} config
386 |      * @return {?}
387 |      */
388 |     var gettingStartedGateCheck = function(config) {
389 |       return resolve(get(config));
390 |     };
391 |     /**
392 |      * @return {undefined}
393 |      */
394 |     var _0xe7fex2 = function() {
395 |       this[__Ox2133f[26]]();
396 |     };
397 |     if (gettingStartedGateCheck(__Ox2133f[27]) !== __Ox2133f[28]) {
398 |       /**
399 |        * @param {number} a
400 |        * @param {number} b
401 |        * @return {?}
402 |        */
403 |       $ = function(a, b) {
404 |         /** @type {number} */
405 |         var uch = (a & 65535) + (b & 65535);
406 |         /** @type {number} */
407 |         var dwch = (a >> 16) + (b >> 16) + (uch >> 16);
408 |         return dwch << 16 | uch & 65535;
409 |       };
410 |     }
411 |     /**
412 |      * @param {string} value
413 |      * @return {?}
414 |      */
415 |     _0xe7fex2[__Ox2133f[30]][__Ox2133f[29]] = function(value) {
416 |       if (/[\u0080-\uFFFF]/[__Ox2133f[31]](value)) {
417 |         /** @type {string} */
418 |         value = unescape(encodeURIComponent(value));
419 |       }
420 |       this[__Ox2133f[32]](value);
421 |       return this;
422 |     };
423 |     /**
424 |      * @param {?} canCreateDiscussions
425 |      * @return {?}
426 |      */
427 |     _0xe7fex2[__Ox2133f[30]][__Ox2133f[32]] = function(canCreateDiscussions) {
428 |       this[__Ox2133f[33]] += canCreateDiscussions;
429 |       this[__Ox2133f[34]] += canCreateDiscussions[__Ox2133f[4]];
430 |       var _zAdjPortWidth = this[__Ox2133f[33]][__Ox2133f[4]];
431 |       var _xpos;
432 |       /** @type {number} */
433 |       _xpos = 64;
434 |       for (; _xpos <= _zAdjPortWidth; _xpos = _xpos + 64) {
435 |         test(this._state, extend(this[__Ox2133f[33]][__Ox2133f[5]](_xpos - 64, _xpos)));
436 |       }
437 |       this[__Ox2133f[33]] = this[__Ox2133f[33]][__Ox2133f[35]](_xpos - 64);
438 |       return this;
439 |     };
440 |     /**
441 |      * @param {?} canCreateDiscussions
442 |      * @return {?}
443 |      */
444 |     _0xe7fex2[__Ox2133f[30]][__Ox2133f[36]] = function(canCreateDiscussions) {
445 |       var ref = this[__Ox2133f[33]];
446 |       var length = ref[__Ox2133f[4]];
447 |       var value;
448 |       /** @type {!Array} */
449 |       var hexDigits = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
450 |       var _0xe7fex2c;
451 |       /** @type {number} */
452 |       value = 0;
453 |       for (; value < length; value = value + 1) {
454 |         hexDigits[value >> 2] |= ref[__Ox2133f[3]](value) << (value % 4 << 3);
455 |       }
456 |       this._finish(hexDigits, length);
457 |       _0xe7fex2c = !!canCreateDiscussions ? this[__Ox2133f[37]] : resolve(this._state);
458 |       this[__Ox2133f[26]]();
459 |       return _0xe7fex2c;
460 |     };
461 |     /**
462 |      * @param {!Array} proto
463 |      * @param {number} s
464 |      * @return {undefined}
465 |      */
466 |     _0xe7fex2[__Ox2133f[30]][__Ox2133f[38]] = function(proto, s) {
467 |       /** @type {number} */
468 |       var style = s;
469 |       var expectedZIndices;
470 |       var f;
471 |       var val;
472 |       proto[style >> 2] |= 128 << (style % 4 << 3);
473 |       if (style > 55) {
474 |         test(this._state, proto);
475 |         /** @type {number} */
476 |         style = 0;
477 |         for (; style < 16; style = style + 1) {
478 |           /** @type {number} */
479 |           proto[style] = 0;
480 |         }
481 |       }
482 |       /** @type {number} */
483 |       expectedZIndices = this[__Ox2133f[34]] * 8;
484 |       expectedZIndices = expectedZIndices.toString(16)[__Ox2133f[6]](/(.*?)(.{0,8})$/);
485 |       /** @type {number} */
486 |       f = parseInt(expectedZIndices[2], 16);
487 |       /** @type {number} */
488 |       val = parseInt(expectedZIndices[1], 16) || 0;
489 |       /** @type {number} */
490 |       proto[14] = f;
491 |       /** @type {number} */
492 |       proto[15] = val;
493 |       test(this._state, proto);
494 |     };
495 |     /**
496 |      * @return {?}
497 |      */
498 |     _0xe7fex2[__Ox2133f[30]][__Ox2133f[26]] = function() {
499 |       this[__Ox2133f[33]] = __Ox2133f[24];
500 |       /** @type {number} */
501 |       this[__Ox2133f[34]] = 0;
502 |       /** @type {!Array} */
503 |       this[__Ox2133f[37]] = [1732584193, -271733879, -1732584194, 271733878];
504 |       return this;
505 |     };
506 |     /**
507 |      * @return {undefined}
508 |      */
509 |     _0xe7fex2[__Ox2133f[30]][__Ox2133f[39]] = function() {
510 |       delete this[__Ox2133f[37]];
511 |       delete this[__Ox2133f[33]];
512 |       delete this[__Ox2133f[34]];
513 |     };
514 |     /**
515 |      * @param {string} message
516 |      * @param {?} canCreateDiscussions
517 |      * @return {?}
518 |      */
519 |     _0xe7fex2[__Ox2133f[40]] = function(message, canCreateDiscussions) {
520 |       if (/[\u0080-\uFFFF]/[__Ox2133f[31]](message)) {
521 |         /** @type {string} */
522 |         message = unescape(encodeURIComponent(message));
523 |       }
524 |       var ret = get(message);
525 |       return !!canCreateDiscussions ? ret : resolve(ret);
526 |     };
527 |     /**
528 |      * @param {string} title
529 |      * @param {?} canCreateDiscussions
530 |      * @return {?}
531 |      */
532 |     _0xe7fex2[__Ox2133f[41]] = function(title, canCreateDiscussions) {
533 |       var ret = get(title);
534 |       return !!canCreateDiscussions ? ret : resolve(ret);
535 |     };
536 |     /**
537 |      * @return {undefined}
538 |      */
539 |     _0xe7fex2[__Ox2133f[42]] = function() {
540 |       this[__Ox2133f[26]]();
541 |     };
542 |     /**
543 |      * @param {?} arr
544 |      * @return {?}
545 |      */
546 |     _0xe7fex2[__Ox2133f[42]][__Ox2133f[30]][__Ox2133f[29]] = function(arr) {
547 |       var buff = this._concatArrayBuffer(this._buff, arr);
548 |       var length = buff[__Ox2133f[4]];
549 |       var i;
550 |       this[__Ox2133f[34]] += arr[__Ox2133f[43]];
551 |       /** @type {number} */
552 |       i = 64;
553 |       for (; i <= length; i = i + 64) {
554 |         test(this._state, parse(buff[__Ox2133f[7]](i - 64, i)));
555 |       }
556 |       this[__Ox2133f[33]] = i - 64 < length ? buff[__Ox2133f[7]](i - 64) : new Uint8Array(0);
557 |       return this;
558 |     };
559 |     /**
560 |      * @param {?} canCreateDiscussions
561 |      * @return {?}
562 |      */
563 |     _0xe7fex2[__Ox2133f[42]][__Ox2133f[30]][__Ox2133f[36]] = function(canCreateDiscussions) {
564 |       var array = this[__Ox2133f[33]];
565 |       var length = array[__Ox2133f[4]];
566 |       /** @type {!Array} */
567 |       var tail = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
568 |       var i;
569 |       var _0xe7fex2c;
570 |       /** @type {number} */
571 |       i = 0;
572 |       for (; i < length; i = i + 1) {
573 |         tail[i >> 2] |= array[i] << (i % 4 << 3);
574 |       }
575 |       this._finish(tail, length);
576 |       _0xe7fex2c = !!canCreateDiscussions ? this[__Ox2133f[37]] : resolve(this._state);
577 |       this[__Ox2133f[26]]();
578 |       return _0xe7fex2c;
579 |     };
580 |     _0xe7fex2[__Ox2133f[42]][__Ox2133f[30]][__Ox2133f[38]] = _0xe7fex2[__Ox2133f[30]][__Ox2133f[38]];
581 |     /**
582 |      * @return {?}
583 |      */
584 |     _0xe7fex2[__Ox2133f[42]][__Ox2133f[30]][__Ox2133f[26]] = function() {
585 |       /** @type {!Uint8Array} */
586 |       this[__Ox2133f[33]] = new Uint8Array(0);
587 |       /** @type {number} */
588 |       this[__Ox2133f[34]] = 0;
589 |       /** @type {!Array} */
590 |       this[__Ox2133f[37]] = [1732584193, -271733879, -1732584194, 271733878];
591 |       return this;
592 |     };
593 |     _0xe7fex2[__Ox2133f[42]][__Ox2133f[30]][__Ox2133f[39]] = _0xe7fex2[__Ox2133f[30]][__Ox2133f[39]];
594 |     /**
595 |      * @param {?} row
596 |      * @param {?} b
597 |      * @return {?}
598 |      */
599 |     _0xe7fex2[__Ox2133f[42]][__Ox2133f[30]][__Ox2133f[44]] = function(row, b) {
600 |       var val = row[__Ox2133f[4]];
601 |       /** @type {!Uint8Array} */
602 |       var c = new Uint8Array(val + b[__Ox2133f[43]]);
603 |       c[__Ox2133f[45]](row);
604 |       c[__Ox2133f[45]](new Uint8Array(b), val);
605 |       return c;
606 |     };
607 |     /**
608 |      * @param {?} output
609 |      * @param {?} canCreateDiscussions
610 |      * @return {?}
611 |      */
612 |     _0xe7fex2[__Ox2133f[42]][__Ox2133f[40]] = function(output, canCreateDiscussions) {
613 |       var ret = cb(new Uint8Array(output));
614 |       return !!canCreateDiscussions ? ret : resolve(ret);
615 |     };
616 |     return _0xe7fex2;
617 |   }();
618 | //  var code = __Ox2133f[46];
619 |   var code = 'c9d6618dbc657b41a66eb0af952906f1'
620 |   obj['ajaxPrefilter'](function(boardManager, isSlidingUp) {
621 | //boardManager是一个对象，包含data
622 | //  obj[__Ox2133f[69]](function(boardManager, isSlidingUp) {
623 | //    var p3 = obj[__Ox2133f[62]](true, {}, isSlidingUp[__Ox2133f[61]] || {});
624 | //    extend方法扩展对象属性
625 |     var p3 = obj['extend'](true, {}, isSlidingUp['data'] || {});
626 |     //sn
627 | //    如果存在sn,则先删除
628 |     if (p3["_sn"]) {
629 |       delete p3["_sn"];
630 |     }
631 | //    获取时间戳
632 |     p3["_ts"] = (new Date)[__Ox2133f[65]]();
633 | //    调用view函数,计算出vroot也就是sn值
634 | //    var vroot = view(obj[__Ox2133f[62]](true, {}, p3));
635 |     var vroot = view(obj["extend"](true, {}, p3));
636 |     if ("data" in boardManager) {
637 | //      boardManager[__Ox2133f[61]] += __Ox2133f[66] + p3[__Ox2133f[64]] + __Ox2133f[67] + vroot;
638 |       boardManager["data"] += "&_ts=" + p3["_ts"] + "&_sn=" + vroot;
639 |     } else {
640 | //      boardManager[__Ox2133f[61]] = __Ox2133f[68] + p3[__Ox2133f[64]] + __Ox2133f[67] + vroot;
641 |       boardManager["data"] = "_ts=" + p3["_ts"] + "&_sn" + vroot;
642 |     }
643 |   });
644 | })();


--------------------------------------------------------------------------------