├── .gitignore
├── 01-Cnblogs
├── README.md
├── cnblogs.py
├── demo
│ └── demo.gif
└── screenshot
│ ├── README.md
│ ├── db_01.jpg
│ └── db_02.jpg
├── 02-Golory_of_Kings
├── Glory_of_Kings.py
├── README.md
└── result.jpg
├── 03-MaoYan_Top100
├── MaoYan_Top100.py
├── README.md
└── result.txt
├── 04-Selenium_Taobao
├── README.md
├── __pycache__
│ └── xdaili.cpython-36.pyc
├── demo
│ └── demo.gif
├── result
│ ├── README.md
│ ├── db_01.jpg
│ └── db_02.jpg
├── taobao.py
├── utils
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── config.cpython-36.pyc
│ ├── config.py
│ └── proxy_auth_plugin.zip
└── xdaili.py
├── 05-Moments
├── .idea
│ ├── 05-Moments.iml
│ ├── encodings.xml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── Readme.md
├── __pycache__
│ ├── config.cpython-36.pyc
│ └── processor.cpython-36.pyc
├── config.py
├── moments.py
├── plates
│ ├── SDK.png
│ ├── content.png
│ ├── contents.png
│ ├── datetime.png
│ ├── device_name.png
│ ├── login.png
│ ├── moment_db.png
│ ├── nickname.png
│ └── yes-no.png
└── processor.py
├── 06-Github_Login
├── README.md
└── github_login.py
├── 07-IGetGet
├── README.md
├── __pycache__
│ └── script.cpython-36.pyc
├── dedao.json
├── script.py
└── utils
│ ├── app_error.jpg
│ ├── charles安装证书页面.png
│ ├── connect_error.png
│ ├── demo.gif
│ ├── mitmproxy证书.png
│ ├── mongo_server_error.png
│ ├── result_json.png
│ ├── 乱码_01.png
│ ├── 乱码_02.png
│ └── 手机证书安装.png
├── 08-Selenium_Cnki
├── README.md
├── __pycache__
│ ├── config.cpython-36.pyc
│ └── handle_code.cpython-36.pyc
├── chaojiying.py
├── cnki.py
├── demo
│ ├── demo.gif
│ └── 超级鹰积分.jpg
└── utils
│ ├── config.py
│ └── handle.py
├── 09-Bilibili
├── .idea
│ ├── bilibili.iml
│ ├── encodings.xml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── README.md
├── bilibili.py
├── captcha1.png
├── captcha2.png
├── require
│ ├── demo.gif
│ └── demo_location.png
└── utils
│ ├── __pycache__
│ └── config.cpython-36.pyc
│ └── config.py
├── 10-DouYin
├── README.md
├── __pycache__
│ └── script.cpython-36.pyc
├── plates
│ ├── JSONDecodeError.jpg
│ ├── TypeError.jpg
│ ├── charles.png
│ ├── demo.gif
│ ├── douyin.json
│ ├── video_demo.gif
│ ├── video_info_json.png
│ └── video_screentshot.png
└── script.py
├── 11-Jianshu
├── README.md
├── __pycache__
│ ├── __init__.cpython-36.pyc
│ └── script.cpython-36.pyc
├── action.py
├── demo
│ └── demo.gif
├── result
│ ├── __init__.py
│ └── jianshu.json
├── script.py
└── utils
│ ├── __pycache__
│ ├── __init__.cpython-36.pyc
│ └── config.cpython-36.pyc
│ └── config.py
├── 12-Crack_Jianshu
├── .idea
│ ├── Jianshu.iml
│ ├── encodings.xml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── README.md
├── captcha.png
├── jianshu.py
├── require
│ ├── chaojiying.png
│ ├── code_demo.png
│ ├── demo.gif
│ └── 超级鹰返回结果处理示例.png
└── utils
│ ├── __pycache__
│ ├── chaojiying.cpython-36.pyc
│ └── config.cpython-36.pyc
│ ├── chaojiying.py
│ └── config.py
├── 13-Pyspider_Lagou
├── README.md
├── data
│ ├── project.db
│ ├── result.db
│ ├── scheduler.1d
│ ├── scheduler.1h
│ ├── scheduler.all
│ └── task.db
├── demo.py
├── lagou.py
└── result
│ ├── db.jpg
│ ├── 单步测试结果_01.jpg
│ ├── 单步测试结果_02.jpg
│ ├── 单步测试结果_03.jpg
│ ├── 单步测试结果_04.jpg
│ └── 单步测试结果_05.jpg
├── 14-Scrapy_Tutorial
├── README.md
├── demo.gif
└── tutorial
│ ├── scrapy.cfg
│ └── tutorial
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-36.pyc
│ ├── items.cpython-36.pyc
│ ├── pipelines.cpython-36.pyc
│ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── main.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-36.pyc
│ └── quotes.cpython-36.pyc
│ └── quotes.py
├── 15-Scrapy_Images360
├── README.md
├── images360
│ ├── images360
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ ├── items.cpython-36.pyc
│ │ │ ├── middlewares.cpython-36.pyc
│ │ │ ├── pipelines.cpython-36.pyc
│ │ │ └── settings.cpython-36.pyc
│ │ ├── images
│ │ │ ├── t01a3ee5a4ff05fe133.jpg
│ │ │ ├── t01a5f844c4a5d5ed7d.jpg
│ │ │ ├── t01ad50ec608cde5fdc.jpg
│ │ │ ├── t01aed1278f885e26ec.jpg
│ │ │ ├── t01b29ea494ffdab388.jpg
│ │ │ ├── t01bf8bb6d4c6b93fff.jpg
│ │ │ └── t01c2bb853e048be307.jpg
│ │ ├── items.py
│ │ ├── main.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ │ ├── __init__.py
│ │ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ └── images.cpython-36.pyc
│ │ │ └── images.py
│ └── scrapy.cfg
└── screenshot
│ ├── README.md
│ ├── demo.gif
│ ├── images.jpg
│ ├── mongodb.jpg
│ └── mysql.jpg
├── 16-vczh
├── .idea
│ ├── encodings.xml
│ ├── misc.xml
│ ├── modules.xml
│ ├── vczh.iml
│ └── workspace.xml
├── README.md
├── scrapy.cfg
└── vczh
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-36.pyc
│ ├── items.cpython-36.pyc
│ ├── middlewares.cpython-36.pyc
│ ├── pipelines.cpython-36.pyc
│ ├── sendemail.cpython-36.pyc
│ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── main.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── sendemail.py
│ ├── settings.py
│ ├── spiders
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── vc.cpython-36.pyc
│ └── vc.py
│ └── utils
│ ├── db_follower.png
│ ├── email.png
│ ├── followers.png
│ ├── huaji.png
│ └── log.png
├── 17-City_58
├── City_58
│ ├── City_58
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ ├── items.cpython-36.pyc
│ │ │ ├── middlewares.cpython-36.pyc
│ │ │ ├── pipelines.cpython-36.pyc
│ │ │ └── settings.cpython-36.pyc
│ │ ├── items.py
│ │ ├── main.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ ├── spiders
│ │ │ ├── 58.py
│ │ │ ├── __init__.py
│ │ │ └── __pycache__
│ │ │ │ ├── 58.cpython-36.pyc
│ │ │ │ └── __init__.cpython-36.pyc
│ │ └── utils
│ │ │ ├── __init__.py
│ │ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ ├── api.cpython-36.pyc
│ │ │ ├── parse.cpython-36.pyc
│ │ │ ├── proxy.cpython-36.pyc
│ │ │ └── xdaili.cpython-36.pyc
│ │ │ ├── api.py
│ │ │ ├── parse.py
│ │ │ ├── proxy.py
│ │ │ └── xdaili.py
│ └── scrapy.cfg
├── README.md
└── screenshot
│ ├── monogdb.jpg
│ ├── run_01.jpg
│ └── run_02.jpg
├── 18-36kr
├── .idea
│ ├── 36kr.iml
│ ├── encodings.xml
│ ├── inspectionProfiles
│ │ └── Project_Default.xml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── 36kr.py
├── README.md
└── utils
│ ├── 36kr.txt
│ ├── FZSTK.TTF
│ ├── __pycache__
│ └── word.cpython-36.pyc
│ ├── cloud.jpg
│ ├── db.png
│ ├── show.jpg
│ └── word.py
├── 19-Youku_DanMu
├── .idea
│ ├── Youku_DanMu.iml
│ ├── encodings.xml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── README.md
├── danmu.py
└── utils
│ ├── FZSTK.TTF
│ ├── cloud.jpg
│ ├── danmu.txt
│ ├── require
│ ├── danmu_content.png
│ └── danmu_json.png
│ ├── show.jpg
│ └── word.py
├── 20-Selenium_163
├── .idea
│ ├── 20-Selenium_163Email.iml
│ ├── encodings.xml
│ ├── inspectionProfiles
│ │ └── Project_Default.xml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── 163.py
├── README.md
├── require
│ ├── content_frame.png
│ ├── demo.gif
│ └── login_frame.png
└── utils
│ ├── __pycache__
│ └── config.cpython-36.pyc
│ └── config.py
├── 21-AutoCrawl_DouYin
├── .idea
│ ├── DouYin.iml
│ ├── encodings.xml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── README.md
├── __pycache__
│ ├── config.cpython-36.pyc
│ └── scripts.cpython-36.pyc
├── actions.py
├── config.py
├── plates
│ ├── demo.gif
│ ├── douyin_demo.gif
│ ├── start.png
│ ├── video_name.png
│ ├── video_url.png
│ └── 图形点触验证码.png
└── scripts.py
├── 22-Stackoverflow
├── .idea
│ ├── encodings.xml
│ ├── misc.xml
│ ├── modules.xml
│ ├── stackoverflow.iml
│ └── workspace.xml
├── README.md
├── scrapy.cfg
└── stackoverflow
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-36.pyc
│ ├── items.cpython-36.pyc
│ ├── middlewares.cpython-36.pyc
│ ├── pipelines.cpython-36.pyc
│ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── main.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ ├── spiders
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── stack.cpython-36.pyc
│ └── stack.py
│ └── utils
│ ├── Error.png
│ └── db.png
├── 23-GithubLogin
├── .idea
│ ├── encodings.xml
│ ├── github.iml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── README.md
├── github
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── main.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ ├── spiders
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ └── logingit.cpython-36.pyc
│ │ └── logingit.py
│ └── utils
│ │ ├── __init__.py
│ │ └── acatar.jpg
└── scrapy.cfg
├── 24-Dianping
├── README.md
├── demo.py
└── utils
│ ├── prtsc1.png
│ ├── prtsc2.png
│ ├── prtsc3.png
│ ├── prtsc4.png
│ └── prtsc5.png
├── 25-DouYin
├── README.md
├── douyin.py
├── font.py
└── shareid.txt
├── README.md
└── sogou_wechat_captcha.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | /.idea
3 | __pycache__/
4 | *.py[cod]
5 | *$py.class
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 | *.cover
48 | .hypothesis/
49 | .pytest_cache/
50 |
51 | # Translations
52 | *.mo
53 | *.pot
54 |
55 | # Django stuff:
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
--------------------------------------------------------------------------------
/01-Cnblogs/README.md:
--------------------------------------------------------------------------------
1 | ## Cnblogs
2 | 使用 urllib 请求库抓取博客园首页最新文章信息并将数据存储到MongoDB, 包含标题、作者、发布时间、阅读量、评论等。
3 |
4 | ## Tip
5 | 博客园的数据请求接口有点特殊, 要认真分析。若有疑问, 可邮箱联系。
6 |
7 | ## Demo
8 | 
9 |
--------------------------------------------------------------------------------
/01-Cnblogs/demo/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/01-Cnblogs/demo/demo.gif
--------------------------------------------------------------------------------
/01-Cnblogs/screenshot/README.md:
--------------------------------------------------------------------------------
1 | ## Display 1
2 | 
3 |
4 | ## Display 2
5 | 
6 |
--------------------------------------------------------------------------------
/01-Cnblogs/screenshot/db_01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/01-Cnblogs/screenshot/db_01.jpg
--------------------------------------------------------------------------------
/01-Cnblogs/screenshot/db_02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/01-Cnblogs/screenshot/db_02.jpg
--------------------------------------------------------------------------------
/02-Golory_of_Kings/Glory_of_Kings.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | """
3 | Created at 21:27 at Sep 17,2018
4 | @author: Northxw
5 | """
6 |
7 | import requests
8 | import os
9 |
10 | # 全英雄列表请求链接
11 | herolist_url = 'https://pvp.qq.com/web201605/js/herolist.json'
12 | # 获取数据
13 | response = requests.get(herolist_url).json()
14 |
15 | # 根据英雄的皮肤链接,分析并下载英雄的皮肤
16 | save_dir = "E:\Python\Spider\Ex\\01-Spider_Glory_of_Kings\hero_list\\" # 指定下载位置
17 | if not os.path.exists(save_dir):
18 | os.mkdir(save_dir)
19 |
20 | for i in range(len(response)):
21 | # 获取英雄皮肤列表
22 | skin_names = response[i]['skin_name'].split('|')
23 | for cnt in range(len(skin_names)):
24 | # 下载当前英雄的所有皮肤
25 | hero_num = response[i]['ename'] # 英雄序号
26 | hero_name = response[i]['cname'] # 英雄名称
27 | skin_name = skin_names[cnt] # 皮肤名称
28 |
29 | save_file_name = save_dir + str(hero_num) + '-' + hero_name + '-' + skin_name + '.jpg'
30 | skin_url = 'http://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{}/{}-bigskin-{}.jpg'.format(hero_num, hero_num, str(cnt+1))
31 | # 获取图片的位数据(二进制流数据)
32 | response_skin_content = requests.get(skin_url).content
33 | # 保存图片
34 | with open(save_file_name, 'wb') as f:
35 | f.write(response_skin_content)
36 |
--------------------------------------------------------------------------------
/02-Golory_of_Kings/README.md:
--------------------------------------------------------------------------------
1 | ## Golory of Kings
2 | 使用requests请求库完成对王者荣耀英雄全皮肤的下载。
3 |
4 | ## Introduction
5 | 代码中的请求接口可通过Chrome开发者工具轻松获取, 具体的爬取过程可以参考公众号"C与python实战"爬虫模块。
6 |
7 | ## Result
8 | 
9 |
--------------------------------------------------------------------------------
/02-Golory_of_Kings/result.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/02-Golory_of_Kings/result.jpg
--------------------------------------------------------------------------------
/03-MaoYan_Top100/MaoYan_Top100.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | """
3 | Updated at 14:33 on March 11,2019
4 | @title: Spider Maoyan Top100
5 | @author: Northxw
6 | """
7 |
8 | import requests
9 | import re
10 | import json
11 | from requests.exceptions import RequestException
12 | from pymongo import MongoClient
13 | import time
14 |
15 | # 创建数据库连接
16 | client = MongoClient('mongodb://localhost:27017/')
17 | db = client.maoyan
18 | collection = db.rank
19 |
20 | def get_one_page(url):
21 | """
22 | 获取每页的网页源代码
23 | :param url: 请求链接
24 | :return: 网页的文本内容
25 | """
26 | try:
27 | headers = {
28 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
29 | }
30 | response = requests.get(url=url, headers=headers)
31 | if response.status_code == 200:
32 | return response.text
33 | return None
34 | except RequestException:
35 | return None
36 |
37 | def parse_one_page(html):
38 | """
39 | 使用正则表达式解析网页数据
40 | :param html: 网页的文本内容
41 | :return: 字典
42 | """
43 | pattern = re.compile(
44 | r'
.*?board-index.*?>(.*?).*?data-src="(.*?)".*?name.*?a.*?>(.*?).*?star.*?>(.*?).'
45 | r'*?releasetime.*?>(.*?).*?integer.*?>(.*?).*?fraction.*?>(.*?).*?',
46 | re.S
47 | )
48 | items = re.findall(pattern, html)
49 | for item in items:
50 | yield {
51 | 'index': item[0],
52 | 'image': item[1].split('@')[0],
53 | 'title': item[2].strip(),
54 | 'actor': item[3].strip()[3:] if len(item[3]) > 3 else '',
55 | 'time': item[4].strip()[5:] if len(item[4]) > 5 else '',
56 | 'score': item[5].strip() + item[6].strip()
57 | }
58 |
59 | def write_to_file(content):
60 | with open('result.txt', 'a', encoding='utf-8') as f:
61 | f.write(json.dumps(content, ensure_ascii=False) + '\n')
62 |
63 | def save_to_mongo(item):
64 | """
65 | 将数据存储到MongoDB
66 | :param dict: 字典类型的数据
67 | :return: None
68 | """
69 | collection.insert(item)
70 |
71 | def main(offset):
72 | url = 'http://maoyan.com/board/4?offset={}'.format(str(offset))
73 | html = get_one_page(url)
74 | for item in parse_one_page(html):
75 | write_to_file(item)
76 | save_to_mongo(item)
77 |
78 | if __name__ == '__main__':
79 | for i in range(10):
80 | main(offset=i*10)
81 | time.sleep(1)
--------------------------------------------------------------------------------
/03-MaoYan_Top100/README.md:
--------------------------------------------------------------------------------
1 | ## MaoYan Top100
2 | 使用requests请求库获取猫眼电影排行TOP100的电影名称、时间、评分、图片等信息,结果以文本格式保存。
3 |
4 | ## Crawl analysis
5 | 打开目标站点,查看榜单信息,如图:
6 | 
7 | 排名第一的电影是霸王别姬,页面中显示的有效信息有影片名称、主演、上映时间、上映地区、评分、图片等信息。
8 | 翻页规律:按住鼠标滑轮滚动到页面底部,点击下一页,观察页面URL和内容发生的变化,如图:
9 | 
10 | 可以发现页面的URL变成http://maoyan.com/board/4?offset=10, 比之前的URL多了一个参数,那就是offset=10,而目前显示的结果是排行11-20名的电影,初步推断这是一个偏移量的参数。再点击下一页,发现页面的URL变成了http://maoyan.com/board/4?offset=20, 参数offset变成了20,而显示的结果是排行21~30的电影
11 | 由此可以总结出规律,off代表偏移量值,如果偏移量为n,则显示的电影序号就是n+1到n+10,每页显示10个。所以,如果想获取TOP100电影,只需要分开请求10次,而10次的offset参数分别设置为0、10、20、…90即可,这样获取不同的页面之后,再用正则表达式提取出相关信息,就可以得到TOP100的所有电影信息了。
12 |
13 | ## Other
14 | + 目标信息采用正则匹配(当然,完全可以利用xpath,pyquery,css等方法)
15 | + 网页的真实源码可以在Chroem浏览器的开发者模式下的Network监听组件中查看
16 | + 写入文件的时候为了保证输出结果是中文形式而不是Unicode编码,需要将open的encoding参数设置为"utf-8",然后在 f.write 时添加 ensure_ascii 参数并设置为False
17 |
18 | ## Result
19 | 
20 |
--------------------------------------------------------------------------------
/04-Selenium_Taobao/README.md:
--------------------------------------------------------------------------------
1 | ## Selenium Taobao
2 | 使用 Selenium+Chrome+Xdaili 爬取淘宝商品数据, 包含商品的图片、名称、价格、购买人数、店铺名称等。
3 |
4 | ## Explain
5 | 本次实践参考崔大"Python3网络爬虫开发实践"第七章。
6 |
7 | ## Demo
8 | 
9 |
--------------------------------------------------------------------------------
/04-Selenium_Taobao/__pycache__/xdaili.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/04-Selenium_Taobao/__pycache__/xdaili.cpython-36.pyc
--------------------------------------------------------------------------------
/04-Selenium_Taobao/demo/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/04-Selenium_Taobao/demo/demo.gif
--------------------------------------------------------------------------------
/04-Selenium_Taobao/result/README.md:
--------------------------------------------------------------------------------
1 | ### Display 1
2 | 
3 |
4 | ### Display 2
5 | 
6 |
--------------------------------------------------------------------------------
/04-Selenium_Taobao/result/db_01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/04-Selenium_Taobao/result/db_01.jpg
--------------------------------------------------------------------------------
/04-Selenium_Taobao/result/db_02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/04-Selenium_Taobao/result/db_02.jpg
--------------------------------------------------------------------------------
/04-Selenium_Taobao/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/04-Selenium_Taobao/utils/__init__.py
--------------------------------------------------------------------------------
/04-Selenium_Taobao/utils/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/04-Selenium_Taobao/utils/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/04-Selenium_Taobao/utils/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/04-Selenium_Taobao/utils/__pycache__/config.cpython-36.pyc
--------------------------------------------------------------------------------
/04-Selenium_Taobao/utils/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | # 搜索关键字
4 | KEYWORD = 'iPad'
5 |
6 | # URL
7 | URL = 'https://s.taobao.com/search?q='
8 |
9 | # 数据库配置
10 | MONGO_URL = 'localhost'
11 | MONGO_DB = 'taobao'
12 | MONGO_COLLECTION = 'products'
13 |
14 | # 加载延时
15 | TIMEOUT = 30
16 |
17 | # 最大页数
18 | MAX_PAGE = 100
--------------------------------------------------------------------------------
/04-Selenium_Taobao/utils/proxy_auth_plugin.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/04-Selenium_Taobao/utils/proxy_auth_plugin.zip
--------------------------------------------------------------------------------
/04-Selenium_Taobao/xdaili.py:
--------------------------------------------------------------------------------
1 | # -*- encoding:utf-8 -*-
2 |
3 | from selenium import webdriver
4 |
5 | import zipfile
6 |
7 | class Xdaili(object):
8 | def __init__(self):
9 | """
10 | 初始化信息
11 | """
12 | # 代理服务器
13 | self.ip = "forward.xdaili.cn"
14 | self.port = '80'
15 | # 订单号和个人密钥(可在讯代理官网购买)
16 | self.orderno = "ZF2018***********"
17 | self.secert = "**********************************"
18 |
19 | def auth(self):
20 | """
21 | 构造代理
22 | :return:
23 | """
24 | manifest_json = """
25 | {
26 | "version": "1.0.0",
27 | "manifest_version": 2,
28 | "name": "Xdaili Proxy",
29 | "permissions": [
30 | "proxy",
31 | "tabs",
32 | "unlimitedStorage",
33 | "storage",
34 | "",
35 | "webRequest",
36 | "webRequestBlocking"
37 | ],
38 | "background": {
39 | "scripts": ["background.js"]
40 | },
41 | "minimum_chrome_version":"22.0.0"
42 | }
43 | """
44 |
45 | background_js = """
46 | var config = {
47 | mode: "fixed_servers",
48 | rules: {
49 | singleProxy: {
50 | scheme: "http",
51 | host: "%(ip)s",
52 | port: "%(port)s")
53 | },
54 | bypassList: ["foobar.com"]
55 | }
56 | };
57 |
58 | chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
59 |
60 | function callbackFn(details) {
61 | return {
62 | authCredentials: {
63 | username: "%(orderno)s",
64 | password: "%(secert)s"
65 | }
66 | };
67 | }
68 |
69 | chrome.webRequest.onAuthRequired.addListener(
70 | callbackFn,
71 | {urls: [""]},
72 | ['blocking']
73 | );
74 | """ % {'ip': self.ip, 'port': self.port, 'orderno': self.orderno, 'secert': self.secert}
75 | playin_file = './utils/proxy_auth_plugin.zip'
76 | with zipfile.ZipFile(playin_file, 'w') as zp:
77 | zp.writestr("manifest.json", manifest_json)
78 | zp.writestr("background.js", background_js)
79 |
--------------------------------------------------------------------------------
/05-Moments/.idea/05-Moments.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/05-Moments/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/05-Moments/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/05-Moments/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/05-Moments/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/__pycache__/config.cpython-36.pyc
--------------------------------------------------------------------------------
/05-Moments/__pycache__/processor.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/__pycache__/processor.cpython-36.pyc
--------------------------------------------------------------------------------
/05-Moments/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import os
4 |
5 | # 设备类型:Android 或 iOS
6 | PLANTFORM = 'Android'
7 | # 设备名称:可在命令行输入 adb devices -l 获取
8 | DEVICE_NAME = 'vivo_X7'
9 | # APP包名
10 | APP_PACKAGE = 'com.tencent.mm'
11 | # 入口类型
12 | APP_ACTIVITY = '.ui.LauncherUI'
13 |
14 | # APP安装包路径(手机没有安装微信时,通过修改启动参数完成安装并启动微信执行后续操作)
15 | APP = os.path.abspath('.') + '/weixin.apk'
16 |
17 | # Appium 服务地址
18 | DRIVER_SERVER = 'http://localhost:4723/wd/hub'
19 |
20 | # 元素加载时间(一般退出重新登录的耗时主要在登录和加载数据界面,可根据设备运行速度灵活调整)
21 | TIMEOUT = 200
22 |
23 | # 微信登录的手机号、密码
24 | USERNAME = '132********' # 你的手机号码
25 | PASSWORD = '123456789' # 你的微信账号密码
26 |
27 | # 滑动点
28 | FLICK_START_X = 300
29 | FLICK_START_Y = 300
30 | FLICK_DISTANCE = 500
31 |
32 | # 滑动的间隔时间
33 | SCROLL_SLEEP_TIME = 3 #设置间隔5秒+是确保新加载的朋友圈节点信息能完全加载出来
34 |
35 | # MYSQL数据库配置
36 | HOST = 'localhost'
37 | USER = 'root'
38 | PASSWORD_ = '123456'
39 | PORT = 3306
40 | DB = 'wechat'
41 |
42 | # MongoDB配置
43 | MONGO_URL = 'localhost'
44 | MONGO_DB = 'wechat'
45 | MONGO_COLLECTION = 'moments'
46 |
--------------------------------------------------------------------------------
/05-Moments/plates/SDK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/SDK.png
--------------------------------------------------------------------------------
/05-Moments/plates/content.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/content.png
--------------------------------------------------------------------------------
/05-Moments/plates/contents.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/contents.png
--------------------------------------------------------------------------------
/05-Moments/plates/datetime.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/datetime.png
--------------------------------------------------------------------------------
/05-Moments/plates/device_name.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/device_name.png
--------------------------------------------------------------------------------
/05-Moments/plates/login.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/login.png
--------------------------------------------------------------------------------
/05-Moments/plates/moment_db.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/moment_db.png
--------------------------------------------------------------------------------
/05-Moments/plates/nickname.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/nickname.png
--------------------------------------------------------------------------------
/05-Moments/plates/yes-no.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/05-Moments/plates/yes-no.png
--------------------------------------------------------------------------------
/05-Moments/processor.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import time
4 | import re
5 |
6 | class Processor():
7 | def date(self, datetime):
8 | """
9 | 格式化时间
10 | :param date: 原始时间
11 | :return: 处理后时间
12 | """
13 | if re.match('\d+分钟前', datetime):
14 | minute = re.match('(\d+)', datetime).group(1)
15 | datetime = time.strftime('%Y-%m-%d', time.localtime(time.time() - float(minute) * 60))
16 | if re.match('\d+小时前', datetime):
17 | hour = re.match('(\d+)', datetime).group(1)
18 | datetime = time.strftime('%Y-%m-%d', time.localtime(time.time() - float(hour) * 60 * 60))
19 | if re.match('昨天', datetime):
20 | datetime = time.strftime('%Y-%m-%d', time.localtime(time.time() - 24 * 60 * 60))
21 | if re.match('\d+天前', datetime):
22 | day = re.match('(\d+)', datetime).group(1)
23 | datetime = time.strftime('%Y-%m-%d', time.localtime((time.time()) - float(day) * 24 * 60 * 60))
24 | return datetime
--------------------------------------------------------------------------------
/06-Github_Login/README.md:
--------------------------------------------------------------------------------
1 | ## Github Login
2 | 模拟登录Github并抓取登录后才可以访问的页面信息, 包括好友动态、个人信息等。
3 |
4 | ## Sort
5 | **模拟登陆 - requests**
6 |
7 | ## Explain
8 | #### 1.清除Cookies
9 | 清除浏览器中待抓取网站的Cookies: [清除方法](https://blog.csdn.net/panbiao1999/article/details/77880649)
10 | #### 2.浏览器设置Coookies
11 | 设置Cookies的过程发生在请求登录界面后(即:http://github.com/login)。
12 | #### 3.From表单的验证参数
13 | Form表单的authenticity_token参数可在登陆界面的源码中获取。
14 |
15 |
--------------------------------------------------------------------------------
/06-Github_Login/github_login.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import requests
4 | from lxml import etree
5 |
6 | class Login(object):
7 | def __init__(self):
8 | self.headers = {
9 | 'Host': 'github.com',
10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
11 | 'Referer': 'https://github.com'
12 | }
13 | self.login_url = 'https://github.com/login'
14 | self.post_url = 'https://github.com/session'
15 | self.logined_url = 'https://github.com/settings/profile'
16 | self.session = requests.Session() # 维持会话,处理Cookies, 使得我们不同担心Cookies的问题
17 |
18 | def token(self):
19 | response = self.session.get(self.login_url, headers=self.headers)
20 | selector = etree.HTML(response.text)
21 | token = selector.xpath('//*[@id="login"]/form/input[2]/@value') # 获取authenticity_token的值
22 | return token
23 |
24 |
25 | def login(self, email, password):
26 | post_data = {
27 | 'commit': 'Sign in',
28 | 'utf8': '✓', # ✓ 可在"xpath('//*[@id="login"]/form/input[1]/@value')" 位置复制粘贴
29 | 'authenticity_token': self.token(), # 获取隐藏在源码中的authenticity_token值.
30 | 'login': email,
31 | 'password': password
32 | }
33 | response = self.session.post(self.post_url, data=post_data, headers=self.headers)
34 | if response.status_code == 200:
35 | self.dynamics(response.text)
36 |
37 | response = self.session.get(self.logined_url, headers=self.headers)
38 | if response.status_code == 200:
39 | self.profile(response.text)
40 |
41 | def dynamics(self, html):
42 | selector = etree.HTML(html)
43 | dynamics = selector.xpath('//div[contains(@class, "news")]/div') # 获取动态信息的div标签(需要处理)
44 | print(len(dynamics))
45 | div_class_values = ['watch_started', 'fork', 'follow', 'repo'] # 所有动态信息的class属性值
46 | for item in dynamics:
47 | value = item.xpath('./@class') # 获取标签的class属性值, 如果没在列表, 则不做处理
48 | print(value)
49 | if value in div_class_values:
50 | text = item.xpath('.//div[contains(@class, "flex-items-baseline")]//text()').strip()
51 | print(text)
52 |
53 | def profile(self, html):
54 | selector = etree.HTML(html)
55 | name = selector.xpath('//input[@id="user_profile_name"]/@value') # 获取用户名称
56 | email = selector.xpath('//select[@id="user_profile_email"]/option[@value!=""]/text()')
57 | print(name, email)
58 |
59 | if __name__ == "__main__":
60 | login = Login()
61 | login.login(email="northxw@163.com", password='your_password')
62 |
--------------------------------------------------------------------------------
/07-IGetGet/README.md:
--------------------------------------------------------------------------------
1 | # Project Name
2 | **IGetGet**,使用 Mitmproxy 的 Mitmdump 组件爬取"得到"App的电子书信息,并将信息存储至Json文件。
3 |
4 | # Sort
5 | **非自动化爬取App数据** - 通过Python脚本捕获服务器返回的response并处理。
6 |
7 | # Demand
8 | **1. Charles** - 跨平台支持度很好的网络抓包工具。addr: https://www.charlesproxy.com/download/
9 |
10 | **2. mitmproxy** - 一个支持HTTP、HTTPS的抓包程序,类似Fiddler、Charles的功能, 通过控制台的形式操作。
11 | ```
12 | pip3 install mitmproxy
13 | ```
14 |
15 | # Process analysis
16 | #### 1.Charles证书安装
17 | 不同OS安装过程基本一致。打开Charles, 点击"Help->SSL Proxy->Install Charles Root Certificate",即可进入证书安装页面。如图:
18 |
19 | 
20 |
21 | 具体的证书安装过程请自行谷歌。
22 |
23 | #### 2.手机证书安装
24 | **前提**:确保Charles的HTTP代理开启,默认端口8888。然后将手机和电脑连接再同一个局域网下。如图:
25 |
26 | 
27 |
28 | 然后,在手机浏览器上打开 chls.pro/ssl, 即可自动安装(**安卓尽量使用本机自带浏览器**)
29 |
30 | #### 3.mitmproxy证书安装的Bug
31 | PC端证书安装请自行谷歌。安装结束后可在用户目录的.mitmproxy目录下找到CA证书,如图:
32 |
33 | 
34 |
35 | 手机安装此证书不要局限于"mitmproxy-ca-cert.pem", 可能无法识别为CA证书并安装。可以尝试将上图中1-5中的任何一个传输到手机安装测试,哪个能用即用哪个。
36 |
37 | #### 4.数据库存储失败
38 | 测试期间,若添加数据库的插入操作,命令行就不会显示数据并且手机端网络丢失(当前局域网没有任何出错);而注释掉数据库插入操作,即可正常显示。具体原因尚不清楚,目前暂时将数据存储至Json。 错误如图:
39 |
40 | 
41 |
42 | # Other
43 | 目前的遗留问题是数据库存储,还没有一个切实可行的解决方案。有知道的可以提交issue。
44 |
45 | # Demo
46 | #### 1.JSON
47 | 
48 |
49 | #### 2.Run Screenshot
50 | 
51 |
--------------------------------------------------------------------------------
/07-IGetGet/__pycache__/script.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/__pycache__/script.cpython-36.pyc
--------------------------------------------------------------------------------
/07-IGetGet/script.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | """
4 | Updated at 9:45 at March 18, 2019
5 | @title: 爬取得到APP电子书信息并将数据存储至MongoDB
6 | @author: Northxw
7 | """
8 |
9 | import time
10 | import json
11 | # import pymongo
12 | from mitmproxy import ctx
13 |
14 | """
15 | class DedaoMongo(object):
16 | def __init__(self):
17 | # set client
18 | self.client = pymongo.MongoClient('localhst', 27017)
19 | # db
20 | self.db = self.client['dedao']
21 | # set
22 | self.collection = self.db['ebook']
23 |
24 | def update_book(self, book_info):
25 | self.collection.insert_one(book_info)
26 |
27 | """
28 |
29 | def response(flow):
30 | """
31 | 抓取得到APP电子书信息, 包含书本ID、书名、封面图片、推荐语、发布时间、当前时间、当前价格、内容简介等。
32 | """
33 | # data_ = DedaoMongo()
34 | url = 'https://entree.igetget.com/ebook2/v1/ebook/list'
35 | if flow.request.url.startswith(url):
36 | text = flow.response.text
37 | data = json.loads(text)
38 | info = ctx.log.info
39 | books = data.get('c').get('list')
40 |
41 | ebooks = list()
42 | # 获取电子书信息
43 | for book in books:
44 | ebook_data = {
45 | # ID
46 | 'id': str(book['id']),
47 | # 书名
48 | 'name': book['operating_title'],
49 | # 封面
50 | 'ico': book['cover'],
51 | # 推荐语
52 | 'share_summary': book['other_share_summary'],
53 | # 发布时间
54 | 'publish_time': book['datetime'],
55 | # 当前价格
56 | 'current_price': book['current_price'],
57 | # 内容简介
58 | 'book_intro': book['book_intro'],
59 | }
60 | # data_.update_book(ebook_data)
61 |
62 | # 终端显示已获取到的信息
63 | info('ID:' + ebook_data['id'])
64 | info('书名:' + ebook_data['name'])
65 | info('推荐语:' + ebook_data['share_summary'])
66 | info('发布时间:' + ebook_data['publish_time'])
67 | info('当前价格:' + '¥{}'.format(ebook_data['current_price']))
68 | info('封面:' + ebook_data['ico'])
69 | info('内容简介:' + ebook_data['book_intro'])
70 | info('-' * 80)
71 |
72 | # 存储为JSON格式
73 | with open('./dedao.json', 'a', encoding='utf-8') as f:
74 | f.write(json.dumps(ebook_data, indent=2, ensure_ascii=False))
75 | f.write(', \n')
--------------------------------------------------------------------------------
/07-IGetGet/utils/app_error.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/app_error.jpg
--------------------------------------------------------------------------------
/07-IGetGet/utils/charles安装证书页面.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/charles安装证书页面.png
--------------------------------------------------------------------------------
/07-IGetGet/utils/connect_error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/connect_error.png
--------------------------------------------------------------------------------
/07-IGetGet/utils/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/demo.gif
--------------------------------------------------------------------------------
/07-IGetGet/utils/mitmproxy证书.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/mitmproxy证书.png
--------------------------------------------------------------------------------
/07-IGetGet/utils/mongo_server_error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/mongo_server_error.png
--------------------------------------------------------------------------------
/07-IGetGet/utils/result_json.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/result_json.png
--------------------------------------------------------------------------------
/07-IGetGet/utils/乱码_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/乱码_01.png
--------------------------------------------------------------------------------
/07-IGetGet/utils/乱码_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/乱码_02.png
--------------------------------------------------------------------------------
/07-IGetGet/utils/手机证书安装.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/07-IGetGet/utils/手机证书安装.png
--------------------------------------------------------------------------------
/08-Selenium_Cnki/README.md:
--------------------------------------------------------------------------------
1 | # Project Name
2 | 使用Selenium注册并登录中国知网并识别知网的图形验证码。
3 |
4 | # Sort
5 | **识别验证码** - 常见四位英文数字混合验证码
6 |
7 | # Demand
8 | **1. Selenium** - 建议使用低版本的Python-Selenium库,因为高版本在Chrome中不支持。
9 | ```
10 | pip3 install selenium==2.48.0
11 | ```
12 | **2. chromedriver.exe** - download_addr:http://npm.taobao.org/mirrors/chromedriver/
13 |
14 | **3. Chaojiying_Python.rar** - download_addr:http://www.chaojiying.com/download/Chaojiying_Python.rar
15 |
16 | # Process analysis
17 | #### 1.验证码类型
18 | 知网注册页的验证码类型属于常见四位英文和数字组成的验证码。可以在超级鹰的 [验证码类型于价格表](http://www.chaojiying.com/price.html#table-item5) 页面参考。
19 |
20 | #### 2.Python识别库 - tesserocr、pytesseract
21 | 这两个三方库识别精度均较差, 字体略微差异可能就不是正常结果。所以选择超级鹰识别,识别前可做灰度、二值化处理(我这里做了注释选择不用,感觉平台打码精度挺高的),代码如下:
22 | ```Python
23 | def handle_code(image):
24 | """
25 | 处理验证码
26 | :param image: Image对象
27 | :return:
28 | """
29 | # 灰度处理
30 | image = image.convert("L")
31 | # 阈值120(可灵活配置)
32 | threshold = 120
33 | table = []
34 | for i in range(256): #
35 | if i < threshold:
36 | table.append(0)
37 | else:
38 | table.append(1)
39 | # 二值化处理
40 | image = image.point(table, '1')
41 | # 使用tesserocr获取处理结果
42 | result_1 = tesserocr.image_to_text(image).strip()
43 | # 使用pytesseract获取处理结果
44 | result_2 = pytesseract.image_to_string(image).strip()
45 | # print('验证码为:', result)
46 | # 两者识别结果相同再继续程序,否则循环识别。但是代价很大,所以弃用。
47 | return result_1, result_2
48 | ```
49 |
50 | # Other
51 | 代码可继续扩展,例如:登录后知网文献的爬取,并做数据可视化分析等。
52 |
53 | # Demo
54 | 
55 |
--------------------------------------------------------------------------------
/08-Selenium_Cnki/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/08-Selenium_Cnki/__pycache__/config.cpython-36.pyc
--------------------------------------------------------------------------------
/08-Selenium_Cnki/__pycache__/handle_code.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/08-Selenium_Cnki/__pycache__/handle_code.cpython-36.pyc
--------------------------------------------------------------------------------
/08-Selenium_Cnki/chaojiying.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import requests
4 | from hashlib import md5
5 |
6 | class Chaojiying(object):
7 |
8 | def __init__(self, username, password, soft_id):
9 | self.username = username
10 | password = password.encode('utf8')
11 | self.password = md5(password).hexdigest()
12 | self.soft_id = soft_id
13 | self.base_params = {
14 | 'user': self.username,
15 | 'pass2': self.password,
16 | 'softid': self.soft_id,
17 | }
18 | self.headers = {
19 | 'Connection': 'Keep-Alive',
20 | 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
21 | }
22 |
23 | def PostPic(self, im, codetype):
24 | """
25 | im: 图片字节
26 | codetype: 题目类型 参考 http://www.chaojiying.com/price.html
27 | """
28 | params = {
29 | 'codetype': codetype,
30 | }
31 | params.update(self.base_params)
32 | files = {'userfile': ('ccc.jpg', im)}
33 | r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
34 | return r.json()
35 |
36 | def ReportError(self, im_id):
37 | """
38 | im_id:报错题目的图片ID
39 | """
40 | params = {
41 | 'id': im_id,
42 | }
43 | params.update(self.base_params)
44 | r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
45 | return r.json()
46 |
47 |
48 | if __name__ == '__main__':
49 | # 以下均为超级鹰平台提供的样码,没有修改.
50 | chaojiying = Chaojiying('超级鹰用户名', '超级鹰用户名的密码', '96001')
51 | im = open('a.jpg', 'rb').read()
52 | print(chaojiying.PostPic(im, 1902))
53 |
54 |
--------------------------------------------------------------------------------
/08-Selenium_Cnki/demo/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/08-Selenium_Cnki/demo/demo.gif
--------------------------------------------------------------------------------
/08-Selenium_Cnki/demo/超级鹰积分.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/08-Selenium_Cnki/demo/超级鹰积分.jpg
--------------------------------------------------------------------------------
/08-Selenium_Cnki/utils/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | # 知网用户名
4 | USERNAME = '要注册的用户名'
5 | # 知网密码
6 | PASSWORD = '要注册用的密码'
7 | # 知网邮箱
8 | EMAIL = 'northxw@qq.com'
9 |
10 | # 目标站点
11 | URL = 'http://my.cnki.net/elibregister/commonRegister.aspx'
12 |
13 | # 超级鹰用户名、密码、软件ID、验证码类型
14 | CHAIJIYING_USERNAME = 'Northxw'
15 | CHAOJIYING_PASSWORD = '**********'
16 | CHAIJIYING_SOFT_ID = ********
17 | CHAOJIYING_KIND = 1902 # 1902代表常见四位英文和数字组成的验证码。验证码类型可以在打码平台的题分表上查看
18 |
--------------------------------------------------------------------------------
/08-Selenium_Cnki/utils/handle.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import tesserocr
4 | import pytesseract
5 |
6 | def handle_code(image):
7 | """
8 | 处理验证码
9 | :param image: Image对象
10 | :return:
11 | """
12 | image = image.convert("L") # 灰度处理
13 | threshold = 120 # 设置阈值为120(可灵活配置)
14 | table = []
15 | for i in range(256): #
16 | if i < threshold:
17 | table.append(0)
18 | else:
19 | table.append(1)
20 | image = image.point(table, '1') # 二值化处理
21 | result_1 = tesserocr.image_to_text(image).strip() # 使用tesserocr获取处理结果
22 | result_2 = pytesseract.image_to_string(image).strip() # 使用pytesseract获取处理结果
23 | # print('验证码为:', result_1)
24 | return result_1, result_2
25 |
--------------------------------------------------------------------------------
/09-Bilibili/.idea/bilibili.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/09-Bilibili/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/09-Bilibili/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/09-Bilibili/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/09-Bilibili/README.md:
--------------------------------------------------------------------------------
1 | # Project name
2 | 使用Selenium模拟登录B站并破解滑动验证码。
3 |
4 | # Sort
5 | **验证码识别** - 破解滑动验证码
6 |
7 | # Install
8 | **1. Selenium** - 建议使用低版本的Python-Selenium库,因为高版本在Chrome中不支持。
9 | ```
10 | pip3 install selenium==2.48.0
11 | ```
12 | **2. chromedriver.exe** - download_addr:http://npm.taobao.org/mirrors/chromedriver/, 版本要匹配。
13 |
14 | # Process analysis
15 | **1.验证码节点**
16 |
17 | B站验证码只要鼠标悬浮滑块就会出现, 当验证码出现后定位节点即可。过程比较繁琐,直接贴出来:
18 | ```
19 | img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'gt_box')))
20 | ```
21 |
22 | **2.获取坐标值**
23 |
24 | 获取的坐标值分别是左上角和右下角, 而前端页面的坐标原点在屏幕左上角并且元素节点一般都是相对位置,所以坐标值部分需要好好理解。比如B站登录界面包含"登录"的div节点其父节点是id=" login-app"的div,如图:
25 |
26 | 
27 |
28 | **3.缺口偏移量**
29 |
30 | 通过遍历图片的每个坐标点获取两张图片对应像素点的RGB,如果RGB差距在阈值范围内就认为相同,继续比对下一像素点。如果超过阈值,则说明像素点不同,当前位置
31 | 即为缺口位置。
32 | ```Python
33 | def get_gap(self, image1, image2):
34 | """
35 | 获取缺口偏移量
36 | :param image1: 不带缺口的图片
37 | :param image2: 带缺口的图片
38 | :return: None
39 | """
40 | left = 60
41 | # 遍历两张图片的每个像素并判断同一位置像素是否相同,不相同的像素点即缺口位置
42 | for i in range(left, image1.size[0]):
43 | for j in range(image1.size[1]):
44 | if not self.is_pixel_equal(image1, image2, i, j):
45 | left = i
46 | return left
47 | return left
48 |
49 | def is_pixel_equal(self, image1, image2, x, y):
50 | """
51 | 判断像素是否相同
52 | :param image1: 极验原图
53 | :param image2: 缺口图片
54 | :param x: 位置X
55 | :param y: 位置Y
56 | :return: 像素是否相同
57 | """
58 | # 取两个图片的像素点
59 | pixel1 = image1.load()[x, y]
60 | pixel2 = image2.load()[x, y]
61 | # 阈值60
62 | threshold = 60
63 | # 比较RGB的绝对值是否小于阈值60,如果在阈值内则相同,反之不同
64 | if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs(pixel1[2] - pixel2[2]) < threshold:
65 | return True
66 | else:
67 | return False
68 | ```
69 | **4.模拟拖动**
70 |
71 | 模拟拖动滑块继承崔大模拟人类行为轨迹的"前段匀加速后段匀减速"。
72 |
73 | **5.点按滑块呼出验证码**
74 |
75 | 点按滑块后, 两到三秒后验证码会自动隐藏, 所以不要添加延时,直接获取。
76 |
77 | # Other
78 | 代码已更新, 正常情况下的破解率应该在50%以上, 主要看服务器判定边界的方式(可能像素差)。
79 |
80 | # Demo
81 | 
82 |
--------------------------------------------------------------------------------
/09-Bilibili/captcha1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/09-Bilibili/captcha1.png
--------------------------------------------------------------------------------
/09-Bilibili/captcha2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/09-Bilibili/captcha2.png
--------------------------------------------------------------------------------
/09-Bilibili/require/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/09-Bilibili/require/demo.gif
--------------------------------------------------------------------------------
/09-Bilibili/require/demo_location.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/09-Bilibili/require/demo_location.png
--------------------------------------------------------------------------------
/09-Bilibili/utils/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/09-Bilibili/utils/__pycache__/config.cpython-36.pyc
--------------------------------------------------------------------------------
/09-Bilibili/utils/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | # B站账号
4 | EMAIL = 'northxw@163.com'
5 | # 密码
6 | PASSWORD = '******'
7 |
8 | # B站登录页URL
9 | URL = 'https://passport.bilibili.com/login'
10 |
11 | BORDER = 10
12 | INIT_LEFT = 51
13 |
--------------------------------------------------------------------------------
/10-DouYin/README.md:
--------------------------------------------------------------------------------
1 | # DouYin
2 | 使用 Mitmdump 爬取 "抖音" App短视频信息,包含标题、视频下载地址、作者、发布时间、获赞数等。
3 |
4 | # Sort
5 | **非自动化爬取App数据** - 基于Mitmproxy的Mitmdump组件实现APP数据的爬取。
6 |
7 | # Explain
8 | #### 1. Charles获取视频接口
9 | 爬取之前先将手机与PC至于同局域网并确保手机WIFI的代理端口为8888,然后打开Charles获取视频请求的链接,如图:
10 |
11 | 
12 |
13 | #### 2. 手动上滑触发视频请求接口
14 | 自动化滑动刷新有尝试过,但是由于技术有限,不能实现抖音APP的登录,所以用Charles只能获取视频下载链接,而不能获取其他有效信息,比如视频的名称、作者名称、获赞数、转发量等。
15 |
16 | #### 3. Python脚本获取视频信息
17 | 使用Python脚本拦截response爬取视频信息并下载视频,同时将视频信息存储至JSON。
18 |
19 | #### 4. 视频无水印
20 | 如图:
21 |
22 | 
23 |
24 | # Other
25 | 获取的数据不能直接存储至MongoDB等数据库,具体原因尚不清楚,若您知道,请提交issuse。
26 |
27 | # Demo Of Screenshot
28 | #### 1.JSON
29 | 
30 |
31 | #### 2.VIDEO
32 | 
33 |
34 | #### 3.Demo
35 | 
36 |
--------------------------------------------------------------------------------
/10-DouYin/__pycache__/script.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/10-DouYin/__pycache__/script.cpython-36.pyc
--------------------------------------------------------------------------------
/10-DouYin/plates/JSONDecodeError.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/10-DouYin/plates/JSONDecodeError.jpg
--------------------------------------------------------------------------------
/10-DouYin/plates/TypeError.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/10-DouYin/plates/TypeError.jpg
--------------------------------------------------------------------------------
/10-DouYin/plates/charles.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/10-DouYin/plates/charles.png
--------------------------------------------------------------------------------
/10-DouYin/plates/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/10-DouYin/plates/demo.gif
--------------------------------------------------------------------------------
/10-DouYin/plates/video_demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/10-DouYin/plates/video_demo.gif
--------------------------------------------------------------------------------
/10-DouYin/plates/video_info_json.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/10-DouYin/plates/video_info_json.png
--------------------------------------------------------------------------------
/10-DouYin/plates/video_screentshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/10-DouYin/plates/video_screentshot.png
--------------------------------------------------------------------------------
/10-DouYin/script.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | """
4 | Updated at 17:57 at March 19,2019
5 | @title: 爬取抖音App短视频
6 | @author: Northxw
7 | """
8 |
9 | from mitmproxy import ctx
10 | import json
11 | import requests
12 | import time
13 | import os
14 |
15 | def response(flow):
16 | """
17 | 抓取抖音标题、APP视频链接、作者、抖音ID、发布时间、获赞数、评论和转发数等信息, 并将结果保存为JSON格式.
18 | :return: None
19 | """
20 | # 通过Charles获取的抖音视频信息的URL接口
21 | url = 'https://api.amemv.com/'
22 | if flow.request.url.startswith(url):
23 | # 获取服务器返回的响应
24 | text = flow.response.text
25 | # 转化为Json格式
26 | dyjson = json.loads(text)
27 | info = ctx.log.info
28 |
29 | # 获取视频列表
30 | aweme_list = dyjson.get('aweme_list')
31 | # 遍历列表,获取每个视频的相应数据
32 | for i in range(len(aweme_list)):
33 | # 视频标题
34 | title = aweme_list[i].get('share_info').get('share_title')
35 | # 视频链接
36 | videourl = aweme_list[i].get('video').get('play_addr').get('url_list')[0]
37 | # 保存视频
38 | res = requests.get(videourl, stream=True)
39 | # 规范文件命名
40 | _str = ['\\', '/', ':', '*', '?', '"', '<', '>', '|', '.', '..', '?']
41 | for _ in _str:
42 | if _ in title:
43 | title.replace(_, '')
44 | # 判断文件路径是否存在
45 | save_dir = './video/'
46 | if not os.path.exists(save_dir):
47 | os.mkdir(save_dir)
48 | with open('{}/{}.mp4'.format(save_dir, title), 'wb') as f:
49 | f.write(res.content)
50 |
51 | # 作者名称
52 | nickname = aweme_list[i].get('author').get('nickname')
53 | # 抖音ID
54 | short_id = aweme_list[i].get('author').get('short_id')
55 | # 发布时间
56 | create_time = aweme_list[i].get('create_time')
57 | # 格式化
58 | create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(create_time))
59 | # 获赞、评论、转发数
60 | digg_count = aweme_list[i].get('statistics').get('digg_count')
61 | comment_count = aweme_list[i].get('statistics').get('comment_count')
62 | share_count = aweme_list[i].get('statistics').get('share_count')
63 |
64 | # 显示所有获取信息
65 | info("标题:" + title)
66 | info("URL:" + videourl)
67 | info("作者: " + nickname)
68 | info("ID: " + short_id)
69 | info("发布时间: " + create_time)
70 | info("获赞:" + str(digg_count))
71 | info("评论:" + str(comment_count))
72 | info("转发:" + str(share_count))
73 | info('-'*80)
74 |
75 | # 保存为json文件
76 | data = {
77 | 'title': title,
78 | 'url': videourl,
79 | 'nickname': nickname,
80 | 'douyin_id': short_id,
81 | 'create_time': create_time,
82 | 'diggs': digg_count,
83 | 'commments': comment_count,
84 | 'shares': share_count
85 | }
86 |
87 | # 下载视频
88 | with open('./douyin.json', 'a', encoding='utf-8') as f:
89 | f.write(json.dumps(data, indent=2, ensure_ascii=False))
90 | f.write(', \n')
91 |
--------------------------------------------------------------------------------
/11-Jianshu/README.md:
--------------------------------------------------------------------------------
1 | ## JianShu
2 | 结合 Appium 和 mitmdump 实现自动化获取简书"发现"页面的推荐文章信息, 包括文章标题、作者、评论数、点赞数、阅读量等。
3 |
4 | ## Demo
5 | 
6 |
--------------------------------------------------------------------------------
/11-Jianshu/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/11-Jianshu/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/11-Jianshu/__pycache__/script.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/11-Jianshu/__pycache__/script.cpython-36.pyc
--------------------------------------------------------------------------------
/11-Jianshu/action.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from appium import webdriver
4 | from selenium.webdriver.support.ui import WebDriverWait
5 | from selenium.webdriver.common.by import By
6 | from selenium.webdriver.support import expected_conditions as EC
7 | from selenium.common.exceptions import NoSuchElementException
8 | from time import sleep
9 | from .utils.config import *
10 |
11 | class JianshuAction(object):
12 | def __init__(self):
13 | """
14 | 初始化信息
15 | """
16 | # 驱动配置
17 | self.desired_caps = {
18 | "platformName": PLATFORM,
19 | "deviceName": DEVICE_NAME,
20 | "appPackage": APP_PACKAGE,
21 | "appActivity": APP_ACTIVITY
22 | }
23 | self.driver = webdriver.Remote(DRIVER_SERVER, self.desired_caps)
24 | self.wait = WebDriverWait(self.driver, TIMEOUT)
25 |
26 | def login(self):
27 | """
28 | 登录
29 | :return: None
30 | """
31 | # 点击"我的"进入登录界面
32 | tab_login = self.wait.until(EC.presence_of_element_located((By.ID, 'com.jianshu.haruki:id/tab_more')))
33 | tab_login.click()
34 | sleep(3)
35 | # 点击"头像"登录简书
36 | image_login = self.wait.until(EC.presence_of_element_located((By.ID, 'com.jianshu.haruki:id/user_top_info_avatar')))
37 | image_login.click()
38 | sleep(3)
39 |
40 | # 用户
41 | # account = self.wait.until(EC.presence_of_element_located((By.ID, 'com.jianshu.haruki:id/et_account')))
42 | # account.send_keys(USER_PHONENUMBER)
43 | # 密码
44 | # password = self.wait.until(EC.presence_of_element_located((By.ID, 'com.jianshu.haruki:id/et_password')))
45 | # password.send_keys(PASSWORD)
46 |
47 | # 选择"微信登录"省略输入账号密码的步骤
48 | weixin_login = self.wait.until(EC.presence_of_element_located((By.ID, 'com.jianshu.haruki:id/iv_wechat')))
49 | weixin_login.click()
50 | sleep(10)
51 |
52 | # 解释:因为之前已经微信授权,所以这里直接登录进入个人页面
53 |
54 | # 点击"发现"进入文章推荐页面
55 | discorver = self.wait.until(EC.presence_of_element_located((By.ID, 'com.jianshu.haruki:id/tab_discover')))
56 | discorver.click()
57 | sleep(3)
58 |
59 | def scroll(self):
60 | """
61 | 上滑页面、触发请求
62 | :return:None
63 | """
64 | # 由于推荐页面的文章数目很多,当前仅获取部分文章信息。
65 | count = 1000 # 可灵活配置该参数
66 | while count > 0:
67 | # 模拟拖动
68 | self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y)
69 | sleep(SCROLL_SLEEP_TIME)
70 | count = count - 1
71 |
72 | def main(self):
73 | """
74 | 主函数
75 | :return:
76 | """
77 | self.login()
78 | self.scroll()
79 |
80 | if __name__ == '__main__':
81 | action = JianshuAction()
82 | action.main()
83 |
--------------------------------------------------------------------------------
/11-Jianshu/demo/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/11-Jianshu/demo/demo.gif
--------------------------------------------------------------------------------
/11-Jianshu/result/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/11-Jianshu/result/__init__.py
--------------------------------------------------------------------------------
/11-Jianshu/script.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | """
4 | Created at 20:50 on Nov 13,2018
5 | @title: 自动化抓取简书文章信息
6 | @author: Northxw
7 | """
8 |
9 | from mitmproxy import ctx
10 | import json
11 |
12 | def response(flow):
13 | """
14 | 爬取简书"发现"页面的推荐文章信息,包括文章标题、作者、ID、喜欢数、评论数、获赏数、阅读量等
15 | :return: None
16 | """
17 | url = 'https://s0.jianshuapi.com/'
18 | url_ = 'https://s0.jianshuapi.com/v3/trending/now3?'
19 | if flow.request.url.startswith(url):
20 | if flow.request.url.startswith(url_):
21 | text = flow.response.text # 获取响应
22 | data = json.loads(text)
23 | info = ctx.log.info
24 |
25 | # 获取文章信息列表
26 | for i in range(len(data)):
27 | # 文章标题
28 | title = data[i].get('object').get('data').get('title')
29 | # ID
30 | id = data[i].get('object').get('data').get('user').get('id')
31 | # 作者
32 | author = data[i].get('object').get('data').get('user').get('nickname')
33 | # 获得的"喜欢"
34 | likes_count = data[i].get('object').get('data').get('likes_count')
35 | # 评论数
36 | comments_count = data[i].get('object').get('data').get('comments_count')
37 | # 获得的"赞赏"
38 | total_rewards_count = data[i].get('object').get('data').get('total_rewards_count')
39 | # 阅读数
40 | views_count = data[i].get('object').get('data').get('views_count')
41 |
42 | # 显示获取的信息
43 | info('总数据' + str(len(data)))
44 | info('文章标题:' + title)
45 | info('作者:' + author)
46 | info('ID:' + str(id))
47 | info('喜欢:' + str(likes_count))
48 | info('评论:' + str(comments_count))
49 | info('赞赏:' + str(total_rewards_count))
50 | info('阅读量:' + str(views_count))
51 | info('-'*80)
52 |
53 | # 存储为JSON文件
54 | data_ = {
55 | 'title': title,
56 | 'id': id,
57 | 'author': author,
58 | 'likes': likes_count,
59 | 'comments': comments_count,
60 | 'rewards': total_rewards_count,
61 | 'views': views_count,
62 | }
63 | with open('./result/jianshu.json', 'a', encoding='utf-8') as f:
64 | f.write(json.dumps(data_, indent=2, ensure_ascii=False))
65 | f.write(', \n')
66 |
67 |
68 |
69 |
70 |
--------------------------------------------------------------------------------
/11-Jianshu/utils/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/11-Jianshu/utils/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/11-Jianshu/utils/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/11-Jianshu/utils/__pycache__/config.cpython-36.pyc
--------------------------------------------------------------------------------
/11-Jianshu/utils/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | # Appium 服务器
4 | DRIVER_SERVER = 'http://localhost:4723/wd/hub'
5 |
6 | # 启动参数:设备类型、名称、APP包名、入口类型
7 | PLATFORM = 'Android'
8 | DEVICE_NAME = 'vivo_X7'
9 | APP_PACKAGE = 'com.jianshu.haruki'
10 | APP_ACTIVITY = 'com.baiji.jianshu.MainActivity'
11 |
12 | # 简书账号、密码
13 | USER_PHONENUMBER = '********'
14 | PASSWORD = '********'
15 |
16 | # 等待时间
17 | TIMEOUT = 100
18 |
19 | # 滑动点
20 | FLICK_START_X = 300
21 | FLICK_START_Y = 300
22 | FLICK_DISTANCE = 600
23 |
24 | # 滑动的间隔时间
25 | SCROLL_SLEEP_TIME = 3
26 |
--------------------------------------------------------------------------------
/12-Crack_Jianshu/.idea/Jianshu.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/12-Crack_Jianshu/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/12-Crack_Jianshu/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/12-Crack_Jianshu/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/12-Crack_Jianshu/README.md:
--------------------------------------------------------------------------------
1 | # Project name
2 | 使用Selenium模拟登录网页版简书并识别点触式验证码。
3 |
4 | # Sort
5 | **验证码识别** - 点触验证码
6 |
7 | # Demand
8 | **1. Selenium**
9 | ```
10 | pip3 install selenium==2.48.0
11 | ```
12 | **2. chromedriver.exe** - download_addr:http://npm.taobao.org/mirrors/chromedriver/
13 |
14 | **3. Chaojiying_Python.rar** - download_addr:http://www.chaojiying.com/download/Chaojiying_Python.rar
15 |
16 | # Process analysis
17 | ### 1.不要频繁运行程序模拟登录
18 | 频繁模拟登录并识别验证码后,会出现验证码却来越模糊到难以识别,并且识别后点击"确认"按钮无法登录(或者说登录按键失效的)的情况。如图所示的位置失效:
19 |
20 | 
21 |
22 | ### 2.超级鹰
23 | [超级鹰打码平台](http://www.chaojiying.com/) 打码效率可以达到90%以上。在平台上注册绑定微信后会赠送1000积分,基本够用了。如图是我的积分情况:
24 |
25 | 
26 |
27 | ### 3.超级鹰软件ID和验证码类型
28 | 软件ID相当于工作牌(或护照),每次打码都必须携带;验证码类型需要你去 [平台](http://www.chaojiying.com/price.html#table-item5) 确认。例如该项目的验证码类型属于**9004 坐标多选,返回1~4个坐标**。
29 |
30 | ### 4.识别思路(简要)
31 | 首先,获取验证码位置并获取网页截图;然后,裁剪获取验证码图像并以字节流的格式发送给超级鹰打码平台;最后,转化识别结果并使用Selenium点击登录。
32 |
33 | # Other
34 | 代码中pass留空函数为预留功能:爬取简书文章信息。有兴趣可以继续完善。
35 |
36 | # Demo
37 | 
38 |
--------------------------------------------------------------------------------
/12-Crack_Jianshu/captcha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/12-Crack_Jianshu/captcha.png
--------------------------------------------------------------------------------
/12-Crack_Jianshu/require/chaojiying.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/12-Crack_Jianshu/require/chaojiying.png
--------------------------------------------------------------------------------
/12-Crack_Jianshu/require/code_demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/12-Crack_Jianshu/require/code_demo.png
--------------------------------------------------------------------------------
/12-Crack_Jianshu/require/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/12-Crack_Jianshu/require/demo.gif
--------------------------------------------------------------------------------
/12-Crack_Jianshu/require/超级鹰返回结果处理示例.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/12-Crack_Jianshu/require/超级鹰返回结果处理示例.png
--------------------------------------------------------------------------------
/12-Crack_Jianshu/utils/__pycache__/chaojiying.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/12-Crack_Jianshu/utils/__pycache__/chaojiying.cpython-36.pyc
--------------------------------------------------------------------------------
/12-Crack_Jianshu/utils/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/12-Crack_Jianshu/utils/__pycache__/config.cpython-36.pyc
--------------------------------------------------------------------------------
/12-Crack_Jianshu/utils/chaojiying.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding:utf-8
3 |
4 | import requests
5 | from hashlib import md5
6 |
7 | class Chaojiying_Client(object):
8 |
9 | def __init__(self, username, password, soft_id):
10 | self.username = username
11 | self.password = md5(password.encode('utf-8')).hexdigest()
12 | self.soft_id = soft_id
13 | self.base_params = {
14 | 'user': self.username,
15 | 'pass2': self.password,
16 | 'softid': self.soft_id,
17 | }
18 | self.headers = {
19 | 'Connection': 'Keep-Alive',
20 | 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
21 | }
22 |
23 | def PostPic(self, im, codetype):
24 | """
25 | im: 图片字节
26 | codetype: 题目类型 参考 http://www.chaojiying.com/price.html
27 | """
28 | params = {
29 | 'codetype': codetype,
30 | }
31 | params.update(self.base_params)
32 | files = {'userfile': ('ccc.jpg', im)}
33 | r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
34 | return r.json()
35 |
36 | def ReportError(self, im_id):
37 | """
38 | im_id:报错题目的图片ID
39 | """
40 | params = {
41 | 'id': im_id,
42 | }
43 | params.update(self.base_params)
44 | r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
45 | return r.json()
--------------------------------------------------------------------------------
/12-Crack_Jianshu/utils/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | # 简书登录界面URL
4 | URL = 'https://www.jianshu.com/sign_in'
5 |
6 | # 邮箱(手机号)、密码
7 | EMAIL = 'northxw@163.com'
8 | PASSWORD = '******'
9 |
10 | # 超级鹰用户名、密码、软件ID、验证码类型
11 | CHAIJIYING_USERNAME = 'Northxw'
12 | CHAOJIYING_PASSWORD = '******'
13 | CHAIJIYING_SOFT_ID = '******'
14 | CHAOJIYING_KIND = 9004
15 |
16 | # 显式加载时间
17 | TIME_OUT = 15
--------------------------------------------------------------------------------
/13-Pyspider_Lagou/README.md:
--------------------------------------------------------------------------------
1 | ## Pyspider Lagou
2 | 本次实践使用国人 **binux** 编写的 Pyspider 框架爬取拉勾网发布的职位信息, 主要包括招聘公司、职位、薪资、岗位要求、职位描述等。
3 |
4 | ## Explain
5 | 本次提交的代码, 通过WebUI界面单步测试未发现Bug, 测试请使用demo.py。若出现599证书问题,可参考"[599 Error 解决方案](https://www.jianshu.com/p/6900cce4e488)"。其他Bug,可邮箱联系。
6 |
7 | ## Demo
8 | 
9 |
--------------------------------------------------------------------------------
/13-Pyspider_Lagou/data/project.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/data/project.db
--------------------------------------------------------------------------------
/13-Pyspider_Lagou/data/result.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/data/result.db
--------------------------------------------------------------------------------
/13-Pyspider_Lagou/data/scheduler.1d:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/data/scheduler.1d
--------------------------------------------------------------------------------
/13-Pyspider_Lagou/data/scheduler.1h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/data/scheduler.1h
--------------------------------------------------------------------------------
/13-Pyspider_Lagou/data/scheduler.all:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/data/scheduler.all
--------------------------------------------------------------------------------
/13-Pyspider_Lagou/data/task.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/data/task.db
--------------------------------------------------------------------------------
/13-Pyspider_Lagou/demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # Created on 2018-11-16 11:48:05
4 | # Project: Lagou
5 |
6 | from pyspider.libs.base_handler import *
7 | import time
8 |
9 | class Handler(BaseHandler):
10 | crawl_config = {
11 | 'headers': {
12 | 'Host': 'www.lagou.com',
13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
14 | },
15 | }
16 |
17 | @every(minutes=24 * 60)
18 | def on_start(self):
19 | self.crawl('https://www.lagou.com/zhaopin/Python/', callback=self.index_page, validate_cert=False,
20 | params={'labelWords': 'label'})
21 |
22 | @config(age=2 * 60 * 60)
23 | def index_page(self, response):
24 | for each in response.doc('.position_link').items():
25 | self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
26 | time.sleep(1)
27 | # 获取下一页链接
28 | next = response.doc('.item_con_pager a:last-child').attr.href
29 | self.crawl(next, callback=self.index_page, validate_cert=False)
30 |
31 | @config(priority=2)
32 | def detail_page(self, response):
33 | return {
34 | "company": response.doc('.job-name > .company').text(),
35 | "job": response.doc('.job-name > .name').text(),
36 | "salary": response.doc('.salary').text(),
37 | "other": response.doc('.job_request span').text().split('/')[1:-1],
38 | "labels": response.doc('.job_request li').text(),
39 | "publish_time": "".join(response.doc('.publish_time').text().split()),
40 | "job_advantage": response.doc('.job-advantage > p').text(),
41 | "job_description": response.doc('.job_bt p').text(),
42 | "work_address": response.doc('.work_addr').text().replace('查看地图', '')
43 | }
--------------------------------------------------------------------------------
/13-Pyspider_Lagou/lagou.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # Created on 2018-11-16 11:48:05
4 | # Project: Lagou
5 |
6 | from pyspider.libs.base_handler import *
7 | from pymongo import MongoClient
8 | import time
9 |
10 |
11 | class Mongo(object):
12 | def __init__(self):
13 | # 初始化数据库
14 | self.client = MongoClient()
15 | self.db = self.client['lagou']
16 | self.collection = self.db['python']
17 |
18 | def insert(self, data):
19 | # 将字典数据插入到数据库
20 | if data:
21 | self.collection.insert(data)
22 |
23 | def __del__(self):
24 | # 关闭数据库连接
25 | self.client.close()
26 |
27 |
28 | class Agent_abuyun(object):
29 | def __init__(self):
30 | self.proxyHost = "proxy.abuyun.com"
31 | self.proxyPort = "9010"
32 | self.proxyUser = "H72RXH024162Y0VD"
33 | self.proxyPass = "E8A5838333933FFE"
34 |
35 | def ip_port(self):
36 | # 代理隧道验证信息
37 | proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
38 | "host": self.proxyHost,
39 | "port": self.proxyPort,
40 | "user": self.proxyUser,
41 | "pass": self.proxyPass,
42 | }
43 | proxies = {
44 | "http": proxyMeta,
45 | "https": proxyMeta,
46 | }
47 | return proxies
48 |
49 |
50 | class Handler(BaseHandler):
51 | crawl_config = {
52 | 'headers': {
53 | 'Host': 'www.lagou.com',
54 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
55 | },
56 | 'proxy': Agent_abuyun().ip_port(),
57 | 'mongo': Mongo(),
58 | }
59 |
60 | @every(minutes=24 * 60)
61 | def on_start(self):
62 | self.crawl('https://www.lagou.com/zhaopin/Python/', callback=self.index_page, validate_cert=False,
63 | params={'labelWords': 'label'})
64 |
65 | # 设置任务有效期为两个小时(因为一般为30个页面左右)
66 | @config(age=2 * 60 * 60)
67 | def index_page(self, response):
68 | for each in response.doc('.position_link').items():
69 | self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
70 | time.sleep(1)
71 | # 获取下一页链接
72 | next = response.doc('.item_con_pager a:last-child').attr.href
73 | self.crawl(next, callback=self.index_page, validate_cert=False)
74 |
75 | @config(priority=2)
76 | def detail_page(self, response):
77 | return {
78 | "company": response.doc('.job-name > .company').text(),
79 | "job": response.doc('.job-name > .name').text(),
80 | "salary": response.doc('.salary').text(),
81 | "other": response.doc('.job_request span').text().split('/')[1:-1],
82 | "labels": response.doc('.job_request li').text(),
83 | "publish_time": "".join(response.doc('.publish_time').text().split()),
84 | "job_advantage": response.doc('.job-advantage > p').text(),
85 | "job_description": response.doc('.job_bt p').text(),
86 | "work_address": response.doc('.work_addr').text().replace('查看地图', '')
87 | }
88 |
89 | def on_result(self, data):
90 | self.crawl_config['mongo'].insert(data)
--------------------------------------------------------------------------------
/13-Pyspider_Lagou/result/db.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/result/db.jpg
--------------------------------------------------------------------------------
/13-Pyspider_Lagou/result/单步测试结果_01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/result/单步测试结果_01.jpg
--------------------------------------------------------------------------------
/13-Pyspider_Lagou/result/单步测试结果_02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/result/单步测试结果_02.jpg
--------------------------------------------------------------------------------
/13-Pyspider_Lagou/result/单步测试结果_03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/result/单步测试结果_03.jpg
--------------------------------------------------------------------------------
/13-Pyspider_Lagou/result/单步测试结果_04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/result/单步测试结果_04.jpg
--------------------------------------------------------------------------------
/13-Pyspider_Lagou/result/单步测试结果_05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/13-Pyspider_Lagou/result/单步测试结果_05.jpg
--------------------------------------------------------------------------------
/14-Scrapy_Tutorial/README.md:
--------------------------------------------------------------------------------
1 | ## Scrapy Tutorial
2 | Scrapy初体验 - 使用 Scrapy 框架抓取 tutorial 站点的相关信息, 包含文本、作者、标签等。
3 |
4 | ## Tip
5 | 本次实践参考崔大"Python3网络爬虫开发实战"第13章, 在此基础上做了些许优化。代码可在 Pycharm 中直接运行,只需要 **run main.py** 即可。
6 |
7 | ## Demo
8 | 
9 |
--------------------------------------------------------------------------------
/14-Scrapy_Tutorial/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/14-Scrapy_Tutorial/demo.gif
--------------------------------------------------------------------------------
/14-Scrapy_Tutorial/tutorial/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = tutorial.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = tutorial
12 |
--------------------------------------------------------------------------------
/14-Scrapy_Tutorial/tutorial/tutorial/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/14-Scrapy_Tutorial/tutorial/tutorial/__init__.py
--------------------------------------------------------------------------------
/14-Scrapy_Tutorial/tutorial/tutorial/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/14-Scrapy_Tutorial/tutorial/tutorial/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/14-Scrapy_Tutorial/tutorial/tutorial/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/14-Scrapy_Tutorial/tutorial/tutorial/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/14-Scrapy_Tutorial/tutorial/tutorial/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/14-Scrapy_Tutorial/tutorial/tutorial/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/14-Scrapy_Tutorial/tutorial/tutorial/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/14-Scrapy_Tutorial/tutorial/tutorial/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/14-Scrapy_Tutorial/tutorial/tutorial/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class QuoteItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | text = scrapy.Field()
15 | author = scrapy.Field()
16 | tags = scrapy.Field()
17 |
--------------------------------------------------------------------------------
/14-Scrapy_Tutorial/tutorial/tutorial/main.py:
--------------------------------------------------------------------------------
1 | from scrapy.cmdline import execute
2 |
3 | execute('scrapy crawl quotes'.split())
--------------------------------------------------------------------------------
/14-Scrapy_Tutorial/tutorial/tutorial/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | from scrapy.exceptions import DropItem
9 | import pymongo
10 |
11 | class TextPipeline(object):
12 | def __init__(self):
13 | self.limit = 50
14 |
15 | def process_item(self, item, spider):
16 | if item['text']:
17 | if len(item['text']) > self.limit:
18 | item['text'] = item['text'][0:self.limit].rstrip() + '...'
19 | return item
20 | else:
21 | return DropItem('Missing Text')
22 |
23 | class MongoPipeline(object):
24 | def __init__(self, mongo_url, mongo_db):
25 | self.mongo_url = mongo_url
26 | self.mongo_db = mongo_db
27 |
28 | @classmethod
29 | def from_crawler(cls, crawler):
30 | return cls(
31 | mongo_url=crawler.settings.get('MONGO_URL'),
32 | mongo_db = crawler.settings.get('MONGO_DB')
33 | )
34 |
35 | def open_spider(self, spider):
36 | self.client = pymongo.MongoClient(self.mongo_url)
37 | self.db = self.client[self.mongo_db]
38 |
39 | def process_item(self, item, spider):
40 | name = item.__class__.__name__
41 | self.db[name].insert(dict(item))
42 | return item
43 |
44 | def close_spider(self, spider):
45 | self.client.close()
--------------------------------------------------------------------------------
/14-Scrapy_Tutorial/tutorial/tutorial/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for tutorial project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'tutorial'
13 |
14 | SPIDER_MODULES = ['tutorial.spiders']
15 | NEWSPIDER_MODULE = 'tutorial.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'tutorial.middlewares.TutorialSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'tutorial.middlewares.TutorialDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'tutorial.pipelines.TextPipeline': 300,
69 | 'tutorial.pipelines.MongoPipeline': 400,
70 | }
71 |
72 | # MongoDB SETTINGS
73 | MONGO_URL = 'localhost'
74 | MONGO_DB = 'tutorial'
75 |
76 | # Enable and configure the AutoThrottle extension (disabled by default)
77 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
78 | #AUTOTHROTTLE_ENABLED = True
79 | # The initial download delay
80 | #AUTOTHROTTLE_START_DELAY = 5
81 | # The maximum download delay to be set in case of high latencies
82 | #AUTOTHROTTLE_MAX_DELAY = 60
83 | # The average number of requests Scrapy should be sending in parallel to
84 | # each remote server
85 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
86 | # Enable showing throttling stats for every response received:
87 | #AUTOTHROTTLE_DEBUG = False
88 |
89 | # Enable and configure HTTP caching (disabled by default)
90 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
91 | #HTTPCACHE_ENABLED = True
92 | #HTTPCACHE_EXPIRATION_SECS = 0
93 | #HTTPCACHE_DIR = 'httpcache'
94 | #HTTPCACHE_IGNORE_HTTP_CODES = []
95 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
96 |
--------------------------------------------------------------------------------
/14-Scrapy_Tutorial/tutorial/tutorial/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/14-Scrapy_Tutorial/tutorial/tutorial/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/14-Scrapy_Tutorial/tutorial/tutorial/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/14-Scrapy_Tutorial/tutorial/tutorial/spiders/__pycache__/quotes.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/14-Scrapy_Tutorial/tutorial/tutorial/spiders/__pycache__/quotes.cpython-36.pyc
--------------------------------------------------------------------------------
/14-Scrapy_Tutorial/tutorial/tutorial/spiders/quotes.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from ..items import QuoteItem
4 | from traceback import format_exc, print_exc
5 |
6 | class QuotesSpider(scrapy.Spider):
7 | name = 'quotes'
8 | allowed_domains = ['quotes.toscrape.com']
9 | start_urls = ['http://quotes.toscrape.com/']
10 |
11 | def parse(self, response):
12 | global next
13 | quotes = response.css('.quote')
14 | for quote in quotes:
15 | item = QuoteItem()
16 | item['text'] = quote.css('.text::text').extract_first()
17 | item['author'] = quote.css('.author::text').extract_first()
18 | item['tags'] = quote.css('.tags .tag::text').extract()
19 | yield item
20 | try:
21 | next = response.css('.pager .next a:attr("href")').extract_first()
22 | except Exception as e:
23 | _ = e # 接收异常
24 | next = None
25 | if next:
26 | url = response.urljoin(next)
27 | yield scrapy.Request(url=url, callback=self.parse)
28 |
--------------------------------------------------------------------------------
/15-Scrapy_Images360/README.md:
--------------------------------------------------------------------------------
1 | ## Scrapy Images360
2 | Scrapy实战 - 使用 Scrapy 框架抓取 image.so.com 站点的图片信息。包括图片ID、图片标题、图片下载链接等。
3 |
4 | ## Tip
5 | 本次实践依旧参考崔大"Python3网络爬虫开发实战"第13章, 在原代码的基础上添加了UA中间件(随机User-Agent)。继续加油, 再接再厉!
6 |
7 | ## Demo of Images
8 | 
9 |
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/__init__.py
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/__pycache__/middlewares.cpython-36.pyc
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/images/t01a3ee5a4ff05fe133.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/images/t01a3ee5a4ff05fe133.jpg
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/images/t01a5f844c4a5d5ed7d.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/images/t01a5f844c4a5d5ed7d.jpg
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/images/t01ad50ec608cde5fdc.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/images/t01ad50ec608cde5fdc.jpg
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/images/t01aed1278f885e26ec.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/images/t01aed1278f885e26ec.jpg
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/images/t01b29ea494ffdab388.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/images/t01b29ea494ffdab388.jpg
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/images/t01bf8bb6d4c6b93fff.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/images/t01bf8bb6d4c6b93fff.jpg
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/images/t01c2bb853e048be307.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/images/t01c2bb853e048be307.jpg
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | from scrapy import Item, Field
9 |
10 | class Images360Item(Item):
11 | # MongoDB、Mysql存储的表格名称
12 | collection = table = 'images'
13 | # ID
14 | id = Field()
15 | # 链接
16 | url = Field()
17 | # 标题
18 | title = Field()
19 | # 缩略图
20 | thumb = Field()
21 |
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/main.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from scrapy.cmdline import execute
4 |
5 | execute("scrapy crawl images".split())
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | import random
9 | import logging
10 |
11 | class UAMiddleware(object):
12 | def __init__(self):
13 | # 添加UA
14 | self.ua_list = [
15 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 ',
16 | '(KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
17 | 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
18 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
19 | 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
20 | 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)',
21 | ]
22 |
23 | def process_request(self, request, spider):
24 | user_agent = random.choices(self.ua_list)
25 | request.headers['User-Agent'] = user_agent
26 | # 通过打印日志查看随机User-Agent
27 | # logging.info(request.url)
28 | # logging.info(request.headers['User-Agent'])
29 |
30 | def process_response(self, request, response, spider):
31 | return response
32 |
33 | def process_exception(self, request, exception, spider):
34 | pass
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | from scrapy import Request
9 | from scrapy.exceptions import DropItem
10 | from scrapy.pipelines.images import ImagesPipeline
11 | import pymongo
12 | import pymysql
13 |
14 | class MongoPipeline(object):
15 | def __init__(self, mongo_url, mongo_db):
16 | self.mongo_url = mongo_url
17 | self.mong_db = mongo_db
18 |
19 | @classmethod
20 | def from_crawler(cls, crawler):
21 | # 通过crawler对象拿到Scrapy的所有核心组件(如全局配置信息)并创建一个Pipeline实例
22 | return cls(
23 | mongo_url=crawler.settings.get('MONGO_URL'),
24 | mongo_db=crawler.settings.get('MONGO_DB')
25 | )
26 |
27 | def open_spider(self, spider):
28 | # 创建数据库连接对象
29 | self.client = pymongo.MongoClient(self.mongo_url)
30 | # 指定数据库
31 | self.db = self.client[self.mong_db]
32 |
33 | def process_item(self, item, spider):
34 | # 将数据插入到指定的表格
35 | self.db[item.collection].insert(dict(item))
36 | return item
37 |
38 | def close_spider(self, spider):
39 | # 关闭数据库连接
40 | self.client.close()
41 |
42 |
43 | class MysqlPipeline():
44 | def __init__(self, host, database, user, password, port):
45 | self.host = host
46 | self.database = database
47 | self.user = user
48 | self.password = password
49 | self.port = port
50 |
51 | @classmethod
52 | def from_crawler(cls, crawler):
53 | return cls(
54 | host=crawler.settings.get('MYSQL_HOST'),
55 | database=crawler.settings.get('MYSQL_DATABASE'),
56 | user=crawler.settings.get('MYSQL_USER'),
57 | password=crawler.settings.get('MYSQL_PASSWORD'),
58 | port=crawler.settings.get('MYSQL_PORT'),
59 | )
60 |
61 | def open_spider(self, spider):
62 | self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8',
63 | port=self.port)
64 | self.cursor = self.db.cursor()
65 |
66 | def process_item(self, item, spider):
67 | data = dict(item)
68 | keys = ', '.join(data.keys())
69 | values = ', '.join(['%s'] * len(data))
70 | sql = 'insert into %s (%s) values (%s)' % (item.table, keys, values)
71 | self.cursor.execute(sql, tuple(data.values()))
72 | self.db.commit()
73 | return item
74 |
75 | def close_spider(self, spider):
76 | self.db.close()
77 |
78 | class ImagePipeline(ImagesPipeline):
79 | def file_path(self, request, response=None, info=None):
80 | url = request.url
81 | file_name = url.split('/')[-1]
82 | return file_name
83 |
84 | def item_completed(self, results, item, info):
85 | image_paths = [x['path'] for ok, x in results if ok]
86 | if not image_paths:
87 | raise DropItem('Image Downloaded Failed')
88 | return item
89 |
90 | def get_media_requests(self, item, info):
91 | yield Request(item['url'])
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for images360 project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'images360'
13 |
14 | SPIDER_MODULES = ['images360.spiders']
15 | NEWSPIDER_MODULE = 'images360.spiders'
16 |
17 | MAX_PAGE = 50
18 |
19 | # MonogDB Settings
20 | MONGO_URL = 'localhost'
21 | MONGO_DB = 'images360'
22 |
23 | # Mysql Settings
24 | MYSQL_HOST = 'localhost'
25 | MYSQL_DATABASE = 'images360'
26 | MYSQL_USER = 'root'
27 | MYSQL_PASSWORD = '0000'
28 | MYSQL_PORT = 3306
29 |
30 | # Image Path
31 | IMAGES_STORE = './images'
32 |
33 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
34 | #USER_AGENT = 'images360 (+http://www.yourdomain.com)'
35 |
36 | # Obey robots.txt rules
37 | ROBOTSTXT_OBEY = False
38 |
39 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
40 | #CONCURRENT_REQUESTS = 32
41 |
42 | # Configure a delay for requests for the same website (default: 0)
43 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
44 | # See also autothrottle settings and docs
45 | #DOWNLOAD_DELAY = 3
46 | # The download delay setting will honor only one of:
47 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
48 | #CONCURRENT_REQUESTS_PER_IP = 16
49 |
50 | # Disable cookies (enabled by default)
51 | #COOKIES_ENABLED = False
52 |
53 | # Disable Telnet Console (enabled by default)
54 | #TELNETCONSOLE_ENABLED = False
55 |
56 | # Override the default request headers:
57 | #DEFAULT_REQUEST_HEADERS = {
58 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59 | # 'Accept-Language': 'en',
60 | #}
61 |
62 | # Enable or disable spider middlewares
63 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
64 | #SPIDER_MIDDLEWARES = {
65 | # 'images360.middlewares.Images360SpiderMiddleware': 543,
66 | #}
67 |
68 | # Enable or disable downloader middlewares
69 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
70 | DOWNLOADER_MIDDLEWARES = {
71 | 'images360.middlewares.UAMiddleware': 543,
72 | }
73 |
74 | # Enable or disable extensions
75 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
76 | #EXTENSIONS = {
77 | # 'scrapy.extensions.telnet.TelnetConsole': None,
78 | #}
79 |
80 | # Configure item pipelines
81 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
82 | ITEM_PIPELINES = {
83 | 'images360.pipelines.MongoPipeline': 300,
84 | 'images360.pipelines.MysqlPipeline': 301,
85 | 'images360.pipelines.ImagePipeline': 302,
86 | }
87 |
88 | # Enable and configure the AutoThrottle extension (disabled by default)
89 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
90 | #AUTOTHROTTLE_ENABLED = True
91 | # The initial download delay
92 | #AUTOTHROTTLE_START_DELAY = 5
93 | # The maximum download delay to be set in case of high latencies
94 | #AUTOTHROTTLE_MAX_DELAY = 60
95 | # The average number of requests Scrapy should be sending in parallel to
96 | # each remote server
97 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
98 | # Enable showing throttling stats for every response received:
99 | #AUTOTHROTTLE_DEBUG = False
100 |
101 | # Enable and configure HTTP caching (disabled by default)
102 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
103 | #HTTPCACHE_ENABLED = True
104 | #HTTPCACHE_EXPIRATION_SECS = 0
105 | #HTTPCACHE_DIR = 'httpcache'
106 | #HTTPCACHE_IGNORE_HTTP_CODES = []
107 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
108 |
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/spiders/__pycache__/images.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/images360/images360/spiders/__pycache__/images.cpython-36.pyc
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/images360/spiders/images.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import scrapy
4 | from scrapy import Spider, Request
5 | from urllib.parse import urlencode
6 | from ..items import Images360Item
7 | import json
8 |
9 | class ImagesSpider(scrapy.Spider):
10 | name = 'images'
11 | allowed_domains = ['image.so.com']
12 |
13 | def start_requests(self):
14 | # GET请求参数
15 | data = {
16 | 'ch': 'photography',
17 | 'listtype': 'new',
18 | }
19 | base_url = 'https://image.so.com/zj?'
20 | for page in range(1, self.settings.get('MAX_PAGE') + 1):
21 | # 偏移量参数
22 | data['sn'] = page * 30
23 | params = urlencode(data)
24 | # 完整请求链接
25 | url = base_url + params
26 | yield Request(url, self.parse)
27 |
28 | def parse(self, response):
29 | result = json.loads(response.text)
30 | for image in result.get('list'):
31 | item = Images360Item()
32 | item['id'] = image.get('imageid')
33 | item['url'] = image.get('qhimg_url')
34 | item['title'] = image.get('group_title')
35 | item['thumb'] = image.get('qhimg_thumb_url')
36 | yield item
37 |
--------------------------------------------------------------------------------
/15-Scrapy_Images360/images360/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = images360.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = images360
12 |
--------------------------------------------------------------------------------
/15-Scrapy_Images360/screenshot/README.md:
--------------------------------------------------------------------------------
1 | ## GIF
2 | 
3 |
4 | ## MongoDB
5 | 
6 |
7 | ## Mysql
8 | 
9 |
10 | ## Images
11 | 
12 |
--------------------------------------------------------------------------------
/15-Scrapy_Images360/screenshot/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/screenshot/demo.gif
--------------------------------------------------------------------------------
/15-Scrapy_Images360/screenshot/images.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/screenshot/images.jpg
--------------------------------------------------------------------------------
/15-Scrapy_Images360/screenshot/mongodb.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/screenshot/mongodb.jpg
--------------------------------------------------------------------------------
/15-Scrapy_Images360/screenshot/mysql.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/15-Scrapy_Images360/screenshot/mysql.jpg
--------------------------------------------------------------------------------
/16-vczh/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/16-vczh/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/16-vczh/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/16-vczh/.idea/vczh.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/16-vczh/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = vczh.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = vczh
12 |
--------------------------------------------------------------------------------
/16-vczh/vczh/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/__init__.py
--------------------------------------------------------------------------------
/16-vczh/vczh/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/16-vczh/vczh/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/16-vczh/vczh/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/__pycache__/middlewares.cpython-36.pyc
--------------------------------------------------------------------------------
/16-vczh/vczh/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/16-vczh/vczh/__pycache__/sendemail.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/__pycache__/sendemail.cpython-36.pyc
--------------------------------------------------------------------------------
/16-vczh/vczh/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/16-vczh/vczh/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from scrapy import Item, Field
4 |
5 | class VczhItem(Item):
6 | table = 'followig'
7 | id = Field()
8 | avatar_url = Field()
9 | name = Field()
10 | gender = Field()
11 | headline = Field()
12 | person_url = Field()
13 | follower_count = Field()
14 | answer_count = Field()
15 | articles_count = Field()
--------------------------------------------------------------------------------
/16-vczh/vczh/main.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from scrapy.cmdline import execute
4 |
5 | execute('scrapy crawl vc'.split())
--------------------------------------------------------------------------------
/16-vczh/vczh/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from scrapy.downloadermiddlewares.retry import RetryMiddleware
4 | from scrapy.utils.response import response_status_message
5 | from fake_useragent import UserAgent
6 | import base64
7 | import logging
8 |
9 | class UAMiddleware(object):
10 | def __init__(self):
11 | self.user_agent = UserAgent().random
12 |
13 | def process_request(self, request, spider):
14 | request.headers['User-Agent'] = self.user_agent
15 |
16 |
17 | class ProxyMiddleware(object):
18 | def __init__(self, proxy_server, proxy_user, proxy_pass):
19 | self.proxy_server = proxy_server
20 | self.proxy_user = proxy_user
21 | self.proxy_pass = proxy_pass
22 | self.proxy_auth = "Basic " + base64.urlsafe_b64encode(bytes((self.proxy_user + ":" + self.proxy_pass), "ascii")).decode("utf8")
23 | self.logger = logging.getLogger(__name__)
24 |
25 | @classmethod
26 | def from_crawler(cls, crawler):
27 | return cls(
28 | proxy_server = crawler.settings.get('PROXY_SERVER'),
29 | proxy_user = crawler.settings.get('PROXY_USER'),
30 | proxy_pass = crawler.settings.get('PROXY_PASS')
31 | )
32 |
33 | def process_request(self, request, spider):
34 | request.meta["proxy"] = self.proxy_server
35 | request.headers["Proxy-Authorization"] = self.proxy_auth
36 |
37 | def process_response(self, request, response, spider):
38 | try:
39 | spider.crawler.stats.inc_value('normal_response')
40 | except Exception as e:
41 | self.logger.error('Response Error: {}'.format(e.args))
42 | return response
43 |
44 | def process_exception(self, request, exception, spider):
45 | pass
46 |
47 | class DownloadRetryMiddleware(RetryMiddleware):
48 | def process_response(self, request, response, spider):
49 | if response.status in self.retry_http_codes:
50 | reason = response_status_message(response.status)
51 | return self._retry(request, reason, spider) or response
52 | return response
53 |
54 | def process_exception(self, request, exception, spider):
55 | if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \
56 | and not request.meta.get('dont_retry', False):
57 | return self._retry(request, exception, spider)
--------------------------------------------------------------------------------
/16-vczh/vczh/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from scrapy import Request
4 | from scrapy.exceptions import DropItem
5 | from scrapy.pipelines.images import ImagesPipeline
6 | import pymysql
7 | import logging
8 |
9 | # 统计下载图片的总量
10 | COUNT_IMAGES_NUMS = {'IMAGES_NUMS': 0}
11 |
12 | class MysqlPipeline(object):
13 | def __init__(self, host, database, user, password, port):
14 | self.host = host
15 | self.database = database
16 | self.user = user
17 | self.password = password
18 | self.port = port
19 | self.logger = logging.getLogger(__name__)
20 |
21 | @classmethod
22 | def from_crawler(cls, crawler):
23 | return cls(
24 | host=crawler.settings.get('MYSQL_HOST'),
25 | database=crawler.settings.get('MYSQL_DB'),
26 | user=crawler.settings.get('MYSQL_USER'),
27 | password=crawler.settings.get('MYSQL_PASSWORD'),
28 | port=crawler.settings.get('MYSQL_PORT')
29 | )
30 |
31 | def open_spider(self, spider):
32 | self.db = pymysql.connect(self.host, self.user, self.password, self.database, self.port)
33 | self.cursor = self.db.cursor()
34 |
35 | def process_item(self, item, spider):
36 | data = dict(item)
37 | keys = ', '.join(data.keys())
38 | values = ', '.join(['%s'] * len(data))
39 | sql = "INSERT INTO %s (%s) VALUES (%s)" % (item.table, keys, values)
40 | try:
41 | self.cursor.execute(sql, tuple(data.values()))
42 | self.db.commit()
43 | # 设置属性Success_InsertDB并自增1
44 | spider.crawler.stats.inc_value('success_insertdb')
45 | except Exception as e:
46 | self.logger.error('Error: {}'.format(e.args))
47 | self.db.rollback()
48 | return item
49 |
50 | def close_spider(self, spider):
51 | self.db.close()
52 |
53 |
54 | class ImagePipeline(ImagesPipeline):
55 |
56 | def file_path(self, request, response=None, info=None):
57 | url = request.url
58 | file_name = url.split('/')[-1]
59 | return file_name
60 |
61 | def item_completed(self, results, item, info):
62 | image_paths = [x['path'] for ok, x in results if ok]
63 | if not image_paths:
64 | raise DropItem('Image Downloaded Failed')
65 | else:
66 | COUNT_IMAGES_NUMS['IMAGES_NUMS'] += 1
67 | return item
68 |
69 | def get_media_requests(self, item, info):
70 | yield Request(item['avatar_url'])
--------------------------------------------------------------------------------
/16-vczh/vczh/sendemail.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import smtplib
4 | from email.mime.text import MIMEText
5 |
6 | class EmailSender(object):
7 | def __init__(self):
8 | # 发送方smtp服务器
9 | self.smtp_host = 'smtp.163.com'
10 | # 发送方邮箱(同于登录smtp服务器)
11 | self.smtp_user = 'northxw@163.com'
12 | # 授权码
13 | self.smtp_authcode = '123456'
14 | # smtp服务器默认端口465
15 | self.smtp_port = 465
16 | # 发送方邮箱
17 | self.sender = 'northxw@163.com'
18 |
19 | def sendEmail(self, recipient_list, email_subject, body):
20 | """
21 | 发送邮件
22 | :param recipient_list: 收件人列表
23 | :param email_subject: 邮件主题
24 | :param body: 邮件内容
25 | :return: None
26 | """
27 | # 邮件内容、格式、编码
28 | message = MIMEText(_text=body, _subtype='plain', _charset='utf-8')
29 | # 发件人
30 | message['From'] = self.sender
31 | # 收件人
32 | message['To'] = ', '.join(recipient_list)
33 | # 主题
34 | message['Subject'] = email_subject
35 | try:
36 | # 实例化SMTP_SSL对象
37 | smtpSSLClient = smtplib.SMTP_SSL(self.smtp_host,self.smtp_port)
38 | # 登录
39 | loginResult = smtpSSLClient.login(self.smtp_user, self.smtp_authcode)
40 | # loginRes = (235, b'Authentication successful')
41 | print("Login Result:LoginRes = {}".format(loginResult))
42 |
43 | if loginResult and loginResult[0] == 235:
44 | print("Successful login, Code = {}".format(loginResult[0]))
45 | smtpSSLClient.sendmail(self.sender, recipient_list, message.as_string())
46 | print("Successful delivery. Message:{}".format(message.as_string()))
47 | else:
48 | print("Login failed, Code = {}".format(str(loginResult[0])))
49 |
50 | except Exception as e:
51 | print("Failed to send, Exception: e={}".format(e))
52 |
--------------------------------------------------------------------------------
/16-vczh/vczh/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for vczh project
4 |
5 | import time
6 |
7 | BOT_NAME = 'vczh'
8 |
9 | SPIDER_MODULES = ['vczh.spiders']
10 | NEWSPIDER_MODULE = 'vczh.spiders'
11 |
12 | # Obey robots.txt rules
13 | ROBOTSTXT_OBEY = False
14 |
15 | # 设置延时0.3秒
16 | DOWNLOAD_DELAY = 0.3
17 |
18 | #SPIDER_MIDDLEWARES = {
19 | # 'vczh.middlewares.VczhSpiderMiddleware': 543,
20 | #}
21 |
22 | DOWNLOADER_MIDDLEWARES = {
23 | 'vczh.middlewares.DownloadRetryMiddleware': 100,
24 | 'vczh.middlewares.UAMiddleware': 543,
25 | 'vczh.middlewares.ProxyMiddleware': 544,
26 | }
27 |
28 | ITEM_PIPELINES = {
29 | 'vczh.pipelines.ImagePipeline': 300,
30 | # 'vczh.pipelines.MongoPipeline': 301,
31 | 'vczh.pipelines.MysqlPipeline': 303,
32 | }
33 |
34 | # 爬取最大页码
35 | MAX_PAGE = 155
36 |
37 | # MYSQL SEETINGS
38 | MYSQL_HOST = 'localhost'
39 | MYSQL_USER = 'root'
40 | MYSQL_PASSWORD = '0513'
41 | MYSQL_DB = 'vczh'
42 | MYSQL_PORT = 3306
43 |
44 | # 代理服务器
45 | PROXY_SERVER = "http://http-dyn.abuyun.com:9020"
46 | # 代理服务器隧道验证信息
47 | PROXY_USER = "HR827T805WJ4667D"
48 | PROXY_PASS = "124D18494FF76D09"
49 |
50 | # 图片存储位置
51 | IMAGES_STORE = './images'
52 |
53 | # LOG名称: 加入时间可以保证每次生成的报告不会重叠,也能清楚的知道报告生成时间
54 | LOG_FILE = './logs/{}.log'.format(str(time.strftime("%Y-%m-%d %H_%M_%S")))
55 | # LOG编码
56 | # LOG_ENCODING = 'utf-8'
57 | # LOG级别: DEBUG级别最低,如果设置DEBUG,所有的log都会记录,不利于查错
58 | LOG_LEVEL = 'WARNING'
59 |
60 | # 邮件发送者
61 | MAIL_FROM = 'northxw@163.com'
62 | # 邮件服务器
63 | MAIL_HOST = 'smtp.163.com'
64 | # 端口
65 | MAIL_PORT = 25
66 | # 发送者
67 | MAIL_USER = 'northxw@163.com'
68 | # 授权码
69 | MAIL_PASS = 'authcode'
70 |
71 | # 邮件接收者列表
72 | RECEIVE_LIST = ['northxw@gmail.com', 'northxw@qq.com', 'northxw@sina.com']
73 | # 邮件主题
74 | SUBJECT = '爬虫状态报告'
--------------------------------------------------------------------------------
/16-vczh/vczh/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/16-vczh/vczh/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/16-vczh/vczh/spiders/__pycache__/vc.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/spiders/__pycache__/vc.cpython-36.pyc
--------------------------------------------------------------------------------
/16-vczh/vczh/spiders/vc.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import json
4 | import scrapy
5 | import time
6 | import logging
7 | from urllib.parse import urlencode
8 | from scrapy import Request
9 | from ..items import VczhItem
10 | from scrapy.mail import MailSender
11 | from ..pipelines import COUNT_IMAGES_NUMS
12 |
13 | class VcSpider(scrapy.Spider):
14 | start = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
15 | name = 'vc'
16 | allowed_domains = ['www.zhihu.com']
17 | base_url = 'https://www.zhihu.com/api/v4/members/excited-vczh/followees?'
18 | logger = logging.getLogger(__name__)
19 |
20 | def start_requests(self):
21 | data = {
22 | 'include': 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics',
23 | 'limit': 20
24 | }
25 | for page in range(1, self.settings.get('MAX_PAGE') + 1):
26 | data['offset'] = page * 20
27 | params = urlencode(data)
28 | url = self.base_url + params
29 | yield Request(url, callback=self.parse, errback=self.error_back)
30 |
31 | def parse(self, response):
32 | result = json.loads(response.text)
33 | for data_ in result.get('data'):
34 | item = VczhItem()
35 | item['id'] = data_.get('id')
36 | item['avatar_url'] = data_.get('avatar_url').replace('_is', '')
37 | item['name'] = data_.get('name')
38 | item['gender'] = data_.get('gender')
39 | item['headline'] = data_.get('headline')
40 | item['person_url'] = data_.get('url'),
41 | item['follower_count'] = data_.get('follower_count')
42 | item['answer_count'] = data_.get('answer_count')
43 | item['articles_count'] = data_.get('articles_count')
44 | yield item
45 |
46 |
47 | def closed(self, reason):
48 | """
49 | 爬虫关闭发送通知邮件
50 | """
51 | # 爬虫完成时间
52 | fnished = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
53 | # 创建邮件发送对象
54 | mail = MailSender.from_settings(self.settings)
55 | # 邮件内容
56 | body = "爬虫名称: {}\n\n 开始时间: {}\n\n 请求成功总量:{}\n 图片下载总量:{}\n 数据库存储总量:{}\n\n 结束时间 : {}\n".format(
57 | '知乎轮子哥粉丝爬虫',
58 | str(self.start),
59 | str(self.crawler.stats.get_value("normal_response")),
60 | str(COUNT_IMAGES_NUMS['IMAGES_NUMS']),
61 | str(self.crawler.stats.get_value("success_insertdb")),
62 | str(str(fnished)))
63 | # 发送邮件
64 | mail.send(to=self.settings.get('RECEIVE_LIST'), subject=self.settings.get('SUBJECT'), body=body)
65 |
66 | def error_back(self, e):
67 | _ = self
68 | # 打印错误信息到日志
69 | self.logger.error('Error: {}'.format(e.reason))
--------------------------------------------------------------------------------
/16-vczh/vczh/utils/db_follower.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/utils/db_follower.png
--------------------------------------------------------------------------------
/16-vczh/vczh/utils/email.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/utils/email.png
--------------------------------------------------------------------------------
/16-vczh/vczh/utils/followers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/utils/followers.png
--------------------------------------------------------------------------------
/16-vczh/vczh/utils/huaji.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/utils/huaji.png
--------------------------------------------------------------------------------
/16-vczh/vczh/utils/log.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/16-vczh/vczh/utils/log.png
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/__init__.py
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/__pycache__/middlewares.cpython-36.pyc
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class City58XiaoQu(scrapy.Item):
12 | """
13 | 小区详情页数据
14 | """
15 | id = scrapy.Field()
16 | name = scrapy.Field()
17 | location = scrapy.Field()
18 | price = scrapy.Field()
19 | address = scrapy.Field()
20 | times = scrapy.Field()
21 |
22 | class City58ItemChuZuInfo(scrapy.Item):
23 | """
24 | 小区出租房页数据
25 | """
26 | id = scrapy.Field() # 关联小区信息
27 | name = scrapy.Field()
28 | zu_price = scrapy.Field()
29 | mianji = scrapy.Field()
30 | type = scrapy.Field()
31 | chuzu_price_pre = scrapy.Field() # 每平米的房价
32 | url = scrapy.Field() # 出租房页面的唯一ID
33 | price_pre = scrapy.Field() # 存储每个出租房的每平米房价
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/main.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from scrapy.cmdline import execute
4 |
5 | execute("scrapy crawl 58".split())
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 | from .utils.api import get_ip_port
10 |
11 | class ProxyMiddleware(object):
12 |
13 | def process_request(self, request, spider):
14 | # 获取一个优质代理(此处请更换为自己购买的API生成的提取链接)
15 | proxy = get_ip_port('http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId=***************69b51b303859ac446&orderno=*********************&returnType=2&count=1')
16 | # 设置代理
17 | request.meta['proxy'] = proxy
18 |
19 | def process_response(self, request, response, spider):
20 | return response
21 |
22 | def process_exception(self, request, exception, spider):
23 | pass
24 |
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from scrapy.exceptions import DropItem
4 | from pymongo import MongoClient
5 | from scrapy.conf import settings
6 | from pymongo.errors import DuplicateKeyError
7 | from traceback import format_exc
8 | from .items import City58XiaoQu, City58ItemChuZuInfo
9 |
10 |
11 | class City58Pipeline(object):
12 |
13 | def __init__(self, mongo_uri, mongo_db):
14 | self.mongo_uri = mongo_uri
15 | self.mongo_db = mongo_db
16 | self.client = None
17 | self.db = None
18 |
19 | @classmethod
20 | def from_crawler(cls, crawler):
21 | return cls(
22 | mongo_uri=crawler.settings.get('MONGODB_URI'),
23 | mongo_db=settings.get('MONGODB_DATABASE', 'items')
24 | )
25 |
26 | def open_spider(self, spider):
27 | _ = spider
28 | self.client = MongoClient(self.mongo_uri)
29 | self.db = self.client[self.mongo_db]
30 | self.db['city58_info'].ensure_index('id', unique=True) # 在表 city58_info 中建立索引,并保证索引的唯一性
31 | self.db['city58_chuzu_info'].ensure_index('url', unique=True) # 在表 city58_chuzu_info 中建立索引,并保证索引的唯一性
32 |
33 | def close_spider(self, spider):
34 | _ = spider
35 | self.client.close()
36 |
37 | def process_item(self, item, spider):
38 | try:
39 | if isinstance(item, City58XiaoQu): # 判断是否是小区的item
40 | self.db['city58_info'].update({'id': item['id']}, {'$set': item}, upsert=True) # 通过id判断,有就更新,没有就插入
41 | elif isinstance(item, City58ItemChuZuInfo): # 判断是否是小区出租信息的item
42 | try:
43 | fangjia = HandleFangjiaPipline.price_per_square_meter_dict[item['id']] # 把HandleFangjiaPipline管道的字典price_per_square_meter_dict中每平米平均价格赋值给fangjia
44 | # del HandleFangjiaPipline.price_per_square_meter_dict[item['id']]
45 | item['price_pre'] = fangjia
46 |
47 | self.db['city58_chuzu_info'].update({'url': item['url']}, {'$set': item}, upsert=True) # 通过url判断,有就更新,没有就插入
48 | except Exception as e:
49 | print(e)
50 |
51 | except DuplicateKeyError:
52 | spider.logger.debug(' duplicate key error collection') # 唯一键冲突报错
53 | except Exception as e:
54 | _ = e
55 | spider.logger.error(format_exc())
56 | return item
57 |
58 |
59 | class HandleZuFangPipline(object):
60 |
61 | def process_item(self, item, spider):
62 | _ = spider, self
63 | # self.db[self.collection_name].insert_one(dict(item))
64 | # 判断进来的item是否是City58ItemXiaoChuZuQuInfo,是否含有面积参数
65 | if isinstance(item, City58ItemChuZuInfo) and 'mianji' in item:
66 | item['chuzu_price_pre'] = int(item['zu_price']) / int(item['mianji']) # 租金除以面积得到平均价格
67 | return item
68 |
69 |
70 | class HandleFangjiaPipline(object):
71 |
72 | price_per_square_meter_dict = dict()
73 |
74 | def process_item(self, item, spider):
75 | _ = spider
76 |
77 | # 判断传进来的item是否是个字典,并且是否含有price_list
78 | if isinstance(item, dict) and 'price_list' in item:
79 | item['price_list'] = [int(i) for i in item['price_list']]
80 | if item['price_list']:
81 | self.price_per_square_meter_dict[item['id']] = sum(item['price_list']) / len(item['price_list']) # 得到每个小区的平均价格
82 | else:
83 | self.price_per_square_meter_dict[item['id']] = 0
84 | raise DropItem()
85 | return item
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for City_58 project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'City_58'
13 |
14 | SPIDER_MODULES = ['City_58.spiders']
15 | NEWSPIDER_MODULE = 'City_58.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'City_58 (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 0.3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'City_58.middlewares.City58SpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | DOWNLOADER_MIDDLEWARES = {
56 | 'City_58.middlewares.ProxyMiddleware': 500,
57 | }
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'City_58.pipelines.HandleFangjiaPipline': 300, # 租房平均每平米价格
69 | 'City_58.pipelines.HandleZuFangPipline': 310,
70 | 'City_58.pipelines.City58Pipeline': 320
71 | }
72 |
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 |
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 |
94 | # 58全国城市站点(测试:成都站)
95 | HOST = ['cd']
96 |
97 | # 不同行政区的编号(测试:天府新区)
98 | AREA_CODE = ['21611']
99 |
100 | # 数据库配置
101 | MONGODB_HOST = '127.0.0.1'
102 | MONGODB_PORT = '27017'
103 | MONGODB_URI = 'mongodb://{}:{}'.format(MONGODB_HOST, MONGODB_PORT)
104 | MONGODB_DATABASE = '58'
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/spiders/58.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import scrapy
4 | from scrapy import Request
5 | from traceback import format_exc
6 | from ..items import City58XiaoQu, City58ItemChuZuInfo
7 | from ..utils.parse import parse_xiaoqu, parse_xiaoqu_detail, \
8 | get_ershoufang_list_page, get_chuzu_detail_page_list_url, get_chuzu_house_info
9 |
10 | class A58Spider(scrapy.Spider):
11 | name = '58'
12 | allowed_domains = ['58.com']
13 | base_url = 'https://{}.58.com/xiaoqu/{}/'
14 |
15 | def start_requests(self):
16 | # 根据HOST和CODE构造各行政区的小区页面的URL
17 | for host in self.settings.get('HOST'):
18 | for code in self.settings.get('AREA_CODE'):
19 | url = self.base_url.format(host, code)
20 | self.logger.debug(url)
21 | yield Request(url=url, callback=self.parse)
22 |
23 | def parse(self, response):
24 | # 提取小区列表页的URL
25 | xiaoqu_url_list = parse_xiaoqu(response)
26 | for xiaoqu_url in xiaoqu_url_list:
27 | yield Request(xiaoqu_url, callback=self.xiaoqu_detail_page, errback=self.error_back)
28 |
29 | def xiaoqu_detail_page(self, response):
30 | # 提取小区详情页的数据
31 | xiaoqu_detail_data = parse_xiaoqu_detail(response)
32 | item = City58XiaoQu()
33 | item.update(xiaoqu_detail_data)
34 | item['id'] = response.url
35 | self.logger.debug(item)
36 | yield item
37 |
38 | # 二手房页面
39 | ershoufang_url = self.base_url.format(self.settings.get('HOST'), item['id']) + 'ershoufang' # 二手房页面的完整请求链接
40 | yield Request(url=ershoufang_url, callback=self.ershoufang_list_page,
41 | errback=self.error_back, meta={'id': item['id']})
42 |
43 | # 出租房页面
44 | chuzufang_url = self.base_url.format(self.settings.get('HOST'), item['id']) + 'chuzu' # 出租房页面的完整请求链接
45 | yield Request(url=chuzufang_url, callback=self.chuzufang_detail_page,
46 | errback=self.error_back, meta={'id': item['id']})
47 |
48 | def ershoufang_list_page(self, response):
49 | # 保持编码规则,在self不使用的情况下接收它
50 | _ = self
51 | # 提取二手房页面的所有房价
52 | price_list = get_ershoufang_list_page(response)
53 | yield {'id': response.item['id'], 'price_list': price_list} # 仅计算该小区的平均房价,不做存储及其他处理
54 |
55 | # 翻页
56 |
57 | def chuzufang_detail_page_url_list(self, response):
58 | # 保持编码规则,在self不使用的情况下接收它
59 | _ = self
60 | # 提取出租房页面的所有详情页链接
61 | chuzufang_detail_url = get_chuzu_detail_page_list_url(response)
62 | for url in chuzufang_detail_url:
63 | yield Request(url=url, callback=self.chuzufang_detail_page,
64 | errback=self.error_back, meta={'id': response.item['id']})
65 |
66 | # 翻页
67 |
68 | def chuzufang_detail_page(self, response):
69 | # 保持编码规则,在self不使用的情况下接收它
70 | _ = self
71 | # 提取出租房页面的详细数据(注意:当前时间-2018/11/24,目前了解至少从2018年9月份开始该页面已添加字体反爬, 爬取的数据已经做反反爬处理)
72 | chuzufang_data = get_chuzu_house_info(response)
73 | item = City58ItemChuZuInfo()
74 | item.update(chuzufang_data)
75 | item['id'] = response.meta['id']
76 | item['url'] = response.url
77 | yield item
78 |
79 | def error_back(self, e):
80 | _ = e
81 | # 打印堆栈的错误信息
82 | self.logger.debug(format_exc())
83 | pass
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/spiders/__pycache__/58.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/spiders/__pycache__/58.cpython-36.pyc
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/utils/__init__.py
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/utils/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/utils/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/utils/__pycache__/api.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/utils/__pycache__/api.cpython-36.pyc
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/utils/__pycache__/parse.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/utils/__pycache__/parse.cpython-36.pyc
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/utils/__pycache__/proxy.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/utils/__pycache__/proxy.cpython-36.pyc
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/utils/__pycache__/xdaili.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/City_58/City_58/utils/__pycache__/xdaili.cpython-36.pyc
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/utils/api.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import requests
4 | import json
5 | import time
6 | from fake_useragent import UserAgent
7 | import logging
8 |
9 | def get_ip_port(url):
10 | """
11 | 获取API返回的JSON数据
12 | :param url: 代理API
13 | :return: 有效IP
14 | """
15 | time.sleep(1)
16 | response = requests.get(url)
17 | response = json.loads(response.text)
18 | result = response['RESULT']
19 | agent = ''
20 | for i in range(len(result)):
21 | agent = 'https://{}:{}/'.format(result[i]['ip'], result[i]['port'])
22 | logging.debug(agent)
23 | return agent
24 |
25 | if __name__ == '__main__':
26 | # 测试 - 这里我购买了讯代理的"优质代理",通过API生成提取链接来提取ip. 测试有效!
27 | url = ''
28 | agent = get_ip_port(url=url)
29 |
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/utils/proxy.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | """
4 | 轻量级ip代理池, 以89代理网站为例, ip成活率较低, 可做测试使用.
5 | """
6 |
7 | import requests
8 | from pyquery import PyQuery
9 | from fake_useragent import UserAgent
10 | import random
11 |
12 | def get_ip_port(page):
13 | """
14 | 获取网页的ip和port
15 | :param page: 页码
16 | :return: 随机ip
17 | """
18 | # 请求头(根据需要另行设置)
19 | headers = dict()
20 | # 代理池
21 | agents = list()
22 | for i in range(page):
23 | url = 'http://www.89ip.cn/index_{}.html'.format(i+1) # 格式化请求链接
24 | response = requests.get(url) # 获取网页内容
25 |
26 | if response.status_code == 200:
27 | jpy = PyQuery(response.text)
28 | tr_list = jpy('div.layui-form > table > tbody > tr').items()
29 | for tr in tr_list:
30 | ip = tr('td:nth-child(1)').text()
31 | port = tr('td:nth-child(2)').text()
32 | agent = 'http://{}:{}'.format(ip, port) # 格式化ip,port
33 | agents.append(agent) # 添加至代理池
34 | else:
35 | print('The status code is {},Try again! '.format(response.status_code))
36 |
37 | # 检测有效ip代理,随机返回使用
38 | return random.choices(test_agent(agents))[0]
39 |
40 | def test_agent(agents):
41 | """
42 | 针对58同城测试获取的免费代理
43 | :param agents: 代理池
44 | :return: 有效的代理
45 | """
46 | agents_copy = agents
47 | for agent in agents_copy:
48 | try:
49 | res = requests.get('https://cd.58.com/', proxy=agent)
50 | except Exception as e:
51 | agents.remove(agent)
52 | continue
53 | return agents
54 |
55 | if __name__ == '__main__':
56 | print(get_ip_port(random.randint(2, 4)))
--------------------------------------------------------------------------------
/17-City_58/City_58/City_58/utils/xdaili.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | """
4 | 提示:讯代理的Python3接入文档需要稍作修改,方能使用.
5 | """
6 |
7 | import sys
8 | import time
9 | import hashlib
10 | import requests
11 | # import grequests
12 | from lxml import etree
13 |
14 | class Xdaili(object):
15 | def __init__(self):
16 | # 请将此处的订单号和个人密钥修改为你自己的.
17 | self.orderno = 'ZF201812********************'
18 | self.secret = 'ddde303a6*******************'
19 | self.ip = "forward.xdaili.cn"
20 | self.port = '80'
21 | self.ip_port = self.ip + ":" + self.port
22 |
23 | def proxy(self):
24 | # 时间戳
25 | timestamp = str(int(time.time()))
26 | # 签名算法参数
27 | string = "orderno=" + self.orderno + "," + "secret=" + self.secret + "," + "timestamp=" + timestamp
28 | # Python3需要编码
29 | string = string.encode()
30 | # 计算sign
31 | md5_string = hashlib.md5(string).hexdigest()
32 | # 转大写
33 | sign = md5_string.upper()
34 | # auth
35 | auth = "sign=" + sign + "&" + "orderno=" + self.orderno + "&" + "timestamp=" + timestamp
36 | proxy = {
37 | "http": "http://" + self.ip_port,
38 | "https": "https://" + self.ip_port
39 | }
40 | return [auth, proxy]
41 |
--------------------------------------------------------------------------------
/17-City_58/City_58/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = City_58.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = City_58
12 |
--------------------------------------------------------------------------------
/17-City_58/README.md:
--------------------------------------------------------------------------------
1 | ## Scrapy 58 City
2 | Scrapy实战项目 - 使用Scrapy框架抓取58同城的房屋信息,并将数据存储至MongoDB。
3 |
4 | ## Tip
5 | 本次实践代码的综合性较高, 建议有选择性的尝试。 对于代码中的疑惑点, 可随时提交问题或邮箱联系。Good Luck!
6 |
7 | ## Demo
8 | 
9 |
--------------------------------------------------------------------------------
/17-City_58/screenshot/monogdb.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/screenshot/monogdb.jpg
--------------------------------------------------------------------------------
/17-City_58/screenshot/run_01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/screenshot/run_01.jpg
--------------------------------------------------------------------------------
/17-City_58/screenshot/run_02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/17-City_58/screenshot/run_02.jpg
--------------------------------------------------------------------------------
/18-36kr/.idea/36kr.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/18-36kr/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/18-36kr/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/18-36kr/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/18-36kr/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/18-36kr/36kr.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | """
4 | Created at 21:04 at March 12,2019
5 | @title: 爬取36kr的最新文章信息并保存至Mysql数据库
6 | @author: Northxw
7 | """
8 |
9 | from tqdm import tqdm
10 | from colorama import init, Fore
11 | from icon.word import show
12 | from fake_useragent import UserAgent
13 | from requests.exceptions import RequestException
14 | import requests
15 | import pymysql
16 | import time
17 | import re
18 |
19 | init(autoreset=True)
20 |
21 | def connect_db():
22 | """
23 | 连接Mysql数据库
24 | :return: db
25 | """
26 | db = pymysql.connect(host='localhost', user='root', password='******', port=3306, db='36kr')
27 | # print('数据库连接成功!')
28 | return db
29 |
30 | def get_one_page(page):
31 | """
32 | 获取一页的最新文章JSON数据
33 | :param page: 页码
34 | :return: json
35 | """
36 | # 真实请求
37 | url = 'https://36kr.com/api/search-column/mainsite?per_page=20&page={}'.format(str(page))
38 | # 设置Headers
39 | headers = {
40 | 'User-Agent': UserAgent().random,
41 | 'Referer': 'https://36kr.com/',
42 | 'Host': '36kr.com'
43 | }
44 | # 获取网页源代码
45 | try:
46 | response = requests.get(url, headers=headers)
47 | if response.status_code == 200:
48 | items = response.json()['data']['items']
49 | return items
50 | return None
51 | except RequestException:
52 | return None
53 |
54 | def parse_one_page(items):
55 | """
56 | 解析获取的JSON数据
57 | :param items: 获取的JSON数据段items
58 | :return: dict
59 | """
60 | # 存储单页总数据
61 | datas = list()
62 | for item in items:
63 | data= {
64 | # 文章ID
65 | 'id': str(item['id']),
66 | # 标题
67 | 'title': item['title'],
68 | # 类别
69 | 'column_name': item['column_name'],
70 | # id
71 | 'column_id': item['column_id'],
72 | # 封面图片链接
73 | 'cover': item['cover'],
74 | # 发布时间
75 | 'publish_time': item['published_at'] ,
76 | # 文章总结
77 | 'summary': item['summary']
78 | }
79 | # 处理时间
80 | data['publish_time'] = re.search('(.*?)T(.*?)\+.*', data['publish_time']).group(1) + ' ' + re.search('(.*?)T(.*?)\+.*', data['publish_time']).group(2)
81 | # 存储
82 | datas.append(data)
83 | # 将标题写入文件.制作中文词云
84 | with open('./icon/36kr.txt', 'a', encoding='utf-8') as f:
85 | f.write(data['title'])
86 | return datas
87 |
88 | def save_to_mysql(datas):
89 | """
90 | 将解析数据存储到Mysql数据库
91 | :param item: 获取的单页有效数据
92 | :return: None
93 | """
94 | # 连接数据库
95 | db = connect_db()
96 | # 获得Mysql操作指针
97 | cursor = db.cursor()
98 | # sql
99 | sql = "INSERT INTO kr(id, article_title, colum_name, colum_id, cover, publish_time, summary) " \
100 | "VALUES(%s, %s, %s, %s, %s, %s, %s)"
101 | for _item in datas:
102 | try:
103 | # 插入数据
104 | cursor.execute(sql, (_item['id'], _item['title'], _item['column_name'],
105 | _item['column_id'], _item['cover'], _item['publish_time'], _item['summary']))
106 | # 提交
107 | db.commit()
108 | # print('数据插入成功!')
109 | except Exception as e:
110 | # print('数据插入失败!',e)
111 | db.rollback()
112 | # 关闭数据库连接
113 | db.close()
114 |
115 | def main():
116 | """
117 | 主函数
118 | :return: None
119 | """
120 | print(Fore.RED + '提示:截止目前的总数据量是77998条, 测试仅抓取前10页的共200条数据!\n')
121 | for i in tqdm(range(10), desc='抓取进度'):
122 | # 获取
123 | items = get_one_page(i+1)
124 | # 解析
125 | data = parse_one_page(items)
126 | # 保存
127 | save_to_mysql(data)
128 | time.sleep(1)
129 |
130 | if __name__ == '__main__':
131 | main()
132 |
--------------------------------------------------------------------------------
/18-36kr/README.md:
--------------------------------------------------------------------------------
1 | ## Spider 36kr
2 | 爬取36氪的最新文章信息并存储至Mysql、制作中文词云图, 爬取内容包含文章ID, 标题,封面图片链接,发布时间,类别名称等。
3 |
4 | ## Explain
5 | 首先,确定36氪的新闻信息是通过Js加载;然后,打开谷歌浏览器开发者工具选择NetWork寻找真实请求的URL;最后,编写Code爬取文章信息。
6 |
7 | 注意:真实请求URL最后的数字参数是时间戳,去掉后可正常获取网页内容。
8 |
9 | ## Demo
10 | 
11 |
12 | 
13 |
--------------------------------------------------------------------------------
/18-36kr/utils/FZSTK.TTF:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/18-36kr/utils/FZSTK.TTF
--------------------------------------------------------------------------------
/18-36kr/utils/__pycache__/word.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/18-36kr/utils/__pycache__/word.cpython-36.pyc
--------------------------------------------------------------------------------
/18-36kr/utils/cloud.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/18-36kr/utils/cloud.jpg
--------------------------------------------------------------------------------
/18-36kr/utils/db.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/18-36kr/utils/db.png
--------------------------------------------------------------------------------
/18-36kr/utils/show.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/18-36kr/utils/show.jpg
--------------------------------------------------------------------------------
/18-36kr/utils/word.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from wordcloud import WordCloud
4 | import cv2
5 | import jieba
6 | import matplotlib.pyplot as plt
7 |
8 | def show():
9 | """
10 | 根据文章标题,制作中文词云
11 | :return: None
12 | """
13 | # 文本
14 | with open('36kr.txt', 'r', encoding='utf-8') as f:
15 | text = f.read()
16 | cut_text = " ".join(jieba.cut(text))
17 | color_mask = cv2.imread('show.jpg')
18 | cloud = WordCloud(
19 | # 设置字体,不指定就会出现乱码
20 | font_path = "./FZSTK.TTF",
21 | # 设置背景色
22 | background_color = 'white',
23 | # 词云形状
24 | mask = color_mask,
25 | # 允许最大词汇
26 | max_words = 2000,
27 | # 最大号字体
28 | max_font_size = 40
29 | )
30 | wCloud = cloud.generate(cut_text)
31 | wCloud.to_file('cloud.jpg')
32 |
33 | plt.imshow(wCloud, interpolation='bilinear')
34 | plt.axis('off')
35 | plt.show()
36 |
37 | if __name__ == '__main__':
38 | show()
--------------------------------------------------------------------------------
/19-Youku_DanMu/.idea/Youku_DanMu.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/19-Youku_DanMu/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/19-Youku_DanMu/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/19-Youku_DanMu/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/19-Youku_DanMu/README.md:
--------------------------------------------------------------------------------
1 | ## Youku DanMu
2 | **弹幕爬取01** - 网页版优酷视频《我不是药神》的弹幕数据并制作词云图。
3 |
4 | ## Explain
5 | 首先,播放影片并打开Chrome开发者工具,选择Network。逐步拖动进度条并观察本地与服务器的请求规律,如图:
6 | 
7 |
8 | 然后,确定弹幕数据来自JS实时加载而非XHR。需要注意的是,弹幕的请求数据不是规范的JSON格式。如图:
9 | 
10 |
11 | ## Other
12 | 1. 请求链接的最后一个参数类似时间戳,去掉后不会影响数据的获取。
13 | 2. 不要使用urllib.parse.urlencode()函数构造GET请求的链接,否则获取的数据为空,亲测。
14 |
15 | ## Demo
16 | 
17 |
18 | 从词云图可以看出,"会员、电影票、五星力荐、王传君、癌症..."等关键字最为突出。
19 |
--------------------------------------------------------------------------------
/19-Youku_DanMu/danmu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | Created at 22:08 at March 13,2019
5 | @title: 爬取优酷《我不是药神仙》弹幕数据并制作词云
6 | @author: Northxw
7 | """
8 |
9 | from fake_useragent import UserAgent
10 | from requests.exceptions import RequestException
11 | from tqdm import tqdm
12 | import requests
13 | import time
14 | import os
15 | import re
16 |
17 | def get_data(mat):
18 | """
19 | 循环遍历爬取弹幕数据
20 | :param mat: 偏移量
21 | :return: list
22 | """
23 | # 请求链接
24 | url = 'https://service.danmu.youku.com/list?jsoncallback=jQuery111207035726936412456_1552483671572&mat={}&mcount=1&ct=1001&iid=959955945&aid=333822&cid=96&lid=0&ouid=0'.format(mat)
25 | # headers
26 | headers = {
27 | 'Referer': 'https://v.youku.com/v_show/id_XMzgzOTgyMzc4MA==.html?spm=a2h0k.11417342.soresults.dplaybutton&s=c6c62a475a5d4a14ab48',
28 | 'User-Agent': UserAgent().random
29 | }
30 | """
31 | # 参数
32 | params = {
33 | 'jsoncallback': 'jQuery11120003560802190473389_1552479833762',
34 | 'mat': mat,
35 | 'mcount': '1',
36 | 'ct': '1001',
37 | 'id': '959955945',
38 | 'aid': '333822',
39 | 'cid': '96',
40 | 'lid': '0',
41 | 'ouid': '0'
42 | # '_': '1552479833815' 提示:类似时间戳,去掉后不影响数据的获取
43 | }
44 | """
45 | # 获取弹幕
46 | try:
47 | response = requests.get(url, headers=headers)
48 | print(response)
49 | if response.status_code == 200:
50 | html = response.text
51 | # 正则解析(结果为list类型)
52 | results = re.findall(',\"content\":\"(.*?)\",', html, re.S)
53 | # 文本存储
54 | save_dir = './utils/danmu.txt'
55 | if not os.path.exists(save_dir): # Determine whether storage path exists, no creation
56 | os.mkdir(save_dir)
57 | with open(save_dir, 'a', encoding='utf-8') as f:
58 | f.write(str(results))
59 | return results
60 | return None
61 | except RequestException as e:
62 | print('Error: ', e.args)
63 | return None
64 |
65 | if __name__ == '__main__':
66 | for i in tqdm(range(10), desc='Progress'):
67 | time.sleep(1)
68 | get_data(str(i))
69 |
--------------------------------------------------------------------------------
/19-Youku_DanMu/utils/FZSTK.TTF:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/19-Youku_DanMu/utils/FZSTK.TTF
--------------------------------------------------------------------------------
/19-Youku_DanMu/utils/cloud.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/19-Youku_DanMu/utils/cloud.jpg
--------------------------------------------------------------------------------
/19-Youku_DanMu/utils/require/danmu_content.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/19-Youku_DanMu/utils/require/danmu_content.png
--------------------------------------------------------------------------------
/19-Youku_DanMu/utils/require/danmu_json.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/19-Youku_DanMu/utils/require/danmu_json.png
--------------------------------------------------------------------------------
/19-Youku_DanMu/utils/show.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/19-Youku_DanMu/utils/show.jpg
--------------------------------------------------------------------------------
/19-Youku_DanMu/utils/word.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from wordcloud import WordCloud
4 | import cv2
5 | import jieba
6 | import matplotlib.pyplot as plt
7 |
8 | def show():
9 | # 文本
10 | with open('danmu.txt', 'r', encoding='utf-8') as f:
11 | text = f.read()
12 | cut_text = " ".join(jieba.cut(text))
13 | color_mask = cv2.imread('show.jpg')
14 | cloud = WordCloud(
15 | # 设置字体,不指定就会出现乱码
16 | font_path = "./FZSTK.TTF",
17 | # 设置背景色
18 | background_color = 'white',
19 | # 词云形状
20 | mask = color_mask,
21 | # 允许最大词汇
22 | max_words = 2000,
23 | # 最大号字体
24 | max_font_size = 40
25 | )
26 | wCloud = cloud.generate(cut_text)
27 | wCloud.to_file('cloud.jpg')
28 | plt.imshow(wCloud, interpolation='bilinear')
29 | plt.axis('off')
30 | plt.show()
31 |
32 | if __name__ == '__main__':
33 | show()
--------------------------------------------------------------------------------
/20-Selenium_163/.idea/20-Selenium_163Email.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/20-Selenium_163/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/20-Selenium_163/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/20-Selenium_163/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/20-Selenium_163/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/20-Selenium_163/README.md:
--------------------------------------------------------------------------------
1 | # Project name
2 | 模拟登录网易163邮箱并发送SOS邮件。
3 |
4 | # Sort
5 | - **iframe子页面处理** - 通过网易163邮箱的多iframe特点训练对子页面的处理。
6 |
7 | - **模拟登陆** - Selenium
8 |
9 | # Install
10 | **1. Selenium** - 建议使用低版本的Python-Selenium库,因为高版本在Chrome中不支持。
11 | ```
12 | pip3 install selenium==2.48.0
13 | ```
14 | **2. chromedriver.exe** - 下载地址:http://npm.taobao.org/mirrors/chromedriver/, 版本要匹配。将 .exe 程序放在"..Python\Python36\Scripts"目录下。
15 |
16 | **3. pymysql**
17 | ```
18 | pip3 install pymysql
19 | ```
20 |
21 | # Process analysis
22 | **1.登录界面iframe**
23 | iframe的id值添加了时间戳,直接获取相对麻烦。可通过XPATH或CSS选择器获取该节点。如图:
24 |
25 |
26 | 
27 |
28 | **2. "写信"节点**
29 | 写信节点的元素定位li节点, 不要定位span子节点,否则获取不到。另外,如果是获取APP节点,可以选择小一级的。
30 |
31 | **3. 邮件主题**
32 | 主题节点不可交互,无法输入文字,这里选择不设置。
33 |
34 | **4. 邮件内容**
35 | 邮件内容的文本输入框处于iframe中,输入文本前需要切换frame,可直接通过class获取并切换。如图:
36 |
37 | 
38 |
39 | **5. "发送"节点**
40 | 由于输入邮件内容时切换至子页面,在点击发送前需要切换到父级Frame。
41 |
42 | **6. 登录限制**
43 | 不要频繁使用Selenium, 否则会出现点触式验证。当然,完全可以破解。但是,网易相对友好,短时间过后便可恢复正常访问,也不会ban IP。
44 |
45 | # Other
46 | 代码注释部分为保留功能:获取所有邮件的有效信息(发件人、收件时间、邮件内容概要),并保存至数据库。由于节点采集遇到问题,所以暂时注释保留。
47 |
48 | # Demo
49 | 
50 |
--------------------------------------------------------------------------------
/20-Selenium_163/require/content_frame.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/20-Selenium_163/require/content_frame.png
--------------------------------------------------------------------------------
/20-Selenium_163/require/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/20-Selenium_163/require/demo.gif
--------------------------------------------------------------------------------
/20-Selenium_163/require/login_frame.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/20-Selenium_163/require/login_frame.png
--------------------------------------------------------------------------------
/20-Selenium_163/utils/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/20-Selenium_163/utils/__pycache__/config.cpython-36.pyc
--------------------------------------------------------------------------------
/20-Selenium_163/utils/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | # 登陆页面URL
4 | URL = 'https://mail.163.com/'
5 |
6 |
7 | # 邮箱账号
8 | MAIL_USER = 'northxw'
9 | # 邮箱密码[更换为你的密码]
10 | MAIL_PASS = '******'
11 |
12 |
13 | # 收件人邮箱账号[更换为你想发送的收件人]
14 | RECIPIENT = '******'
15 | # 内容
16 | CONTENT = '6的二进制 !!!'
17 |
18 | """
19 | # localhost
20 | MYSQL_LOCALHOST = 'localhost'
21 | # 用户
22 | MYSQL_USER = 'root'
23 | # 密码
24 | MYSQL_PASS = '0513'
25 | # 端口
26 | MYSQL_PORT = 3306
27 | # 数据库
28 | MYSQL_DB = 'mail'
29 | """
30 |
31 | # 间隔时间
32 | TIME_OUT = 10
33 |
34 |
--------------------------------------------------------------------------------
/21-AutoCrawl_DouYin/.idea/DouYin.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/21-AutoCrawl_DouYin/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/21-AutoCrawl_DouYin/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/21-AutoCrawl_DouYin/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/21-AutoCrawl_DouYin/README.md:
--------------------------------------------------------------------------------
1 | # Project Name
2 | Appium、Mitmdump爬取抖音短视频。
3 |
4 | # Sort
5 | **自动化爬取APP数据** - 基于 Appium + Mitmdump 的自动化爬取程序。
6 |
7 | # Install
8 | 请移步:[Environmental_installation](https://github.com/Northxw/Python3_WebSpider/blob/master/05-Moments/Readme.md)
9 |
10 | # Explain
11 | ### 1. 不登录抖音账号
12 | 若选择登录抖音账号,第一个问题是无法自动化获取短信验证码,第二个问题是填写短信验证码后会出现点触式图形验证码,如图:
13 |
14 | 
15 |
16 | ### 2.跳过"滑动查看更多"
17 | 自动化打开抖音APP后会出现"滑动查看更多", 须通过获取点击位置跳过该页面,如图:
18 |
19 | 
20 |
21 | ### 3. 视频请求接口
22 | 抖音视频的接口较多,有的包含较多广告,有的全是短视频,这里选择全部获取,构造共16个URL,代码如下:
23 | ```Python
24 | nums = [1,3,6,9]
25 | for num in nums:
26 | url_first = 'http://v{}-dy.ixigua.com/'.format(str(num))
27 | url_second = 'http://v{}-dy-x.ixigua.com'.format(str(num))
28 | url_third = 'http://v{}-dy-z.ixigua.com'.format(str(num))
29 | url_fouth = 'http://v{}-dy-y.ixigua.com'.format(str(num))
30 | urls.extend([url_first, url_second, url_third, url_fouth])
31 | ```
32 |
33 | ### 4. 视频文件名称
34 | 取视频URL中的唯一值作为保存视频的名称,如图:
35 |
36 | 
37 |
38 | # Other
39 | 自动化爬取抖音短视频只能下载视频,而不能获取视频的其他有效信息,就好比有些网站必须登录之后才能获取数据是一样的。
40 |
41 | # Demo
42 | #### 1. GIF-Download_Video
43 | 
44 |
45 | #### 2. GIF-Crawl_Video
46 | 
47 |
--------------------------------------------------------------------------------
/21-AutoCrawl_DouYin/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/21-AutoCrawl_DouYin/__pycache__/config.cpython-36.pyc
--------------------------------------------------------------------------------
/21-AutoCrawl_DouYin/__pycache__/scripts.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/21-AutoCrawl_DouYin/__pycache__/scripts.cpython-36.pyc
--------------------------------------------------------------------------------
/21-AutoCrawl_DouYin/actions.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from appium import webdriver
4 | from selenium.webdriver.support.ui import WebDriverWait
5 | from selenium.webdriver.support import expected_conditions as EC
6 | from selenium.webdriver.common.by import By
7 | from selenium.common.exceptions import NoSuchElementException, TimeoutException
8 | from time import sleep
9 | from config import *
10 | import time
11 |
12 | class DouYin(object):
13 | def __init__(self):
14 | """
15 | 初始化
16 | """
17 | # 配置启动APP的参数
18 | self.desired_caps = {
19 | 'platformName': PLATFORM,
20 | 'deviceName': DEVICE_NAME,
21 | 'appPackage': APP_PACKAGE,
22 | 'appActivity': APP_ACTICITY
23 | }
24 | self.driver = webdriver.Remote(APPIUM_SERVER, self.desired_caps)
25 | self.wait = WebDriverWait(self.driver, TIME_OUT)
26 |
27 | def open(self):
28 | """
29 | 打开抖音APP
30 | """
31 | time.sleep(5)
32 | # 跳过"滑动查看更多"界面
33 | unknown = self.wait.until(
34 | EC.presence_of_element_located((By.XPATH, '//*[@class="android.widget.FrameLayout"]')))
35 | unknown.click()
36 | """
37 | try:
38 | # 出现抖音"用户隐私政策概要"界面后,选择"仅浏览"
39 | yes = self.wait.until(EC.element_to_be_clickable((By.ID, 'com.ss.android.ugc.aweme:id/mw')))
40 | yes.click()
41 | except NoSuchElementException as e:
42 | pass
43 | # 跳过"滑动查看更多"界面
44 | unknown = self.wait.until(EC.presence_of_element_located((By.XPATH, '//*[@class="android.widget.FrameLayout"]')))
45 | unknown.click()
46 | """
47 |
48 | def scroll(self):
49 | """
50 | 滑动
51 | """
52 | while True:
53 | # 上滑刷新
54 | self.driver.swipe(FLICK_START_X, FLICK_START_Y + FLICK_DISTANCE, FLICK_START_X, FLICK_START_Y)
55 | sleep(SCROLL_SLEEP_TIME)
56 |
57 | def main(self):
58 | self.open()
59 | self.scroll()
60 |
61 | if __name__ == '__main__':
62 | douyin = DouYin()
63 | douyin.main()
--------------------------------------------------------------------------------
/21-AutoCrawl_DouYin/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | # Appium 服务接口
4 | APPIUM_SERVER = 'http://localhost:4723/wd/hub'
5 |
6 |
7 | # 设备类型
8 | DEVICE_NAME = 'vivo_X7'
9 | # 设备类型(安卓或IOS)
10 | PLATFORM = 'Android'
11 | # APP包名
12 | APP_PACKAGE = 'com.ss.android.ugc.aweme'
13 | # 入口类型
14 | APP_ACTICITY = '.main.MainActivity'
15 |
16 |
17 | # 元素加载时间
18 | TIME_OUT = 300
19 |
20 | # 滑动点
21 | FLICK_START_X = 300
22 | FLICK_START_Y = 300
23 | FLICK_DISTANCE = 900
24 |
25 | # 滑动间隔时间
26 | SCROLL_SLEEP_TIME = 5
--------------------------------------------------------------------------------
/21-AutoCrawl_DouYin/plates/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/21-AutoCrawl_DouYin/plates/demo.gif
--------------------------------------------------------------------------------
/21-AutoCrawl_DouYin/plates/douyin_demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/21-AutoCrawl_DouYin/plates/douyin_demo.gif
--------------------------------------------------------------------------------
/21-AutoCrawl_DouYin/plates/start.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/21-AutoCrawl_DouYin/plates/start.png
--------------------------------------------------------------------------------
/21-AutoCrawl_DouYin/plates/video_name.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/21-AutoCrawl_DouYin/plates/video_name.png
--------------------------------------------------------------------------------
/21-AutoCrawl_DouYin/plates/video_url.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/21-AutoCrawl_DouYin/plates/video_url.png
--------------------------------------------------------------------------------
/21-AutoCrawl_DouYin/plates/图形点触验证码.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/21-AutoCrawl_DouYin/plates/图形点触验证码.png
--------------------------------------------------------------------------------
/21-AutoCrawl_DouYin/scripts.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | """
4 | Crated at 07:34 at March 20,2019
5 | @title: 使用Appium + Mitmdump 自动化爬取抖音视频
6 | @author: Northxw
7 | """
8 |
9 | import requests
10 | import os
11 |
12 | def response(flow):
13 | """
14 | 爬取抖音短视频
15 | """
16 | urls = list()
17 | # 抖音短视频接口
18 | nums = [1,3,6,9]
19 | for num in nums:
20 | url_first = 'http://v{}-dy.ixigua.com/'.format(str(num))
21 | url_second = 'http://v{}-dy-x.ixigua.com'.format(str(num))
22 | url_third = 'http://v{}-dy-z.ixigua.com'.format(str(num))
23 | url_fouth = 'http://v{}-dy-y.ixigua.com'.format(str(num))
24 | urls.extend([url_first, url_second, url_third, url_fouth])
25 |
26 | for url in urls:
27 | if flow.request.url.startswith(url):
28 | # 取URL中取值唯一的部分作为文件名称
29 | video_name = flow.request.url.split('/')[3]
30 | # 获取视频的二进制内容
31 | content = requests.get(flow.request.url, stream=True).content
32 | # 判断文件路径是否存在
33 | save_dir = './video'
34 | if not os.path.exists(save_dir):
35 | os.mkdir(save_dir)
36 | # 视频存储路径
37 | save_dir = '{}/{}.mp4'.format(save_dir, video_name)
38 |
39 | # 存储
40 | with open(save_dir, 'wb') as f:
41 | f.write(content)
--------------------------------------------------------------------------------
/22-Stackoverflow/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/22-Stackoverflow/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/22-Stackoverflow/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/22-Stackoverflow/.idea/stackoverflow.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/22-Stackoverflow/README.md:
--------------------------------------------------------------------------------
1 | # Spider Stackoverflow
2 | 爬取 **Stackoverflow** 前1000个问题的相关信息。
3 |
4 | # Sort
5 | **Scrapy** - 爬取外网数据。
6 |
7 | # Explain
8 | ## 1. 设置 "ROBOTSTXT_OBEY = True"
9 | 如果你没有某墙软件,建议遵循爬虫协议,否则会被强制切断请求。在此基础上,设置 **DOWNLOAD_DELAY** 爬取时间间隔, 访问不要过于频繁。
10 |
11 | ## 2. 建议设置"佛跳墙"
12 | 经测,设置某墙后,可以在不设爬取时延的状态下,更快更高效的获取数据。如果某强是客户端软件,在 requests 超过TIMEOUT时切换节点可继续获取数据。
13 |
14 | ## 3. UAMiddleware、ProxyMiddleware
15 | 此外,添加随机UA中间件以及代理中间件(由于本机有佛跳墙的客户端软件,所以没有开启代理中间件)。
16 | ```Python
17 | from fake_useragent import UserAgent
18 |
19 | class UAMiddleware(object):
20 | def __init__(self):
21 | self.user_agent = UserAgent().random
22 |
23 | def process_request(self, request, spider):
24 | request.headers['User-Agent'] = self.user_agent
25 | ```
26 |
27 | ## 4.爬取思路
28 | - **start_requests()** 初始化前100页链接
29 | - 爬取每页问题的详情页链接
30 | - 爬取问题详情页的标题、投票数、正文、标签等信息
31 | - 管道清洗后存入MonogoDB
32 |
33 | 注意:**Reqeust()** 过程产生的异常,由error_back()函数接收并在控制台打印错误信息;爬取问题详情页由于部分问题没有code,所以返回None。数据库管道如下:
34 | ```Python
35 | import pymongo
36 |
37 | class MongoPipeline(object):
38 | def __init__(self, mongo_url, mongo_db):
39 | self.mongo_url = mongo_url
40 | self.mongo_db = mongo_db
41 |
42 | @classmethod
43 | def from_crawler(cls, crawler):
44 | return cls(
45 | mongo_url=crawler.settings.get('MONGO_INIT_URL'),
46 | mongo_db=crawler.settings.get('MONGO_DB')
47 | )
48 |
49 | def open_spider(self, spider):
50 | self.client = pymongo.MongoClient(self.mongo_url)
51 | self.db = self.client[self.mongo_db]
52 |
53 | def process_item(self, item, spider):
54 | self.db[item.table].insert(dict(item))
55 | return item
56 |
57 | def close_spider(self, spider):
58 | self.client.close()
59 | ```
60 |
61 | # Other
62 | ???
63 |
64 | # Result
65 |
66 | 
67 |
--------------------------------------------------------------------------------
/22-Stackoverflow/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = stackoverflow.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = stackoverflow
12 |
--------------------------------------------------------------------------------
/22-Stackoverflow/stackoverflow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/__init__.py
--------------------------------------------------------------------------------
/22-Stackoverflow/stackoverflow/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/22-Stackoverflow/stackoverflow/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/22-Stackoverflow/stackoverflow/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/__pycache__/middlewares.cpython-36.pyc
--------------------------------------------------------------------------------
/22-Stackoverflow/stackoverflow/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/22-Stackoverflow/stackoverflow/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/22-Stackoverflow/stackoverflow/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | from scrapy import Item, Field
9 |
10 | class StackoverflowItem(Item):
11 | table = 'stackoverflow'
12 | link = Field()
13 | title = Field()
14 | votes = Field()
15 | body = Field()
16 | tags = Field()
17 |
18 |
19 |
--------------------------------------------------------------------------------
/22-Stackoverflow/stackoverflow/main.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | from scrapy.cmdline import execute
4 |
5 | execute('scrapy crawl stack'.split())
--------------------------------------------------------------------------------
/22-Stackoverflow/stackoverflow/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from fake_useragent import UserAgent
4 | import base64
5 |
6 | class UAMiddleware(object):
7 | def __init__(self):
8 | self.user_agent = UserAgent().random
9 |
10 | def process_request(self, request, spider):
11 | request.headers['User-Agent'] = self.user_agent
12 |
13 | class ProxyMiddleware(object):
14 | def __init__(self, proxy_server, proxy_user, proxy_pass):
15 | self.proxy_server = proxy_server
16 | self.proxy_user = proxy_user
17 | self.proxy_pass = proxy_pass
18 | self.proxy_auth = "Basic " + base64.urlsafe_b64encode(bytes((self.proxy_user + ":" + self.proxy_pass), "ascii")).decode("utf8")
19 |
20 | @classmethod
21 | def from_crawler(cls, crawler):
22 | return cls(
23 | proxy_server = crawler.settings.get('PROXY_SERVER'),
24 | proxy_user = crawler.settings.get('PROXY_USER'),
25 | proxy_pass = crawler.settings.get('PROXY_PASS')
26 | )
27 |
28 | def process_request(self, request, spider):
29 | request.meta["proxy"] = self.proxy_server
30 | request.headers["Proxy-Authorization"] = self.proxy_auth
31 |
32 | def process_response(self, request, response, spider):
33 | # 统计状态码正常的请求总数量
34 | if response.status not in [500, 502, 503, 504, 522, 524, 408]:
35 | return response
36 |
37 | def process_exception(self, request, exception, spider):
38 | pass
--------------------------------------------------------------------------------
/22-Stackoverflow/stackoverflow/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import pymongo
4 |
5 | class MongoPipeline(object):
6 | def __init__(self, mongo_url, mongo_db):
7 | self.mongo_url = mongo_url
8 | self.mongo_db = mongo_db
9 |
10 | @classmethod
11 | def from_crawler(cls, crawler):
12 | return cls(
13 | mongo_url=crawler.settings.get('MONGO_INIT_URL'),
14 | mongo_db=crawler.settings.get('MONGO_DB')
15 | )
16 |
17 | def open_spider(self, spider):
18 | self.client = pymongo.MongoClient(self.mongo_url)
19 | self.db = self.client[self.mongo_db]
20 |
21 | def process_item(self, item, spider):
22 | self.db[item.table].insert(dict(item))
23 | return item
24 |
25 | def close_spider(self, spider):
26 | self.client.close()
--------------------------------------------------------------------------------
/22-Stackoverflow/stackoverflow/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for stackoverflow project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'stackoverflow'
13 |
14 | SPIDER_MODULES = ['stackoverflow.spiders']
15 | NEWSPIDER_MODULE = 'stackoverflow.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'stackoverflow (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | # DOWNLOAD_DELAY = 1
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'stackoverflow.middlewares.StackoverflowSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | DOWNLOADER_MIDDLEWARES = {
56 | 'stackoverflow.middlewares.UAMiddleware': 543,
57 | # 'stackoverflow.middlewares.ProxyMiddleware':545,
58 | }
59 |
60 | # Enable or disable extensions
61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | # 'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 |
66 | # Configure item pipelines
67 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 | 'stackoverflow.pipelines.MongoPipeline': 300,
70 | }
71 |
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 |
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 |
93 | MAX_PAGES = 100
94 |
95 | # 代理服务器
96 | PROXY_SERVER = "http://http-dyn.abuyun.com:9020"
97 |
98 | # 代理隧道验证信息(阿布云)
99 | PROXY_USER = "HEO8FRWV77C1H36D"
100 | PROXY_PASS = "6CF467F7135C59B6"
101 |
102 | MONGO_INIT_URL = 'localhost'
103 | MONGO_DB = 'stackoverflow'
--------------------------------------------------------------------------------
/22-Stackoverflow/stackoverflow/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/22-Stackoverflow/stackoverflow/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/22-Stackoverflow/stackoverflow/spiders/__pycache__/stack.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/spiders/__pycache__/stack.cpython-36.pyc
--------------------------------------------------------------------------------
/22-Stackoverflow/stackoverflow/spiders/stack.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy import Request
4 | from urllib.parse import urlencode
5 | from stackoverflow.items import StackoverflowItem
6 |
7 | class StackSpider(scrapy.Spider):
8 | name = 'stack'
9 | allowed_domains = ['stackoverflow.com/']
10 | base_url = 'https://stackoverflow.com/questions?'
11 |
12 | def start_requests(self):
13 | """
14 | 构建请求链接
15 | """
16 | for i in range(1, self.settings.get('MAX_PAGES') + 1):
17 | params = {'sort': 'votes', 'page': i}
18 | url = self.base_url + urlencode(params)
19 | yield Request(url, callback=self.parse_quetion_list, errback=self.error_back)
20 |
21 | def parse_quetion_list(self, response):
22 | """
23 | 获取每页的问题链接
24 | """
25 | for href in response.xpath('//*[@class="summary"]/h3/a/@href'):
26 | url = response.urljoin(href.extract())
27 | yield Request(url, callback=self.parse_question, errback=self.error_back, dont_filter=True)
28 |
29 | def parse_question(self, response):
30 | """
31 | 获取问题详情页的数据
32 | """
33 | self.logger.debug('Already into Pipeline!')
34 | item = StackoverflowItem()
35 | item['link'] = response.url
36 | item['title'] = response.xpath('//*[@id="question-header"]/h1/a/text()').extract_first()
37 | item['votes'] = response.xpath('//*[@id="question"]/div/div[1]/div/div/text()').extract_first()
38 | item['body'] = response.css('.post-text').xpath('.//*[contains(@class, "prettyprint")]').extract()
39 | item['tags'] = response.css('.question .post-tag::text').extract()
40 | yield item
41 |
42 | def error_back(self, e):
43 | _ = self
44 | self.logger.debug('Error: {}'.format(e))
45 |
--------------------------------------------------------------------------------
/22-Stackoverflow/stackoverflow/utils/Error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/utils/Error.png
--------------------------------------------------------------------------------
/22-Stackoverflow/stackoverflow/utils/db.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/22-Stackoverflow/stackoverflow/utils/db.png
--------------------------------------------------------------------------------
/23-GithubLogin/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/23-GithubLogin/.idea/github.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/23-GithubLogin/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/23-GithubLogin/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/23-GithubLogin/README.md:
--------------------------------------------------------------------------------
1 | ## Github Login
2 | 使用 Scrapy 的 FormReqeust 模拟登陆 Github。
3 |
4 | ## Sort
5 | **模拟登陆 - FormReqeust**
6 |
7 | ## Analysis
8 | #### 1. 清除Cookies
9 | 查找POST表单参数之前先清除待爬取站点的Cookies。
10 |
11 | #### 2. Form表单
12 | 打开Github登陆界面,F12打开开发者工具并选择All,正常登陆Github,在请求列表中可以看到session请求,然后查看POST参数。
13 |
14 | #### 3. 表单参数 - authenticity_token
15 | 该参数是在访问登陆界面时浏览器设置的,可以在登陆界面的源码中找到。
16 |
17 | #### 4. Cookies
18 | 利用Scrapy的FormReqeust模拟登陆时,不需要像requests模拟登陆时保存Cookies, 因为在后续的Request中会默认将前面的Cookies携带。
19 |
20 | ## Tip
21 | 截止2019/4/2 19:50代码运行无误。
22 |
--------------------------------------------------------------------------------
/23-GithubLogin/github/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/23-GithubLogin/github/__init__.py
--------------------------------------------------------------------------------
/23-GithubLogin/github/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/23-GithubLogin/github/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/23-GithubLogin/github/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/23-GithubLogin/github/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/23-GithubLogin/github/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class GithubItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/23-GithubLogin/github/main.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | from scrapy.cmdline import execute
4 |
5 | execute('scrapy crawl logingit'.split())
--------------------------------------------------------------------------------
/23-GithubLogin/github/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class GithubPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/23-GithubLogin/github/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for github project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'github'
13 |
14 | SPIDER_MODULES = ['github.spiders']
15 | NEWSPIDER_MODULE = 'github.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'github (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | DEFAULT_REQUEST_HEADERS = {
43 | 'Host': 'github.com',
44 | 'Referer': 'https://github.com',
45 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
46 | }
47 |
48 | # Enable or disable spider middlewares
49 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | # 'github.middlewares.GithubSpiderMiddleware': 543,
52 | #}
53 |
54 | # Enable or disable downloader middlewares
55 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | # 'github.middlewares.GithubDownloaderMiddleware': 543,
58 | #}
59 |
60 | # Enable or disable extensions
61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | # 'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 |
66 | # Configure item pipelines
67 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
68 | #ITEM_PIPELINES = {
69 | # 'github.pipelines.GithubPipeline': 300,
70 | #}
71 |
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 |
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 |
93 | # 账号、密码
94 | ACCOUNT = 'northxw@163.com'
95 | PASSWORD = '123456'
--------------------------------------------------------------------------------
/23-GithubLogin/github/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/23-GithubLogin/github/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/23-GithubLogin/github/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/23-GithubLogin/github/spiders/__pycache__/logingit.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/23-GithubLogin/github/spiders/__pycache__/logingit.cpython-36.pyc
--------------------------------------------------------------------------------
/23-GithubLogin/github/spiders/logingit.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import requests
4 |
5 | class LogingitSpider(scrapy.Spider):
6 | name = 'logingit'
7 | allowed_domains = ['github.com']
8 | # 登陆界面的URL
9 | login_url = 'https://github.com/login'
10 | # POST表单数据的URL
11 | post_url = 'https://github.com/session'
12 | # 登陆后URL
13 | logined_url = 'https://github.com/settings/profile'
14 |
15 | def start_requests(self):
16 | """
17 | 获取登陆页面源码
18 | """
19 | return [scrapy.Request(url=self.login_url,
20 | callback=self.login,
21 | headers=self.settings.get('DEFAULT_REQUEST_HEADERS'))]
22 |
23 | def login(self, response):
24 | """
25 | 使用FromRequest模拟登陆Github
26 | """
27 | # 提取POST验证参数 authenticity_token
28 | authcode = response.xpath('//*[@id="login"]/form/input[2]/@value').extract_first()
29 | if authcode:
30 | self.logger.debug("Auth Token: %s" %authcode)
31 | post_data = {
32 | 'commit': 'Sign in',
33 | 'utf8': '✓',
34 | 'authenticity_token': authcode,
35 | 'login': self.settings.get('ACCOUNT'),
36 | 'password': self.settings.get('PASSWORD')
37 | }
38 | return [scrapy.FormRequest(url=self.post_url,
39 | formdata=post_data,
40 | headers=self.settings.get('DEFAULT_REQUEST_HEADERS'),
41 | callback=self.check)]
42 | else:
43 | return [scrapy.Request(url=self.login_url, callback=self.login)]
44 |
45 | def check(self, response):
46 | """
47 | 验证登陆是否成功
48 | """
49 | avatar = response.css('#user-links > li:nth-child(3) > details > summary > img::attr(src)').extract_first()
50 | if avatar:
51 | content = requests.get(url=avatar.split('?')[0]).content
52 | with open('./utils/acatar.jpg', 'wb') as f:
53 | f.write(content)
54 | print('Successfully Login!')
55 | pass
56 |
57 |
58 | def parse(self, response):
59 | pass
--------------------------------------------------------------------------------
/23-GithubLogin/github/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/23-GithubLogin/github/utils/__init__.py
--------------------------------------------------------------------------------
/23-GithubLogin/github/utils/acatar.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/23-GithubLogin/github/utils/acatar.jpg
--------------------------------------------------------------------------------
/23-GithubLogin/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = github.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = github
12 |
--------------------------------------------------------------------------------
/24-Dianping/README.md:
--------------------------------------------------------------------------------
1 | ## 大众点评字体反爬
2 | 大众点评css定位的字体反爬解决方案
3 |
4 | ## 处理思路
5 | - 请求CSS链接获取文本内容,正则匹配class对应的坐标值
6 | - 请求SVG链接,正则匹配被除数以及偏移文本
7 | - 判断、获取、拼接数字
8 |
9 | ## 示例
10 | 网页对应文字截图(https://www.dianping.com/xian/ch0):
11 |
12 | 
13 |
14 | 代码运行结果截图:
15 |
16 | 
17 |
--------------------------------------------------------------------------------
/24-Dianping/demo.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import re
4 | import requests
5 | import lxml.html
6 |
7 | def get_css_text(class_):
8 | """
9 | 获取坐标值
10 | """
11 | css_html = requests.get('https://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/1595b8f4917c831efb53461c8d9b86cb.css').text
12 | info_css = re.findall(r'%s{background:-(\d+).0px -(\d+).0px' % class_, css_html, re.S)[0]
13 | return info_css
14 |
15 | def get_completed_nums(compelted_nums=''):
16 | """
17 | 获取数字
18 | """
19 | result_svgtext = requests.get('http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/7226aa7d9b89866aecb63ab0f06ca037.svg').text
20 | a, b, c = re.findall('y=.*?>(.*?)<', result_svgtext, re.S) # 示例a:56422383356911691085268889707857...
21 | y1, y2, y3 = re.findall('y="(.*?)">', result_svgtext, re.S) # 示例: 46, 83, 129
22 | divisor = eval(re.search('x="(\d{2}) ', result_svgtext, re.S).group(1)) # 示例:x = 12,......
23 | for class_ in class_list:
24 | x, y = get_css_text(class_)
25 | x, y = int(x), int(y)
26 | if y < int(y1):
27 | compelted_nums += a[x // divisor]
28 | elif y < int(y2):
29 | compelted_nums += b[x // divisor]
30 | elif y < int(y3):
31 | compelted_nums += c[x // divisor]
32 | print("总评论数:", compelted_nums)
33 | return compelted_nums
34 |
35 | if __name__ == '__main__':
36 | class_list = ['ovr2h', 'ovjpg', 'ovra6', 'ovzs7']
37 | get_completed_nums()
--------------------------------------------------------------------------------
/24-Dianping/utils/prtsc1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/24-Dianping/utils/prtsc1.png
--------------------------------------------------------------------------------
/24-Dianping/utils/prtsc2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/24-Dianping/utils/prtsc2.png
--------------------------------------------------------------------------------
/24-Dianping/utils/prtsc3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/24-Dianping/utils/prtsc3.png
--------------------------------------------------------------------------------
/24-Dianping/utils/prtsc4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/24-Dianping/utils/prtsc4.png
--------------------------------------------------------------------------------
/24-Dianping/utils/prtsc5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Northxw/Python3_WebSpider/87cbae60f7a5b033851b0056dff741a3d5980d06/24-Dianping/utils/prtsc5.png
--------------------------------------------------------------------------------
/25-DouYin/README.md:
--------------------------------------------------------------------------------
1 | ## 抖音
2 |
3 | 根据抖音个人主页名片分享链接构造抓取链接,获取用户的昵称、抖音ID,签名,头像、关注、粉丝、获赞数量,作品及获得喜欢的数量。
4 |
5 | ## 分类
6 |
7 | 字体反爬 - 抖音
8 |
9 | ## 运行
10 |
11 | ```shell
12 | python douyin.py
13 | ```
14 |
15 |
--------------------------------------------------------------------------------
/25-DouYin/font.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | # import requests
4 | # import re
5 | # import time
6 | #
7 | # headers = {
8 | # "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
9 | # }
10 | #
11 | # def crack_font():
12 | # """处理反爬"""
13 | # url = "https://www.iesdouyin.com/share/user/59498826860"
14 | # response = requests.get(url, headers=headers)
15 | # ttf_url = "https://%s" % re.findall("format\('woff'\),url\(//(.*?\.ttf)\)", response.text, re.S)[0] # 匹配字体文件链接
16 | # print(ttf_url)
17 | # # get_mapping_table(ttf_url)
18 |
19 | def get_mapping_table(codeNum):
20 | """处理文字"""
21 | font_code_map = {
22 | "": "num_",
23 | "": "num_1",
24 | "": "num_2",
25 | "": "num_3",
26 | "": "num_4",
27 | "": "num_5",
28 | "": "num_6",
29 | "": "num_7",
30 | "": "num_8",
31 | "": "num_9",
32 | "": "num_4",
33 | "": "num_1",
34 | "": "num_",
35 | "": "num_5",
36 | "": "num_3",
37 | "": "num_2",
38 | "": "num_6",
39 | "": "num_8",
40 | "": "num_9",
41 | "": "num_7",
42 | "": "num_1",
43 | "": "num_3",
44 | "": "num_",
45 | "": "num_4",
46 | "": "num_2",
47 | "": "num_5",
48 | "": "num_8",
49 | "": "num_9",
50 | "": "num_7",
51 | "": "num_6",
52 | }
53 |
54 | font_num_map = {
55 | "1": "num_",
56 | "0": "num_1",
57 | "3": "num_2",
58 | "2": "num_3",
59 | "4": "num_4",
60 | "5": "num_5",
61 | "6": "num_6",
62 | "9": "num_7",
63 | "7": "num_8",
64 | "8": "num_9",
65 | }
66 | codeNumMap = font_code_map[codeNum]
67 | decodeNum = ''
68 | if codeNumMap in font_num_map.values():
69 | decodeNum = ''.join([k for k, v in font_num_map.items() if codeNumMap == v])
70 | return decodeNum
71 |
72 |
73 | if __name__ == '__main__':
74 | print(get_mapping_table(""))
--------------------------------------------------------------------------------
/25-DouYin/shareid.txt:
--------------------------------------------------------------------------------
1 | 98524936524
2 | 96467876974
3 | 97836647912
4 | 72051219546
5 | 88445518961
6 | 59498826860
7 | 76055758243
8 | 58944980339
9 | 93584412487
10 | 62427282029
11 | 98985522288
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Python3 WebSpider
2 | Python3网络爬虫实践代码, 仅供学习交流使用。
3 |
4 | ## Tip
5 | 除特殊说明外,编译环境为:**Python 3.6.5、Pycharm 2018.3.5**。
6 |
7 | ## Notice
8 | 短期内不再更新,后期视情况。项目代码已经很早了,很多爬虫破解逻辑或思路可能已经过时,仅供爬虫入门练习。
9 |
--------------------------------------------------------------------------------