├── .gitignore
├── 10、数据分析案例:某婚恋网站交友情况分析
├── 代码
│ ├── 10_1.py
│ ├── render.html
│ ├── wzly.csv
│ └── wzly.ipynb
└── 勘误.md
├── 1、Python爬虫概念与Web基础
└── 勘误.md
├── 2、Python爬虫基本库的使用
├── 代码
│ ├── 2_1.py
│ ├── 2_10.py
│ ├── 2_11.py
│ ├── 2_12.py
│ ├── 2_13.py
│ ├── 2_14.py
│ ├── 2_2.py
│ ├── 2_3.py
│ ├── 2_4.py
│ ├── 2_5.py
│ ├── 2_6.py
│ ├── 2_7.py
│ ├── 2_8.py
│ ├── 2_9.py
│ ├── cookie.txt
│ └── proxy_ips.txt
└── 勘误.md
├── 3、Python爬虫抓包与数据解析
├── 代码
│ ├── 3_1.py
│ ├── 3_2.py
│ ├── 3_3.py
│ └── 3_4.py
└── 勘误.md
├── 4、用CSV 和 Excel 存储数据
├── 代码
│ ├── 4_1.py
│ ├── 4_10.py
│ ├── 4_2.py
│ ├── 4_3.py
│ ├── 4_4.py
│ ├── 4_5.py
│ ├── 4_6.py
│ ├── 4_7.py
│ ├── 4_8.py
│ └── 4_9.py
└── 勘误.md
├── 5、用数据库存储数据
├── 代码
│ ├── 5_1.py
│ ├── 5_10.py
│ ├── 5_2.py
│ ├── 5_3.py
│ ├── 5_4.py
│ ├── 5_5.py
│ ├── 5_6.py
│ ├── 5_7.py
│ ├── 5_8.py
│ └── 5_9.py
└── 勘误.md
├── 6、Python应对反爬虫策略
├── 代码
│ ├── 6_1.py
│ ├── 6_2.py
│ ├── 6_3.py
│ ├── 6_4.py
│ ├── 6_5.py
│ └── 6_6.py
└── 勘误.md
├── 7、Python爬虫框架Scrapy(上)
├── 代码
│ ├── FirstSpider
│ │ ├── FirstSpider
│ │ │ ├── __init__.py
│ │ │ ├── __pycache__
│ │ │ │ ├── __init__.cpython-37.pyc
│ │ │ │ ├── items.cpython-37.pyc
│ │ │ │ ├── middlewares.cpython-37.pyc
│ │ │ │ ├── pipelines.cpython-37.pyc
│ │ │ │ └── settings.cpython-37.pyc
│ │ │ ├── items.py
│ │ │ ├── middlewares.py
│ │ │ ├── pipelines.py
│ │ │ ├── proxy_ip.txt
│ │ │ ├── run.py
│ │ │ ├── settings.py
│ │ │ └── spiders
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __pycache__
│ │ │ │ ├── __init__.cpython-37.pyc
│ │ │ │ └── bcy.cpython-37.pyc
│ │ │ │ └── bcy.py
│ │ └── scrapy.cfg
│ └── bing
│ │ ├── Dockerfile
│ │ ├── bing.json
│ │ ├── bing
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-37.pyc
│ │ │ ├── items.cpython-37.pyc
│ │ │ └── settings.cpython-37.pyc
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ │ ├── BingWallpaper.py
│ │ │ ├── Test.py
│ │ │ ├── __init__.py
│ │ │ └── __pycache__
│ │ │ ├── BingWallpaper.cpython-37.pyc
│ │ │ ├── Test.cpython-37.pyc
│ │ │ └── __init__.cpython-37.pyc
│ │ ├── logs
│ │ └── BingWallpaper
│ │ │ ├── 2018-10-15T104228.709049.log
│ │ │ ├── 2018-10-15T104303.655633.log
│ │ │ ├── 2018-10-15T104348.228406.log
│ │ │ ├── 2018-10-15T104841.872511.log
│ │ │ ├── 2018-10-15T104922.591600.log
│ │ │ ├── 2018-10-15T105002.320386.log
│ │ │ ├── 2018-10-15T105902.809743.log
│ │ │ ├── 2018-10-15T113038.987323.log
│ │ │ └── 2018-10-15T120654.496911.log
│ │ ├── out
│ │ └── res
│ │ │ └── pic
│ │ │ └── full
│ │ │ ├── 033317f07b809f0cd06487b30b29eccb26d063b8.jpg
│ │ │ ├── 0698af79b195349b838bdfeebbd11409f82f0f38.jpg
│ │ │ ├── 092235104f84cb2f4de8808c10f655298313f65c.jpg
│ │ │ ├── 2efd29b32c481136507115a3ee2e6181c122aa0b.jpg
│ │ │ ├── 3a573eb605fef87faaf91ad8ad421d1a24d0bc6b.jpg
│ │ │ ├── 4099096a19a0eaad0aef6782a206881d948ad775.jpg
│ │ │ ├── 486c568e353051efd0959cc4a424ff9093cfceb9.jpg
│ │ │ ├── 5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg
│ │ │ ├── 599f27e7835da59b495c44297cce0553ee4a0b51.jpg
│ │ │ ├── 86fd225ce368589a9b5e7454e6583cf77aedb0d4.jpg
│ │ │ ├── 885648740905a26703e18c1ae24f23c480ecc822.jpg
│ │ │ ├── 97e86cde9a308e626f537c107303537ec598903c.jpg
│ │ │ ├── b7e4ba8cba538b44e31132d175479c7ec37284fd.jpg
│ │ │ ├── bca701f1923e317aa8a9be18125c2a894fc80780.jpg
│ │ │ ├── bfa7e5e22268f27d7a195390abf6ef9ee45a6c29.jpg
│ │ │ ├── c14461fb44425865b9afe6695ab5926e2001411c.jpg
│ │ │ ├── cbba4b16b644659920ad93e10a6d3478270ce927.jpg
│ │ │ ├── e254600d400f3c54c77171e02b021d46369788ae.jpg
│ │ │ ├── e7fc4de75bcafe18f64b68072bf5cc6ece6084a8.jpg
│ │ │ └── ed989d9c858c5290ca559cf2c462cace68e49362.jpg
│ │ ├── requirements.txt
│ │ ├── run.py
│ │ └── scrapy.cfg
└── 勘误.md
├── 8、Python爬虫框架Scrapy(下)
├── 代码
│ ├── jianshuspider
│ │ ├── jianshuspider
│ │ │ ├── __init__.py
│ │ │ ├── __pycache__
│ │ │ │ ├── __init__.cpython-37.pyc
│ │ │ │ ├── items.cpython-37.pyc
│ │ │ │ ├── middlewares.cpython-37.pyc
│ │ │ │ ├── pipelines.cpython-37.pyc
│ │ │ │ └── settings.cpython-37.pyc
│ │ │ ├── items.py
│ │ │ ├── middlewares.py
│ │ │ ├── pipelines.py
│ │ │ ├── settings.py
│ │ │ └── spiders
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __pycache__
│ │ │ │ ├── __init__.cpython-37.pyc
│ │ │ │ └── jianshu.cpython-37.pyc
│ │ │ │ └── jianshu.py
│ │ ├── requirements.txt
│ │ └── scrapy.cfg
│ └── proxy_ips
│ │ ├── proxy_ip_check.py
│ │ ├── proxy_ips
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-37.pyc
│ │ │ └── settings.cpython-37.pyc
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ │ ├── __init__.py
│ │ │ ├── __pycache__
│ │ │ ├── __init__.cpython-37.pyc
│ │ │ └── proxy_spider.cpython-37.pyc
│ │ │ └── proxy_spider.py
│ │ ├── proxy_server.py
│ │ ├── run.py
│ │ └── scrapy.cfg
└── 勘误.md
├── 9、数据分析案例:Python岗位行情
├── 代码
│ ├── 9_1.py
│ └── 9_2.py
└── 勘误.md
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .idea/
--------------------------------------------------------------------------------
/10、数据分析案例:某婚恋网站交友情况分析/代码/10_1.py:
--------------------------------------------------------------------------------
1 | """
2 | 抓取我主良缘妹子交友信息做数据分析
3 | """
4 |
5 | import requests as rq
6 | import pandas as pd
7 | import time
8 | import random
9 | import os
10 |
11 | # 结果写入文件
12 | result_save_file = 'wzly.csv'
13 |
14 | # Ajax加载url
15 | ajax_url = "http://www.lovewzly.com/api/user/pc/list/search?"
16 |
17 | # 模拟请求头
18 | ajax_headers = {
19 | 'Accept': 'application/json, text/javascript, */*; q=0.01',
20 | 'Accept-Encoding': 'gzip, deflate, br',
21 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
22 | 'Connection': 'keep-alive',
23 | 'Host': 'www.lovewzly.com',
24 | 'Referer': 'http://www.lovewzly.com/jiaoyou.html',
25 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 '
26 | 'Safari/537.36',
27 | 'X-Requested-With': 'XMLHttpRequest',
28 | }
29 |
30 | # post请求参数
31 | form_data = {'gender': '2', 'marry': '1', 'page': '1'}
32 |
33 | # csv表头
34 | csv_headers = [
35 | '昵称', '用户id', '头像', '身高', '学历', '省份',
36 | '城市', '出生年份', '性别', '交友宣言'
37 | ]
38 |
39 | height_interval = ['140', '150', '160', '170', '180'] # 身高范围
40 | edu_interval = ['本科', '大专', '高中', '中专', '初中', '硕士', '博士', '院士'] # 学历范围
41 | age_interval = [
42 | ('18-30', 8000), ('26-30', 8000), ('31-40', 8000),
43 | ('41-50', 8000), ('50以上', 8000),
44 | ] # 年龄范围
45 |
46 |
47 | # 获取每页交友信息
48 | def fetch_data(page):
49 | while True:
50 | try:
51 | form_data['page'] = page
52 | print("抓取第:" + str(page) + "页!")
53 | resp = rq.get(url=ajax_url, params=form_data, headers=ajax_headers)
54 | if resp.status_code == 200:
55 | data_json = resp.json()['data']['list']
56 | if len(data_json) > 0:
57 | data_list = []
58 | for data in data_json:
59 | data_list.append((
60 | data['username'], data['userid'], data['avatar'],
61 | data['height'], data['education'], data['province'],
62 | data['city'], data['birthdayyear'], data['gender'], data['monolog']))
63 | result = pd.DataFrame(data_list)
64 | if page == 1:
65 | result.to_csv(result_save_file, header=csv_headers, index=False, mode='a+', encoding='utf-8')
66 | else:
67 | result.to_csv(result_save_file, header=False, index=False, mode='a+', encoding='utf-8')
68 | return None
69 | except Exception as e:
70 | print(e)
71 |
72 |
73 | if __name__ == '__main__':
74 | if not os.path.exists(result_save_file):
75 | for i in range(1, 718):
76 | time.sleep(random.randint(2, 10))
77 | fetch_data(i)
78 |
--------------------------------------------------------------------------------
/10、数据分析案例:某婚恋网站交友情况分析/勘误.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/10、数据分析案例:某婚恋网站交友情况分析/勘误.md
--------------------------------------------------------------------------------
/1、Python爬虫概念与Web基础/勘误.md:
--------------------------------------------------------------------------------
1 | ## 第1章 Python 爬虫概念与Web基础
2 |
3 | ### 1.1.7 爬虫的学习路线
4 |
5 | 学习路线图部分:
6 |
7 | 1、原文:利用urllib、**requestsy**库
8 |
9 | > 改为:利用urllib、**requests** 库
10 |
11 | 2、原文:利用文件、**CVS**、Excel
12 |
13 | > 改为:利用文件、**CSV**、Excel
--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_1.py:
--------------------------------------------------------------------------------
1 | """
2 | urllib.request使用示例
3 | """
4 |
5 | import urllib.request
6 |
7 | resp = urllib.request.urlopen("http://www.baidu.com")
8 | print("resp.geturl:", resp.geturl())
9 | print("resp.msg:", resp.msg)
10 | print("resp.status:", resp.status)
11 | print("resp.version:", resp.version)
12 | print("resp.reason:", resp.reason)
13 | print("resp.debuglevel:", resp.debuglevel)
14 | print("resp.getheaders:", resp.getheaders()[0:2])
15 | print(resp.read().decode('utf-8'))
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_10.py:
--------------------------------------------------------------------------------
1 | """
2 | urllib.parse.urlencode函数使用代码示例
3 | """
4 | from urllib import parse
5 |
6 | params = {
7 | 'q': 'parse',
8 | 'check_keywords': 'yes',
9 | 'area': 'default'
10 | }
11 | url = 'https://docs.python.org/3/search.html?' + parse.urlencode(params)
12 | print("拼接后的URL:", url)
13 |
--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_11.py:
--------------------------------------------------------------------------------
1 | """
2 | urllib.parse.parse_qs和parse_qsl函数使用代码示例
3 | """
4 | from urllib import parse
5 |
6 | params_str = 'q=parse&check_keywords=yes&area=default'
7 |
8 | print("parse_qs 反序列化结果:", parse.parse_qs(params_str))
9 | print("parse_qsl 反序列化结果:", parse.parse_qsl(params_str))
10 |
--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_12.py:
--------------------------------------------------------------------------------
1 | """
2 | urllib.robotparser使用示例
3 | """
4 |
5 | from urllib import robotparser
6 | import ssl
7 | ssl._create_default_https_context = ssl._create_unverified_context
8 |
9 | rp = robotparser.RobotFileParser()
10 | # 设置rebots.txt文件的链接
11 | rp.set_url('http://www.taobao.com/robots.txt')
12 | # 读取rebots.txt文件并进行分析
13 | rp.read()
14 |
15 | url = 'https://www.douban.com'
16 | user_agent = 'Baiduspider'
17 | op_info = rp.can_fetch(user_agent, url)
18 | print("Elsespider 代理用户访问情况:",op_info)
19 |
20 | bdp_info = rp.can_fetch(user_agent, url)
21 | print("Baiduspider 代理用户访问情况:",bdp_info)
22 | user_agent = 'Elsespider'
--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_13.py:
--------------------------------------------------------------------------------
1 | """
2 | 刷CSDN博客文章访问量的脚本
3 | """
4 | import random
5 | import urllib.request
6 | import threading as t
7 | import os
8 | import ssl
9 |
10 | # 全局取消证书验证
11 | ssl._create_default_https_context = ssl._create_unverified_context
12 |
13 | # 代理ip文件
14 | proxy_ips_file = 'proxy_ips.txt'
15 |
16 | # 代理ip列表
17 | proxy_ips = []
18 |
19 | # 文章地址
20 | article_url = 'https://blog.csdn.net/l1028386804/article/details/116191713'
21 |
22 | # 请求头
23 | headers = {
24 | 'Host': 'blog.csdn.net',
25 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
26 | }
27 |
28 | read_count = 0
29 |
30 |
31 | # 读取文件里的代理ip,返回一个列表
32 | def load_ips(file_path):
33 | if os.path.exists(file_path):
34 | data_list = []
35 | with open(file_path, "r+", encoding='utf-8') as f:
36 | for ip in f:
37 | data_list.append(ip.replace("\n", ""))
38 | return data_list
39 |
40 |
41 | # 访问网页
42 | def read_article():
43 | # 随机取出一枚代理ip
44 | proxy_ip = proxy_ips[random.randint(0, len(proxy_ips) - 1)]
45 | proxy_support = urllib.request.ProxyHandler({'http': proxy_ip})
46 | opener = urllib.request.build_opener(proxy_support)
47 | urllib.request.install_opener(opener)
48 | try:
49 | req = urllib.request.Request(article_url, headers=headers)
50 | resp = urllib.request.urlopen(req, timeout=20)
51 | # 如果返回码是200代表访问成功
52 | if resp is not None and resp.status == 200:
53 | global read_count
54 | read_count += 1
55 | print("累计访问成功次数: %d" % read_count)
56 | return None
57 | except Exception as e:
58 | print(e)
59 |
60 |
61 | if __name__ == '__main__':
62 | # 读取代理ip列表
63 | proxy_ips = load_ips(proxy_ips_file)
64 | read_article()
65 | if len(proxy_ips) > 0:
66 | for i in range(100):
67 | read_article()
--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_14.py:
--------------------------------------------------------------------------------
1 | """
2 | 爬取笔趣看的小说脚本示例
3 | """
4 |
5 | import urllib
6 | import urllib.request
7 | import urllib.parse
8 | from lxml import etree
9 | from urllib import error
10 | import lxml.html
11 | import os
12 | import time
13 |
14 | # 小说站点的URL
15 | novel_base_url = 'http://www.biqukan.com'
16 |
17 | # 拉取小说的URL
18 | novel_url = urllib.parse.urljoin(novel_base_url, '/0_790/')
19 |
20 | # 每章小说的链接
21 | chapter_url_list = []
22 |
23 | # 小说的保存文件夹
24 | novel_save_dir = os.path.join(os.getcwd(), 'novel_cache/')
25 |
26 | # 请求头
27 | headers = {
28 | 'Host': 'www.biqukan.com',
29 | 'Referer': 'http://www.biqukan.com/',
30 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
31 | }
32 |
33 | # 获取章节链接列表
34 | def fetch_chapter_urls():
35 | req = urllib.request.Request(url=novel_url, headers=headers)
36 | html = lxml.html.parse(urllib.request.urlopen(req))
37 | hrefs = html.xpath('//dd/a/@href')
38 | # 过滤前面的最新章节列表和无用章节
39 | for href in hrefs[16:]:
40 | chapter_url_list.append(urllib.parse.urljoin(novel_base_url, href))
41 |
42 | # 解析每个页面获得章节正文
43 | def parsing_chapter(url):
44 | req = urllib.request.Request(url=url, headers=headers)
45 | html = lxml.html.parse(urllib.request.urlopen(req))
46 | title = html.xpath('//h1/text()')[0]
47 | contents = html.xpath('//*[@id="content"]/text()')
48 | content = ''
49 | for i in contents:
50 | content += i.strip()
51 | save_novel(title, content)
52 |
53 | # 把章节正文写到本地
54 | def save_novel(name, content):
55 | try:
56 | with open(novel_save_dir + name + '.txt', "w+") as f:
57 | f.write(content.strip())
58 | except (error.HTTPError, OSError) as reason:
59 | print(str(reason))
60 | else:
61 | print("下载完成:" + name)
62 |
63 |
64 | if __name__ == '__main__':
65 | # 判断存储的文件夹是否存在,不存在新建
66 | if not os.path.exists(novel_save_dir):
67 | os.mkdir(novel_save_dir)
68 | # 爬取小说文章链接列表
69 | fetch_chapter_urls()
70 | # 遍历抓取所有的小说内容
71 | for chapter in chapter_url_list:
72 | # 定时休眠1s防止ip被封
73 | time.sleep(1)
74 | parsing_chapter(chapter)
--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_2.py:
--------------------------------------------------------------------------------
1 | """
2 | urllib下载图片
3 | """
4 | import urllib.request
5 | import ssl
6 |
7 | ssl._create_default_https_context = ssl._create_unverified_context
8 |
9 | # pic_url = "https://www.baidu.com/img/bd_logo1.png"
10 | # pic_resp = urllib.request.urlopen(pic_url,context=context)
11 | # pic = pic_resp.read()
12 | # with open("bg_logo.png", "wb") as f:
13 | # f.write(pic)
14 |
15 | urllib.request.urlretrieve('https://www.baidu.com/img/bd_logo1.png', 'bd_logo.png')
16 |
17 |
18 |
--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_3.py:
--------------------------------------------------------------------------------
1 | """
2 | itchat模拟Get请求
3 | """
4 |
5 | import urllib.request
6 | import json
7 | import ssl
8 |
9 | ssl._create_default_https_context = ssl._create_unverified_context
10 |
11 | get_url = "http://gank.io/api/data/" + urllib.request.quote("福利") + "/1/1"
12 | get_resp = urllib.request.urlopen(get_url)
13 | get_result = json.loads(get_resp.read().decode('utf-8'))
14 | # 这里后面的参数用于格式化Json输出格式
15 | get_result_format = json.dumps(get_result, indent=2,
16 | sort_keys=True, ensure_ascii=False)
17 | print(get_result_format)
--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_4.py:
--------------------------------------------------------------------------------
1 | """
2 | urllib模拟Post请求示例(伪代码,不能直接请求)
3 | """
4 | import urllib.request
5 | import urllib.parse
6 | import json
7 |
8 | post_url = "http://xxx.xxx.login"
9 | phone = "13555555555"
10 | password = "111111"
11 | values = {
12 | 'phone': phone,
13 | 'password': password
14 | }
15 | data = urllib.parse.urlencode(values).encode(encoding='utf-8')
16 | req = urllib.request.Request(post_url, data)
17 | resp = urllib.request.urlopen(req)
18 | result = json.loads(resp.read()) # Byte结果转Json
19 | print(json.dumps(result, sort_keys=True,
20 | indent=2, ensure_ascii=False)) # 格式化输出Json
21 |
22 |
23 |
--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_5.py:
--------------------------------------------------------------------------------
1 | """
2 | urllib修改请求头代码示例
3 | """
4 | import urllib.request
5 |
6 | # 修改头信息
7 | novel_url = "http://www.biqukxs.com/book/1.html"
8 | headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
9 | 'AppleWebKit/537.36 (KHTML, like Gecko)'
10 | ' Chrome/63.0.3239.84 Safari/537.36',
11 | 'Host': 'www.biqukxs.com',
12 | 'Referer': 'http://www.biqukxs.com/',
13 | 'Connection': 'keep-alive'}
14 | novel_req = urllib.request.Request(novel_url, headers=headers)
15 | novel_resp = urllib.request.urlopen(novel_req)
16 | print(novel_resp.read().decode('gbk'))
17 |
--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_6.py:
--------------------------------------------------------------------------------
1 | """
2 | urllib配置代理示例
3 | """
4 |
5 | import urllib.request
6 |
7 | # 使用ip代理
8 | ip_query_url = "http://ip.chinaz.com/"
9 |
10 | # 1.创建代理处理器,ProxyHandler参数是一个字典{类型:代理ip:端口},下述代理IP失效的话替换成可用的代理ip即可
11 | proxy_support = urllib.request.ProxyHandler({'http': '60.187.118.246:9000'})
12 |
13 | # 2.定制,创建一个opener
14 | opener = urllib.request.build_opener(proxy_support)
15 |
16 | # 3.安装opener
17 | urllib.request.install_opener(opener)
18 |
19 | # 请求头
20 | headers = {
21 | 'User-Agent': 'User-Agent:Mozilla/5.0 (X11; Linux x86_64)'
22 | ' AppleWebKit/537.36 (KHTML, like Gecko)'
23 | ' Chrome/63.0.3239.84 Safari/537.36',
24 | 'Host': 'ip.chinaz.com'
25 | }
26 |
27 | req = urllib.request.Request(ip_query_url, headers=headers)
28 | resp = urllib.request.urlopen(req, timeout=20)
29 | html = resp.read().decode('utf-8')
30 | print(html)
31 |
--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_7.py:
--------------------------------------------------------------------------------
1 | """
2 | urllib使用cookie代码示例
3 | """
4 |
5 | import urllib.request
6 | from http import cookiejar
7 |
8 | # ============ 获得Cookie ============
9 |
10 | # 1.实例化CookieJar对象
11 |
12 |
13 | cookie = cookiejar.CookieJar()
14 |
15 | # 2.创建Cookie处理器
16 | handler = urllib.request.HTTPCookieProcessor(cookie)
17 |
18 | # 3.通过CookieHandler创建opener
19 | opener = urllib.request.build_opener(handler)
20 |
21 | # 4.打开网页
22 | resp = opener.open("http://www.baidu.com")
23 |
24 | for i in cookie:
25 | print("Name = %s" % i.name)
26 | print("Name = %s" % i.value)
27 |
28 | # ============ 保存Cookie到文件 ============
29 | # 1.用于保存cookie的文件
30 | cookie_file = "cookie.txt"
31 |
32 | # 2.创建MozillaCookieJar对象保存Cookie
33 | cookie = cookiejar.MozillaCookieJar(cookie_file)
34 |
35 | # 3.创建Cookie处理器
36 | handler = urllib.request.HTTPCookieProcessor(cookie)
37 |
38 | # 4.通过CookieHandler创建opener
39 | opener = urllib.request.build_opener(handler)
40 |
41 | # 5.打开网页
42 | resp = opener.open("http://www.baidu.com")
43 |
44 | # 6.保存Cookie到文件中,参数依次是:
45 | # ignore_discard:即使cookies将被丢弃也将它保存下来
46 | # ignore_expires:如果在该文件中cookies已存在,覆盖原文件写入
47 | cookie.save(ignore_discard=True, ignore_expires=True)
48 |
49 | # ============ 读取Cookie文件 ============
50 |
51 | cookie_file = "cookie.txt"
52 |
53 | # 1.创建MozillaCookieJar对象保存Cookie
54 | cookie = cookiejar.MozillaCookieJar(cookie_file)
55 |
56 | # 2.从文件中读取cookie内容
57 | cookie.load(cookie_file, ignore_expires=True, ignore_discard=True)
58 |
59 | handler = urllib.request.HTTPCookieProcessor(cookie)
60 | opener = urllib.request.build_opener(handler)
61 | resp = opener.open("http://www.baidu.com")
62 | print(resp.read().decode('utf-8'))
63 |
64 |
--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_8.py:
--------------------------------------------------------------------------------
1 | """
2 | urllib.parse.urlparse和urlsplit函数使用示例
3 | """
4 | import urllib.parse
5 |
6 | urp = urllib.parse.urlparse('https://docs.python.org/3/search.html?q=parse&check_keywords=yes&area=default')
7 | print('urlparse执行结果:', urp)
8 | # 可以通过.的方式获取某个部分
9 | print('urp.scheme:', urp.scheme)
10 | print('urp.netloc:', urp.netloc)
11 |
12 | urp = urllib.parse.urlsplit('https://docs.python.org/3/search.html?q=parse&check_keywords=yes&area=default')
13 | print('urlsplit执行结果:', urp)
14 |
--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_9.py:
--------------------------------------------------------------------------------
1 | """
2 | urllib.parse.urlunparse,urlunsplit和urljoin函数使用示例
3 | """
4 | import urllib.parse
5 |
6 | url = urllib.parse.urlunparse(['https','docs.python.org', '/3/search.html', 'q=parse&check_keywords=yes&area=default' , '', ''])
7 | print('urlunparse函数拼接的URL:',url)
8 |
9 | url = urllib.parse.urlunsplit(['https','docs.python.org', '/3/search.html', 'q=parse&check_keywords=yes&area=default',''])
10 | print('urlunsplit函数拼接的URL:',url)
11 |
12 | url = urllib.parse.urljoin('https://docs.python.org','/3/search.html')
13 | url = urllib.parse.urljoin(url,'?q=parse&check_keywords=yes&area=default')
14 | print('urljoin函数拼接的URL:',url)
--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/cookie.txt:
--------------------------------------------------------------------------------
1 | # Netscape HTTP Cookie File
2 | # http://curl.haxx.se/rfc/cookie_spec.html
3 | # This is a generated file! Do not edit.
4 |
5 | .baidu.com TRUE / FALSE 3681539028 BAIDUID F16617940595A8E3EF9BB50E63AC0954:FG=1
6 | .baidu.com TRUE / FALSE 3681539028 BIDUPSID F16617940595A8E3EF9BB50E63AC0954
7 | .baidu.com TRUE / FALSE H_PS_PSSID 1442_21106_22074
8 | .baidu.com TRUE / FALSE 3681539028 PSTM 1534055381
9 | www.baidu.com FALSE / FALSE BDSVRTM 0
10 | www.baidu.com FALSE / FALSE BD_HOME 0
11 | www.baidu.com FALSE / FALSE 2480135321 delPer 0
12 |
--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/proxy_ips.txt:
--------------------------------------------------------------------------------
1 | 183.129.244.16:10080
2 | 219.141.153.39:80
3 | 119.180.140.9:8060
4 | 111.3.154.196:8060
5 | 123.117.250.127:8060
6 | 222.182.56.120:8118
7 | 123.114.200.43:8118
8 | 117.28.96.103:808
9 | 120.92.174.37:1080
10 | 39.137.69.10:80
11 | 106.56.102.78:8070
12 | 218.88.177.155:8908
13 | 221.2.175.238:8060
14 | 120.198.224.5:8000
15 | 119.180.131.39:8060
16 | 112.67.34.99:8118
17 | 123.114.200.72:8118
18 | 39.137.69.6:8080
19 | 163.125.235.73:8118
20 | 219.141.153.11:80
21 | 180.119.65.150:1133
22 | 221.14.140.66:80
23 | 119.180.142.175:8060
24 | 113.78.255.243:8118
25 | 119.180.172.222:8060
26 | 39.137.77.66:8080
27 | 61.171.0.40:9999
28 | 221.2.155.35:8060
29 | 118.190.94.254:9001
30 | 219.141.153.43:80
31 | 112.24.107.109:8908
32 | 222.186.45.139:65309
33 | 219.141.153.5:80
34 | 219.141.153.35:80
35 | 221.14.140.130:80
36 | 101.96.11.5:80
37 | 119.179.131.245:8060
38 | 121.14.159.150:9001
39 | 114.250.25.19:80
40 | 120.198.224.6:8088
41 | 223.96.95.229:3128
42 | 121.17.18.219:8060
43 | 117.190.90.20:8060
44 | 219.141.153.6:80
45 | 113.239.240.152:80
46 | 101.96.10.5:80
47 | 219.141.153.10:80
48 | 117.44.247.37:8908
49 | 115.213.103.150:8010
50 | 113.3.210.60:80
51 | 106.56.102.252:8070
52 | 183.246.84.229:8060
53 | 118.190.95.35:9001
54 | 219.141.153.41:80
55 | 58.247.46.123:8088
56 | 112.24.107.102:8908
57 | 223.93.145.186:8060
58 | 218.244.44.194:8060
59 | 120.198.224.7:8080
60 | 117.28.97.169:808
61 | 222.88.147.104:8060
62 | 218.88.177.149:8908
63 | 39.137.69.8:8080
64 | 119.179.147.68:8060
65 | 113.105.202.51:1133
66 | 219.141.153.12:8080
67 | 114.95.61.165:8118
68 | 222.186.34.212:65309
69 | 113.128.198.50:8060
70 | 219.141.153.2:8080
71 | 219.141.153.34:80
72 | 222.175.200.58:8060
73 | 117.131.235.198:8060
74 | 219.141.153.44:80
75 | 60.14.125.246:8908
76 | 119.180.137.134:8060
77 | 39.137.77.67:80
78 | 120.131.9.254:1080
79 | 106.56.102.17:8070
80 | 119.180.168.33:8060
81 | 221.2.174.99:8060
82 | 118.190.200.139:8080
83 | 222.88.149.32:8060
84 | 118.190.145.138:9001
85 | 221.2.174.6:8060
86 | 219.141.153.38:80
87 | 119.180.140.140:8060
88 | 123.158.175.102:1080
89 | 219.141.153.7:80
90 | 117.44.247.53:8908
91 | 124.128.76.142:8060
92 | 112.80.93.76:8118
93 | 119.180.131.16:8060
94 | 39.135.24.11:8080
95 | 222.222.236.207:8060
96 | 218.88.177.161:8908
97 | 119.179.132.101:8060
98 | 39.137.69.7:80
99 | 119.180.171.89:8060
100 | 118.190.95.43:9001
--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/勘误.md:
--------------------------------------------------------------------------------
1 | 2021.4.29 更新内容:
2 |
3 | - 2_4.py → 新增注释:伪代码,不能直接请求,只是用于演示用法;
4 | - 2_5.py → 将百度地址替换为小说地址,请求头内容替换;
5 | - 2_6.py → 新增注释:请求失败时,将时效代理IP替换为可用代理IP;
6 | - 2_12.py → 新增全局取消https证书验证;
7 | - 2_13.py → 替换失效博客地址;
8 |
9 |
--------------------------------------------------------------------------------
/3、Python爬虫抓包与数据解析/代码/3_1.py:
--------------------------------------------------------------------------------
1 | """
2 | requests抓取微信公众号文章的图片,音视频
3 | """
4 | import requests
5 | from lxml import etree
6 | import time
7 | import os
8 |
9 | # 资源的保存文件夹
10 | save_dir = os.path.join(os.getcwd(), 'tmp')
11 |
12 | # 测试文章的URL
13 | test_url = 'https://mp.weixin.qq.com/s/4oLnJvfGCZneoErkrh0sHw'
14 |
15 | # 语音获取的基URL
16 | music_res_url = 'http://res.wx.qq.com/voice/getvoice'
17 |
18 | # 视频获取的接口URL
19 | video_parse_url = 'http://v.ranks.xin/video-parse.php'
20 |
21 | # 微信公众号文章请求头
22 | headers = {
23 | 'Host': 'mp.weixin.qq.com',
24 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 '
25 | '(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
26 | }
27 |
28 | # 视频获取接口的请求头
29 | video_parse_headers = {
30 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)',
31 | 'Host': 'v.ranks.xin',
32 | 'Referer': 'http://v.ranks.xin/',
33 | 'X-Requested-With': 'XMLHttpRequest'
34 | }
35 |
36 |
37 | # 获取标题
38 | def get_title(content):
39 | return content.xpath("//h2[@class='rich_media_title']/text()")[0].strip()
40 |
41 |
42 | # 解析下载图片
43 | def get_pic(content, path):
44 | img_list = content.xpath("//img/@data-src")
45 | for img in img_list:
46 | download_pic(img, path)
47 |
48 |
49 | # 解析获得音频
50 | def get_sound(content, path):
51 | sound_list = content.xpath("//mpvoice/@voice_encode_fileid")
52 | for sound in sound_list:
53 | download_sound(sound, path)
54 |
55 |
56 | # 解析获得视频
57 | def get_video(content, path):
58 | video_list = content.xpath("//iframe/@data-src")
59 | for video in video_list:
60 | download_video(video, path)
61 |
62 |
63 | # 下载图片的方法
64 | def download_pic(url, path):
65 | print("下载图片:" + url)
66 | try:
67 | pic_name = str(int(time.time())) # 使用当前时间戳作为图片名字
68 | fmt = url.split('=')[-1] # 图片格式
69 | img_resp = requests.get(url).content
70 | with open(path + pic_name + "." + fmt, "wb+") as f:
71 | f.write(img_resp)
72 | except Exception as reason:
73 | print(str(reason))
74 |
75 |
76 | # 下载音频的方法
77 | def download_sound(file_id, path):
78 | try:
79 | sound_resp = requests.get(music_res_url, params={'mediaid': file_id, 'voice_type': '1'})
80 | if sound_resp is not None:
81 | music_name = str(int(time.time())) + '.mp3' # 使用当前时间戳作为音频名字
82 | print("开始下载音频: " + sound_resp.url)
83 | with open(path + music_name, "wb+") as f:
84 | f.write(sound_resp.content)
85 | print("音频下载完成:" + music_name)
86 | except Exception as reason:
87 | print(str(reason))
88 |
89 |
90 | # 下载视频的方法
91 | def download_video(url, path):
92 | print("开始解析视频链接:" + url)
93 | video_resp = requests.get(video_parse_url, headers=video_parse_headers, params={'url': url})
94 | if video_resp is not None:
95 | video_url = video_resp.json()['data'][0]['url']
96 | print("解析完成,开始下载视频:" + video_url)
97 | try:
98 | video_name = str(int(time.time())) + '.mp4' # 使用当前时间戳作为视频名字
99 | video_resp = requests.get(video_url).content
100 | if video_resp is not None:
101 | with open(path + video_name, "wb+") as f:
102 | f.write(video_resp)
103 | print("视频下载完成:" + video_name)
104 | except Exception as reason:
105 | print(str(reason))
106 |
107 |
108 | if __name__ == '__main__':
109 | while True:
110 | print("请输入你要抓取的微信文章链接:(输出Q回车或者按Ctrl+C可以退出~)")
111 | input_url = input()
112 | if input_url == 'Q':
113 | exit()
114 | else:
115 | resp = requests.get(url=input_url.strip(), headers=headers).text
116 | html = etree.HTML(resp)
117 | title = get_title(html)
118 | res_save_dir = os.path.join(save_dir, title)
119 | if not os.path.exists(res_save_dir):
120 | os.makedirs(res_save_dir)
121 | get_pic(html,res_save_dir)
122 | get_sound(html,res_save_dir)
123 | get_video(html,res_save_dir)
124 | print("所有资源下载完成!")
--------------------------------------------------------------------------------
/3、Python爬虫抓包与数据解析/代码/3_2.py:
--------------------------------------------------------------------------------
1 | """
2 | Beautiful Soup使用示例,抓取壁纸网站的壁纸
3 | """
4 | import requests as r
5 | from bs4 import BeautifulSoup
6 | import os
7 | import time
8 |
9 | base_url = "http://www.win4000.com"
10 | theme_base_url = "http://www.win4000.com/zt/xiaoqingxin_"
11 |
12 | # 利用列表表达式生成每页链接列表
13 | theme_url_list = [theme_base_url + str(x) + '.html' for x in range(1, 6)]
14 |
15 | # 套图链接列表
16 | series_url_lists = []
17 |
18 | # 保存文件名
19 | save_root_dir = os.path.join(os.getcwd(), 'tmp/')
20 |
21 |
22 | # 获取所有套图的链接列表
23 | def get_series_url_lists(url):
24 | resp = r.get(url)
25 | if resp is not None:
26 | result = resp.text
27 | bs = BeautifulSoup(result, 'html.parser')
28 | ul = bs.find('div', attrs={'class': 'tab_tj'})
29 | a_s = ul.find_all('a')
30 | for a in a_s:
31 | series_url_lists.append(a.get('href'))
32 |
33 |
34 | # 获取某个套图里的所有图片
35 | def fetch_all_series_pic(url):
36 | cur_page = 1
37 | while True:
38 | current_url = url
39 | if cur_page > 1:
40 | current_url = url.replace('.html', '_' + str(cur_page) + '.html')
41 | resp = r.get(current_url)
42 | if resp.status_code == 404:
43 | break
44 | else:
45 | if resp is not None:
46 | result = resp.text
47 | bs = BeautifulSoup(result, 'lxml')
48 | # 使用lxml来获取标题,用作文件夹名
49 | title_name = bs.find('div', attrs={'class': 'ptitle'}).h1.text
50 | save_dir = os.path.join(save_root_dir, title_name)
51 | if not os.path.exists(save_dir):
52 | os.makedirs(save_dir)
53 | # 使用CSS选择器选择图片结点
54 | imgs = bs.select('img.pic-large')
55 | for img in imgs:
56 | download_pic(img.attrs.get('src'), save_dir)
57 | cur_page += 1
58 |
59 |
60 | # 下载图片的方法
61 | def download_pic(url, path):
62 | print("下载图片:" + url)
63 | try:
64 | pic_name = url.split('/')[-1]
65 | img_resp = r.get(url).content
66 | with open(path + '/' +pic_name, "wb+") as f:
67 | f.write(img_resp)
68 | except Exception as reason:
69 | print(str(reason))
70 |
71 |
72 | if __name__ == '__main__':
73 | for url in theme_url_list:
74 | get_series_url_lists(url)
75 | for url in series_url_lists:
76 | fetch_all_series_pic(url)
77 |
--------------------------------------------------------------------------------
/3、Python爬虫抓包与数据解析/代码/3_3.py:
--------------------------------------------------------------------------------
1 | """
2 | 正则使用示例
3 | """
4 |
5 | import re
6 |
7 | ret = re.match(r'^(\d{4})-(\d{3,8})$', '0756-3890993')
8 | print(ret.group())
9 | print(ret.group(0))
10 | print(ret.group(1))
11 | print(ret.group(2))
12 |
13 | str_count = "您的网站被访问了10000次"
14 | match = re.match(r"^您的网站被访问了(\d{1,6})次$", str_count)
15 | print(match.group(1))
16 |
17 |
--------------------------------------------------------------------------------
/3、Python爬虫抓包与数据解析/代码/3_4.py:
--------------------------------------------------------------------------------
1 | """
2 | 正则表达式实战示例:采集所有城市编码
3 | """
4 | import requests as r
5 | from bs4 import BeautifulSoup
6 | import re
7 | import os
8 |
9 | base_url = 'http://www.weather.com.cn'
10 | city_referer_url = 'http://www.weather.com.cn/textFC/hb.shtml'
11 |
12 | # 获取城市编码的正则
13 | code_regex = re.compile('^.*?weather/(.*?).shtml$', re.S)
14 | # 城市编码的保存文件
15 | save_file_name = os.path.join(os.getcwd(), 'city_codes.txt')
16 | # 城市编码列表
17 | city_code_list = []
18 |
19 |
20 | # 获取所有的城市列表
21 | def fetch_city_url_list():
22 | city_url_list = []
23 | resp = r.get(city_referer_url)
24 | resp.encoding = 'utf-8'
25 | bs = BeautifulSoup(resp.text, 'lxml')
26 | content = bs.find('div', attrs={'class': 'lqcontentBoxheader'})
27 | if content is not None:
28 | a_s = content.find_all('a')
29 | if a_s is not None:
30 | for a in a_s:
31 | city_url_list.append(base_url + a.get('href'))
32 | return city_url_list
33 |
34 |
35 | # 获取城市天气跳转链接列表
36 | def fetch_city_weather_url_list(url):
37 | resp = r.get(url)
38 | resp.encoding = 'utf-8'
39 | bs = BeautifulSoup(resp.text, 'lxml')
40 | a_s = bs.select('div.conMidtab a')
41 | for a in a_s:
42 | if a.get("href") is not None and a.text != '详情' and a.text != '返回顶部':
43 | # 提取城市编码
44 | result = code_regex.match(a.get("href"))
45 | if result is not None:
46 | city_code_list.append(a.text + ":" + result.group(1))
47 |
48 |
49 | # 把列表写入到文件中的方法
50 | def write_list_to_file(data):
51 | try:
52 | with open(save_file_name, "w+", encoding='utf-8') as f:
53 | for content in data:
54 | f.write(content + "\n")
55 | except OSError as reason:
56 | print(str(reason))
57 |
58 |
59 | if __name__ == '__main__':
60 | city_list = fetch_city_url_list()
61 | for city in city_list:
62 | print("解析:", city)
63 | fetch_city_weather_url_list(city)
64 | write_list_to_file(city_code_list)
65 |
--------------------------------------------------------------------------------
/3、Python爬虫抓包与数据解析/勘误.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/3、Python爬虫抓包与数据解析/勘误.md
--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/代码/4_1.py:
--------------------------------------------------------------------------------
1 | """
2 | csv库使用代码示例
3 | """
4 |
5 | import csv
6 | import os
7 |
8 | save_file_name_1 = os.path.join(os.getcwd(), '1.csv')
9 | save_file_name_2 = os.path.join(os.getcwd(), '2.csv')
10 | save_file_name_3 = os.path.join(os.getcwd(), '3.csv')
11 |
12 | data_1 = [['id', '姓名', '性别', '年龄', '工作'],
13 | [1, '小明', '男', '18', '学生'],
14 | [2, '小红', '女', '24', '老师'],
15 | [3, '小光', '男', '25', 'Python工程师']]
16 |
17 | headers = ['id', '姓名', '性别', '年龄', '工作']
18 | data_2 = [{'id': 1, '姓名': '小明', '性别': '男', '年龄': '18', '工作': '学生'},
19 | {'id': 2, '姓名': '小红', '性别': '女', '年龄': '24', '工作': '老师'},
20 | {'id': 3, '姓名': '小光', '性别': '男', '年龄': '25', '工作': 'Python工程师'}]
21 |
22 | # 单行写入示例
23 | with open(save_file_name_1, 'w', newline='') as f:
24 | writer = csv.writer(f)
25 | for row in data_1:
26 | writer.writerow(row)
27 |
28 | # 多行写入
29 | with open(save_file_name_2, 'w', newline='') as f:
30 | writer = csv.writer(f)
31 | writer.writerows(data_1)
32 |
33 | # 字典写入
34 | with open(save_file_name_3, 'w', newline='') as f:
35 | # 标头在这里传入,作为第一行数据
36 | writer = csv.DictWriter(f, headers)
37 | writer.writeheader()
38 | for row in data_2:
39 | writer.writerow(row)
40 |
41 |
42 |
43 | if __name__ == '__main__':
44 | with open(save_file_name_1) as f:
45 | reader = csv.DictReader(f)
46 | for row in reader:
47 | print(row['姓名'])
48 | # reader = csv.reader(f)
49 | # print(list(reader)[0][1])
50 | # for row in reader:
51 | # print(reader.line_num, row)
52 |
53 |
--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/代码/4_10.py:
--------------------------------------------------------------------------------
1 | """
2 | PyMongo库实战示例:爬取一号店关键词搜索结果保存到MongoDB中
3 | """
4 | import pymongo
5 | import requests as r
6 | from lxml import etree
7 |
8 | search_word = "羽毛球"
9 | search_base_url = 'https://search.yhd.com/c0-0/k'
10 |
11 |
12 | def search_goods(key):
13 | data_list = []
14 | resp = r.get(search_base_url + key)
15 | print(resp.url)
16 | resp.encoding = 'utf-8'
17 | html = etree.HTML(resp.text)
18 | ul_list = html.xpath('//div[@id="itemSearchList"]/div')
19 | for ul in ul_list:
20 | # 商品名称
21 | title = ul.xpath('div//p[@class="proName clearfix"]/a/@title')[0]
22 | # 商品链接
23 | link = ul.xpath('div//p[@class="proName clearfix"]/a/@href')[0]
24 | # 商品价格
25 | price = ul.xpath('div//p[@class="proPrice"]/em/@yhdprice')[0]
26 | # 店铺名称
27 | store = ul.xpath('div//p[@class="storeName limit_width"]/a/@title')
28 | store_name = store[0] if len(store) > 0 else ''
29 | # 评论数
30 | comment_count = ul.xpath('div//p[@class="proPrice"]/span[@class="comment"]/a/text()')[1]
31 | # 好评率
32 | favorable_rate = ul.xpath('div//span[@class="positiveRatio"]/text()')[0]
33 | data_list.append({'title': title, 'link': 'https:' + link, 'price': price, 'store_name': store_name, 'comment_count': comment_count,
34 | 'favorable_rate': favorable_rate})
35 | return data_list
36 |
37 |
38 | if __name__ == '__main__':
39 | conn = pymongo.MongoClient(host='localhost', port=27017)
40 | search_goods(search_word)
41 | db = conn['yhd']
42 | collection = db['羽毛球']
43 | search_result_list = search_goods(search_word)
44 | collection.insert_many(search_result_list)
45 | conn.close()
46 |
--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/代码/4_2.py:
--------------------------------------------------------------------------------
1 | """
2 | csv库实战示例:爬取星座运势
3 | """
4 | import csv
5 | import requests as r
6 | from bs4 import BeautifulSoup
7 | import re
8 | import os
9 |
10 | # 抓取站点
11 | constellation_url = 'http://www.xzw.com/fortune/'
12 |
13 | # 提取信息的正则
14 | fetch_regex = re.compile(r'^.*?(.*?)(.*?).*?width:(\d*)%.*?p>(.*)\[ 15')
83 | db.close()
84 |
--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/代码/4_6.py:
--------------------------------------------------------------------------------
1 | """
2 | 爬取Gank.io API接口的数据到MySQL
3 | """
4 | import requests as r
5 | from bs4 import BeautifulSoup
6 | import pymysql
7 |
8 | # 接口地址
9 | search_api_base_url = 'https://gank.io/api/data/'
10 |
11 | # 各种分类的表名:Android,iOS,休息视频,福利,拓展资源,前端,瞎推荐,App
12 | category_list = ["android", "ios", "video", "meizi", "other", "fed", "random", "app"]
13 |
14 | # 图片表名
15 | pic_table_name = 'pics'
16 |
17 | # 请求分类字段列表
18 | type_list = ["Android", "iOS", "休息视频", "福利", "拓展资源", "前端", "瞎推荐", "App"]
19 |
20 | # 表字段名
21 | column_list = ('_id', 'createdAt', 'dsec', 'publishedAt', 'source', 'type', 'url', 'used', 'who')
22 |
23 | # 图片表字段名
24 | pic_column_list = ('_id', 'url')
25 |
26 |
27 | # 创建数据库
28 | def create_db():
29 | conn = pymysql.connect(host='localhost', user='root', password='Zpj12345', port=3306)
30 | cursor = conn.cursor()
31 | cursor.execute("Create Database If Not Exists gank Character Set UTF8MB4")
32 | conn.close()
33 | conn = pymysql.connect(host='localhost', user='root', password='Zpj12345', port=3306, db='gank')
34 | return conn
35 |
36 |
37 | # 创建数据库表
38 | def init_tables(c, table):
39 | c.execute(
40 | ("CREATE TABLE IF Not Exists {table}"
41 | "(_id CHAR(24) PRIMARY KEY,"
42 | "createdAt TEXT NOT NULL,"
43 | "dsec TEXT NOT NULL,"
44 | "publishedAt TEXT NOT NULL,"
45 | "source TEXT NOT NULL,"
46 | "type TEXT NOT NULL,"
47 | "url TEXT NOT NULL,"
48 | "used TEXT NOT NULL,"
49 | "who TEXT NOT NULL)").format(table=table))
50 |
51 |
52 | # 创建图表
53 | def init_pic_table(c, table):
54 | c.execute(
55 | ("CREATE TABLE IF Not Exists {table} "
56 | "(id INT AUTO_INCREMENT PRIMARY KEY,"
57 | "_id CHAR(24),"
58 | "url TEXT NOT NULL)").format(table=table))
59 |
60 |
61 | # 把数据插入到数据库中
62 | def insert_data(c, table, column, data):
63 | try:
64 | keys = ', '.join(column)
65 | values = ', '.join(['%s'] * len(data))
66 | sql = 'INSERT INTO {table} ({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
67 | c.execute(sql, tuple(data))
68 | db.commit()
69 | except Exception as e:
70 | print(e)
71 | db.rollback()
72 |
73 |
74 | # 查询数据库表的方法
75 | def query_data(c, table):
76 | try:
77 | sql = 'SELECT * FROM {table}'.format(table=table)
78 | c.execute(sql)
79 | print('共有 %d 行数据' % c.rowcount)
80 | row = c.fetchone()
81 | while row:
82 | print(row)
83 | row = c.fetchone()
84 | except Exception as e:
85 | print(e)
86 |
87 |
88 | # 爬取接口数据的方法
89 | def fetch_data(c, pos):
90 | page_count = 1
91 | while True:
92 | resp = r.get(search_api_base_url + type_list[pos] + '/50/' + str(page_count))
93 | result_json = resp.json()
94 | print("抓取:", resp.url)
95 | if len(result_json['results']) > 0:
96 | for result in result_json['results']:
97 | data_list = [result['_id'],
98 | result['createdAt'],
99 | result['desc'],
100 | result['publishedAt'],
101 | result.get('source', ''),
102 | result['type'],
103 | result['url'],
104 | 1 if result['used'] else 0,
105 | result.get('who', '') if result.get('who', '') is not None else '']
106 | insert_data(c, category_list[pos], column_list, data_list)
107 | if 'images' in result:
108 | for image in result['images']:
109 | insert_data(c, pic_table_name, pic_column_list, [result['_id'], image])
110 | page_count += 1
111 | else:
112 | break
113 |
114 |
115 | if __name__ == '__main__':
116 | db = create_db()
117 | cursor = db.cursor()
118 | # for category in category_list:
119 | # init_tables(cursor, category)
120 | # init_pic_table(cursor, pic_table_name)
121 | # for i in range(0, len(category_list)):
122 | # fetch_data(cursor, i)
123 | query_data(cursor, 'Android')
124 | cursor.close()
125 |
--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/代码/4_7.py:
--------------------------------------------------------------------------------
1 | """
2 | redis-py库的基本操作示例
3 | """
4 | import redis
5 |
6 | # ====================== 连接Redis ============================
7 |
8 | # 1.普通连接
9 | r = redis.StrictRedis(host='127.0.0.1', port=6379, db=0)
10 |
11 | # 2.连接池(一般)
12 | # redis-py使用connection pool来管理对一个redis server的所有连接,避免每次建立、
13 | # 释放连接的开销。这种方式实现多个Redis实例共享一个连接池
14 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='Zpj12345')
15 | r = redis.StrictRedis(connection_pool=pool)
16 |
17 | # 3.管道
18 | # redis-py,默认情况下,每次都会进行连接池的连接和断开。若是想一次执行多条命令,进行
19 | # 事务性操作,就要用管道。(虽然有这个功能,但是不建议使用,慢而且没什么必要。)
20 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379)
21 | r = redis.StrictRedis(connection_pool=pool)
22 | pipe = r.pipeline(transaction=True)
23 | # 执行多条命令
24 | pipe.execute()
25 |
26 | # ====================== 通用操作 ============================
27 |
28 | r.delete('name') # 根据键删除redis中的任意数据类型
29 | r.exists('name') # 检测redis的键是否存在
30 | r.keys(pattern='*') # 根据* ?等通配符匹配获取redis的键
31 | r.expire('name', time=3000) # 为某个键设置超时时间
32 | r.rename('name', 'name1') # 重命名键
33 | r.move('name', 'db1') # 将redis的某个值移动到指定的db下
34 | r.randomkey() # 随机获取一个redis的键(不删除)
35 | r.type('name') # 获取键对应值的类型
36 | r.dbsize() # 获得当前数据库中键的数目
37 | r.ttl('name') # 获得键的过期时间
38 | r.flushdb() # 删除当前选择数据库中所有的键
39 | r.flushall() # 删除所有数据库中的所有键
40 |
41 |
42 | # ====================== String操作 ============================
43 |
44 | # 设置键值对,默认不存在则创建,存在则修改
45 | # set(name, value, ex=None, px=None, nx=False, xx=False)
46 | # ex,过期时间(秒)
47 | # px,过期时间(毫秒)
48 | # nx,如果设置为True,则只有name不存在时,当前set操作才执行,同setnx(name, value)
49 | # xx,如果设置为True,则只有name存在时,当前set操作才执行
50 |
51 | r.set('name', value) #设置值
52 | r.setnx('name',value) #如果name这个键不存在,把这个键对应的值设置为value
53 | r.setex('name', value, time) #设置值,并指定此键值的有效期
54 | r.setrange(name, offset, value) #修改字符串内容,从指定字符串索引开始向后替换
55 | r.mset({"name3":'xxx', "name4":'xxx'}) #批量设置值
56 | r.msetnx({"name3":'xxx', "name4":'xxx'}) #键都不存在是才批量赋值
57 |
58 | r.get('name') # 获取值
59 | r.getset('name', 'yyy') # 为键为name的值赋值为yyy,并返回上次的值xxx
60 | r.mget(['name1','name2']) # 返回多个键对应的值
61 | r.getrange(key, start, end) # 返回键为name的值的字符串,截取索引为start到end的字符
62 | r.strlen("name") #返回name对应值的字节长度(一个汉字3个字节)
63 |
64 | r.append('name',value) # 为键为name的值后追加value
65 | r.incr('name',amount) # 字符串转化为整型,再自增属性name对应的值,当属性name不存在时,
66 | # 则创建name=amount,否则,则自增,amount为自增数(整数)
67 | r.decr('name',amount) #自减name对应的值,当name不存在时,则创建name=amount,
68 | #否则,则自减,amount为自增数(整数)
69 | r.substr('name',start, end) # 返回键为name的值的字符串截取索引为start到end的字符
70 |
71 |
--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/代码/4_8.py:
--------------------------------------------------------------------------------
1 | """
2 | 利用redis保存bilibili弹幕
3 | """
4 | import requests as r
5 | from bs4 import BeautifulSoup
6 | import re
7 | import redis
8 |
9 | video_url = 'https://www.bilibili.com/video/av28989880'
10 | cid_regex = re.compile(r'.*?cid=(\d*?)\&.*', re.S)
11 | xml_base_url = 'http://comment.bilibili.com/'
12 |
13 |
14 | # 获取弹幕的cid
15 | def get_cid():
16 | resp = r.get(video_url).text
17 | bs = BeautifulSoup(resp, 'lxml')
18 | src = bs.select('div.share-address ul li')[1].input
19 | cid = cid_regex.match(str(src)).group(1)
20 | print("获取到的cid:", cid)
21 |
22 |
23 | # 解析获取弹幕
24 | def analysis_d(cid):
25 | count = 1
26 | url = xml_base_url + cid + '.xml'
27 | resp = r.get(url)
28 | resp.encoding = 'utf-8'
29 | bs = BeautifulSoup(resp.text, 'lxml')
30 | d_s = bs.find_all('d')
31 | for d in d_s:
32 | dan_redis.set(str(count), d.text)
33 | count += 1
34 |
35 |
36 | if __name__ == '__main__':
37 | # 连接redis
38 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='Zpj12345', db = 0)
39 | dan_redis = redis.StrictRedis(connection_pool=pool)
40 | # analysis_d('50280136')
41 | results = dan_redis.mget(dan_redis.keys())
42 | print("总共有%d条数据" % len(results))
43 | for result in results:
44 | print(result.decode('utf-8'))
--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/代码/4_9.py:
--------------------------------------------------------------------------------
1 | """
2 | PyMongo库的基本操作示例
3 | """
4 |
5 | import pymongo
6 |
7 | # 1.连接MongoDB数据库(默认没有密码,如果设置了密码要调用db.auth("用户名","密码"))
8 | conn = pymongo.MongoClient(host='localhost', port=27017)
9 | # 或者采用MongoDB连接字符串的形式也可以:
10 | # conn = pymongo.MongoClient('mongodb://localhost:27017')
11 |
12 | # 2.选择数据库,也可以使用conn['test']这一的方式选择,等价
13 | # db = conn.test
14 | #
15 | # # 3.选择collection
16 | # collection = db.user
17 | # print(collection)
18 |
19 |
20 | # # 4.创建数据库
21 | # db = conn['test_db']
22 | #
23 | # # 5.创建collection
24 | # collection = db['test_collection']
25 |
26 | # 6.插入一条数据
27 | # db = conn['test_db']
28 | # collection = db['test_collection']
29 | # dic = {'id': '1', 'name': 'Jay'}
30 | # collection.insert_one(dic)
31 |
32 | db = conn.test_db
33 | collection = db.test_collection
34 |
35 | # 7.插入多条数据(传入一个字典的列表)
36 | # data_list = [{'id': '2', 'name': 'Tom'},{'id': '3', 'name': 'Jack'}]
37 | # collection.insert_many(data_list)
38 |
39 |
40 | # 8.查找数据
41 |
42 | # 查找一条
43 | # print(collection.find_one({'name': 'Tom'}))
44 |
45 |
46 | # 查找多条
47 | # data_list = [{'id': '4', 'name': 'Mary'},{'id': '4', 'name': 'Lucy'}]
48 | # collection.insert_many(data_list)
49 | # results = collection.find({'id':'4'})
50 | # for result in results:
51 | # print(result)
52 |
53 | # 正则匹配
54 | # for result in collection.find({'name':{'$regex':'^J.*'}}):
55 | # print(result)
56 |
57 | # 9.修改数据
58 |
59 | # 方法一:需要整条记录参与
60 | # person = collection.find_one({'name':'Jack'})
61 | # person['name'] = 'Jacky'
62 | # collection.update({'name':'Jack'}, person)
63 |
64 | # 方法二:部分修改字段内容的方式
65 | # result = collection.update_one({'name': 'Tom'}, {'$set': {"name": "Tony"}})
66 | # print(result)
67 | # print("匹配的数据条数:",result.matched_count, "受影响的数据条数:",result.modified_count)
68 |
69 | # 10.删除数据
70 | # result = collection.delete_many({'id': {'$lte': 3}})
71 | # print("删除的数据条数:", result.deleted_count)
72 |
73 | # 11.计数
74 | # print("数据库中有%d条记录。" % collection.find().count())
75 |
76 | # 12.排序
77 | # data_list = [{'id': 2, 'name': 'Tom'},{'id': 3, 'name': 'Jack'},{'id': 5, 'name': 'Daisy'}]
78 | # collection.insert_many(data_list)
79 | # # 降序排列,升序可以传入pymongo.ASCENDING
80 | # results = collection.find().sort('id', pymongo.DESCENDING)
81 | # for result in results:
82 | # print(result)
83 |
84 | # 13.偏移
85 | results = collection.find().sort('id', pymongo.ASCENDING).skip(1)
86 | for result in results:
87 | print(result)
88 |
89 |
--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/勘误.md:
--------------------------------------------------------------------------------
1 | 2021.4.29 更新内容:
2 |
3 | 4_10.py → 1号店已不提供H5版本,目前无解
--------------------------------------------------------------------------------
/5、用数据库存储数据/代码/5_1.py:
--------------------------------------------------------------------------------
1 | """
2 | csv库使用代码示例
3 | """
4 |
5 | import csv
6 | import os
7 |
8 | save_file_name_1 = os.path.join(os.getcwd(), '1.csv')
9 | save_file_name_2 = os.path.join(os.getcwd(), '2.csv')
10 | save_file_name_3 = os.path.join(os.getcwd(), '3.csv')
11 |
12 | data_1 = [['id', '姓名', '性别', '年龄', '工作'],
13 | [1, '小明', '男', '18', '学生'],
14 | [2, '小红', '女', '24', '老师'],
15 | [3, '小光', '男', '25', 'Python工程师']]
16 |
17 | headers = ['id', '姓名', '性别', '年龄', '工作']
18 | data_2 = [{'id': 1, '姓名': '小明', '性别': '男', '年龄': '18', '工作': '学生'},
19 | {'id': 2, '姓名': '小红', '性别': '女', '年龄': '24', '工作': '老师'},
20 | {'id': 3, '姓名': '小光', '性别': '男', '年龄': '25', '工作': 'Python工程师'}]
21 |
22 | # 单行写入示例
23 | with open(save_file_name_1, 'w', newline='') as f:
24 | writer = csv.writer(f)
25 | for row in data_1:
26 | writer.writerow(row)
27 |
28 | # 多行写入
29 | with open(save_file_name_2, 'w', newline='') as f:
30 | writer = csv.writer(f)
31 | writer.writerows(data_1)
32 |
33 | # 字典写入
34 | with open(save_file_name_3, 'w', newline='') as f:
35 | # 标头在这里传入,作为第一行数据
36 | writer = csv.DictWriter(f, headers)
37 | writer.writeheader()
38 | for row in data_2:
39 | writer.writerow(row)
40 |
41 |
42 |
43 | if __name__ == '__main__':
44 | with open(save_file_name_1) as f:
45 | reader = csv.DictReader(f)
46 | for row in reader:
47 | print(row['姓名'])
48 | # reader = csv.reader(f)
49 | # print(list(reader)[0][1])
50 | # for row in reader:
51 | # print(reader.line_num, row)
52 |
53 |
--------------------------------------------------------------------------------
/5、用数据库存储数据/代码/5_10.py:
--------------------------------------------------------------------------------
1 | """
2 | PyMongo库实战示例:爬取一号店关键词搜索结果保存到MongoDB中
3 | """
4 | import pymongo
5 | import requests as r
6 | from lxml import etree
7 |
8 | search_word = "羽毛球"
9 | search_base_url = 'https://search.yhd.com/c0-0/k'
10 |
11 |
12 | def search_goods(key):
13 | data_list = []
14 | resp = r.get(search_base_url + key)
15 | resp.encoding = 'utf-8'
16 | html = etree.HTML(resp.text)
17 | ul_list = html.xpath('//div[@id="itemSearchList"]/div')
18 | for ul in ul_list:
19 | # 商品名称
20 | title = ul.xpath('div//p[@class="proName clearfix"]/a/@title')[0]
21 | # 商品链接
22 | link = ul.xpath('div//p[@class="proName clearfix"]/a/@href')[0]
23 | # 商品价格
24 | price = ul.xpath('div//p[@class="proPrice"]/em/@yhdprice')[0]
25 | # 店铺名称
26 | store = ul.xpath('div//p[@class="storeName limit_width"]/a/@title')
27 | store_name = store[0] if len(store) > 0 else ''
28 | # 评论数
29 | comment_count = ul.xpath('div//p[@class="proPrice"]/span[@class="comment"]/a/text()')[1]
30 | # 好评率
31 | favorable_rate = ul.xpath('div//span[@class="positiveRatio"]/text()')[0]
32 | data_list.append({'title': title, 'link': 'https:' + link, 'price': price, 'store_name': store_name, 'comment_count': comment_count,
33 | 'favorable_rate': favorable_rate})
34 | return data_list
35 |
36 |
37 | if __name__ == '__main__':
38 | conn = pymongo.MongoClient(host='localhost', port=27017)
39 | search_goods(search_word)
40 | db = conn['yhd']
41 | collection = db['羽毛球']
42 | search_result_list = search_goods(search_word)
43 | collection.insert_many(search_result_list)
44 | conn.close()
45 |
--------------------------------------------------------------------------------
/5、用数据库存储数据/代码/5_2.py:
--------------------------------------------------------------------------------
1 | """
2 | csv库实战示例:爬取星座运势
3 | """
4 | import csv
5 | import requests as r
6 | from bs4 import BeautifulSoup
7 | import re
8 | import os
9 |
10 | # 抓取站点
11 | constellation_url = 'http://www.xzw.com/fortune/'
12 |
13 | # 提取信息的正则
14 | fetch_regex = re.compile(r'^.*?(.*?)(.*?).*?width:(\d*)%.*?p>(.*)\[ 15')
83 | db.close()
84 |
--------------------------------------------------------------------------------
/5、用数据库存储数据/代码/5_6.py:
--------------------------------------------------------------------------------
1 | """
2 | 爬取Gank.io API接口的数据到MySQL
3 | """
4 | import requests as r
5 | from bs4 import BeautifulSoup
6 | import pymysql
7 |
8 | # 接口地址
9 | search_api_base_url = 'https://gank.io/api/v2/data/'
10 |
11 | # 各种分类的表名:Android,iOS,休息视频,福利,拓展资源,前端,瞎推荐,App
12 | category_list = ["android", "ios", "video", "meizi", "other", "fed", "random", "app"]
13 |
14 | # 图片表名
15 | pic_table_name = 'pics'
16 |
17 | # 请求分类字段列表
18 | type_list = ["Android", "iOS", "休息视频", "福利", "拓展资源", "前端", "瞎推荐", "App"]
19 |
20 | # 表字段名
21 | column_list = ('_id', 'createdAt', 'dsec', 'publishedAt', 'source', 'type', 'url', 'used', 'who')
22 |
23 | # 图片表字段名
24 | pic_column_list = ('_id', 'url')
25 |
26 |
27 | # 创建数据库
28 | def create_db():
29 | conn = pymysql.connect(host='localhost', user='root', password='Zpj12345', port=3306)
30 | cursor = conn.cursor()
31 | cursor.execute("Create Database If Not Exists gank Character Set UTF8MB4")
32 | conn.close()
33 | conn = pymysql.connect(host='localhost', user='root', password='Zpj12345', port=3306, db='gank')
34 | return conn
35 |
36 |
37 | # 创建数据库表
38 | def init_tables(c, table):
39 | c.execute(
40 | ("CREATE TABLE IF Not Exists {table}"
41 | "(_id CHAR(24) PRIMARY KEY,"
42 | "createdAt TEXT NOT NULL,"
43 | "dsec TEXT NOT NULL,"
44 | "publishedAt TEXT NOT NULL,"
45 | "source TEXT NOT NULL,"
46 | "type TEXT NOT NULL,"
47 | "url TEXT NOT NULL,"
48 | "used TEXT NOT NULL,"
49 | "who TEXT NOT NULL)").format(table=table))
50 |
51 |
52 | # 创建图表
53 | def init_pic_table(c, table):
54 | c.execute(
55 | ("CREATE TABLE IF Not Exists {table} "
56 | "(id INT AUTO_INCREMENT PRIMARY KEY,"
57 | "_id CHAR(24),"
58 | "url TEXT NOT NULL)").format(table=table))
59 |
60 |
61 | # 把数据插入到数据库中
62 | def insert_data(c, table, column, data):
63 | try:
64 | keys = ', '.join(column)
65 | values = ', '.join(['%s'] * len(data))
66 | sql = 'INSERT INTO {table} ({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
67 | c.execute(sql, tuple(data))
68 | db.commit()
69 | except Exception as e:
70 | print(e)
71 | db.rollback()
72 |
73 |
74 | # 查询数据库表的方法
75 | def query_data(c, table):
76 | try:
77 | sql = 'SELECT * FROM {table}'.format(table=table)
78 | c.execute(sql)
79 | print('共有 %d 行数据' % c.rowcount)
80 | row = c.fetchone()
81 | while row:
82 | print(row)
83 | row = c.fetchone()
84 | except Exception as e:
85 | print(e)
86 |
87 |
88 | # 爬取接口数据的方法
89 | def fetch_data(c, pos):
90 | page_count = 1
91 | while True:
92 | resp = r.get(search_api_base_url + type_list[pos] + '/50/' + str(page_count))
93 | result_json = resp.json()
94 | print("抓取:", resp.url)
95 | if len(result_json['results']) > 0:
96 | for result in result_json['results']:
97 | data_list = [result['_id'],
98 | result['createdAt'],
99 | result['desc'],
100 | result['publishedAt'],
101 | result.get('source', ''),
102 | result['type'],
103 | result['url'],
104 | 1 if result['used'] else 0,
105 | result.get('who', '') if result.get('who', '') is not None else '']
106 | insert_data(c, category_list[pos], column_list, data_list)
107 | if 'images' in result:
108 | for image in result['images']:
109 | insert_data(c, pic_table_name, pic_column_list, [result['_id'], image])
110 | page_count += 1
111 | else:
112 | break
113 |
114 |
115 | if __name__ == '__main__':
116 | db = create_db()
117 | cursor = db.cursor()
118 | # for category in category_list:
119 | # init_tables(cursor, category)
120 | # init_pic_table(cursor, pic_table_name)
121 | # for i in range(0, len(category_list)):
122 | # fetch_data(cursor, i)
123 | query_data(cursor, 'Android')
124 | cursor.close()
125 |
--------------------------------------------------------------------------------
/5、用数据库存储数据/代码/5_7.py:
--------------------------------------------------------------------------------
1 | """
2 | redis-py库的基本操作示例
3 | """
4 | import redis
5 |
6 | # ====================== 连接Redis ============================
7 |
8 | # 1.普通连接
9 | r = redis.StrictRedis(host='127.0.0.1', port=6379, db=0)
10 |
11 | # 2.连接池(一般)
12 | # redis-py使用connection pool来管理对一个redis server的所有连接,避免每次建立、
13 | # 释放连接的开销。这种方式实现多个Redis实例共享一个连接池
14 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='Zpj12345')
15 | r = redis.StrictRedis(connection_pool=pool)
16 |
17 | # 3.管道
18 | # redis-py,默认情况下,每次都会进行连接池的连接和断开。若是想一次执行多条命令,进行
19 | # 事务性操作,就要用管道。(虽然有这个功能,但是不建议使用,慢而且没什么必要。)
20 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379)
21 | r = redis.StrictRedis(connection_pool=pool)
22 | pipe = r.pipeline(transaction=True)
23 | # 执行多条命令
24 | pipe.execute()
25 |
26 | # ====================== 通用操作 ============================
27 |
28 | r.delete('name') # 根据键删除redis中的任意数据类型
29 | r.exists('name') # 检测redis的键是否存在
30 | r.keys(pattern='*') # 根据* ?等通配符匹配获取redis的键
31 | r.expire('name', time=3000) # 为某个键设置超时时间
32 | r.rename('name', 'name1') # 重命名键
33 | r.move('name', 'db1') # 将redis的某个值移动到指定的db下
34 | r.randomkey() # 随机获取一个redis的键(不删除)
35 | r.type('name') # 获取键对应值的类型
36 | r.dbsize() # 获得当前数据库中键的数目
37 | r.ttl('name') # 获得键的过期时间
38 | r.flushdb() # 删除当前选择数据库中所有的键
39 | r.flushall() # 删除所有数据库中的所有键
40 |
41 |
42 | # ====================== String操作 ============================
43 |
44 | # 设置键值对,默认不存在则创建,存在则修改
45 | # set(name, value, ex=None, px=None, nx=False, xx=False)
46 | # ex,过期时间(秒)
47 | # px,过期时间(毫秒)
48 | # nx,如果设置为True,则只有name不存在时,当前set操作才执行,同setnx(name, value)
49 | # xx,如果设置为True,则只有name存在时,当前set操作才执行
50 |
51 | r.set('name', value) #设置值
52 | r.setnx('name',value) #如果name这个键不存在,把这个键对应的值设置为value
53 | r.setex('name', value, time) #设置值,并指定此键值的有效期
54 | r.setrange(name, offset, value) #修改字符串内容,从指定字符串索引开始向后替换
55 | r.mset({"name3":'xxx', "name4":'xxx'}) #批量设置值
56 | r.msetnx({"name3":'xxx', "name4":'xxx'}) #键都不存在是才批量赋值
57 |
58 | r.get('name') # 获取值
59 | r.getset('name', 'yyy') # 为键为name的值赋值为yyy,并返回上次的值xxx
60 | r.mget(['name1','name2']) # 返回多个键对应的值
61 | r.getrange(key, start, end) # 返回键为name的值的字符串,截取索引为start到end的字符
62 | r.strlen("name") #返回name对应值的字节长度(一个汉字3个字节)
63 |
64 | r.append('name',value) # 为键为name的值后追加value
65 | r.incr('name',amount) # 字符串转化为整型,再自增属性name对应的值,当属性name不存在时,
66 | # 则创建name=amount,否则,则自增,amount为自增数(整数)
67 | r.decr('name',amount) #自减name对应的值,当name不存在时,则创建name=amount,
68 | #否则,则自减,amount为自增数(整数)
69 | r.substr('name',start, end) # 返回键为name的值的字符串截取索引为start到end的字符
70 |
71 |
--------------------------------------------------------------------------------
/5、用数据库存储数据/代码/5_8.py:
--------------------------------------------------------------------------------
1 | """
2 | 利用redis保存bilibili弹幕
3 | """
4 | import requests as r
5 | from bs4 import BeautifulSoup
6 | import re
7 | import redis
8 |
9 | video_url = 'https://www.bilibili.com/video/av28989880'
10 | cid_regex = re.compile(r'cid=(\d{8})', re.S)
11 | xml_base_url = 'http://comment.bilibili.com/'
12 |
13 |
14 | # 获取弹幕的cid
15 | def get_cid():
16 | resp = r.get(video_url).text
17 | cid = cid_regex.search(str(resp)).group(1).strip()
18 | print("获取到的cid:", cid)
19 | return cid
20 |
21 |
22 | # 解析获取弹幕
23 | def analysis_d(cid):
24 | count = 1
25 | url = xml_base_url + cid + '.xml'
26 | resp = r.get(url)
27 | resp.encoding = 'utf-8'
28 | bs = BeautifulSoup(resp.text, 'lxml')
29 | d_s = bs.find_all('d')
30 | for d in d_s:
31 | print(d.text)
32 | # dan_redis.set(str(count), d.text)
33 | count += 1
34 |
35 |
36 | if __name__ == '__main__':
37 | analysis_d(get_cid())
38 | # 连接redis
39 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='Zpj12345', db=0)
40 | dan_redis = redis.StrictRedis(connection_pool=pool)
41 | results = dan_redis.mget(dan_redis.keys())
42 | print("总共有%d条数据" % len(results))
43 | for result in results:
44 | print(result.decode('utf-8'))
45 |
--------------------------------------------------------------------------------
/5、用数据库存储数据/代码/5_9.py:
--------------------------------------------------------------------------------
1 | """
2 | PyMongo库的基本操作示例
3 | """
4 |
5 | import pymongo
6 |
7 | # 1.连接MongoDB数据库(默认没有密码,如果设置了密码要调用db.auth("用户名","密码"))
8 | conn = pymongo.MongoClient(host='localhost', port=27017)
9 | # 或者采用MongoDB连接字符串的形式也可以:
10 | # conn = pymongo.MongoClient('mongodb://localhost:27017')
11 |
12 | # 2.选择数据库,也可以使用conn['test']这一的方式选择,等价
13 | # db = conn.test
14 | #
15 | # # 3.选择collection
16 | # collection = db.user
17 | # print(collection)
18 |
19 |
20 | # # 4.创建数据库
21 | # db = conn['test_db']
22 | #
23 | # # 5.创建collection
24 | # collection = db['test_collection']
25 |
26 | # 6.插入一条数据
27 | # db = conn['test_db']
28 | # collection = db['test_collection']
29 | # dic = {'id': '1', 'name': 'Jay'}
30 | # collection.insert_one(dic)
31 |
32 | db = conn.test_db
33 | collection = db.test_collection
34 |
35 | # 7.插入多条数据(传入一个字典的列表)
36 | # data_list = [{'id': '2', 'name': 'Tom'},{'id': '3', 'name': 'Jack'}]
37 | # collection.insert_many(data_list)
38 |
39 |
40 | # 8.查找数据
41 |
42 | # 查找一条
43 | # print(collection.find_one({'name': 'Tom'}))
44 |
45 |
46 | # 查找多条
47 | # data_list = [{'id': '4', 'name': 'Mary'},{'id': '4', 'name': 'Lucy'}]
48 | # collection.insert_many(data_list)
49 | # results = collection.find({'id':'4'})
50 | # for result in results:
51 | # print(result)
52 |
53 | # 正则匹配
54 | # for result in collection.find({'name':{'$regex':'^J.*'}}):
55 | # print(result)
56 |
57 | # 9.修改数据
58 |
59 | # 方法一:需要整条记录参与
60 | # person = collection.find_one({'name':'Jack'})
61 | # person['name'] = 'Jacky'
62 | # collection.update({'name':'Jack'}, person)
63 |
64 | # 方法二:部分修改字段内容的方式
65 | # result = collection.update_one({'name': 'Tom'}, {'$set': {"name": "Tony"}})
66 | # print(result)
67 | # print("匹配的数据条数:",result.matched_count, "受影响的数据条数:",result.modified_count)
68 |
69 | # 10.删除数据
70 | # result = collection.delete_many({'id': {'$lte': 3}})
71 | # print("删除的数据条数:", result.deleted_count)
72 |
73 | # 11.计数
74 | # print("数据库中有%d条记录。" % collection.find().count())
75 |
76 | # 12.排序
77 | # data_list = [{'id': 2, 'name': 'Tom'},{'id': 3, 'name': 'Jack'},{'id': 5, 'name': 'Daisy'}]
78 | # collection.insert_many(data_list)
79 | # # 降序排列,升序可以传入pymongo.ASCENDING
80 | # results = collection.find().sort('id', pymongo.DESCENDING)
81 | # for result in results:
82 | # print(result)
83 |
84 | # 13.偏移
85 | results = collection.find().sort('id', pymongo.ASCENDING).skip(1)
86 | for result in results:
87 | print(result)
88 |
89 |
--------------------------------------------------------------------------------
/5、用数据库存储数据/勘误.md:
--------------------------------------------------------------------------------
1 | 2021.4.29 更新内容:
2 |
3 | 5_6.py → 更新Gank.io的新API
4 |
5 | 5_8.py → 更新获取cid的正则,修改提取弹幕规则
6 |
7 | 5_10.py → 1号店已不提供H5版本,目前无解
--------------------------------------------------------------------------------
/6、Python应对反爬虫策略/代码/6_1.py:
--------------------------------------------------------------------------------
1 | """
2 | fake_useragent库使用示例
3 | """
4 |
5 | from fake_useragent import UserAgent
6 | import random
7 |
8 | if __name__ == '__main__':
9 | ua = UserAgent(use_cache_server=False)
10 | print("Chrome浏览器:", ua.chrome)
11 | print("FireFox浏览器:", ua.firefox)
12 | print("Ubuntu FireFox浏览器:", ua.ff)
13 | print("IE浏览器:", ua.ie)
14 | print("Safari浏览器:", ua.safari)
15 | print("Mac Chrome:", ua.google)
16 | print("Opera浏览器:", ua.opera)
17 | print("随机:",ua.random)
18 |
--------------------------------------------------------------------------------
/6、Python应对反爬虫策略/代码/6_2.py:
--------------------------------------------------------------------------------
1 | """
2 | Ajax动态加载数据应对策略例子:爬取花瓣网某个画板的所有风景图
3 | """
4 | import requests as r
5 | import os
6 | import re
7 | import json
8 |
9 | # 图片URL拼接的前缀和后缀
10 | img_start_url = 'http://img.hb.aicdn.com/'
11 | img_end = '_fw658'
12 |
13 | # 图片key的保存文件
14 | pic_key_file = 'pin_ids.txt'
15 |
16 | # 获取pins的正则
17 | boards_pattern = re.compile(r'pins":(.*)};')
18 |
19 | # 修改pin_id的正则
20 | max_pattern = re.compile(r'(?<=max=)\d*(?=&limit)')
21 |
22 | # 图片保存路径
23 | pic_download_dir = os.path.join(os.getcwd(), 'HuaBan/')
24 |
25 | # Ajax模拟的请求头
26 | ajax_headers = {
27 | 'Host': 'huaban.com',
28 | 'Accept': 'application/json',
29 | 'X-Request': 'JSON',
30 | 'X-Requested-With': 'XMLHttpRequest'
31 | }
32 |
33 |
34 | # 以追加的形式往文件中写入内容
35 | def write_str_data(content, file_path):
36 | try:
37 | with open(file_path, 'a+', encoding='utf-8') as f:
38 | f.write(content + "\n", )
39 | except OSError as reason:
40 | print(str(reason))
41 |
42 |
43 | # 按行读取文件里的内容添加到列表中返回
44 | def load_data(file_path):
45 | if os.path.exists(file_path):
46 | data_list = []
47 | with open(file_path, "r+", encoding='utf-8') as f:
48 | for ip in f:
49 | data_list.append(ip.replace("\n", ""))
50 | return data_list
51 |
52 |
53 | # 获得borads页数据,提取key列表写入到文件里,并返回最后一个pid用于后续查询
54 | def get_boards_index_data(url):
55 | print("请求:" + url)
56 | resp = r.get(url).text
57 | result = boards_pattern.search(resp)
58 | json_dict = json.loads(result.group(1))
59 | for item in json_dict:
60 | write_str_data(item['file']['key'], pic_key_file)
61 | # 返回最后一个pin_id
62 | pin_id = json_dict[-1]['pin_id']
63 | return pin_id
64 |
65 |
66 | # 模拟Ajax请求更多数据
67 | def get_json_list(url):
68 | print("请求:" + url)
69 | resp = r.get(url, headers=ajax_headers)
70 | if resp is None:
71 | return None
72 | else:
73 | json_dict = json.loads(resp.text)
74 | pins = json_dict['board']['pins']
75 | if len(pins) == 0:
76 | return None
77 | else:
78 | for item in pins:
79 | write_str_data(item['file']['key'], pic_key_file)
80 | return pins[-1]['pin_id']
81 |
82 |
83 | # 下载图片的方法
84 | def download_pic(key):
85 | url = img_start_url + key + img_end
86 | resp = r.get(url).content
87 | try:
88 | print("下载图片:" + url)
89 | pic_name = key + ".jpg"
90 | with open(pic_download_dir + pic_name, "wb+") as f:
91 | f.write(resp)
92 | except (OSError, r.HTTPError, r.ConnectionError, Exception) as reason:
93 | print(str(reason))
94 |
95 |
96 | if __name__ == '__main__':
97 | if not os.path.exists(pic_download_dir):
98 | os.makedirs(pic_download_dir)
99 | # 判断图片key的保存文件是否存在,存在的话删除
100 | if os.path.exists(pic_key_file):
101 | os.remove(pic_key_file)
102 | # 一个画板链接,可自行替换
103 | boards_url = 'http://huaban.com/boards/279523/'
104 | board_last_pin_id = get_boards_index_data(boards_url)
105 | board_json_url = boards_url + '?jl58nz3i&max=43131274&limit=20&wfl=1'
106 | while True:
107 | board_last_pin_id = get_json_list(max_pattern.sub(str(board_last_pin_id), board_json_url))
108 | if board_last_pin_id is None:
109 | break
110 | pic_url_list = load_data(pic_key_file)
111 | for key in pic_url_list:
112 | download_pic(key)
113 | print("所有图片下载完成~")
114 |
--------------------------------------------------------------------------------
/6、Python应对反爬虫策略/代码/6_3.py:
--------------------------------------------------------------------------------
1 | """
2 | selenium使用示例
3 | """
4 | from selenium import webdriver
5 |
6 | browser = webdriver.Chrome() # 调用本地的Chrome浏览器
7 | browser.get('http://www.baidu.com') # 请求页面,会打开一个浏览器窗口
8 | html_text = browser.page_source # 获得页面代码
9 | # browser.quit() # 关闭浏览器
10 | print(html_text)
--------------------------------------------------------------------------------
/6、Python应对反爬虫策略/代码/6_4.py:
--------------------------------------------------------------------------------
1 | """
2 | selenium爬取简单网无聊图示例
3 | """
4 | import os
5 | from selenium import webdriver
6 | import redis
7 | import requests as r
8 | from bs4 import BeautifulSoup
9 |
10 | # 请求基地址
11 | base_url = 'http://jandan.net/pic'
12 | # 图片的保存路径
13 | pic_save_path = os.path.join(os.getcwd(), 'JianDan/')
14 | # 图片需要,作为Reids键用
15 | pic_count = 0
16 |
17 | # 下载图片用headers
18 | pic_headers = {
19 | 'Host': 'wx2.sinaimg.cn',
20 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
21 | 'Chrome/61.0.3163.100 Safari/537.36 '
22 | }
23 |
24 |
25 | # 打开浏览器模拟请求
26 | def browser_get():
27 | browser = webdriver.Chrome()
28 | browser.get(base_url)
29 | html_text = browser.page_source
30 | page_count = get_page_count(html_text)
31 | # 循环拼接URL访问
32 | for page in range(page_count, 0, -1):
33 | page_url = base_url + '/page-' + str(page)
34 | print('解析:' + page_url)
35 | browser.get(page_url)
36 | html = browser.page_source
37 | get_meizi_url(html)
38 | # 没有更多了关闭浏览器
39 | browser.quit()
40 |
41 |
42 | # 获取总页码
43 | def get_page_count(html):
44 | bs = BeautifulSoup(html, 'lxml')
45 | page_count = bs.find('span', attrs={'class': 'current-comment-page'})
46 | return int(page_count.get_text()[1:-1]) - 1
47 |
48 |
49 | # 获取每页的图片
50 | def get_meizi_url(html):
51 | soup = BeautifulSoup(html, 'html.parser')
52 | ol = soup.find('ol', attrs={'class': 'commentlist'})
53 | href = ol.findAll('a', attrs={'class': 'view_img_link'})
54 | global pic_count
55 | for a in href:
56 | dan_redis.set(str(pic_count), a['href'])
57 | pic_count += 1
58 |
59 |
60 | # 下载图片
61 | def download_pic(url):
62 | correct_url = url
63 | if url.startswith('//'):
64 | correct_url = url[2:]
65 | if not url.startswith('http'):
66 | correct_url = 'http://' + correct_url
67 | print("下载:", correct_url)
68 | try:
69 | resp = r.get(correct_url, headers=pic_headers).content
70 | pic_name = correct_url.split("/")[-1]
71 | with open(pic_save_path + pic_name, "wb+") as f:
72 | f.write(resp)
73 | except (OSError, r.ConnectionError, r.HTTPError, Exception) as reason:
74 | print(str(reason))
75 |
76 |
77 | if __name__ == '__main__':
78 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='Zpj12345', db=1)
79 | dan_redis = redis.StrictRedis(connection_pool=pool)
80 | if not os.path.exists(pic_save_path):
81 | os.makedirs(pic_save_path)
82 | browser_get()
83 | results = dan_redis.mget(dan_redis.keys())
84 | for result in results:
85 | download_pic(result.decode('utf-8'))
86 | print("图片下载完毕!")
87 |
--------------------------------------------------------------------------------
/6、Python应对反爬虫策略/代码/6_5.py:
--------------------------------------------------------------------------------
1 | """
2 | Selenium+Tesserocr实现自动登陆知乎
3 | """
4 | import os
5 | from selenium import webdriver
6 | import requests as r
7 | import time
8 | from PIL import Image
9 | from aip import AipOcr
10 | from hashlib import md5
11 | import base64
12 |
13 | zhihu_login_url = 'https://www.zhihu.com/signup'
14 |
15 | config = {
16 | 'appId': 'd4ed8d211abd4f20b3xxe0f55xxx173f',
17 | 'apiKey': 'Nk3RSGAh0gFEGdoFC7GxxaCQ',
18 | 'secretKey': '63TyYDkI5R0x21tDsCxxBoF8EEmiDfEd'
19 | }
20 | client = AipOcr(**config)
21 |
22 | # 超级鹰参数
23 | cjy_params = {
24 | 'user': 'CoderPig',
25 | 'pass2': md5('zpj12345'.encode('utf8')).hexdigest(),
26 | 'softid': '897137',
27 | }
28 |
29 | # 超级鹰请求头
30 | cjy_headers = {
31 | 'Connection': 'Keep-Alive',
32 | 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
33 | }
34 |
35 |
36 | # 打开浏览器模拟请求
37 | def auto_login():
38 | browser = webdriver.Chrome()
39 | while True:
40 | browser.get(zhihu_login_url)
41 | # 判断是否处于注册页(底部有登录字样,是的话点击跳转)
42 | signup_switch_bt = browser.find_element_by_xpath('//div[@class="SignContainer-switch"]/span')
43 | if signup_switch_bt.text == '登录':
44 | signup_switch_bt.click()
45 | # 输入用户名
46 | username_input = browser.find_element_by_xpath('//input[@name="username"]')
47 | username_input.send_keys('xx@qq.com')
48 | # 输入密码
49 | password_input = browser.find_element_by_xpath('//input[@name="password"]')
50 | password_input.send_keys('xxx')
51 | # 等待一会儿,等验证码刷出来
52 | time.sleep(5)
53 | # 判断是否包含英文字符验证码,是的话处理,否则跳出
54 | if is_elements_existed(browser, "//div[@class='Captcha-englishContainer']"):
55 | if len(browser.find_element_by_xpath("//img[@class='Captcha-englishImg']").get_attribute('src')) > 30:
56 | code_img = browser.find_element_by_xpath('//img[@alt="图形验证码"]')
57 | code = cjy_fetch_code(base64.b64decode(code_img.get_attribute('src')[22:].replace("%0A", "")), 1902)
58 | # 输入验证码
59 | code_input = browser.find_element_by_xpath('//input[@name="captcha"]')
60 | code_input.send_keys(code)
61 | time.sleep(2)
62 | # 点击登录按钮
63 | login_bt = browser.find_element_by_xpath('//button[@type="submit"]')
64 | login_bt.click()
65 | time.sleep(3)
66 | break
67 | else:
68 | continue
69 | time.sleep(10)
70 | # 打印当前的网页链接,以此判断是否跳转成功
71 | print(browser.current_url)
72 |
73 |
74 | # 判断xpath定位的元素是否存在
75 | def is_elements_existed(browser, element):
76 | flag = True
77 | try:
78 | browser.find_element_by_xpath(element)
79 | return flag
80 | except:
81 | flag = False
82 | return flag
83 |
84 |
85 | # 读取图片
86 | def get_file_content(file_path):
87 | with open(file_path, 'rb') as fp:
88 | return fp.read()
89 |
90 |
91 | # 百度OCR文字识别
92 | def baidu_ocr(file):
93 | image = get_file_content(file)
94 | # 调用通用文字识别, 图片参数为本地图片
95 | result = client.basicAccurate(image)
96 | print(result)
97 | if 'words_result' in result:
98 | return '\n'.join([w['words'] for w in result['words_result']])
99 |
100 |
101 | # 重置图片大小,并进行灰度和二值化处理
102 | def resize_pic(file, width=1200, height=480):
103 | img = Image.open(file)
104 | try:
105 | new_img = img.resize((width, height), Image.BILINEAR)
106 | # 转灰度处理
107 | new_img = new_img.convert('L')
108 | # 二值化处理
109 | table = []
110 | for i in range(256):
111 | if i < 150:
112 | table.append(0)
113 | else:
114 | table.append(1)
115 | # 通过表格转换为二进制图片
116 | new_img = new_img.point(table, "1")
117 | new_img.save(os.path.join(os.getcwd(), os.path.basename(file)))
118 | except Exception as e:
119 | print(e)
120 |
121 |
122 | # 超级鹰识别验证码
123 | def cjy_fetch_code(im, codetype):
124 | cjy_params.update({'codetype': codetype})
125 | files = {'userfile': ('ccc.jpg', im)}
126 | resp = r.post('http://upload.chaojiying.net/Upload/Processing.php', data=cjy_params, files=files,
127 | headers=cjy_headers).json()
128 | print(resp)
129 | if resp.get('err_no', 0) == 0:
130 | return resp.get('pic_str')
131 |
132 |
133 | if __name__ == '__main__':
134 | # resize_pic('code.png')
135 | # baidu_ocr('code.png')
136 | # im = open('code.png', 'rb').read()
137 | # print(cjy_fetch_code(im, 1902))
138 | auto_login()
139 |
--------------------------------------------------------------------------------
/6、Python应对反爬虫策略/代码/6_6.py:
--------------------------------------------------------------------------------
1 | """
2 | 破解极验滑动验证码示例
3 | """
4 | import time
5 | from selenium import webdriver
6 | from selenium.webdriver.support.wait import WebDriverWait
7 | from selenium.webdriver import ActionChains
8 | from lxml import etree
9 | import requests as r
10 | import re
11 | import PIL.Image as image
12 |
13 | full_image_file = 'full.jpg'
14 | cut_image_file = 'cut.jpg'
15 | bilibili_login_url = 'https://passport.bilibili.com/login'
16 | url_fetch_regex = re.compile('url\(\"(.*?)\"\);')
17 | bg_postion_regex = re.compile('position: (.*?)px (.*?)px;')
18 |
19 |
20 | def auto_login():
21 | # 输入账号密码
22 | input_user = browser.find_element_by_xpath('//input[@id="login-username"]')
23 | input_user.send_keys("xxx")
24 | input_passwd = browser.find_element_by_xpath('//input[@id="login-passwd"]')
25 | input_passwd.send_keys("xxx")
26 | # 验证码自动验证
27 | location_lists = fetch_images()
28 | offset = (get_offset(restore_images(cut_image_file, location_lists[0]),
29 | restore_images(full_image_file, location_lists[1])))
30 | print("滑块偏移量:", offset)
31 | b_track = get_track(offset - 6)
32 | b_slider = get_slider()
33 | move_slider(b_slider, b_track)
34 | time.sleep(1)
35 | # 点击登录
36 | login_bt = browser.find_element_by_xpath('//a[@class="btn btn-login"]')
37 | login_bt.click()
38 |
39 |
40 | # 下载缺失的图片,每个小方块的坐标
41 | def fetch_images():
42 | html = etree.HTML(browser.page_source)
43 | cut_bg = html.xpath('//div[@class="gt_cut_bg gt_show"]/div')
44 | full_bg = html.xpath('//div[@class="gt_cut_fullbg gt_show"]/div')
45 | # 提取两个打乱后顺序的webp图片URL替换为jpg
46 | cut_bg_url = url_fetch_regex.search((cut_bg[0].get('style'))).group(1).replace('webp', 'jpg')
47 | full_bg_url = url_fetch_regex.search((full_bg[0].get('style'))).group(1).replace('webp', 'jpg')
48 | with open(cut_image_file, 'wb+') as f: f.write(r.get(cut_bg_url).content)
49 | with open(full_image_file, 'wb+') as f: f.write(r.get(full_bg_url).content)
50 | # 采集图片定位坐标
51 | cut_bg_location_list = []
52 | full_bg_location_list = []
53 | for cut in cut_bg:
54 | cut_result = bg_postion_regex.search(cut.get('style'))
55 | full_result = bg_postion_regex.search(cut.get('style'))
56 | cut_bg_location_list.append({'x': int(cut_result.group(1)), 'y': int(cut_result.group(2))})
57 | full_bg_location_list.append({'x': int(full_result.group(1)), 'y': int(full_result.group(2))})
58 | return cut_bg_location_list, full_bg_location_list
59 |
60 |
61 | # 合并还原图片
62 | def restore_images(file, location_list):
63 | im = image.open(file)
64 | # 分段分成上面的图和下面的图列表
65 | below_list = []
66 | above_list = []
67 | for location in location_list:
68 | if location['y'] == -58:
69 | above_list.append(im.crop((abs(location['x']), 58, abs(location['x']) + 10, 116)))
70 | if location['y'] == 0:
71 | below_list.append(im.crop((abs(location['x']), 0, abs(location['x']) + 10, 58)))
72 |
73 | # 创建一个一样大的图片
74 | new_im = image.new('RGB', (260, 116))
75 | # 遍历坐标粘贴上面的图片
76 | x_offset = 0
77 | for im in above_list:
78 | new_im.paste(im, (x_offset, 0))
79 | x_offset += im.size[0]
80 | # 遍历坐标粘贴下面的图片
81 | x_offset = 0
82 | for im in below_list:
83 | new_im.paste(im, (x_offset, 58))
84 | x_offset += im.size[0]
85 | # 保存图片
86 | new_im.save(file)
87 | return new_im
88 |
89 |
90 | # 判断两个像素点是否相同
91 | def is_pixel_equal(img1, img2, x, y):
92 | pix1 = img1.load()[x, y]
93 | pix2 = img2.load()[x, y]
94 | scope = 20 # 像素阀值
95 | return abs(pix1[0] - pix2[0] < scope) and abs(pix1[1] - pix2[1] < scope) and abs(pix1[2] - pix2[2] < scope)
96 |
97 |
98 | # 获得缺口偏移量
99 | def get_offset(img1, img2):
100 | left = 60
101 | for x in range(left, img1.size[0]):
102 | for y in range(img1.size[1]):
103 | if not is_pixel_equal(img1, img2, x, y):
104 | return x
105 | return left
106 |
107 |
108 | # 获取滑块
109 | def get_slider():
110 | while True:
111 | try:
112 | slider = browser.find_element_by_xpath("//div[@class='gt_slider_knob gt_show']")
113 | break
114 | except:
115 | time.sleep(0.5)
116 | return slider
117 |
118 |
119 | # 滑块匀速滑动轨迹构造
120 | def get_track(distance):
121 | track = []
122 | current = 0
123 | while current < distance:
124 | move = distance / 4
125 | current += move
126 | track.append(round(move))
127 | return track
128 |
129 |
130 | # 先加速后减速滑动轨迹构造
131 | def get_person_track(distance):
132 | track = []
133 | current = 0
134 | mid = distance * 4 / 5 # 减速阈值
135 | t = 0.2 # 计算间隔
136 | v = 0 # 初速度
137 | while current < distance:
138 | a = 2 if current < mid else -3
139 | v0 = v # 初速度v0
140 | v = v0 + a * t # 当前速度
141 | move = v0 * t + 1 / 2 * a * t * t # 移动距离
142 | current += move
143 | track.append(round(move))
144 | return track
145 |
146 |
147 | # 滑块滑动的方法
148 | def move_slider(slider, track):
149 | ActionChains(browser).click_and_hold(slider).perform()
150 | for x in track:
151 | ActionChains(browser).move_by_offset(xoffset=x, yoffset=0).perform()
152 | time.sleep(0.05)
153 | ActionChains(browser).release().perform()
154 |
155 |
156 | if __name__ == '__main__':
157 | browser = webdriver.Chrome()
158 | wait = WebDriverWait(browser, 20)
159 | browser.get(bilibili_login_url)
160 | # 休眠2秒等待登录页加载完毕
161 | time.sleep(1)
162 | auto_login()
163 | time.sleep(5)
164 | print(browser.current_url)
165 | browser.quit()
166 |
--------------------------------------------------------------------------------
/6、Python应对反爬虫策略/勘误.md:
--------------------------------------------------------------------------------
1 | 2021.4.29 更新内容:
2 |
3 | 6_5.py → 知乎登录不再使用文字验证,而是使用滑动验证
4 |
5 | 6_6.py → B站登录不再使用验证,而是使用文件识别验证
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__init__.py
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/items.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/items.cpython-37.pyc
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/middlewares.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/middlewares.cpython-37.pyc
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/pipelines.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/pipelines.cpython-37.pyc
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/settings.cpython-37.pyc
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 | class BcyItem(scrapy.Item):
11 | author = scrapy.Field()
12 | pic_url = scrapy.Field()
13 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 | import os
10 | import random
11 | import logging
12 |
13 |
14 | class FirstspiderSpiderMiddleware(object):
15 | # Not all methods need to be defined. If a method is not defined,
16 | # scrapy acts as if the spider middleware does not modify the
17 | # passed objects.
18 |
19 | @classmethod
20 | def from_crawler(cls, crawler):
21 | # This method is used by Scrapy to create your spiders.
22 | s = cls()
23 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
24 | return s
25 |
26 | def process_spider_input(self, response, spider):
27 | # Called for each response that goes through the spider
28 | # middleware and into the spider.
29 |
30 | # Should return None or raise an exception.
31 | return None
32 |
33 | def process_spider_output(self, response, result, spider):
34 | # Called with the results returned from the Spider, after
35 | # it has processed the response.
36 |
37 | # Must return an iterable of Request, dict or Item objects.
38 | for i in result:
39 | yield i
40 |
41 | def process_spider_exception(self, response, exception, spider):
42 | # Called when a spider or process_spider_input() method
43 | # (from other spider middleware) raises an exception.
44 |
45 | # Should return either None or an iterable of Response, dict
46 | # or Item objects.
47 | pass
48 |
49 | def process_start_requests(self, start_requests, spider):
50 | # Called with the start requests of the spider, and works
51 | # similarly to the process_spider_output() method, except
52 | # that it doesn’t have a response associated.
53 |
54 | # Must return only requests (not items).
55 | for r in start_requests:
56 | yield r
57 |
58 | def spider_opened(self, spider):
59 | spider.logger.info('Spider opened: %s' % spider.name)
60 |
61 |
62 | class FirstspiderDownloaderMiddleware(object):
63 | # Not all methods need to be defined. If a method is not defined,
64 | # scrapy acts as if the downloader middleware does not modify the
65 | # passed objects.
66 |
67 | @classmethod
68 | def from_crawler(cls, crawler):
69 | # This method is used by Scrapy to create your spiders.
70 | s = cls()
71 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
72 | return s
73 |
74 | def process_request(self, request, spider):
75 | # Called for each request that goes through the downloader
76 | # middleware.
77 |
78 | # Must either:
79 | # - return None: continue processing this request
80 | # - or return a Response object
81 | # - or return a Request object
82 | # - or raise IgnoreRequest: process_exception() methods of
83 | # installed downloader middleware will be called
84 | return None
85 |
86 | def process_response(self, request, response, spider):
87 | # Called with the response returned from the downloader.
88 |
89 | # Must either;
90 | # - return a Response object
91 | # - return a Request object
92 | # - or raise IgnoreRequest
93 | return response
94 |
95 | def process_exception(self, request, exception, spider):
96 | # Called when a download handler or a process_request()
97 | # (from other downloader middleware) raises an exception.
98 |
99 | # Must either:
100 | # - return None: continue processing this exception
101 | # - return a Response object: stops process_exception() chain
102 | # - return a Request object: stops process_exception() chain
103 | pass
104 |
105 | def spider_opened(self, spider):
106 | spider.logger.info('Spider opened: %s' % spider.name)
107 |
108 |
109 | class ProxyMiddleware(object):
110 | def __init__(self):
111 | self.proxy_ip_list = self.load_list_from_file()
112 |
113 | @staticmethod
114 | def load_list_from_file():
115 | data_list = []
116 | with open(os.path.join(os.getcwd(), 'proxy_ip.txt'), "r+", encoding='utf-8') as f:
117 | for ip in f:
118 | data_list.append(ip.replace("\n", ""))
119 | return data_list
120 |
121 | def process_request(self, request, spider):
122 | if request.meta.get('retry_times'):
123 | proxy = self.proxy_ip_list[random.randint(0, 175)]
124 | if proxy:
125 | proxy_ip = 'https://{proxy}'.format(proxy=proxy)
126 | logging.debug("使用了代理:", proxy_ip)
127 | request.meta['proxy'] = proxy_ip
128 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import pymysql
8 |
9 |
10 | class FirstspiderPipeline(object):
11 | def process_item(self, item, spider):
12 | return item
13 |
14 |
15 | class MySQLPipeline():
16 | def __init__(self):
17 | self.host = 'localhost'
18 | self.database = 'bcy'
19 | self.user = 'root'
20 | self.password = 'Jay12345'
21 | self.port = 3306
22 |
23 | def open_spider(self, spider):
24 | self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8', port=self.port)
25 | self.cursor = self.db.cursor()
26 |
27 | def close_spider(self, spider):
28 | self.db.close()
29 |
30 | def process_item(self, item, spider):
31 | data = dict(item)
32 | keys = ', '.join(data.keys())
33 | values = ', '.join(["%s"] * len(data))
34 | sql = "INSERT INTO draw (%s) VALUES (%s)" % (keys, values)
35 | self.cursor.execute(sql, tuple(data.values()))
36 | self.db.commit()
37 | return item
38 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/proxy_ip.txt:
--------------------------------------------------------------------------------
1 | 125.39.9.34:9000
2 | 183.129.244.17:21776
3 | 120.131.9.254:1080
4 | 117.28.97.64:808
5 | 120.92.174.37:1080
6 | 119.254.103.43:8000
7 | 219.141.153.4:80
8 | 103.205.14.254:53281
9 | 222.175.200.58:8060
10 | 60.13.187.162:63000
11 | 119.180.136.223:8060
12 | 111.47.192.141:8888
13 | 219.141.153.40:80
14 | 171.11.77.4:45592
15 | 221.2.174.28:8060
16 | 14.149.68.120:1080
17 | 61.150.113.74:8908
18 | 119.179.135.114:8060
19 | 39.135.24.12:80
20 | 183.2.203.24:9000
21 | 123.7.177.20:9999
22 | 125.72.70.46:8060
23 | 114.250.25.19:80
24 | 101.248.64.74:80
25 | 60.8.42.132:8908
26 | 119.179.133.58:8060
27 | 140.207.95.94:8060
28 | 123.249.88.153:9000
29 | 219.141.153.2:8080
30 | 119.179.175.60:8060
31 | 61.135.180.27:9000
32 | 112.24.107.102:8908
33 | 121.8.98.196:80
34 | 222.88.149.32:8060
35 | 121.8.98.198:80
36 | 183.234.38.213:63000
37 | 27.154.240.222:8060
38 | 123.161.62.150:9000
39 | 118.190.200.139:8080
40 | 219.150.189.212:9999
41 | 219.145.197.203:8908
42 | 183.15.121.120:3128
43 | 219.141.153.44:80
44 | 221.14.140.130:80
45 | 121.8.98.197:80
46 | 221.2.175.214:8060
47 | 113.87.202.97:53281
48 | 113.128.198.50:8060
49 | 111.3.154.196:8060
50 | 60.13.156.45:8060
51 | 39.137.77.67:8080
52 | 222.222.243.124:8060
53 | 120.194.61.62:8060
54 | 221.1.205.74:8060
55 | 118.190.94.254:9001
56 | 123.161.62.151:9000
57 | 119.52.116.114:80
58 | 61.150.109.70:8908
59 | 101.81.48.234:1028
60 | 117.158.174.164:8060
61 | 222.208.208.33:8060
62 | 106.56.102.219:8070
63 | 124.118.27.3:8060
64 | 39.137.69.8:80
65 | 117.141.99.38:53281
66 | 183.63.101.62:55555
67 | 123.117.166.166:8060
68 | 163.125.114.218:8118
69 | 171.10.31.67:8080
70 | 223.93.145.186:8060
71 | 223.96.95.229:3128
72 | 61.150.113.27:8908
73 | 219.141.153.3:80
74 | 222.88.147.121:8060
75 | 120.236.128.201:8060
76 | 221.234.192.220:8010
77 | 61.150.113.75:8908
78 | 183.163.41.62:41766
79 | 221.2.174.99:8060
80 | 218.60.8.83:3129
81 | 125.39.9.35:9000
82 | 180.168.113.204:1080
83 | 111.205.6.206:8088
84 | 60.8.42.134:8908
85 | 219.141.153.35:80
86 | 61.135.18.206:8888
87 | 218.201.55.74:63000
88 | 183.246.84.229:8060
89 | 116.228.236.219:8080
90 | 121.17.18.218:8060
91 | 112.16.28.103:8060
92 | 61.149.137.110:80
93 | 175.10.87.16:8060
94 | 60.30.19.131:10010
95 | 39.137.69.10:8080
96 | 117.28.96.109:808
97 | 125.46.245.93:53281
98 | 211.136.127.125:80
99 | 219.141.153.41:80
100 | 180.119.141.11:8118
101 | 124.238.248.4:80
102 | 175.174.85.171:80
103 | 123.122.225.134:8888
104 | 221.194.108.8:8060
105 | 119.180.173.64:8060
106 | 119.179.135.132:8060
107 | 101.227.5.36:9000
108 | 61.150.113.28:8908
109 | 111.43.139.151:80
110 | 124.128.76.142:8060
111 | 112.24.107.109:8908
112 | 119.180.178.70:8060
113 | 106.12.3.84:80
114 | 111.3.122.245:8060
115 | 39.135.24.11:80
116 | 42.236.123.17:80
117 | 222.222.236.207:8060
118 | 113.231.247.131:80
119 | 39.137.69.7:80
120 | 120.92.142.64:8080
121 | 114.225.169.226:53128
122 | 112.24.107.101:8908
123 | 106.58.252.76:80
124 | 58.49.73.141:8888
125 | 116.196.105.136:80
126 | 221.193.177.45:8060
127 | 117.44.247.53:8908
128 | 221.2.174.6:8060
129 | 118.190.95.35:9001
130 | 39.137.69.9:8080
131 | 119.180.138.69:8060
132 | 221.2.174.3:8060
133 | 222.223.203.109:8060
134 | 117.66.167.30:8118
135 | 1.197.117.27:8060
136 | 221.176.206.29:8060
137 | 219.141.153.39:80
138 | 39.137.77.68:8080
139 | 58.49.72.141:8888
140 | 222.88.154.56:8060
141 | 39.137.77.66:80
142 | 59.48.237.6:8060
143 | 119.48.189.100:80
144 | 222.89.85.130:8060
145 | 106.12.22.41:8118
146 | 202.103.215.23:80
147 | 60.8.42.36:8908
148 | 117.177.243.6:80
149 | 218.244.44.194:8060
150 | 118.190.95.43:9001
151 | 219.141.153.34:80
152 | 106.56.102.35:8070
153 | 103.205.26.57:21776
154 | 117.131.235.198:8060
155 | 183.129.207.74:11493
156 | 58.247.46.123:8088
157 | 60.8.42.137:8908
158 | 117.156.234.3:8060
159 | 223.68.190.130:8181
160 | 222.88.147.104:8060
161 | 183.220.43.78:8080
162 | 123.146.216.14:80
163 | 60.8.42.15:8908
164 | 221.14.140.66:80
165 | 175.155.24.10:1133
166 | 119.180.161.173:8060
167 | 175.9.177.63:8060
168 | 182.254.145.163:1080
169 | 119.187.120.118:8060
170 | 202.100.83.139:80
171 | 183.129.207.73:13846
172 | 120.236.168.19:8060
173 | 219.141.153.6:80
174 | 211.159.171.58:80
175 | 221.1.84.241:8197
176 | 60.14.125.246:8908
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/run.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 | cmdline.execute(["scrapy", "crawl", "bcy"])
4 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for FirstSpider project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'FirstSpider'
13 |
14 | SPIDER_MODULES = ['FirstSpider.spiders']
15 | NEWSPIDER_MODULE = 'FirstSpider.spiders'
16 |
17 | ROBOTSTXT_OBEY = False
18 |
19 |
20 | DEFAULT_REQUEST_HEADERS = {
21 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
22 | 'Chrome/68.0.3440.106 Safari/537.36',
23 | 'Host': 'bcy.net',
24 | 'Origin': 'https://bcy.net',
25 | }
26 |
27 | DOWNLOADER_MIDDLEWARES = {
28 | 'FirstSpider.middlewares.ProxyMiddleware': 555
29 | }
30 |
31 | ITEM_PIPELINES = {
32 | 'FirstSpider.pipelines.MySQLPipeline': 300,
33 | }
34 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/spiders/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/spiders/__pycache__/bcy.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/spiders/__pycache__/bcy.cpython-37.pyc
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/spiders/bcy.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from scrapy import Request, Spider, Selector
3 | import datetime
4 |
5 | from FirstSpider.items import *
6 |
7 |
8 | def parse_index(response):
9 | items = response.xpath('//li[@class="js-smallCards _box"]')
10 | for item in items:
11 | bcy_item = BcyItem()
12 | bcy_item['author'] = item.xpath('a[@class="db posr ovf"]/@title').extract_first()
13 | bcy_item['pic_url'] = item.xpath('a/img/@src').extract_first().replace('/2X3', '')
14 | yield bcy_item
15 |
16 |
17 | class BcySpider(Spider):
18 | name = 'bcy'
19 | allowed_domains = ['bcy.net']
20 |
21 | index_url = 'https://bcy.net/illust/toppost100?type=lastday&date={d}'
22 |
23 | ajax_url = 'https://bcy.net/illust/index/ajaxloadtoppost?p=1&type=lastday&date={d}'
24 |
25 | date_list = [] # 日期范围列表
26 |
27 | ajax_headers = {
28 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
29 | 'Chrome/68.0.3440.106 Safari/537.36',
30 | 'Host': 'bcy.net',
31 | 'Origin': 'https://bcy.net',
32 | 'X-Requested-With': 'XMLHttpRequest'
33 | }
34 |
35 | def start_requests(self):
36 | self.init_date_list()
37 | for date in self.date_list:
38 | yield Request(self.index_url.format(d=date), callback=parse_index)
39 | for date in self.date_list:
40 | yield Request(self.ajax_url.format(d=date), callback=parse_index)
41 |
42 | # 构造一个日期列表
43 | def init_date_list(self):
44 | begin_date = datetime.datetime.strptime("20150918", "%Y%m%d")
45 | end_date = datetime.datetime.strptime("20180827", "%Y%m%d")
46 | while begin_date <= end_date:
47 | date_str = begin_date.strftime("%Y%m%d")
48 | self.date_list.append(date_str)
49 | begin_date += datetime.timedelta(days=1)
50 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = FirstSpider.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = FirstSpider
12 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.6
2 | ENV PATH /usr/local/bin:$PATH
3 | ADD . /code
4 | WORKDIR /code
5 | RUN pip3 install -r requirements.txt
6 | CMD scrapy crawl BingWallpaper
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/bing.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/bing.json
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/bing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/bing/__init__.py
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/bing/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/bing/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/bing/__pycache__/items.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/bing/__pycache__/items.cpython-37.pyc
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/bing/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/bing/__pycache__/settings.cpython-37.pyc
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/bing/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class BingItem(scrapy.Item):
12 | image_urls = scrapy.Field()
13 | images = scrapy.Field()
14 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/bing/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class BingSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class BingDownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/bing/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class BingPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/bing/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for bing project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'bing'
13 |
14 | SPIDER_MODULES = ['bing.spiders']
15 | NEWSPIDER_MODULE = 'bing.spiders'
16 |
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'bing (+http://www.yourdomain.com)'
19 |
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 |
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | # CONCURRENT_REQUESTS = 32
25 |
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | # DOWNLOAD_DELAY = 31
30 | # The download delay setting will honor only one of:
31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | # CONCURRENT_REQUESTS_PER_IP = 16
33 |
34 | # Disable cookies (enabled by default)
35 | # COOKIES_ENABLED = False
36 |
37 | # Disable Telnet Console (enabled by default)
38 | # TELNETCONSOLE_ENABLED = False
39 |
40 | # Override the default request headers:
41 | # DEFAULT_REQUEST_HEADERS = {
42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | # 'Accept-Language': 'en',
44 | # }
45 |
46 | # Enable or disable spider middlewares
47 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
48 | # SPIDER_MIDDLEWARES = {
49 | # 'bing.middlewares.BingSpiderMiddleware': 543,
50 | # }
51 |
52 | # Enable or disable downloader middlewares
53 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
54 | # DOWNLOADER_MIDDLEWARES = {
55 | # 'bing.middlewares.BingDownloaderMiddleware': 543,
56 | # }
57 |
58 | # Enable or disable extensions
59 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
60 | # EXTENSIONS = {
61 | # 'scrapy.extensions.telnet.TelnetConsole': None,
62 | # }
63 |
64 | # Configure item pipelines
65 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
66 | # ITEM_PIPELINES = {
67 | # 'bing.pipelines.BingPipeline': 300,
68 | # }
69 |
70 | ITEM_PIPELINES = {
71 | # 引入Scrapy提供的ImagesPipeline组件
72 | 'scrapy.pipelines.images.ImagesPipeline': 300,
73 | }
74 |
75 | # ImagesPipeline辅助配置项
76 | # 图片存储路径(绝对路径或相对路径)
77 | IMAGES_STORE = 'out/res/pic/'
78 | # BingItem中定义的存储图片链接的image_urls字段
79 | IMAGES_URLS_FIELD = 'image_urls'
80 | # BingItem中定义的的images字段
81 | IMAGES_RESULT_FIELD='images'
82 | # 过期时间,单位:天(可选)
83 | IMAGES_EXPIRES = 120
84 | # 过滤小图片(可选)
85 | # IMAGES_MIN_HEIGHT = 110
86 | # IMAGES_MIN_WIDTH = 110
87 | # 是否允许重定向(可选)
88 | # MEDIA_ALLOW_REDIRECTS = True
89 | # 生成缩略图(可选)
90 | # IMAGES_THUMBS = {
91 | # 'small': (50, 50),
92 | # 'big': (270, 270),
93 | # }
94 |
95 | # Enable and configure the AutoThrottle extension (disabled by default)
96 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
97 | # AUTOTHROTTLE_ENABLED = True
98 | # The initial download delay
99 | # AUTOTHROTTLE_START_DELAY = 5
100 | # The maximum download delay to be set in case of high latencies
101 | # AUTOTHROTTLE_MAX_DELAY = 60
102 | # The average number of requests Scrapy should be sending in parallel to
103 | # each remote server
104 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
105 | # Enable showing throttling stats for every response received:
106 | # AUTOTHROTTLE_DEBUG = False
107 |
108 | # Enable and configure HTTP caching (disabled by default)
109 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
110 | # HTTPCACHE_ENABLED = True
111 | # HTTPCACHE_EXPIRATION_SECS = 0
112 | # HTTPCACHE_DIR = 'httpcache'
113 | # HTTPCACHE_IGNORE_HTTP_CODES = []
114 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
115 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/BingWallpaper.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from scrapy import Spider, Request
3 | import time
4 | import json
5 |
6 | from bing.items import BingItem
7 |
8 |
9 | class BingWallpaperSpider(Spider):
10 | name = 'BingWallpaper'
11 | allowed_domains = ['cn.bing.com']
12 |
13 | def start_requests(self):
14 | yield Request(
15 | 'https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc={ts}&pid=hp'.format(ts=int(time.time())),
16 | callback=self.parse)
17 |
18 | def parse(self, response):
19 | json_result = json.loads(response.body.decode('utf8'))
20 | images = json_result['images']
21 | if images is not None:
22 | item = BingItem()
23 | url_list = []
24 | for image in images:
25 | url_list.append('https://cn.bing.com' + image['url'])
26 | item['image_urls'] = url_list
27 | yield item
28 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/Test.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | print(int(time.time()))
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/__pycache__/BingWallpaper.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/__pycache__/BingWallpaper.cpython-37.pyc
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/__pycache__/Test.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/__pycache__/Test.cpython-37.pyc
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T104228.709049.log:
--------------------------------------------------------------------------------
1 | 2018-10-15 10:42:29 [scrapy] DEBUG: Crawled (200) (referer: None)
2 | 2018-10-15 10:42:29 [scrapy] ERROR: Spider error processing (referer: None)
3 | Traceback (most recent call last):
4 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
5 | yield next(it)
6 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 30, in process_spider_output
7 | for x in result:
8 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in
9 | return (_set_referer(r) for r in result or ())
10 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in
11 | return (r for r in result or () if _filter(r))
12 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in
13 | return (r for r in result or () if _filter(r))
14 | File "E:\Code\Python\bing\bing\spiders\BingWallpaper.py", line 17, in parse
15 | json_result = json.loads(response.body.decode('utf8'))
16 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\__init__.py", line 348, in loads
17 | return _default_decoder.decode(s)
18 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 337, in decode
19 | obj, end = self.raw_decode(s, idx=_w(s, 0).end())
20 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 355, in raw_decode
21 | raise JSONDecodeError("Expecting value", s, err.value) from None
22 | json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
23 | 2018-10-15 10:42:29 [scrapy] INFO: Closing spider (finished)
24 | 2018-10-15 10:42:29 [scrapy] INFO: Dumping Scrapy stats:
25 | {'downloader/request_bytes': 210,
26 | 'downloader/request_count': 1,
27 | 'downloader/request_method_count/GET': 1,
28 | 'downloader/response_bytes': 48356,
29 | 'downloader/response_count': 1,
30 | 'downloader/response_status_count/200': 1,
31 | 'finish_reason': 'finished',
32 | 'finish_time': datetime.datetime(2018, 10, 15, 2, 42, 29, 916818),
33 | 'log_count/DEBUG': 1,
34 | 'log_count/ERROR': 1,
35 | 'log_count/INFO': 6,
36 | 'response_received_count': 1,
37 | 'scheduler/dequeued': 1,
38 | 'scheduler/dequeued/memory': 1,
39 | 'scheduler/enqueued': 1,
40 | 'scheduler/enqueued/memory': 1,
41 | 'spider_exceptions/JSONDecodeError': 1,
42 | 'start_time': datetime.datetime(2018, 10, 15, 2, 42, 29, 334376)}
43 | 2018-10-15 10:42:29 [scrapy] INFO: Spider closed (finished)
44 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T104303.655633.log:
--------------------------------------------------------------------------------
1 | 2018-10-15 10:43:03 [scrapy] DEBUG: Crawled (200) (referer: None)
2 | 2018-10-15 10:43:04 [scrapy] ERROR: Spider error processing (referer: None)
3 | Traceback (most recent call last):
4 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
5 | yield next(it)
6 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 30, in process_spider_output
7 | for x in result:
8 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in
9 | return (_set_referer(r) for r in result or ())
10 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in
11 | return (r for r in result or () if _filter(r))
12 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in
13 | return (r for r in result or () if _filter(r))
14 | File "E:\Code\Python\bing\bing\spiders\BingWallpaper.py", line 17, in parse
15 | json_result = json.loads(response.body.decode('utf8'))
16 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\__init__.py", line 348, in loads
17 | return _default_decoder.decode(s)
18 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 337, in decode
19 | obj, end = self.raw_decode(s, idx=_w(s, 0).end())
20 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 355, in raw_decode
21 | raise JSONDecodeError("Expecting value", s, err.value) from None
22 | json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
23 | 2018-10-15 10:43:04 [scrapy] INFO: Closing spider (finished)
24 | 2018-10-15 10:43:04 [scrapy] INFO: Dumping Scrapy stats:
25 | {'downloader/request_bytes': 210,
26 | 'downloader/request_count': 1,
27 | 'downloader/request_method_count/GET': 1,
28 | 'downloader/response_bytes': 48361,
29 | 'downloader/response_count': 1,
30 | 'downloader/response_status_count/200': 1,
31 | 'finish_reason': 'finished',
32 | 'finish_time': datetime.datetime(2018, 10, 15, 2, 43, 4, 73509),
33 | 'log_count/DEBUG': 1,
34 | 'log_count/ERROR': 1,
35 | 'log_count/INFO': 6,
36 | 'response_received_count': 1,
37 | 'scheduler/dequeued': 1,
38 | 'scheduler/dequeued/memory': 1,
39 | 'scheduler/enqueued': 1,
40 | 'scheduler/enqueued/memory': 1,
41 | 'spider_exceptions/JSONDecodeError': 1,
42 | 'start_time': datetime.datetime(2018, 10, 15, 2, 43, 3, 665598)}
43 | 2018-10-15 10:43:04 [scrapy] INFO: Spider closed (finished)
44 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T104348.228406.log:
--------------------------------------------------------------------------------
1 | 2018-10-15 10:43:48 [scrapy] DEBUG: Crawled (200) (referer: None)
2 | 2018-10-15 10:43:48 [scrapy] ERROR: Spider error processing (referer: None)
3 | Traceback (most recent call last):
4 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
5 | yield next(it)
6 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 30, in process_spider_output
7 | for x in result:
8 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in
9 | return (_set_referer(r) for r in result or ())
10 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in
11 | return (r for r in result or () if _filter(r))
12 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in
13 | return (r for r in result or () if _filter(r))
14 | File "E:\Code\Python\bing\bing\spiders\BingWallpaper.py", line 17, in parse
15 | json_result = json.loads(response.body.decode('utf8'))
16 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\__init__.py", line 348, in loads
17 | return _default_decoder.decode(s)
18 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 337, in decode
19 | obj, end = self.raw_decode(s, idx=_w(s, 0).end())
20 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 355, in raw_decode
21 | raise JSONDecodeError("Expecting value", s, err.value) from None
22 | json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
23 | 2018-10-15 10:43:48 [scrapy] INFO: Closing spider (finished)
24 | 2018-10-15 10:43:48 [scrapy] INFO: Dumping Scrapy stats:
25 | {'downloader/request_bytes': 210,
26 | 'downloader/request_count': 1,
27 | 'downloader/request_method_count/GET': 1,
28 | 'downloader/response_bytes': 48360,
29 | 'downloader/response_count': 1,
30 | 'downloader/response_status_count/200': 1,
31 | 'finish_reason': 'finished',
32 | 'finish_time': datetime.datetime(2018, 10, 15, 2, 43, 48, 681197),
33 | 'log_count/DEBUG': 1,
34 | 'log_count/ERROR': 1,
35 | 'log_count/INFO': 6,
36 | 'response_received_count': 1,
37 | 'scheduler/dequeued': 1,
38 | 'scheduler/dequeued/memory': 1,
39 | 'scheduler/enqueued': 1,
40 | 'scheduler/enqueued/memory': 1,
41 | 'spider_exceptions/JSONDecodeError': 1,
42 | 'start_time': datetime.datetime(2018, 10, 15, 2, 43, 48, 238379)}
43 | 2018-10-15 10:43:48 [scrapy] INFO: Spider closed (finished)
44 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T104841.872511.log:
--------------------------------------------------------------------------------
1 | 2018-10-15 10:48:42 [scrapy] DEBUG: Crawled (200) (referer: None)
2 | 2018-10-15 10:48:42 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
3 | 2018-10-15 10:48:42 [scrapy] DEBUG: Crawled (200) (referer: None)
4 | 2018-10-15 10:48:42 [scrapy] DEBUG: File (downloaded): Downloaded file from referred in
5 | 2018-10-15 10:48:43 [scrapy] DEBUG: Crawled (200) (referer: None)
6 | 2018-10-15 10:48:43 [scrapy] DEBUG: File (downloaded): Downloaded file from referred in
7 | 2018-10-15 10:48:43 [scrapy] DEBUG: Crawled (200) (referer: None)
8 | 2018-10-15 10:48:43 [scrapy] DEBUG: File (downloaded): Downloaded file from referred in
9 | 2018-10-15 10:48:43 [scrapy] DEBUG: Crawled (200) (referer: None)
10 | 2018-10-15 10:48:43 [scrapy] DEBUG: File (downloaded): Downloaded file from referred in
11 | 2018-10-15 10:48:43 [scrapy] DEBUG: Crawled (200) (referer: None)
12 | 2018-10-15 10:48:43 [scrapy] DEBUG: File (downloaded): Downloaded file from referred in
13 | 2018-10-15 10:48:43 [scrapy] DEBUG: Crawled (200) (referer: None)
14 | 2018-10-15 10:48:43 [scrapy] DEBUG: File (downloaded): Downloaded file from referred in
15 | 2018-10-15 10:48:43 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539571348&pid=hp>
16 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg',
17 | 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg',
18 | 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg',
19 | 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg',
20 | 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg',
21 | 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg',
22 | 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'],
23 | 'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f',
24 | 'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg',
25 | 'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'},
26 | {'checksum': 'adbcc3f3fa26188db600654137117e2a',
27 | 'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg',
28 | 'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'},
29 | {'checksum': '092f09cdb791bedf29913ad3d1940960',
30 | 'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg',
31 | 'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'},
32 | {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7',
33 | 'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg',
34 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'},
35 | {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1',
36 | 'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg',
37 | 'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'},
38 | {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99',
39 | 'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg',
40 | 'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'},
41 | {'checksum': '6ebfdb5210fce5986b88d07053ac94af',
42 | 'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg',
43 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]}
44 | 2018-10-15 10:48:43 [scrapy] INFO: Closing spider (finished)
45 | 2018-10-15 10:48:43 [scrapy] INFO: Dumping Scrapy stats:
46 | {'downloader/request_bytes': 3614,
47 | 'downloader/request_count': 7,
48 | 'downloader/request_method_count/GET': 7,
49 | 'downloader/response_bytes': 1842731,
50 | 'downloader/response_count': 7,
51 | 'downloader/response_status_count/200': 7,
52 | 'file_count': 7,
53 | 'file_status_count/downloaded': 6,
54 | 'file_status_count/uptodate': 1,
55 | 'finish_reason': 'finished',
56 | 'finish_time': datetime.datetime(2018, 10, 15, 2, 48, 43, 629811),
57 | 'item_scraped_count': 1,
58 | 'log_count/DEBUG': 15,
59 | 'log_count/INFO': 6,
60 | 'response_received_count': 7,
61 | 'scheduler/dequeued': 1,
62 | 'scheduler/dequeued/memory': 1,
63 | 'scheduler/enqueued': 1,
64 | 'scheduler/enqueued/memory': 1,
65 | 'start_time': datetime.datetime(2018, 10, 15, 2, 48, 41, 884479)}
66 | 2018-10-15 10:48:43 [scrapy] INFO: Spider closed (finished)
67 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T104922.591600.log:
--------------------------------------------------------------------------------
1 | 2018-10-15 10:49:22 [scrapy] DEBUG: Crawled (200) (referer: None)
2 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
3 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
4 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
5 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
6 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
7 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
8 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
9 | 2018-10-15 10:49:23 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539571348&pid=hp>
10 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg',
11 | 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg',
12 | 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg',
13 | 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg',
14 | 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg',
15 | 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg',
16 | 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'],
17 | 'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f',
18 | 'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg',
19 | 'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'},
20 | {'checksum': 'adbcc3f3fa26188db600654137117e2a',
21 | 'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg',
22 | 'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'},
23 | {'checksum': '092f09cdb791bedf29913ad3d1940960',
24 | 'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg',
25 | 'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'},
26 | {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7',
27 | 'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg',
28 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'},
29 | {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1',
30 | 'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg',
31 | 'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'},
32 | {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99',
33 | 'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg',
34 | 'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'},
35 | {'checksum': '6ebfdb5210fce5986b88d07053ac94af',
36 | 'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg',
37 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]}
38 | 2018-10-15 10:49:23 [scrapy] INFO: Closing spider (finished)
39 | 2018-10-15 10:49:23 [scrapy] INFO: Dumping Scrapy stats:
40 | {'downloader/request_bytes': 270,
41 | 'downloader/request_count': 1,
42 | 'downloader/request_method_count/GET': 1,
43 | 'downloader/response_bytes': 2711,
44 | 'downloader/response_count': 1,
45 | 'downloader/response_status_count/200': 1,
46 | 'file_count': 7,
47 | 'file_status_count/uptodate': 7,
48 | 'finish_reason': 'finished',
49 | 'finish_time': datetime.datetime(2018, 10, 15, 2, 49, 23, 63339),
50 | 'item_scraped_count': 1,
51 | 'log_count/DEBUG': 9,
52 | 'log_count/INFO': 6,
53 | 'response_received_count': 1,
54 | 'scheduler/dequeued': 1,
55 | 'scheduler/dequeued/memory': 1,
56 | 'scheduler/enqueued': 1,
57 | 'scheduler/enqueued/memory': 1,
58 | 'start_time': datetime.datetime(2018, 10, 15, 2, 49, 22, 600576)}
59 | 2018-10-15 10:49:23 [scrapy] INFO: Spider closed (finished)
60 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T105002.320386.log:
--------------------------------------------------------------------------------
1 | 2018-10-15 10:50:02 [scrapy] DEBUG: Crawled (200) (referer: None)
2 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
3 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
4 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
5 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
6 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
7 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
8 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
9 | 2018-10-15 10:50:02 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539571348&pid=hp>
10 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg',
11 | 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg',
12 | 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg',
13 | 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg',
14 | 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg',
15 | 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg',
16 | 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'],
17 | 'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f',
18 | 'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg',
19 | 'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'},
20 | {'checksum': 'adbcc3f3fa26188db600654137117e2a',
21 | 'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg',
22 | 'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'},
23 | {'checksum': '092f09cdb791bedf29913ad3d1940960',
24 | 'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg',
25 | 'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'},
26 | {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7',
27 | 'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg',
28 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'},
29 | {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1',
30 | 'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg',
31 | 'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'},
32 | {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99',
33 | 'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg',
34 | 'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'},
35 | {'checksum': '6ebfdb5210fce5986b88d07053ac94af',
36 | 'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg',
37 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]}
38 | 2018-10-15 10:50:02 [scrapy] INFO: Closing spider (finished)
39 | 2018-10-15 10:50:02 [scrapy] INFO: Dumping Scrapy stats:
40 | {'downloader/request_bytes': 270,
41 | 'downloader/request_count': 1,
42 | 'downloader/request_method_count/GET': 1,
43 | 'downloader/response_bytes': 2711,
44 | 'downloader/response_count': 1,
45 | 'downloader/response_status_count/200': 1,
46 | 'file_count': 7,
47 | 'file_status_count/uptodate': 7,
48 | 'finish_reason': 'finished',
49 | 'finish_time': datetime.datetime(2018, 10, 15, 2, 50, 2, 820051),
50 | 'item_scraped_count': 1,
51 | 'log_count/DEBUG': 9,
52 | 'log_count/INFO': 6,
53 | 'response_received_count': 1,
54 | 'scheduler/dequeued': 1,
55 | 'scheduler/dequeued/memory': 1,
56 | 'scheduler/enqueued': 1,
57 | 'scheduler/enqueued/memory': 1,
58 | 'start_time': datetime.datetime(2018, 10, 15, 2, 50, 2, 331356)}
59 | 2018-10-15 10:50:02 [scrapy] INFO: Spider closed (finished)
60 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T105902.809743.log:
--------------------------------------------------------------------------------
1 | 2018-10-15 10:59:03 [scrapy] DEBUG: Crawled (200) (referer: None)
2 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
3 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
4 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
5 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
6 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
7 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
8 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
9 | 2018-10-15 10:59:03 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539572342&pid=hp>
10 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg',
11 | 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg',
12 | 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg',
13 | 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg',
14 | 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg',
15 | 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg',
16 | 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'],
17 | 'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f',
18 | 'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg',
19 | 'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'},
20 | {'checksum': 'adbcc3f3fa26188db600654137117e2a',
21 | 'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg',
22 | 'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'},
23 | {'checksum': '092f09cdb791bedf29913ad3d1940960',
24 | 'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg',
25 | 'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'},
26 | {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7',
27 | 'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg',
28 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'},
29 | {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1',
30 | 'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg',
31 | 'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'},
32 | {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99',
33 | 'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg',
34 | 'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'},
35 | {'checksum': '6ebfdb5210fce5986b88d07053ac94af',
36 | 'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg',
37 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]}
38 | 2018-10-15 10:59:03 [scrapy] INFO: Closing spider (finished)
39 | 2018-10-15 10:59:03 [scrapy] INFO: Dumping Scrapy stats:
40 | {'downloader/request_bytes': 270,
41 | 'downloader/request_count': 1,
42 | 'downloader/request_method_count/GET': 1,
43 | 'downloader/response_bytes': 2711,
44 | 'downloader/response_count': 1,
45 | 'downloader/response_status_count/200': 1,
46 | 'file_count': 7,
47 | 'file_status_count/uptodate': 7,
48 | 'finish_reason': 'finished',
49 | 'finish_time': datetime.datetime(2018, 10, 15, 2, 59, 3, 536799),
50 | 'item_scraped_count': 1,
51 | 'log_count/DEBUG': 9,
52 | 'log_count/INFO': 6,
53 | 'response_received_count': 1,
54 | 'scheduler/dequeued': 1,
55 | 'scheduler/dequeued/memory': 1,
56 | 'scheduler/enqueued': 1,
57 | 'scheduler/enqueued/memory': 1,
58 | 'start_time': datetime.datetime(2018, 10, 15, 2, 59, 3, 51096)}
59 | 2018-10-15 10:59:03 [scrapy] INFO: Spider closed (finished)
60 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T113038.987323.log:
--------------------------------------------------------------------------------
1 | 2018-10-15 11:30:39 [scrapy] DEBUG: Crawled (200) (referer: None)
2 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
3 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
4 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
5 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
6 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
7 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
8 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
9 | 2018-10-15 11:30:39 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539574239&pid=hp>
10 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg',
11 | 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg',
12 | 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg',
13 | 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg',
14 | 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg',
15 | 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg',
16 | 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'],
17 | 'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f',
18 | 'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg',
19 | 'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'},
20 | {'checksum': 'adbcc3f3fa26188db600654137117e2a',
21 | 'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg',
22 | 'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'},
23 | {'checksum': '092f09cdb791bedf29913ad3d1940960',
24 | 'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg',
25 | 'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'},
26 | {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7',
27 | 'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg',
28 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'},
29 | {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1',
30 | 'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg',
31 | 'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'},
32 | {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99',
33 | 'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg',
34 | 'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'},
35 | {'checksum': '6ebfdb5210fce5986b88d07053ac94af',
36 | 'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg',
37 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]}
38 | 2018-10-15 11:30:39 [scrapy] INFO: Closing spider (finished)
39 | 2018-10-15 11:30:39 [scrapy] INFO: Dumping Scrapy stats:
40 | {'downloader/request_bytes': 270,
41 | 'downloader/request_count': 1,
42 | 'downloader/request_method_count/GET': 1,
43 | 'downloader/response_bytes': 2711,
44 | 'downloader/response_count': 1,
45 | 'downloader/response_status_count/200': 1,
46 | 'file_count': 7,
47 | 'file_status_count/uptodate': 7,
48 | 'finish_reason': 'finished',
49 | 'finish_time': datetime.datetime(2018, 10, 15, 3, 30, 39, 713384),
50 | 'item_scraped_count': 1,
51 | 'log_count/DEBUG': 9,
52 | 'log_count/INFO': 6,
53 | 'response_received_count': 1,
54 | 'scheduler/dequeued': 1,
55 | 'scheduler/dequeued/memory': 1,
56 | 'scheduler/enqueued': 1,
57 | 'scheduler/enqueued/memory': 1,
58 | 'start_time': datetime.datetime(2018, 10, 15, 3, 30, 39, 227680)}
59 | 2018-10-15 11:30:39 [scrapy] INFO: Spider closed (finished)
60 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T120654.496911.log:
--------------------------------------------------------------------------------
1 | 2018-10-15 12:06:54 [scrapy] DEBUG: Crawled (200) (referer: None)
2 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
3 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
4 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
5 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
6 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
7 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
8 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in
9 | 2018-10-15 12:06:55 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539575848&pid=hp>
10 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg',
11 | 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg',
12 | 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg',
13 | 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg',
14 | 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg',
15 | 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg',
16 | 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'],
17 | 'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f',
18 | 'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg',
19 | 'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'},
20 | {'checksum': 'adbcc3f3fa26188db600654137117e2a',
21 | 'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg',
22 | 'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'},
23 | {'checksum': '092f09cdb791bedf29913ad3d1940960',
24 | 'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg',
25 | 'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'},
26 | {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7',
27 | 'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg',
28 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'},
29 | {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1',
30 | 'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg',
31 | 'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'},
32 | {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99',
33 | 'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg',
34 | 'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'},
35 | {'checksum': '6ebfdb5210fce5986b88d07053ac94af',
36 | 'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg',
37 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]}
38 | 2018-10-15 12:06:55 [scrapy] INFO: Closing spider (finished)
39 | 2018-10-15 12:06:55 [scrapy] INFO: Dumping Scrapy stats:
40 | {'downloader/request_bytes': 270,
41 | 'downloader/request_count': 1,
42 | 'downloader/request_method_count/GET': 1,
43 | 'downloader/response_bytes': 2711,
44 | 'downloader/response_count': 1,
45 | 'downloader/response_status_count/200': 1,
46 | 'file_count': 7,
47 | 'file_status_count/uptodate': 7,
48 | 'finish_reason': 'finished',
49 | 'finish_time': datetime.datetime(2018, 10, 15, 4, 6, 55, 222970),
50 | 'item_scraped_count': 1,
51 | 'log_count/DEBUG': 9,
52 | 'log_count/INFO': 6,
53 | 'response_received_count': 1,
54 | 'scheduler/dequeued': 1,
55 | 'scheduler/dequeued/memory': 1,
56 | 'scheduler/enqueued': 1,
57 | 'scheduler/enqueued/memory': 1,
58 | 'start_time': datetime.datetime(2018, 10, 15, 4, 6, 54, 748238)}
59 | 2018-10-15 12:06:55 [scrapy] INFO: Spider closed (finished)
60 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/033317f07b809f0cd06487b30b29eccb26d063b8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/033317f07b809f0cd06487b30b29eccb26d063b8.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/0698af79b195349b838bdfeebbd11409f82f0f38.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/0698af79b195349b838bdfeebbd11409f82f0f38.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/092235104f84cb2f4de8808c10f655298313f65c.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/092235104f84cb2f4de8808c10f655298313f65c.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/2efd29b32c481136507115a3ee2e6181c122aa0b.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/2efd29b32c481136507115a3ee2e6181c122aa0b.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/3a573eb605fef87faaf91ad8ad421d1a24d0bc6b.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/3a573eb605fef87faaf91ad8ad421d1a24d0bc6b.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/4099096a19a0eaad0aef6782a206881d948ad775.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/4099096a19a0eaad0aef6782a206881d948ad775.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/486c568e353051efd0959cc4a424ff9093cfceb9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/486c568e353051efd0959cc4a424ff9093cfceb9.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/599f27e7835da59b495c44297cce0553ee4a0b51.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/599f27e7835da59b495c44297cce0553ee4a0b51.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/86fd225ce368589a9b5e7454e6583cf77aedb0d4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/86fd225ce368589a9b5e7454e6583cf77aedb0d4.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/885648740905a26703e18c1ae24f23c480ecc822.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/885648740905a26703e18c1ae24f23c480ecc822.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/97e86cde9a308e626f537c107303537ec598903c.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/97e86cde9a308e626f537c107303537ec598903c.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/b7e4ba8cba538b44e31132d175479c7ec37284fd.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/b7e4ba8cba538b44e31132d175479c7ec37284fd.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/bfa7e5e22268f27d7a195390abf6ef9ee45a6c29.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/bfa7e5e22268f27d7a195390abf6ef9ee45a6c29.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/c14461fb44425865b9afe6695ab5926e2001411c.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/c14461fb44425865b9afe6695ab5926e2001411c.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/cbba4b16b644659920ad93e10a6d3478270ce927.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/cbba4b16b644659920ad93e10a6d3478270ce927.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/e254600d400f3c54c77171e02b021d46369788ae.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/e254600d400f3c54c77171e02b021d46369788ae.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/e7fc4de75bcafe18f64b68072bf5cc6ece6084a8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/e7fc4de75bcafe18f64b68072bf5cc6ece6084a8.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/ed989d9c858c5290ca559cf2c462cace68e49362.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/ed989d9c858c5290ca559cf2c462cace68e49362.jpg
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/requirements.txt:
--------------------------------------------------------------------------------
1 | Scrapy==1.5.1
2 | Pillow==5.2.0
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/run.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 | cmdline.execute(["scrapy", "crawl", "BingWallpaper"])
4 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/代码/bing/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = bing.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = bing
12 |
--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy(上)/勘误.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/勘误.md
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__init__.py
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/items.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/items.cpython-37.pyc
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/middlewares.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/middlewares.cpython-37.pyc
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/pipelines.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/pipelines.cpython-37.pyc
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/settings.cpython-37.pyc
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | from scrapy import Field, Item
9 |
10 |
11 | class JianshuspiderItem(Item):
12 | title = Field()
13 | content = Field()
14 | url = Field()
15 | nickname = Field()
16 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 | from selenium import webdriver
8 | from scrapy import signals
9 | from scrapy.http import HtmlResponse
10 |
11 |
12 | class JianshuspiderSpiderMiddleware(object):
13 | # Not all methods need to be defined. If a method is not defined,
14 | # scrapy acts as if the spider middleware does not modify the
15 | # passed objects.
16 |
17 | @classmethod
18 | def from_crawler(cls, crawler):
19 | # This method is used by Scrapy to create your spiders.
20 | s = cls()
21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
22 | return s
23 |
24 | def process_spider_input(self, response, spider):
25 | # Called for each response that goes through the spider
26 | # middleware and into the spider.
27 |
28 | # Should return None or raise an exception.
29 | return None
30 |
31 | def process_spider_output(self, response, result, spider):
32 | # Called with the results returned from the Spider, after
33 | # it has processed the response.
34 |
35 | # Must return an iterable of Request, dict or Item objects.
36 | for i in result:
37 | yield i
38 |
39 | def process_spider_exception(self, response, exception, spider):
40 | # Called when a spider or process_spider_input() method
41 | # (from other spider middleware) raises an exception.
42 |
43 | # Should return either None or an iterable of Response, dict
44 | # or Item objects.
45 | pass
46 |
47 | def process_start_requests(self, start_requests, spider):
48 | # Called with the start requests of the spider, and works
49 | # similarly to the process_spider_output() method, except
50 | # that it doesn’t have a response associated.
51 |
52 | # Must return only requests (not items).
53 | for r in start_requests:
54 | yield r
55 |
56 | def spider_opened(self, spider):
57 | spider.logger.info('Spider opened: %s' % spider.name)
58 |
59 |
60 | class JianshuspiderDownloaderMiddleware(object):
61 | # Not all methods need to be defined. If a method is not defined,
62 | # scrapy acts as if the downloader middleware does not modify the
63 | # passed objects.
64 |
65 | @classmethod
66 | def from_crawler(cls, crawler):
67 | # This method is used by Scrapy to create your spiders.
68 | s = cls()
69 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
70 | return s
71 |
72 | def process_request(self, request, spider):
73 | # Called for each request that goes through the downloader
74 | # middleware.
75 |
76 | # Must either:
77 | # - return None: continue processing this request
78 | # - or return a Response object
79 | # - or return a Request object
80 | # - or raise IgnoreRequest: process_exception() methods of
81 | # installed downloader middleware will be called
82 | return None
83 |
84 | def process_response(self, request, response, spider):
85 | # Called with the response returned from the downloader.
86 |
87 | # Must either;
88 | # - return a Response object
89 | # - return a Request object
90 | # - or raise IgnoreRequest
91 | return response
92 |
93 | def process_exception(self, request, exception, spider):
94 | # Called when a download handler or a process_request()
95 | # (from other downloader middleware) raises an exception.
96 |
97 | # Must either:
98 | # - return None: continue processing this exception
99 | # - return a Response object: stops process_exception() chain
100 | # - return a Request object: stops process_exception() chain
101 | pass
102 |
103 | def spider_opened(self, spider):
104 | spider.logger.info('Spider opened: %s' % spider.name)
105 |
106 |
107 | class JSSeleniumMiddleware:
108 | def __init__(self):
109 | self.browser = webdriver.Chrome()
110 |
111 | def __del__(self):
112 | self.browser.close()
113 |
114 | def process_request(self, request, spider):
115 | self.browser.get("https://www.jianshu.com/")
116 | return HtmlResponse(url='https://www.jianshu.com/', body=self.browser.page_source, request=request,
117 | encoding='utf-8', status=200)
118 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import pymongo
8 |
9 |
10 | class JianshuspiderPipeline(object):
11 | def process_item(self, item, spider):
12 | return item
13 |
14 |
15 | class MongoPipeline(object):
16 | def open_spider(self, spider):
17 | self.client = pymongo.MongoClient(host='localhost', port=27017)
18 | self.db = self.client['js']
19 |
20 | def process_item(self, item, spider):
21 | self.db['index_article'].insert(dict(item))
22 |
23 | def close_spider(self, spider):
24 | self.client.close()
25 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for jianshuspider project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'jianshuspider'
13 |
14 | SPIDER_MODULES = ['jianshuspider.spiders']
15 | NEWSPIDER_MODULE = 'jianshuspider.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'jianshuspider (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'jianshuspider.middlewares.JianshuspiderSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | DOWNLOADER_MIDDLEWARES = {
56 | 'jianshuspider.middlewares.JSSeleniumMiddleware': 543,
57 | }
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'jianshuspider.pipelines.MongoPipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/spiders/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/spiders/__pycache__/jianshu.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/spiders/__pycache__/jianshu.cpython-37.pyc
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/spiders/jianshu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from scrapy import Spider, Request
3 |
4 | from jianshuspider.items import JianshuspiderItem
5 |
6 |
7 | class JianshuSpider(Spider):
8 | name = 'jianshu'
9 | allowed_domains = ['www.jianshu.com']
10 | start_urls = ['http://www.jianshu.com/']
11 |
12 | def start_requests(self):
13 | yield Request('https://www.jianshu.com', callback=self.parse)
14 |
15 | def parse(self, response):
16 | li_s = response.xpath('//ul[@class="note-list"]/li')
17 | for li in li_s:
18 | item = JianshuspiderItem()
19 | item['title'] = li.xpath('.//div/a[@class="title"]/text()').extract_first()
20 | item['content'] = str(li.xpath('.//div/p[@class="abstract"]/text()').extract_first()).replace(
21 | " ", "").replace(
22 | "\n", "")
23 | item['url'] = 'https://www.jianshu.com/p/' + str(
24 | li.xpath('.//div/a[@class="title"]/@href').extract_first())
25 | item['nickname'] = li.xpath('.//div/a[@class="nickname"]/text()').extract_first()
26 | yield item
27 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.4.4
2 | APScheduler==3.5.3
3 | asn1crypto==0.24.0
4 | async-timeout==3.0.1
5 | attrs==18.2.0
6 | Automat==0.7.0
7 | certifi==2018.8.24
8 | cffi==1.11.5
9 | chardet==3.0.4
10 | Click==7.0
11 | constantly==15.1.0
12 | cryptography==2.3.1
13 | cssselect==1.0.3
14 | demjson==2.2.4
15 | docopt==0.6.2
16 | Flask==1.0.2
17 | hyperlink==18.0.0
18 | idna==2.7
19 | incremental==17.5.0
20 | itsdangerous==0.24
21 | Jinja2==2.10
22 | lxml==4.2.5
23 | MarkupSafe==1.0
24 | multidict==4.4.2
25 | parsel==1.5.0
26 | Pillow==5.2.0
27 | pipreqs==0.4.9
28 | pyasn1==0.4.4
29 | pyasn1-modules==0.2.2
30 | pycparser==2.19
31 | PyDispatcher==2.0.5
32 | PyHamcrest==1.9.0
33 | pymongo==3.7.2
34 | PyMySQL==0.9.2
35 | pyOpenSSL==18.0.0
36 | pytz==2018.5
37 | pywin32==223
38 | queuelib==1.5.0
39 | redis==2.10.6
40 | requests==2.19.1
41 | Scrapy==1.5.1
42 | scrapyrt==0.10
43 | selenium==3.14.1
44 | service-identity==17.0.0
45 | six==1.11.0
46 | Twisted==18.7.0
47 | tzlocal==1.5.1
48 | urllib3==1.23
49 | w3lib==1.19.0
50 | Werkzeug==0.14.1
51 | yarg==0.1.9
52 | yarl==1.2.6
53 | zope.interface==4.5.0
54 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = jianshuspider.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = jianshuspider
12 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ip_check.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 |
3 | import aiohttp
4 | from aiohttp import ClientError, ClientConnectionError, ClientHttpProxyError, ServerDisconnectedError
5 | from redis import StrictRedis
6 |
7 | test_url = 'https://ip.cn/'
8 |
9 |
10 | class ProxyCheck:
11 | def __init__(self):
12 | self.redis_db = StrictRedis(
13 | host="127.0.0.1",
14 | port=6379,
15 | password="Jay12345",
16 | db=0
17 | )
18 |
19 | # 检测代理IP是否可用
20 | async def check_ip(self, proxy_ip):
21 | conn = aiohttp.TCPConnector(ssl=False)
22 | async with aiohttp.ClientSession(connector=conn) as session:
23 | try:
24 | async with session.get(test_url, proxy=proxy_ip.replace("https", "http"), headers={
25 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
26 | 'Chrome/68.0.3440.106 Safari/537.36'
27 | }) as resp:
28 | if resp.status in [200]:
29 | print("代理可用:", proxy_ip)
30 | else:
31 | print("移除不可用代理ip:", proxy_ip)
32 | self.redis_db.srem('proxy_ips:proxy_pool', proxy_ip)
33 | except (ClientError, ClientConnectionError, ClientHttpProxyError, ServerDisconnectedError, TimeoutError,
34 | AttributeError):
35 | print("代理请求失败移除代理ip:", proxy_ip)
36 | self.redis_db.srem('proxy_ips:proxy_pool', proxy_ip)
37 |
38 | def check_all_ip(self):
39 | print("开始检测代理ip是否可用")
40 | loop = asyncio.get_event_loop()
41 | tasks = []
42 | for ip in self.redis_db.smembers('proxy_ips:proxy_pool'):
43 | tasks.append(self.check_ip(ip.decode()))
44 | loop.run_until_complete(asyncio.wait(tasks))
45 |
46 |
47 | if __name__ == '__main__':
48 | ProxyCheck().check_all_ip()
49 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/__init__.py
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/__pycache__/settings.cpython-37.pyc
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class ProxyIpsItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class ProxyIpsSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class ProxyIpsDownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class ProxyIpsPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for proxy_ips project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'proxy_ips'
13 |
14 | SPIDER_MODULES = ['proxy_ips.spiders']
15 | NEWSPIDER_MODULE = 'proxy_ips.spiders'
16 |
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'proxy_ips (+http://www.yourdomain.com)'
19 |
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 |
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | # CONCURRENT_REQUESTS = 32
25 |
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | # DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | # CONCURRENT_REQUESTS_PER_IP = 16
33 |
34 | # Disable cookies (enabled by default)
35 | # COOKIES_ENABLED = False
36 |
37 | # Disable Telnet Console (enabled by default)
38 | # TELNETCONSOLE_ENABLED = False
39 |
40 | # Override the default request headers:
41 | DEFAULT_REQUEST_HEADERS = {
42 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | 'Accept-Language': 'en',
44 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
45 | 'Chrome/68.0.3440.106 Safari/537.36',
46 |
47 | }
48 |
49 | # Enable or disable spider middlewares
50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
51 | # SPIDER_MIDDLEWARES = {
52 | # 'proxy_ips.middlewares.ProxyIpsSpiderMiddleware': 543,
53 | # }
54 |
55 | # Enable or disable downloader middlewares
56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
57 | # DOWNLOADER_MIDDLEWARES = {
58 | # 'proxy_ips.middlewares.ProxyIpsDownloaderMiddleware': 543,
59 | # }
60 |
61 | # Enable or disable extensions
62 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
63 | # EXTENSIONS = {
64 | # 'scrapy.extensions.telnet.TelnetConsole': None,
65 | # }
66 |
67 | # Configure item pipelines
68 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
69 | # ITEM_PIPELINES = {
70 | # 'proxy_ips.pipelines.ProxyIpsPipeline': 300,
71 | # }
72 |
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | # AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | # AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | # AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | # AUTOTHROTTLE_DEBUG = False
85 |
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | # HTTPCACHE_ENABLED = True
89 | # HTTPCACHE_EXPIRATION_SECS = 0
90 | # HTTPCACHE_DIR = 'httpcache'
91 | # HTTPCACHE_IGNORE_HTTP_CODES = []
92 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/spiders/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/spiders/__pycache__/proxy_spider.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/spiders/__pycache__/proxy_spider.cpython-37.pyc
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/spiders/proxy_spider.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 |
3 | import aiohttp
4 | from aiohttp import ClientError, ClientConnectionError, ClientHttpProxyError, ServerDisconnectedError
5 | from redis import StrictRedis
6 | from scrapy import Spider, Request
7 | import time
8 |
9 | test_url = 'https://ip.cn/'
10 |
11 |
12 | # 获取代理IP的爬虫
13 | class FetchIpSpider(Spider):
14 | name = "fetch_ip"
15 |
16 | def __init__(self, **kwargs):
17 | super().__init__(**kwargs)
18 | self.redis_db = StrictRedis(
19 | host="127.0.0.1",
20 | port=6379,
21 | password="Jay12345",
22 | db=0
23 | )
24 |
25 | def start_requests(self):
26 | # for i in range(1, 5):
27 | # yield Request(url="http://www.xicidaili.com/nn/" + str(i), callback=self.parse_xici, headers={
28 | # 'Host': 'www.xicidaili.com',
29 | # 'Referer': 'http://www.xicidaili.com/',
30 | # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
31 | # 'Chrome/68.0.3440.106 Safari/537.36'
32 | # })
33 |
34 | for i in range(1, 5):
35 | time.sleep(3)
36 | yield Request(url='https://www.kuaidaili.com/free/inha/' + str(i) + '/', callback=self.parse_kuaidaili,
37 | headers={
38 | 'Host': 'www.kuaidaili.com',
39 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, '
40 | 'like Gecko) '
41 | 'Chrome/68.0.3440.106 Safari/537.36'
42 | })
43 |
44 | def parse_xici(self, response):
45 | loop = asyncio.get_event_loop()
46 | proxy_ips = []
47 | for tr in response.css('#ip_list tr'):
48 | td_list = tr.css('td::text')
49 | if len(td_list) < 3:
50 | continue
51 | ip_address = td_list[0].extract() # IP
52 | port = td_list[1].extract() # 端口
53 | if len(td_list) == 11:
54 | proto = td_list[4].extract()
55 | else:
56 | proto = td_list[5].extract() # 协议类型
57 | proxy_ip = '%s://%s:%s' % (proto.lower(), ip_address, port)
58 | # 获取响应时间,超过2s的丢弃
59 | latency = tr.css('div.bar::attr(title)').re_first('(\d+\.\d+)')
60 | if float(latency) > 2:
61 | self.logger.info("跳过慢速代理:%s 响应时间:%s" % (proxy_ip, latency))
62 | else:
63 | self.logger.info("可用代理加入队列:%s 响应时间:%s" % (proxy_ip, latency))
64 | proxy_ips.append(proxy_ip)
65 | tasks = []
66 | for ip in proxy_ips:
67 | tasks.append(self.check_ip(ip))
68 | loop.run_until_complete(asyncio.wait(tasks))
69 |
70 |
71 | def parse_kuaidaili(self, response):
72 | loop = asyncio.get_event_loop()
73 | proxy_ips = []
74 | for tr in response.css('tbody tr'):
75 | td_list = tr.css('td::text')
76 | ip_address = td_list[0].extract() # IP
77 | port = td_list[1].extract() # 端口
78 | proto = td_list[3].extract() # 协议
79 | proxy_ip = '%s://%s:%s' % (proto.lower(), ip_address, port)
80 | # 获取响应时间,超过2s的丢弃
81 | latency = float((td_list[5].extract())[:-1])
82 | if float(latency) > 2:
83 | self.logger.info("跳过慢速代理:%s 响应时间:%s" % (proxy_ip, latency))
84 | else:
85 | self.logger.info("可用代理加入队列:%s 响应时间:%s" % (proxy_ip, latency))
86 | proxy_ips.append(proxy_ip)
87 | tasks = []
88 | for ip in proxy_ips:
89 | tasks.append(self.check_ip(ip))
90 | loop.run_until_complete(asyncio.wait(tasks))
91 |
92 | # 检测代理IP是否可用
93 | async def check_ip(self, proxy_ip):
94 | conn = aiohttp.TCPConnector(ssl=False)
95 | async with aiohttp.ClientSession(connector=conn) as session:
96 | try:
97 | async with session.get(test_url, proxy=proxy_ip.replace("https", "http")) as resp:
98 | if resp.status in [200]:
99 | print("代理可用:", proxy_ip)
100 | self.redis_db.sadd('proxy_ips:proxy_pool', proxy_ip)
101 | else:
102 | print("代理不可用:", proxy_ip)
103 | except (ClientError, ClientConnectionError, ClientHttpProxyError, ServerDisconnectedError, TimeoutError,
104 | AttributeError):
105 | print("代理请求失败:", proxy_ip)
106 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_server.py:
--------------------------------------------------------------------------------
1 | # coding =utf-8
2 | from flask import Flask
3 | from redis import StrictRedis
4 | import random
5 |
6 | app = Flask(__name__)
7 |
8 |
9 | @app.route("/")
10 | def fetch_ip():
11 | ip_list = list(redis_db.smembers("proxy_ips:proxy_pool"))
12 | return random.choice(ip_list).decode()
13 |
14 |
15 | if __name__ == '__main__':
16 | redis_db = StrictRedis(
17 | host="127.0.0.1",
18 | port=6379,
19 | password="Jay12345",
20 | db=0
21 | )
22 | app.run()
23 |
24 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/run.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | from apscheduler.schedulers.blocking import BlockingScheduler
5 | from redis import StrictRedis
6 |
7 | fetch_ip_time = 0
8 |
9 | redis_db = StrictRedis(
10 | host="127.0.0.1",
11 | port=6379,
12 | password="Jay12345",
13 | db=0
14 | )
15 |
16 |
17 | def check_ip():
18 | global fetch_ip_time
19 | proxy_poll = redis_db.smembers("proxy_ips:proxy_pool")
20 | if len(proxy_poll) == 0:
21 | print("可用代理IP数目为0,激活爬虫...")
22 | os.system("scrapy crawl fetch_ip")
23 | fetch_ip_time = int(time.time())
24 | else:
25 | if len(proxy_poll) < 5:
26 | if int(time.time() - fetch_ip_time) < 600:
27 | if len(proxy_poll) == 0:
28 | print("虽然处于保护状态,但是可用代理IP数目为0,激活爬虫...")
29 | os.system("scrapy crawl fetch_ip")
30 | fetch_ip_time = int(time.time())
31 | else:
32 | print("当前可用代理IP少于5,但是还处于保护状态,不激活爬虫")
33 | else:
34 | print("当前可用代理IP少于5,且处于非保护状态,激活爬虫...")
35 | os.system("scrapy crawl fetch_ip")
36 | fetch_ip_time = int(time.time())
37 | else:
38 | print("日常自检...")
39 | os.system("python proxy_ip_check.py")
40 |
41 |
42 | if __name__ == '__main__':
43 | check_ip()
44 | scheduler = BlockingScheduler()
45 | # 每隔20s执行一次
46 | scheduler.add_job(check_ip, 'interval', max_instances=10, seconds=20)
47 | scheduler.start()
48 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = proxy_ips.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = proxy_ips
12 |
--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy(下)/勘误.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/勘误.md
--------------------------------------------------------------------------------
/9、数据分析案例:Python岗位行情/代码/9_1.py:
--------------------------------------------------------------------------------
1 | """
2 | numpy使用代码示例
3 | """
4 |
5 | import numpy as np
6 |
7 | print("1.生成一个一维数组:\n %s" % np.array([1, 2]))
8 | print("2.生成一个二维数组:\n %s" % np.array([[1, 2], [3, 4]]))
9 | print("3.生成一个元素初始值都为0的,4行3列矩阵:\n %s" % np.zeros((4, 3)))
10 | print("4.生成一个元素初始值都为1的,3行4列矩阵:\n %s" % np.ones((3, 4)))
11 | print("5.创建一个空数组,元素为随机值:\n %s" % np.empty([2, 3], dtype=int))
12 | a1 = np.arange(0, 30, 2)
13 | print("6.生成一个等间隔数字的数组:\n %s" % a1)
14 | a2 = a1.reshape(3, 5)
15 | print("7.转换数组的维度,比如把一维的转为3行5列的数组:\n %s" % a2)
16 |
17 | # ndarray常用属性
18 | print("8.a1的维度: %d \t a2的维度:%d" % (a1.ndim, a2.ndim))
19 | print("9.a1的行列数:%s \t a2的行列数:%s" % (a1.shape, a2.shape))
20 | print("10.a1的元素个数:%d \t a2的元素个数:%d" % (a1.size, a2.size))
21 | print("11.a1的元素数据类型:%s 数据类型大小:%s" % (a1.dtype, a1.itemsize))
22 |
--------------------------------------------------------------------------------
/9、数据分析案例:Python岗位行情/代码/9_2.py:
--------------------------------------------------------------------------------
1 | # 拉勾网Android招聘数据分析
2 | import html
3 | import random
4 | import re
5 | import time
6 | import urllib.parse
7 | from collections import Counter
8 |
9 | import matplotlib.pyplot as plt
10 | import numpy as np
11 | import pandas as pd
12 | import requests
13 | from scipy.misc import imread
14 | from wordcloud import WordCloud, ImageColorGenerator
15 |
16 | import config as c
17 | import tools as t
18 |
19 | max_page = 1
20 | result_save_file = c.outputs_logs_path + 'result.csv'
21 | pic_save_path = c.outputs_pictures_path + 'LaGou/'
22 | default_font = c.res_documents + 'wryh.ttf' # 生成词云用的默认字体
23 | default_mask = c.res_pictures + 'default_mask.jpg' # 默认遮罩图片
24 |
25 | # Ajax加载url
26 | ajax_url = "https://www.lagou.com/jobs/positionAjax.json?"
27 |
28 | # url拼接参数
29 | request_params = {'needAddtionalResult': 'false'}
30 |
31 | # post提交参数
32 | form_data = {'first': 'false', 'pn': '1', 'kd': 'Python'}
33 |
34 | # 获得页数的正则
35 | page_pattern = re.compile('"totalCount":(\d*),', re.S)
36 |
37 | # csv表头
38 | csv_headers = [
39 | '公司id', '城市', '职位名称', '工作年限', '学历', '职位性质', '薪资',
40 | '融资状态', '行业领域', '招聘岗位id', '公司优势', '公司规模',
41 | '公司标签', '所在区域', '技能标签', '公司经度', '公司纬度', '公司全名'
42 | ]
43 |
44 | # 模拟请求头
45 | ajax_headers = {
46 | 'Accept': 'application/json, text/javascript, */*; q=0.01',
47 | 'Accept-Encoding': 'gzip, deflate, br',
48 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
49 | 'Connection': 'keep-alive',
50 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
51 | 'Host': 'www.lagou.com',
52 | 'Origin': 'https://www.lagou.com',
53 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 '
54 | 'Safari/537.36',
55 | 'X-Anit-Forge-Code': '0',
56 | 'X-Anit-Forge-Token': 'None',
57 | 'X-Requested-With': 'XMLHttpRequest',
58 | 'Referer': 'https://www.lagou.com/jobs/list_android?labelWords=&fromSearch=true&suginput='
59 | }
60 |
61 |
62 | # 获取每页招聘信息
63 | def fetch_data(page):
64 | fetch_url = ajax_url + urllib.parse.urlencode(request_params)
65 | global max_page
66 | while True:
67 | try:
68 | form_data['pn'] = page
69 | print("抓取第:" + str(page) + "页!")
70 | # 随机休眠5-15s,避免因为访问过于频繁ip被封
71 | resp = requests.post(url=fetch_url, data=form_data, headers=ajax_headers)
72 | if resp.status_code == 200:
73 | if page == 1:
74 | max_page = int(int(page_pattern.search(resp.text).group(1)) / 15)
75 | print("总共有:" + str(max_page) + "页")
76 | data_json = resp.json()['content']['positionResult']['result']
77 | data_list = []
78 | for data in data_json:
79 | data_list.append((data['companyId'],
80 | data['city'],
81 | html.unescape(data['positionName']),
82 | data['workYear'],
83 | data['education'],
84 | data['jobNature'],
85 | data['salary'],
86 | data['financeStage'],
87 | data['industryField'],
88 | data['positionId'],
89 | html.unescape(data['positionAdvantage']),
90 | data['companySize'],
91 | data['companyLabelList'],
92 | data['district'],
93 | html.unescape(data['positionLables']),
94 | data['longitude'],
95 | data['latitude'],
96 | html.unescape(data['companyFullName'])))
97 | result = pd.DataFrame(data_list)
98 | if page == 1:
99 | result.to_csv(result_save_file, header=csv_headers, index=False, mode='a+')
100 | else:
101 | result.to_csv(result_save_file, header=False, index=False, mode='a+')
102 | return None
103 | except Exception as e:
104 | print(e)
105 |
106 |
107 | # 生成词云文件
108 | def make_wc(content, file_name, mask_pic=default_mask, font=default_font):
109 | bg_pic = imread(mask_pic)
110 | pic_colors = ImageColorGenerator(bg_pic)
111 | wc = WordCloud(font_path=font, background_color='white', margin=2, max_font_size=250,
112 | width=2000, height=2000,
113 | min_font_size=30, max_words=1000)
114 | wc.generate_from_frequencies(content)
115 | wc.to_file(file_name)
116 |
117 |
118 | # 数据分析方法(生成相关文件)
119 | def data_analysis(data):
120 | # 1.分析招聘公司的相关信息
121 | # 行业领域
122 | industry_field_list = []
123 | for industry_field in data['行业领域']:
124 | for field in industry_field.strip().replace(" ", ",").replace("、", ",").split(','):
125 | industry_field_list.append(field)
126 | counter = dict(Counter(industry_field_list))
127 | counter.pop('')
128 | make_wc(counter, pic_save_path + "wc_1.jpg")
129 |
130 | # 公司规模
131 | plt.figure(1)
132 | data['公司规模'].value_counts().plot(kind='pie', autopct='%1.1f%%', explode=np.linspace(0, 0.5, 6))
133 | plt.subplots_adjust(left=0.22, right=0.74, wspace=0.20, hspace=0.20,
134 | bottom=0.17, top=0.84)
135 | plt.savefig(pic_save_path + 'result_1.jpg')
136 | plt.close(1)
137 | # 融资状态
138 | plt.figure(2)
139 | data['融资状态'].value_counts().plot(kind='pie', autopct='%1.1f%%')
140 | plt.subplots_adjust(left=0.22, right=0.74, wspace=0.20, hspace=0.20,
141 | bottom=0.17, top=0.84)
142 | plt.savefig(pic_save_path + 'result_2.jpg')
143 | plt.close(2)
144 | # 所在区域
145 | plt.figure(3)
146 | data['所在区域'].value_counts().plot(kind='pie', autopct='%1.1f%%', explode=[0, 0, 0, 0, 0, 0, 0, 1, 1.5])
147 | plt.subplots_adjust(left=0.31, right=0.74, wspace=0.20, hspace=0.20,
148 | bottom=0.26, top=0.84)
149 | plt.savefig(pic_save_path + 'result_3.jpg')
150 | plt.close(3)
151 | # 公司标签
152 | tags_list = []
153 | for tags in data['公司标签']:
154 | for tag in tags.strip().replace("[", "").replace("]", "").replace("'", "").split(','):
155 | tags_list.append(tag)
156 | counter = dict(Counter(tags_list))
157 | counter.pop('')
158 | make_wc(counter, pic_save_path + "wc_2.jpg")
159 | # 公司优势
160 | advantage_list = []
161 | for advantage_field in data['公司优势']:
162 | for field in advantage_field.strip().replace(" ", ",").replace("、", ",").replace(",", ",").replace("+", ",") \
163 | .split(','):
164 | industry_field_list.append(field)
165 | counter = dict(Counter(industry_field_list))
166 | counter.pop('')
167 | counter.pop('移动互联网')
168 | make_wc(counter, pic_save_path + "wc_3.jpg")
169 |
170 | # 2.分析招聘需求
171 | # 工作年限要求
172 | # 横向条形图
173 | plt.figure(4)
174 | data['工作年限'].value_counts().plot(kind='barh', rot=0)
175 | plt.title("工作经验直方图")
176 | plt.xlabel("年限/年")
177 | plt.ylabel("公司/个")
178 | plt.savefig(pic_save_path + 'result_4.jpg')
179 | plt.close(4)
180 | # 饼图
181 | plt.figure(5)
182 | data['工作年限'].value_counts().plot(kind='pie', autopct='%1.1f%%', explode=np.linspace(0, 0.75, 6))
183 | plt.title("工作经验饼图")
184 | plt.subplots_adjust(left=0.22, right=0.74, wspace=0.20, hspace=0.20,
185 | bottom=0.17, top=0.84)
186 | plt.savefig(pic_save_path + 'result_5.jpg')
187 | plt.close(5)
188 | # 学历要求
189 | plt.figure(6)
190 | data['学历'].value_counts().plot(kind='pie', autopct='%1.1f%%', explode=(0, 0.1, 0.2))
191 | plt.title("学历饼图")
192 | plt.subplots_adjust(left=0.22, right=0.74, wspace=0.20, hspace=0.20,
193 | bottom=0.17, top=0.84)
194 | plt.savefig(pic_save_path + 'result_6.jpg')
195 | plt.close(6)
196 |
197 | # 薪资(先去掉后部分的最大工资,过滤掉kK以上词汇,获取索引按照整数生序排列)
198 | plt.figure(7)
199 | salary = data['薪资'].str.split('-').str.get(0).str.replace('k|K|以上', "").value_counts()
200 | salary_index = list(salary.index)
201 | salary_index.sort(key=lambda x: int(x))
202 | final_salary = salary.reindex(salary_index)
203 | plt.title("薪资条形图")
204 | final_salary.plot(kind='bar', rot=0)
205 | plt.xlabel("薪资/K")
206 | plt.ylabel("公司/个")
207 | plt.savefig(pic_save_path + 'result_7.jpg')
208 | plt.close(7)
209 |
210 | # 技能标签
211 | skill_list = []
212 | for skills in data['技能标签']:
213 | for skill in skills.strip().replace("[", "").replace("]", "").replace("'", "").split(','):
214 | skill_list.append(skill)
215 | counter = dict(Counter(skill_list))
216 | counter.pop('')
217 | counter.pop('Android')
218 | make_wc(counter, pic_save_path + "wc_4.jpg")
219 |
220 |
221 | # 处理数据
222 | if __name__ == '__main__':
223 | t.is_dir_existed(pic_save_path)
224 | if not t.is_dir_existed(result_save_file, mkdir=False):
225 | fetch_data(1)
226 | for cur_page in range(2, max_page + 1):
227 | fetch_data(cur_page)
228 | else:
229 | raw_data = pd.read_csv(result_save_file)
230 | data_analysis(raw_data)
231 | # 筛选电子商务公司
232 | dzsw_result = raw_data.loc[raw_data["行业领域"].str.find("电子商务") != -1, ["行业领域", "公司全名"]]
233 | dzsw_result.to_csv(c.outputs_logs_path + "dzsw.csv", header=False, index=False, mode='a+')
234 | # 筛选人15-50人的公司
235 | p_num_result = raw_data.loc[raw_data["所在区域"] == "龙华新区", ["所在区域", "公司全名"]]
236 | p_num_result.to_csv(c.outputs_logs_path + "lhxq.csv", header=False, index=False, mode='a+')
237 |
--------------------------------------------------------------------------------
/9、数据分析案例:Python岗位行情/勘误.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/9、数据分析案例:Python岗位行情/勘误.md
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 《Python网络爬虫从入门到实践》勘误与随书源代码
2 |
3 |
4 | ---
5 |
6 | ![][1]
7 |
8 | 貌似,出版社忘记把随书源代码加到书里了...
9 |
10 | 而电脑重装了,只找到了挺久以前的备份...
11 |
12 | 将就着用吧...
13 |
14 | 书中有问题的地方(错别字,代码错误,印刷错误等)欢迎 **提issues反馈** ,感激不尽。
15 |
16 | 另外,本来书是由:**Python基础** 和 **Python爬虫** 两个部分组成的,本书为爬虫部分,
17 | 基础部分的内容因为一些客观因素没有出版,笔者把这部分的内容发布到了公号上,**免费**!!!
18 | 有兴趣的同学可自行到公号查阅,谢谢~
19 |
20 | ![][2]
21 |
22 | 最后,如果有其他事宜,如:加群、商业合作等,可在公号留言或者加下我的小号~
23 |
24 | ![][3]
25 |
26 |
27 | [1]: http://static.zybuluo.com/coder-pig/ionx6je52iwlhxbgba3t1x51/12121.png
28 | [2]: http://static.zybuluo.com/coder-pig/1jpu7nalyfp3kvaxfm4q0h8y/20190524181102821.jpg
29 | [3]: http://static.zybuluo.com/coder-pig/whqf2oblwvzqempi2eec32xy/1111.png
--------------------------------------------------------------------------------