├── .gitignore
├── 10、数据分析案例：某婚恋网站交友情况分析
    ├── 代码
    │   ├── 10_1.py
    │   ├── render.html
    │   ├── wzly.csv
    │   └── wzly.ipynb
    └── 勘误.md
├── 1、Python爬虫概念与Web基础
    └── 勘误.md
├── 2、Python爬虫基本库的使用
    ├── 代码
    │   ├── 2_1.py
    │   ├── 2_10.py
    │   ├── 2_11.py
    │   ├── 2_12.py
    │   ├── 2_13.py
    │   ├── 2_14.py
    │   ├── 2_2.py
    │   ├── 2_3.py
    │   ├── 2_4.py
    │   ├── 2_5.py
    │   ├── 2_6.py
    │   ├── 2_7.py
    │   ├── 2_8.py
    │   ├── 2_9.py
    │   ├── cookie.txt
    │   └── proxy_ips.txt
    └── 勘误.md
├── 3、Python爬虫抓包与数据解析
    ├── 代码
    │   ├── 3_1.py
    │   ├── 3_2.py
    │   ├── 3_3.py
    │   └── 3_4.py
    └── 勘误.md
├── 4、用CSV 和 Excel 存储数据
    ├── 代码
    │   ├── 4_1.py
    │   ├── 4_10.py
    │   ├── 4_2.py
    │   ├── 4_3.py
    │   ├── 4_4.py
    │   ├── 4_5.py
    │   ├── 4_6.py
    │   ├── 4_7.py
    │   ├── 4_8.py
    │   └── 4_9.py
    └── 勘误.md
├── 5、用数据库存储数据
    ├── 代码
    │   ├── 5_1.py
    │   ├── 5_10.py
    │   ├── 5_2.py
    │   ├── 5_3.py
    │   ├── 5_4.py
    │   ├── 5_5.py
    │   ├── 5_6.py
    │   ├── 5_7.py
    │   ├── 5_8.py
    │   └── 5_9.py
    └── 勘误.md
├── 6、Python应对反爬虫策略
    ├── 代码
    │   ├── 6_1.py
    │   ├── 6_2.py
    │   ├── 6_3.py
    │   ├── 6_4.py
    │   ├── 6_5.py
    │   └── 6_6.py
    └── 勘误.md
├── 7、Python爬虫框架Scrapy（上）
    ├── 代码
    │   ├── FirstSpider
    │   │   ├── FirstSpider
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   │   ├── items.cpython-37.pyc
    │   │   │   │   ├── middlewares.cpython-37.pyc
    │   │   │   │   ├── pipelines.cpython-37.pyc
    │   │   │   │   └── settings.cpython-37.pyc
    │   │   │   ├── items.py
    │   │   │   ├── middlewares.py
    │   │   │   ├── pipelines.py
    │   │   │   ├── proxy_ip.txt
    │   │   │   ├── run.py
    │   │   │   ├── settings.py
    │   │   │   └── spiders
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── __pycache__
    │   │   │   │       ├── __init__.cpython-37.pyc
    │   │   │   │       └── bcy.cpython-37.pyc
    │   │   │   │   └── bcy.py
    │   │   └── scrapy.cfg
    │   └── bing
    │   │   ├── Dockerfile
    │   │   ├── bing.json
    │   │   ├── bing
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │       │   ├── __init__.cpython-37.pyc
    │   │       │   ├── items.cpython-37.pyc
    │   │       │   └── settings.cpython-37.pyc
    │   │       ├── items.py
    │   │       ├── middlewares.py
    │   │       ├── pipelines.py
    │   │       ├── settings.py
    │   │       └── spiders
    │   │       │   ├── BingWallpaper.py
    │   │       │   ├── Test.py
    │   │       │   ├── __init__.py
    │   │       │   └── __pycache__
    │   │       │       ├── BingWallpaper.cpython-37.pyc
    │   │       │       ├── Test.cpython-37.pyc
    │   │       │       └── __init__.cpython-37.pyc
    │   │   ├── logs
    │   │       └── BingWallpaper
    │   │       │   ├── 2018-10-15T104228.709049.log
    │   │       │   ├── 2018-10-15T104303.655633.log
    │   │       │   ├── 2018-10-15T104348.228406.log
    │   │       │   ├── 2018-10-15T104841.872511.log
    │   │       │   ├── 2018-10-15T104922.591600.log
    │   │       │   ├── 2018-10-15T105002.320386.log
    │   │       │   ├── 2018-10-15T105902.809743.log
    │   │       │   ├── 2018-10-15T113038.987323.log
    │   │       │   └── 2018-10-15T120654.496911.log
    │   │   ├── out
    │   │       └── res
    │   │       │   └── pic
    │   │       │       └── full
    │   │       │           ├── 033317f07b809f0cd06487b30b29eccb26d063b8.jpg
    │   │       │           ├── 0698af79b195349b838bdfeebbd11409f82f0f38.jpg
    │   │       │           ├── 092235104f84cb2f4de8808c10f655298313f65c.jpg
    │   │       │           ├── 2efd29b32c481136507115a3ee2e6181c122aa0b.jpg
    │   │       │           ├── 3a573eb605fef87faaf91ad8ad421d1a24d0bc6b.jpg
    │   │       │           ├── 4099096a19a0eaad0aef6782a206881d948ad775.jpg
    │   │       │           ├── 486c568e353051efd0959cc4a424ff9093cfceb9.jpg
    │   │       │           ├── 5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg
    │   │       │           ├── 599f27e7835da59b495c44297cce0553ee4a0b51.jpg
    │   │       │           ├── 86fd225ce368589a9b5e7454e6583cf77aedb0d4.jpg
    │   │       │           ├── 885648740905a26703e18c1ae24f23c480ecc822.jpg
    │   │       │           ├── 97e86cde9a308e626f537c107303537ec598903c.jpg
    │   │       │           ├── b7e4ba8cba538b44e31132d175479c7ec37284fd.jpg
    │   │       │           ├── bca701f1923e317aa8a9be18125c2a894fc80780.jpg
    │   │       │           ├── bfa7e5e22268f27d7a195390abf6ef9ee45a6c29.jpg
    │   │       │           ├── c14461fb44425865b9afe6695ab5926e2001411c.jpg
    │   │       │           ├── cbba4b16b644659920ad93e10a6d3478270ce927.jpg
    │   │       │           ├── e254600d400f3c54c77171e02b021d46369788ae.jpg
    │   │       │           ├── e7fc4de75bcafe18f64b68072bf5cc6ece6084a8.jpg
    │   │       │           └── ed989d9c858c5290ca559cf2c462cace68e49362.jpg
    │   │   ├── requirements.txt
    │   │   ├── run.py
    │   │   └── scrapy.cfg
    └── 勘误.md
├── 8、Python爬虫框架Scrapy（下）
    ├── 代码
    │   ├── jianshuspider
    │   │   ├── jianshuspider
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   │   ├── items.cpython-37.pyc
    │   │   │   │   ├── middlewares.cpython-37.pyc
    │   │   │   │   ├── pipelines.cpython-37.pyc
    │   │   │   │   └── settings.cpython-37.pyc
    │   │   │   ├── items.py
    │   │   │   ├── middlewares.py
    │   │   │   ├── pipelines.py
    │   │   │   ├── settings.py
    │   │   │   └── spiders
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── __pycache__
    │   │   │   │       ├── __init__.cpython-37.pyc
    │   │   │   │       └── jianshu.cpython-37.pyc
    │   │   │   │   └── jianshu.py
    │   │   ├── requirements.txt
    │   │   └── scrapy.cfg
    │   └── proxy_ips
    │   │   ├── proxy_ip_check.py
    │   │   ├── proxy_ips
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │       │   ├── __init__.cpython-37.pyc
    │   │       │   └── settings.cpython-37.pyc
    │   │       ├── items.py
    │   │       ├── middlewares.py
    │   │       ├── pipelines.py
    │   │       ├── settings.py
    │   │       └── spiders
    │   │       │   ├── __init__.py
    │   │       │   ├── __pycache__
    │   │       │       ├── __init__.cpython-37.pyc
    │   │       │       └── proxy_spider.cpython-37.pyc
    │   │       │   └── proxy_spider.py
    │   │   ├── proxy_server.py
    │   │   ├── run.py
    │   │   └── scrapy.cfg
    └── 勘误.md
├── 9、数据分析案例：Python岗位行情
    ├── 代码
    │   ├── 9_1.py
    │   └── 9_2.py
    └── 勘误.md
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .idea/


--------------------------------------------------------------------------------
/10、数据分析案例：某婚恋网站交友情况分析/代码/10_1.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 抓取我主良缘妹子交友信息做数据分析
 3 | """
 4 | 
 5 | import requests as rq
 6 | import pandas as pd
 7 | import time
 8 | import random
 9 | import os
10 | 
11 | # 结果写入文件
12 | result_save_file = 'wzly.csv'
13 | 
14 | # Ajax加载url
15 | ajax_url = "http://www.lovewzly.com/api/user/pc/list/search?"
16 | 
17 | # 模拟请求头
18 | ajax_headers = {
19 |     'Accept': 'application/json, text/javascript, */*; q=0.01',
20 |     'Accept-Encoding': 'gzip, deflate, br',
21 |     'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
22 |     'Connection': 'keep-alive',
23 |     'Host': 'www.lovewzly.com',
24 |     'Referer': 'http://www.lovewzly.com/jiaoyou.html',
25 |     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 '
26 |                   'Safari/537.36',
27 |     'X-Requested-With': 'XMLHttpRequest',
28 | }
29 | 
30 | # post请求参数
31 | form_data = {'gender': '2', 'marry': '1', 'page': '1'}
32 | 
33 | # csv表头
34 | csv_headers = [
35 |     '昵称', '用户id', '头像', '身高', '学历', '省份',
36 |     '城市', '出生年份', '性别', '交友宣言'
37 | ]
38 | 
39 | height_interval = ['140', '150', '160', '170', '180']  # 身高范围
40 | edu_interval = ['本科', '大专', '高中', '中专', '初中', '硕士', '博士', '院士']  # 学历范围
41 | age_interval = [
42 |     ('18-30', 8000), ('26-30', 8000), ('31-40', 8000),
43 |     ('41-50', 8000), ('50以上', 8000),
44 | ] # 年龄范围
45 | 
46 | 
47 | # 获取每页交友信息
48 | def fetch_data(page):
49 |     while True:
50 |         try:
51 |             form_data['page'] = page
52 |             print("抓取第：" + str(page) + "页!")
53 |             resp = rq.get(url=ajax_url, params=form_data, headers=ajax_headers)
54 |             if resp.status_code == 200:
55 |                 data_json = resp.json()['data']['list']
56 |                 if len(data_json) > 0:
57 |                     data_list = []
58 |                     for data in data_json:
59 |                         data_list.append((
60 |                             data['username'], data['userid'], data['avatar'],
61 |                             data['height'], data['education'], data['province'],
62 |                             data['city'], data['birthdayyear'], data['gender'], data['monolog']))
63 |                     result = pd.DataFrame(data_list)
64 |                     if page == 1:
65 |                         result.to_csv(result_save_file, header=csv_headers, index=False, mode='a+', encoding='utf-8')
66 |                     else:
67 |                         result.to_csv(result_save_file, header=False, index=False, mode='a+', encoding='utf-8')
68 |             return None
69 |         except Exception as e:
70 |             print(e)
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     if not os.path.exists(result_save_file):
75 |         for i in range(1, 718):
76 |             time.sleep(random.randint(2, 10))
77 |             fetch_data(i)
78 | 


--------------------------------------------------------------------------------
/10、数据分析案例：某婚恋网站交友情况分析/勘误.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/10、数据分析案例：某婚恋网站交友情况分析/勘误.md


--------------------------------------------------------------------------------
/1、Python爬虫概念与Web基础/勘误.md:
--------------------------------------------------------------------------------
 1 | ## 第1章 Python 爬虫概念与Web基础
 2 | 
 3 | ### 1.1.7 爬虫的学习路线
 4 | 
 5 | 学习路线图部分：
 6 | 
 7 | 1、原文：利用urllib、**requestsy**库
 8 | 
 9 | > 改为：利用urllib、**requests** 库
10 | 
11 | 2、原文：利用文件、**CVS**、Excel
12 | 
13 | > 改为：利用文件、**CSV**、Excel


--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_1.py:
--------------------------------------------------------------------------------
 1 | """
 2 | urllib.request使用示例
 3 | """
 4 | 
 5 | import urllib.request
 6 | 
 7 | resp = urllib.request.urlopen("http://www.baidu.com")
 8 | print("resp.geturl：", resp.geturl())
 9 | print("resp.msg：", resp.msg)
10 | print("resp.status：", resp.status)
11 | print("resp.version：", resp.version)
12 | print("resp.reason：", resp.reason)
13 | print("resp.debuglevel：", resp.debuglevel)
14 | print("resp.getheaders：", resp.getheaders()[0:2])
15 | print(resp.read().decode('utf-8'))
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_10.py:
--------------------------------------------------------------------------------
 1 | """
 2 | urllib.parse.urlencode函数使用代码示例
 3 | """
 4 | from urllib import parse
 5 | 
 6 | params = {
 7 |     'q': 'parse',
 8 |     'check_keywords': 'yes',
 9 |     'area': 'default'
10 | }
11 | url = 'https://docs.python.org/3/search.html?' + parse.urlencode(params)
12 | print("拼接后的URL：", url)
13 | 


--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_11.py:
--------------------------------------------------------------------------------
 1 | """
 2 | urllib.parse.parse_qs和parse_qsl函数使用代码示例
 3 | """
 4 | from urllib import parse
 5 | 
 6 | params_str = 'q=parse&check_keywords=yes&area=default'
 7 | 
 8 | print("parse_qs 反序列化结果：", parse.parse_qs(params_str))
 9 | print("parse_qsl 反序列化结果：", parse.parse_qsl(params_str))
10 | 


--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_12.py:
--------------------------------------------------------------------------------
 1 | """
 2 | urllib.robotparser使用示例
 3 | """
 4 | 
 5 | from urllib import robotparser
 6 | import ssl
 7 | ssl._create_default_https_context = ssl._create_unverified_context
 8 | 
 9 | rp = robotparser.RobotFileParser()
10 | # 设置rebots.txt文件的链接
11 | rp.set_url('http://www.taobao.com/robots.txt')
12 | # 读取rebots.txt文件并进行分析
13 | rp.read()
14 | 
15 | url = 'https://www.douban.com'
16 | user_agent = 'Baiduspider'
17 | op_info = rp.can_fetch(user_agent, url)
18 | print("Elsespider 代理用户访问情况：",op_info)
19 | 
20 | bdp_info = rp.can_fetch(user_agent, url)
21 | print("Baiduspider 代理用户访问情况：",bdp_info)
22 | user_agent = 'Elsespider'


--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_13.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 刷CSDN博客文章访问量的脚本
 3 | """
 4 | import random
 5 | import urllib.request
 6 | import threading as t
 7 | import os
 8 | import ssl
 9 | 
10 | # 全局取消证书验证
11 | ssl._create_default_https_context = ssl._create_unverified_context
12 | 
13 | # 代理ip文件
14 | proxy_ips_file = 'proxy_ips.txt'
15 | 
16 | # 代理ip列表
17 | proxy_ips = []
18 | 
19 | # 文章地址
20 | article_url = 'https://blog.csdn.net/l1028386804/article/details/116191713'
21 | 
22 | # 请求头
23 | headers = {
24 |     'Host': 'blog.csdn.net',
25 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
26 | }
27 | 
28 | read_count = 0
29 | 
30 | 
31 | # 读取文件里的代理ip，返回一个列表
32 | def load_ips(file_path):
33 |     if os.path.exists(file_path):
34 |         data_list = []
35 |         with open(file_path, "r+", encoding='utf-8') as f:
36 |             for ip in f:
37 |                 data_list.append(ip.replace("\n", ""))
38 |         return data_list
39 | 
40 | 
41 | # 访问网页
42 | def read_article():
43 |     # 随机取出一枚代理ip
44 |     proxy_ip = proxy_ips[random.randint(0, len(proxy_ips) - 1)]
45 |     proxy_support = urllib.request.ProxyHandler({'http': proxy_ip})
46 |     opener = urllib.request.build_opener(proxy_support)
47 |     urllib.request.install_opener(opener)
48 |     try:
49 |         req = urllib.request.Request(article_url, headers=headers)
50 |         resp = urllib.request.urlopen(req, timeout=20)
51 |         # 如果返回码是200代表访问成功
52 |         if resp is not None and resp.status == 200:
53 |             global read_count
54 |             read_count += 1
55 |             print("累计访问成功次数： %d" % read_count)
56 |             return None
57 |     except Exception as e:
58 |         print(e)
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     # 读取代理ip列表
63 |     proxy_ips = load_ips(proxy_ips_file)
64 |     read_article()
65 |     if len(proxy_ips) > 0:
66 |         for i in range(100):
67 |             read_article()


--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_14.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 爬取笔趣看的小说脚本示例
 3 | """
 4 | 
 5 | import urllib
 6 | import urllib.request
 7 | import urllib.parse
 8 | from lxml import etree
 9 | from urllib import error
10 | import lxml.html
11 | import os
12 | import time
13 | 
14 | # 小说站点的URL
15 | novel_base_url = 'http://www.biqukan.com'
16 | 
17 | # 拉取小说的URL
18 | novel_url = urllib.parse.urljoin(novel_base_url, '/0_790/')
19 | 
20 | # 每章小说的链接
21 | chapter_url_list = []
22 | 
23 | # 小说的保存文件夹
24 | novel_save_dir = os.path.join(os.getcwd(), 'novel_cache/')
25 | 
26 | # 请求头
27 | headers = {
28 |     'Host': 'www.biqukan.com',
29 |     'Referer': 'http://www.biqukan.com/',
30 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
31 | }
32 | 
33 | # 获取章节链接列表
34 | def fetch_chapter_urls():
35 |     req = urllib.request.Request(url=novel_url, headers=headers)
36 |     html = lxml.html.parse(urllib.request.urlopen(req))
37 |     hrefs = html.xpath('//dd/a/@href')
38 |     # 过滤前面的最新章节列表和无用章节
39 |     for href in hrefs[16:]:
40 |         chapter_url_list.append(urllib.parse.urljoin(novel_base_url, href))
41 | 
42 | # 解析每个页面获得章节正文
43 | def parsing_chapter(url):
44 |     req = urllib.request.Request(url=url, headers=headers)
45 |     html = lxml.html.parse(urllib.request.urlopen(req))
46 |     title = html.xpath('//h1/text()')[0]
47 |     contents = html.xpath('//*[@id="content"]/text()')
48 |     content = ''
49 |     for i in contents:
50 |         content += i.strip()
51 |     save_novel(title, content)
52 | 
53 | # 把章节正文写到本地
54 | def save_novel(name, content):
55 |     try:
56 |         with open(novel_save_dir + name + '.txt', "w+") as f:
57 |             f.write(content.strip())
58 |     except (error.HTTPError, OSError) as reason:
59 |         print(str(reason))
60 |     else:
61 |         print("下载完成：" + name)
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     # 判断存储的文件夹是否存在，不存在新建
66 |     if not os.path.exists(novel_save_dir):
67 |         os.mkdir(novel_save_dir)
68 |     # 爬取小说文章链接列表
69 |     fetch_chapter_urls()
70 |     # 遍历抓取所有的小说内容
71 |     for chapter in chapter_url_list:
72 |         # 定时休眠1s防止ip被封
73 |         time.sleep(1)
74 |         parsing_chapter(chapter)


--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | urllib下载图片
 3 | """
 4 | import urllib.request
 5 | import ssl
 6 | 
 7 | ssl._create_default_https_context = ssl._create_unverified_context
 8 | 
 9 | # pic_url = "https://www.baidu.com/img/bd_logo1.png"
10 | # pic_resp = urllib.request.urlopen(pic_url,context=context)
11 | # pic = pic_resp.read()
12 | # with open("bg_logo.png", "wb") as f:
13 | #     f.write(pic)
14 | 
15 | urllib.request.urlretrieve('https://www.baidu.com/img/bd_logo1.png', 'bd_logo.png')
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_3.py:
--------------------------------------------------------------------------------
 1 | """
 2 | itchat模拟Get请求
 3 | """
 4 | 
 5 | import urllib.request
 6 | import json
 7 | import ssl
 8 | 
 9 | ssl._create_default_https_context = ssl._create_unverified_context
10 | 
11 | get_url = "http://gank.io/api/data/" + urllib.request.quote("福利") + "/1/1"
12 | get_resp = urllib.request.urlopen(get_url)
13 | get_result = json.loads(get_resp.read().decode('utf-8'))
14 | # 这里后面的参数用于格式化Json输出格式
15 | get_result_format = json.dumps(get_result, indent=2,
16 |                                sort_keys=True,  ensure_ascii=False)
17 | print(get_result_format)


--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_4.py:
--------------------------------------------------------------------------------
 1 | """
 2 | urllib模拟Post请求示例(伪代码，不能直接请求)
 3 | """
 4 | import urllib.request
 5 | import urllib.parse
 6 | import json
 7 | 
 8 | post_url = "http://xxx.xxx.login"
 9 | phone = "13555555555"
10 | password = "111111"
11 | values = {
12 |     'phone': phone,
13 |     'password': password
14 | }
15 | data = urllib.parse.urlencode(values).encode(encoding='utf-8')
16 | req = urllib.request.Request(post_url, data)
17 | resp = urllib.request.urlopen(req)
18 | result = json.loads(resp.read())    # Byte结果转Json
19 | print(json.dumps(result, sort_keys=True,
20 |                  indent=2, ensure_ascii=False)) # 格式化输出Json
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_5.py:
--------------------------------------------------------------------------------
 1 | """
 2 | urllib修改请求头代码示例
 3 | """
 4 | import urllib.request
 5 | 
 6 | # 修改头信息
 7 | novel_url = "http://www.biqukxs.com/book/1.html"
 8 | headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
 9 |                          'AppleWebKit/537.36 (KHTML, like Gecko)'
10 |                          ' Chrome/63.0.3239.84 Safari/537.36',
11 |            'Host': 'www.biqukxs.com',
12 |            'Referer': 'http://www.biqukxs.com/',
13 |            'Connection': 'keep-alive'}
14 | novel_req = urllib.request.Request(novel_url, headers=headers)
15 | novel_resp = urllib.request.urlopen(novel_req)
16 | print(novel_resp.read().decode('gbk'))
17 | 


--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_6.py:
--------------------------------------------------------------------------------
 1 | """
 2 | urllib配置代理示例
 3 | """
 4 | 
 5 | import urllib.request
 6 | 
 7 | # 使用ip代理
 8 | ip_query_url = "http://ip.chinaz.com/"
 9 | 
10 | # 1.创建代理处理器，ProxyHandler参数是一个字典{类型:代理ip:端口}，下述代理IP失效的话替换成可用的代理ip即可
11 | proxy_support = urllib.request.ProxyHandler({'http': '60.187.118.246:9000'})
12 | 
13 | # 2.定制，创建一个opener
14 | opener = urllib.request.build_opener(proxy_support)
15 | 
16 | # 3.安装opener
17 | urllib.request.install_opener(opener)
18 | 
19 | # 请求头
20 | headers = {
21 |     'User-Agent': 'User-Agent:Mozilla/5.0 (X11; Linux x86_64)'
22 |                   ' AppleWebKit/537.36 (KHTML, like Gecko)'
23 |                   ' Chrome/63.0.3239.84 Safari/537.36',
24 |     'Host': 'ip.chinaz.com'
25 | }
26 | 
27 | req = urllib.request.Request(ip_query_url, headers=headers)
28 | resp = urllib.request.urlopen(req, timeout=20)
29 | html = resp.read().decode('utf-8')
30 | print(html)
31 | 


--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_7.py:
--------------------------------------------------------------------------------
 1 | """
 2 | urllib使用cookie代码示例
 3 | """
 4 | 
 5 | import urllib.request
 6 | from http import cookiejar
 7 | 
 8 | # ============ 获得Cookie ============
 9 | 
10 | # 1.实例化CookieJar对象
11 | 
12 | 
13 | cookie = cookiejar.CookieJar()
14 | 
15 | # 2.创建Cookie处理器
16 | handler = urllib.request.HTTPCookieProcessor(cookie)
17 | 
18 | # 3.通过CookieHandler创建opener
19 | opener = urllib.request.build_opener(handler)
20 | 
21 | # 4.打开网页
22 | resp = opener.open("http://www.baidu.com")
23 | 
24 | for i in cookie:
25 |     print("Name = %s" % i.name)
26 |     print("Name = %s" % i.value)
27 | 
28 | # ============ 保存Cookie到文件 ============
29 | # 1.用于保存cookie的文件
30 | cookie_file = "cookie.txt"
31 | 
32 | # 2.创建MozillaCookieJar对象保存Cookie
33 | cookie = cookiejar.MozillaCookieJar(cookie_file)
34 | 
35 | # 3.创建Cookie处理器
36 | handler = urllib.request.HTTPCookieProcessor(cookie)
37 | 
38 | # 4.通过CookieHandler创建opener
39 | opener = urllib.request.build_opener(handler)
40 | 
41 | # 5.打开网页
42 | resp = opener.open("http://www.baidu.com")
43 | 
44 | # 6.保存Cookie到文件中，参数依次是:
45 | # ignore_discard：即使cookies将被丢弃也将它保存下来
46 | # ignore_expires：如果在该文件中cookies已存在，覆盖原文件写入
47 | cookie.save(ignore_discard=True, ignore_expires=True)
48 | 
49 | # ============ 读取Cookie文件 ============
50 | 
51 | cookie_file = "cookie.txt"
52 | 
53 | # 1.创建MozillaCookieJar对象保存Cookie
54 | cookie = cookiejar.MozillaCookieJar(cookie_file)
55 | 
56 | # 2.从文件中读取cookie内容
57 | cookie.load(cookie_file, ignore_expires=True, ignore_discard=True)
58 | 
59 | handler = urllib.request.HTTPCookieProcessor(cookie)
60 | opener = urllib.request.build_opener(handler)
61 | resp = opener.open("http://www.baidu.com")
62 | print(resp.read().decode('utf-8'))
63 | 
64 | 


--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_8.py:
--------------------------------------------------------------------------------
 1 | """
 2 | urllib.parse.urlparse和urlsplit函数使用示例
 3 | """
 4 | import urllib.parse
 5 | 
 6 | urp = urllib.parse.urlparse('https://docs.python.org/3/search.html?q=parse&check_keywords=yes&area=default')
 7 | print('urlparse执行结果：', urp)
 8 | # 可以通过.的方式获取某个部分
 9 | print('urp.scheme：', urp.scheme)
10 | print('urp.netloc：', urp.netloc)
11 | 
12 | urp = urllib.parse.urlsplit('https://docs.python.org/3/search.html?q=parse&check_keywords=yes&area=default')
13 | print('urlsplit执行结果：', urp)
14 | 


--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/2_9.py:
--------------------------------------------------------------------------------
 1 | """
 2 | urllib.parse.urlunparse，urlunsplit和urljoin函数使用示例
 3 | """
 4 | import urllib.parse
 5 | 
 6 | url = urllib.parse.urlunparse(['https','docs.python.org', '/3/search.html', 'q=parse&check_keywords=yes&area=default' , '', ''])
 7 | print('urlunparse函数拼接的URL：',url)
 8 | 
 9 | url = urllib.parse.urlunsplit(['https','docs.python.org', '/3/search.html', 'q=parse&check_keywords=yes&area=default',''])
10 | print('urlunsplit函数拼接的URL：',url)
11 | 
12 | url = urllib.parse.urljoin('https://docs.python.org','/3/search.html')
13 | url = urllib.parse.urljoin(url,'?q=parse&check_keywords=yes&area=default')
14 | print('urljoin函数拼接的URL：',url)


--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/cookie.txt:
--------------------------------------------------------------------------------
 1 | # Netscape HTTP Cookie File
 2 | # http://curl.haxx.se/rfc/cookie_spec.html
 3 | # This is a generated file!  Do not edit.
 4 | 
 5 | .baidu.com	TRUE	/	FALSE	3681539028	BAIDUID	F16617940595A8E3EF9BB50E63AC0954:FG=1
 6 | .baidu.com	TRUE	/	FALSE	3681539028	BIDUPSID	F16617940595A8E3EF9BB50E63AC0954
 7 | .baidu.com	TRUE	/	FALSE		H_PS_PSSID	1442_21106_22074
 8 | .baidu.com	TRUE	/	FALSE	3681539028	PSTM	1534055381
 9 | www.baidu.com	FALSE	/	FALSE		BDSVRTM	0
10 | www.baidu.com	FALSE	/	FALSE		BD_HOME	0
11 | www.baidu.com	FALSE	/	FALSE	2480135321	delPer	0
12 | 


--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/代码/proxy_ips.txt:
--------------------------------------------------------------------------------
  1 | 183.129.244.16:10080
  2 | 219.141.153.39:80
  3 | 119.180.140.9:8060
  4 | 111.3.154.196:8060
  5 | 123.117.250.127:8060
  6 | 222.182.56.120:8118
  7 | 123.114.200.43:8118
  8 | 117.28.96.103:808
  9 | 120.92.174.37:1080
 10 | 39.137.69.10:80
 11 | 106.56.102.78:8070
 12 | 218.88.177.155:8908
 13 | 221.2.175.238:8060
 14 | 120.198.224.5:8000
 15 | 119.180.131.39:8060
 16 | 112.67.34.99:8118
 17 | 123.114.200.72:8118
 18 | 39.137.69.6:8080
 19 | 163.125.235.73:8118
 20 | 219.141.153.11:80
 21 | 180.119.65.150:1133
 22 | 221.14.140.66:80
 23 | 119.180.142.175:8060
 24 | 113.78.255.243:8118
 25 | 119.180.172.222:8060
 26 | 39.137.77.66:8080
 27 | 61.171.0.40:9999
 28 | 221.2.155.35:8060
 29 | 118.190.94.254:9001
 30 | 219.141.153.43:80
 31 | 112.24.107.109:8908
 32 | 222.186.45.139:65309
 33 | 219.141.153.5:80
 34 | 219.141.153.35:80
 35 | 221.14.140.130:80
 36 | 101.96.11.5:80
 37 | 119.179.131.245:8060
 38 | 121.14.159.150:9001
 39 | 114.250.25.19:80
 40 | 120.198.224.6:8088
 41 | 223.96.95.229:3128
 42 | 121.17.18.219:8060
 43 | 117.190.90.20:8060
 44 | 219.141.153.6:80
 45 | 113.239.240.152:80
 46 | 101.96.10.5:80
 47 | 219.141.153.10:80
 48 | 117.44.247.37:8908
 49 | 115.213.103.150:8010
 50 | 113.3.210.60:80
 51 | 106.56.102.252:8070
 52 | 183.246.84.229:8060
 53 | 118.190.95.35:9001
 54 | 219.141.153.41:80
 55 | 58.247.46.123:8088
 56 | 112.24.107.102:8908
 57 | 223.93.145.186:8060
 58 | 218.244.44.194:8060
 59 | 120.198.224.7:8080
 60 | 117.28.97.169:808
 61 | 222.88.147.104:8060
 62 | 218.88.177.149:8908
 63 | 39.137.69.8:8080
 64 | 119.179.147.68:8060
 65 | 113.105.202.51:1133
 66 | 219.141.153.12:8080
 67 | 114.95.61.165:8118
 68 | 222.186.34.212:65309
 69 | 113.128.198.50:8060
 70 | 219.141.153.2:8080
 71 | 219.141.153.34:80
 72 | 222.175.200.58:8060
 73 | 117.131.235.198:8060
 74 | 219.141.153.44:80
 75 | 60.14.125.246:8908
 76 | 119.180.137.134:8060
 77 | 39.137.77.67:80
 78 | 120.131.9.254:1080
 79 | 106.56.102.17:8070
 80 | 119.180.168.33:8060
 81 | 221.2.174.99:8060
 82 | 118.190.200.139:8080
 83 | 222.88.149.32:8060
 84 | 118.190.145.138:9001
 85 | 221.2.174.6:8060
 86 | 219.141.153.38:80
 87 | 119.180.140.140:8060
 88 | 123.158.175.102:1080
 89 | 219.141.153.7:80
 90 | 117.44.247.53:8908
 91 | 124.128.76.142:8060
 92 | 112.80.93.76:8118
 93 | 119.180.131.16:8060
 94 | 39.135.24.11:8080
 95 | 222.222.236.207:8060
 96 | 218.88.177.161:8908
 97 | 119.179.132.101:8060
 98 | 39.137.69.7:80
 99 | 119.180.171.89:8060
100 | 118.190.95.43:9001


--------------------------------------------------------------------------------
/2、Python爬虫基本库的使用/勘误.md:
--------------------------------------------------------------------------------
1 | 2021.4.29 更新内容：
2 | 
3 | - 2_4.py → 新增注释：伪代码，不能直接请求，只是用于演示用法；
4 | - 2_5.py → 将百度地址替换为小说地址，请求头内容替换；
5 | - 2_6.py → 新增注释：请求失败时，将时效代理IP替换为可用代理IP；
6 | - 2_12.py → 新增全局取消https证书验证；
7 | - 2_13.py → 替换失效博客地址；
8 | 
9 | 


--------------------------------------------------------------------------------
/3、Python爬虫抓包与数据解析/代码/3_1.py:
--------------------------------------------------------------------------------
  1 | """
  2 | requests抓取微信公众号文章的图片，音视频
  3 | """
  4 | import requests
  5 | from lxml import etree
  6 | import time
  7 | import os
  8 | 
  9 | # 资源的保存文件夹
 10 | save_dir = os.path.join(os.getcwd(), 'tmp')
 11 | 
 12 | # 测试文章的URL
 13 | test_url = 'https://mp.weixin.qq.com/s/4oLnJvfGCZneoErkrh0sHw'
 14 | 
 15 | # 语音获取的基URL
 16 | music_res_url = 'http://res.wx.qq.com/voice/getvoice'
 17 | 
 18 | # 视频获取的接口URL
 19 | video_parse_url = 'http://v.ranks.xin/video-parse.php'
 20 | 
 21 | # 微信公众号文章请求头
 22 | headers = {
 23 |     'Host': 'mp.weixin.qq.com',
 24 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 '
 25 |                   '(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
 26 | }
 27 | 
 28 | # 视频获取接口的请求头
 29 | video_parse_headers = {
 30 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)',
 31 |     'Host': 'v.ranks.xin',
 32 |     'Referer': 'http://v.ranks.xin/',
 33 |     'X-Requested-With': 'XMLHttpRequest'
 34 | }
 35 | 
 36 | 
 37 | # 获取标题
 38 | def get_title(content):
 39 |     return content.xpath("//h2[@class='rich_media_title']/text()")[0].strip()
 40 | 
 41 | 
 42 | # 解析下载图片
 43 | def get_pic(content, path):
 44 |     img_list = content.xpath("//img/@data-src")
 45 |     for img in img_list:
 46 |         download_pic(img, path)
 47 | 
 48 | 
 49 | # 解析获得音频
 50 | def get_sound(content, path):
 51 |     sound_list = content.xpath("//mpvoice/@voice_encode_fileid")
 52 |     for sound in sound_list:
 53 |         download_sound(sound, path)
 54 | 
 55 | 
 56 | # 解析获得视频
 57 | def get_video(content, path):
 58 |     video_list = content.xpath("//iframe/@data-src")
 59 |     for video in video_list:
 60 |         download_video(video, path)
 61 | 
 62 | 
 63 | # 下载图片的方法
 64 | def download_pic(url, path):
 65 |     print("下载图片：" + url)
 66 |     try:
 67 |         pic_name = str(int(time.time()))  # 使用当前时间戳作为图片名字
 68 |         fmt = url.split('=')[-1]  # 图片格式
 69 |         img_resp = requests.get(url).content
 70 |         with open(path + pic_name + "." + fmt, "wb+") as f:
 71 |             f.write(img_resp)
 72 |     except Exception as reason:
 73 |         print(str(reason))
 74 | 
 75 | 
 76 | # 下载音频的方法
 77 | def download_sound(file_id, path):
 78 |     try:
 79 |         sound_resp = requests.get(music_res_url, params={'mediaid': file_id, 'voice_type': '1'})
 80 |         if sound_resp is not None:
 81 |             music_name = str(int(time.time())) + '.mp3'  # 使用当前时间戳作为音频名字
 82 |             print("开始下载音频: " + sound_resp.url)
 83 |             with open(path + music_name, "wb+") as f:
 84 |                 f.write(sound_resp.content)
 85 |                 print("音频下载完成:" + music_name)
 86 |     except Exception as reason:
 87 |         print(str(reason))
 88 | 
 89 | 
 90 | # 下载视频的方法
 91 | def download_video(url, path):
 92 |     print("开始解析视频链接：" + url)
 93 |     video_resp = requests.get(video_parse_url, headers=video_parse_headers, params={'url': url})
 94 |     if video_resp is not None:
 95 |         video_url = video_resp.json()['data'][0]['url']
 96 |         print("解析完成，开始下载视频:" + video_url)
 97 |         try:
 98 |             video_name = str(int(time.time())) + '.mp4'  # 使用当前时间戳作为视频名字
 99 |             video_resp = requests.get(video_url).content
100 |             if video_resp is not None:
101 |                 with open(path + video_name, "wb+") as f:
102 |                     f.write(video_resp)
103 |                     print("视频下载完成:" + video_name)
104 |         except Exception as reason:
105 |             print(str(reason))
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     while True:
110 |         print("请输入你要抓取的微信文章链接：(输出Q回车或者按Ctrl+C可以退出～)")
111 |         input_url = input()
112 |         if input_url == 'Q':
113 |             exit()
114 |         else:
115 |             resp = requests.get(url=input_url.strip(), headers=headers).text
116 |             html = etree.HTML(resp)
117 |             title = get_title(html)
118 |             res_save_dir = os.path.join(save_dir, title)
119 |             if not os.path.exists(res_save_dir):
120 |                 os.makedirs(res_save_dir)
121 |             get_pic(html,res_save_dir)
122 |             get_sound(html,res_save_dir)
123 |             get_video(html,res_save_dir)
124 |             print("所有资源下载完成！")


--------------------------------------------------------------------------------
/3、Python爬虫抓包与数据解析/代码/3_2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Beautiful Soup使用示例，抓取壁纸网站的壁纸
 3 | """
 4 | import requests as r
 5 | from bs4 import BeautifulSoup
 6 | import os
 7 | import time
 8 | 
 9 | base_url = "http://www.win4000.com"
10 | theme_base_url = "http://www.win4000.com/zt/xiaoqingxin_"
11 | 
12 | # 利用列表表达式生成每页链接列表
13 | theme_url_list = [theme_base_url + str(x) + '.html' for x in range(1, 6)]
14 | 
15 | # 套图链接列表
16 | series_url_lists = []
17 | 
18 | # 保存文件名
19 | save_root_dir = os.path.join(os.getcwd(), 'tmp/')
20 | 
21 | 
22 | # 获取所有套图的链接列表
23 | def get_series_url_lists(url):
24 |     resp = r.get(url)
25 |     if resp is not None:
26 |         result = resp.text
27 |         bs = BeautifulSoup(result, 'html.parser')
28 |         ul = bs.find('div', attrs={'class': 'tab_tj'})
29 |         a_s = ul.find_all('a')
30 |         for a in a_s:
31 |             series_url_lists.append(a.get('href'))
32 | 
33 | 
34 | # 获取某个套图里的所有图片
35 | def fetch_all_series_pic(url):
36 |     cur_page = 1
37 |     while True:
38 |         current_url = url
39 |         if cur_page > 1:
40 |             current_url = url.replace('.html', '_' + str(cur_page) + '.html')
41 |         resp = r.get(current_url)
42 |         if resp.status_code == 404:
43 |             break
44 |         else:
45 |             if resp is not None:
46 |                 result = resp.text
47 |                 bs = BeautifulSoup(result, 'lxml')
48 |                 # 使用lxml来获取标题，用作文件夹名
49 |                 title_name = bs.find('div', attrs={'class': 'ptitle'}).h1.text
50 |                 save_dir = os.path.join(save_root_dir, title_name)
51 |                 if not os.path.exists(save_dir):
52 |                     os.makedirs(save_dir)
53 |                 # 使用CSS选择器选择图片结点
54 |                 imgs = bs.select('img.pic-large')
55 |                 for img in imgs:
56 |                     download_pic(img.attrs.get('src'), save_dir)
57 |                 cur_page += 1
58 | 
59 | 
60 | # 下载图片的方法
61 | def download_pic(url, path):
62 |     print("下载图片：" + url)
63 |     try:
64 |         pic_name = url.split('/')[-1]
65 |         img_resp = r.get(url).content
66 |         with open(path + '/' +pic_name, "wb+") as f:
67 |             f.write(img_resp)
68 |     except Exception as reason:
69 |         print(str(reason))
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     for url in theme_url_list:
74 |         get_series_url_lists(url)
75 |     for url in series_url_lists:
76 |         fetch_all_series_pic(url)
77 | 


--------------------------------------------------------------------------------
/3、Python爬虫抓包与数据解析/代码/3_3.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 正则使用示例
 3 | """
 4 | 
 5 | import re
 6 | 
 7 | ret = re.match(r'^(\d{4})-(\d{3,8})$', '0756-3890993')
 8 | print(ret.group())
 9 | print(ret.group(0))
10 | print(ret.group(1))
11 | print(ret.group(2))
12 | 
13 | str_count = "您的网站被访问了10000次"
14 | match = re.match(r"^您的网站被访问了(\d{1,6})次$", str_count)
15 | print(match.group(1))
16 | 
17 | 


--------------------------------------------------------------------------------
/3、Python爬虫抓包与数据解析/代码/3_4.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 正则表达式实战示例：采集所有城市编码
 3 | """
 4 | import requests as r
 5 | from bs4 import BeautifulSoup
 6 | import re
 7 | import os
 8 | 
 9 | base_url = 'http://www.weather.com.cn'
10 | city_referer_url = 'http://www.weather.com.cn/textFC/hb.shtml'
11 | 
12 | # 获取城市编码的正则
13 | code_regex = re.compile('^.*?weather/(.*?).shtml$', re.S)
14 | # 城市编码的保存文件
15 | save_file_name = os.path.join(os.getcwd(), 'city_codes.txt')
16 | # 城市编码列表
17 | city_code_list = []
18 | 
19 | 
20 | # 获取所有的城市列表
21 | def fetch_city_url_list():
22 |     city_url_list = []
23 |     resp = r.get(city_referer_url)
24 |     resp.encoding = 'utf-8'
25 |     bs = BeautifulSoup(resp.text, 'lxml')
26 |     content = bs.find('div', attrs={'class': 'lqcontentBoxheader'})
27 |     if content is not None:
28 |         a_s = content.find_all('a')
29 |         if a_s is not None:
30 |             for a in a_s:
31 |                 city_url_list.append(base_url + a.get('href'))
32 |     return city_url_list
33 | 
34 | 
35 | # 获取城市天气跳转链接列表
36 | def fetch_city_weather_url_list(url):
37 |     resp = r.get(url)
38 |     resp.encoding = 'utf-8'
39 |     bs = BeautifulSoup(resp.text, 'lxml')
40 |     a_s = bs.select('div.conMidtab a')
41 |     for a in a_s:
42 |         if a.get("href") is not None and a.text != '详情' and a.text != '返回顶部':
43 |             # 提取城市编码
44 |             result = code_regex.match(a.get("href"))
45 |             if result is not None:
46 |                 city_code_list.append(a.text + ":" + result.group(1))
47 | 
48 | 
49 | # 把列表写入到文件中的方法
50 | def write_list_to_file(data):
51 |     try:
52 |         with open(save_file_name, "w+",  encoding='utf-8') as f:
53 |             for content in data:
54 |                 f.write(content + "\n")
55 |     except OSError as reason:
56 |         print(str(reason))
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     city_list = fetch_city_url_list()
61 |     for city in city_list:
62 |         print("解析：", city)
63 |         fetch_city_weather_url_list(city)
64 |     write_list_to_file(city_code_list)
65 | 


--------------------------------------------------------------------------------
/3、Python爬虫抓包与数据解析/勘误.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/3、Python爬虫抓包与数据解析/勘误.md


--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/代码/4_1.py:
--------------------------------------------------------------------------------
 1 | """
 2 | csv库使用代码示例
 3 | """
 4 | 
 5 | import csv
 6 | import os
 7 | 
 8 | save_file_name_1 = os.path.join(os.getcwd(), '1.csv')
 9 | save_file_name_2 = os.path.join(os.getcwd(), '2.csv')
10 | save_file_name_3 = os.path.join(os.getcwd(), '3.csv')
11 | 
12 | data_1 = [['id', '姓名', '性别', '年龄', '工作'],
13 |           [1, '小明', '男', '18', '学生'],
14 |           [2, '小红', '女', '24', '老师'],
15 |           [3, '小光', '男', '25', 'Python工程师']]
16 | 
17 | headers = ['id', '姓名', '性别', '年龄', '工作']
18 | data_2 = [{'id': 1, '姓名': '小明', '性别': '男', '年龄': '18', '工作': '学生'},
19 |           {'id': 2, '姓名': '小红', '性别': '女', '年龄': '24', '工作': '老师'},
20 |           {'id': 3, '姓名': '小光', '性别': '男', '年龄': '25', '工作': 'Python工程师'}]
21 | 
22 | # 单行写入示例
23 | with open(save_file_name_1, 'w', newline='') as f:
24 |     writer = csv.writer(f)
25 |     for row in data_1:
26 |         writer.writerow(row)
27 | 
28 | # 多行写入
29 | with open(save_file_name_2, 'w', newline='') as f:
30 |     writer = csv.writer(f)
31 |     writer.writerows(data_1)
32 | 
33 | # 字典写入
34 | with open(save_file_name_3, 'w', newline='') as f:
35 |     # 标头在这里传入，作为第一行数据
36 |     writer = csv.DictWriter(f, headers)
37 |     writer.writeheader()
38 |     for row in data_2:
39 |         writer.writerow(row)
40 | 
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     with open(save_file_name_1) as f:
45 |         reader = csv.DictReader(f)
46 |         for row in reader:
47 |             print(row['姓名'])
48 |         # reader = csv.reader(f)
49 |         # print(list(reader)[0][1])
50 |         # for row in reader:
51 |         #     print(reader.line_num, row)
52 | 
53 | 


--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/代码/4_10.py:
--------------------------------------------------------------------------------
 1 | """
 2 | PyMongo库实战示例：爬取一号店关键词搜索结果保存到MongoDB中
 3 | """
 4 | import pymongo
 5 | import requests as r
 6 | from lxml import etree
 7 | 
 8 | search_word = "羽毛球"
 9 | search_base_url = 'https://search.yhd.com/c0-0/k'
10 | 
11 | 
12 | def search_goods(key):
13 |     data_list = []
14 |     resp = r.get(search_base_url + key)
15 |     print(resp.url)
16 |     resp.encoding = 'utf-8'
17 |     html = etree.HTML(resp.text)
18 |     ul_list = html.xpath('//div[@id="itemSearchList"]/div')
19 |     for ul in ul_list:
20 |         # 商品名称
21 |         title = ul.xpath('div//p[@class="proName clearfix"]/a/@title')[0]
22 |         # 商品链接
23 |         link = ul.xpath('div//p[@class="proName clearfix"]/a/@href')[0]
24 |         # 商品价格
25 |         price = ul.xpath('div//p[@class="proPrice"]/em/@yhdprice')[0]
26 |         # 店铺名称
27 |         store = ul.xpath('div//p[@class="storeName limit_width"]/a/@title')
28 |         store_name = store[0] if len(store) > 0 else ''
29 |         # 评论数
30 |         comment_count = ul.xpath('div//p[@class="proPrice"]/span[@class="comment"]/a/text()')[1]
31 |         # 好评率
32 |         favorable_rate = ul.xpath('div//span[@class="positiveRatio"]/text()')[0]
33 |         data_list.append({'title': title, 'link': 'https:' + link, 'price': price, 'store_name': store_name, 'comment_count': comment_count,
34 |                           'favorable_rate': favorable_rate})
35 |     return data_list
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     conn = pymongo.MongoClient(host='localhost', port=27017)
40 |     search_goods(search_word)
41 |     db = conn['yhd']
42 |     collection = db['羽毛球']
43 |     search_result_list = search_goods(search_word)
44 |     collection.insert_many(search_result_list)
45 |     conn.close()
46 | 


--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/代码/4_2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | csv库实战示例：爬取星座运势
 3 | """
 4 | import csv
 5 | import requests as r
 6 | from bs4 import BeautifulSoup
 7 | import re
 8 | import os
 9 | 
10 | # 抓取站点
11 | constellation_url = 'http://www.xzw.com/fortune/'
12 | 
13 | # 提取信息的正则
14 | fetch_regex = re.compile(r'^.*?<strong>(.*?)</strong><small>(.*?)</small>.*?width:(\d*)%.*?p>(.*)\[<a.*$', re.S)
15 | 
16 | # 数据保存文件名
17 | save_path = os.path.join(os.getcwd(), 'constellation.csv')
18 | 
19 | # 表头
20 | headers = ['星座','生日时间','运势评分','今日运势']
21 | 
22 | # 爬取星座运势相关信息保存
23 | def fetch_constellation_msg():
24 |     resp = r.get(constellation_url).text
25 |     bs = BeautifulSoup(resp, 'lxml')
26 |     dls = bs.select('div.alb div dl')
27 |     result_list = [headers]
28 |     for dl in dls:
29 |         # 正则提取信息
30 |         result = fetch_regex.match(str(dl))
31 |         if result is not None:
32 |             result_list.append([result.group(1),result.group(2), str(int(result.group(3))/20) + '颗星', result.group(4)])
33 |     # 数据写入csv文件中
34 |     with open(save_path, 'w', newline='') as f:
35 |         writer = csv.writer(f)
36 |         writer.writerows(result_list)
37 | 
38 | if __name__ == '__main__':
39 |     fetch_constellation_msg()


--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/代码/4_3.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Excel操作库xlwt，xlrd库使用代码示例
 3 | """
 4 | 
 5 | import xlwt
 6 | import xlrd
 7 | import os
 8 | 
 9 | if __name__ == '__main__':
10 |     # 新建一个工作薄
11 |     # workbook = xlwt.Workbook()
12 |     # sheet = workbook.add_sheet('工作表1',cell_overwrite_ok=True)
13 |     # sheet.write(0, 0, '姓名')
14 |     # sheet.write(0, 1, '学号')
15 |     # sheet.write(1, 0, '小猪')
16 |     # sheet.write(1, 1, '1')
17 |     # workbook.save(os.path.join(os.getcwd(), 'result.xlsx'))
18 |     workbook = xlrd.open_workbook(os.path.join(os.getcwd(), 'result.xlsx'))
19 |     sheet = workbook.sheets()[0]
20 |     # 获得行数
21 |     row_count = sheet.nrows
22 |     for row in range(0, row_count):
23 |         print(sheet.row_values(row))


--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/代码/4_4.py:
--------------------------------------------------------------------------------
  1 | """
  2 | xlwt和xlrd库实战示例：爬取豆瓣音乐top 250
  3 | """
  4 | import xlwt
  5 | import xlrd
  6 | import requests as r
  7 | import os
  8 | from bs4 import BeautifulSoup
  9 | 
 10 | base_url = 'https://music.douban.com/top250'
 11 | 
 12 | headers = {
 13 |     'Host': 'music.douban.com',
 14 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
 15 | }
 16 | 
 17 | save_file = 'douban.xlsx'
 18 | 
 19 | # 提取音乐信息的方法
 20 | def fetch_data(start_pos):
 21 |     result = []
 22 |     resp = r.get(base_url, headers=headers, params={'start': str(start_pos)}).text
 23 |     bs = BeautifulSoup(resp, 'lxml')
 24 |     trs = bs.select('tr.item')
 25 |     for tr in trs:
 26 |         tds = tr.select('td')
 27 |         # 歌名
 28 |         music_name = tds[1].div.a.text.strip().split("\n")[0]
 29 |         # 歌曲封面
 30 |         music_pic_url = tds[0].img.get('src')
 31 |         # 歌手
 32 |         msg_list = tds[1].div.p.text.split('/')
 33 |         singer = msg_list[0]
 34 |         # 发行时间
 35 |         public_time = msg_list[1]
 36 |         # 分类
 37 |         albums = ''
 38 |         for album in msg_list[2:]:
 39 |             albums += album.strip() + "/"
 40 |         # 评分
 41 |         spans = tds[1].select('div div span')
 42 |         score = spans[1].text
 43 |         # 评分人数
 44 |         score_num = (spans[2].text.replace(' ','').replace('\n','').replace('人评价','')[1:-1])
 45 |         # 歌曲详情页
 46 |         music_detail_url = tds[0].a.get('href')
 47 |         result.append([music_name, music_pic_url, singer, public_time, albums, score, score_num, music_detail_url])
 48 |     return result
 49 | 
 50 | class ExcelHelper:
 51 |     def __init__(self):
 52 |         if not os.path.exists(save_file):
 53 |             # 1.创建工作薄
 54 |             self.workbook = xlwt.Workbook()
 55 |             # 2.创建工作表，第二个参数用于确认同一个cell单元是否可以重设值
 56 |             self.sheet = self.workbook.add_sheet(u"豆瓣音乐Top 250", cell_overwrite_ok=True)
 57 |             # 3.初始化表头
 58 |             self.headTitles = [u'歌名', u'歌曲封面', u'歌手', u'发行时间', u'分类', u'评分', u'评分人数', u'歌曲详情页']
 59 |             for i, item in enumerate(self.headTitles):
 60 |                 self.sheet.write(0, i, item, self.style('Monaco', 220, bold=True))
 61 |             self.workbook.save(save_file)
 62 | 
 63 |     # 参数依次是：字体名称，字体高度，是否加粗
 64 |     def style(self, name, height, bold=False):
 65 |         style = xlwt.XFStyle()  # 赋值style为XFStyle()，初始化样式
 66 |         font = xlwt.Font()  # 为样式创建字体样式
 67 |         font.name = name
 68 |         font.height = height
 69 |         font.bold = bold
 70 |         return style
 71 | 
 72 |     # 往单元格里插入数据
 73 |     def insert_data(self, data_group):
 74 |         try:
 75 |             xlsx = xlrd.open_workbook(save_file)  # 读取Excel文件
 76 |             table = xlsx.sheets()[0]  # 根据索引获得表
 77 |             row_count = table.nrows  # 获取当前行数，新插入的数据从这里开始
 78 |             count = 0
 79 |             for data in data_group:
 80 |                 for i in range(len(data)):
 81 |                     self.sheet.write(row_count + count, i, data[i], self.style('Monaco', 220, bold=True))
 82 |                 count += 1
 83 |         except Exception as e:
 84 |             print(e)
 85 |         finally:
 86 |             self.workbook.save(save_file)
 87 | 
 88 |     # 读取Excel里的数据
 89 |     def read_data(self):
 90 |         xlsx = xlrd.open_workbook(save_file)
 91 |         table = xlsx.sheets()[0]
 92 |         nrows = table.nrows  # 行数
 93 |         ncols = table.ncols  # 列数
 94 |         # 从第一行开始，0是表头
 95 |         for i in range(1, nrows):
 96 |             # 读取某行数据
 97 |             row_value = table.row_values(i)
 98 |             print(row_value)
 99 | 
100 | 
101 | if __name__ == '__main__':
102 |     data_group = []
103 |     offsets = [x for x in range(0, 250, 25)]
104 |     for offset in offsets:
105 |         data_group += fetch_data(offset)
106 |     print(data_group)
107 |     excel = ExcelHelper()
108 |     excel.insert_data(data_group)
109 |     excel.read_data()


--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/代码/4_5.py:
--------------------------------------------------------------------------------
 1 | """
 2 | pymysql库使用代码示例
 3 | """
 4 | 
 5 | import pymysql
 6 | 
 7 | 
 8 | # 连接数据库
 9 | def db_connect():
10 |     conn = pymysql.connect(host='localhost', user='root', password='Zpj12345', port=3306, db='test')
11 |     return conn
12 | 
13 | 
14 | # 创建一个数据库表
15 | def create_table(c):
16 |     c.execute(
17 |         "CREATE TABLE IF Not Exists person(id INT AUTO_INCREMENT PRIMARY KEY,name VARCHAR(30) NOT NULL DEFAULT '',age INT,sex CHAR(2))")
18 | 
19 | 
20 | # 插入数据
21 | def insert_data(c, table, data_dict):
22 |     try:
23 |         keys = ', '.join(data_dict.keys())
24 |         values = ', '.join(['%s'] * len(data_dict))
25 |         sql = 'INSERT INTO {table} ({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
26 |         print(sql)
27 |         c.execute(sql, tuple(data_dict.values()))
28 |         db.commit()
29 |     except Exception as e:
30 |         print(e)
31 |         db.rollback()
32 | 
33 | 
34 | # 删除数据
35 | def delete_data(c, table, condition):
36 |     try:
37 |         sql = 'DELETE FROM {table} WHERE {condition}'.format(table=table, condition=condition)
38 |         c.execute(sql)
39 |         db.commit()
40 |     except Exception as e:
41 |         print(e)
42 |         db.rollback()
43 | 
44 | 
45 | # 修改数据
46 | def update_data(c, table, old, new):
47 |     try:
48 |         sql = 'UPDATE {table} SET {old} WHERE {new}'.format(table=table, old=old, new=new)
49 |         c.execute(sql)
50 |         db.commit()
51 |     except Exception as e:
52 |         print(e)
53 |         db.rollback()
54 | 
55 | 
56 | # 查看数据
57 | def inquire_data(c, table, condition):
58 |     try:
59 |         sql = 'SELECT * FROM {table} WHERE {condition}'.format(table=table, condition=condition)
60 |         c.execute(sql)
61 |         print('共有 %d 行数据' % c.rowcount)
62 |         row = c.fetchone()
63 |         while row:
64 |             print(row)
65 |             row = c.fetchone()
66 |     except Exception as e:
67 |         print(e)
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     db = db_connect()
72 |     cursor = db.cursor()
73 |     create_table(cursor)
74 |     # data = {
75 |     #     'name': '大黄',
76 |     #     'age': '17',
77 |     #     'sex': '男',
78 |     # }
79 |     # insert_data(cursor, 'person', data)
80 |     # delete_data(cursor, 'person', 'age < 10')
81 |     # update_data(cursor, 'person', 'age = 10', "name = '小红'")
82 |     inquire_data(cursor, 'person', 'age > 15')
83 |     db.close()
84 | 


--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/代码/4_6.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 爬取Gank.io API接口的数据到MySQL
  3 | """
  4 | import requests as r
  5 | from bs4 import BeautifulSoup
  6 | import pymysql
  7 | 
  8 | # 接口地址
  9 | search_api_base_url = 'https://gank.io/api/data/'
 10 | 
 11 | # 各种分类的表名：Android，iOS，休息视频，福利，拓展资源，前端，瞎推荐，App
 12 | category_list = ["android", "ios", "video", "meizi", "other", "fed", "random", "app"]
 13 | 
 14 | # 图片表名
 15 | pic_table_name = 'pics'
 16 | 
 17 | # 请求分类字段列表
 18 | type_list = ["Android", "iOS", "休息视频", "福利", "拓展资源", "前端", "瞎推荐", "App"]
 19 | 
 20 | # 表字段名
 21 | column_list = ('_id', 'createdAt', 'dsec', 'publishedAt', 'source', 'type', 'url', 'used', 'who')
 22 | 
 23 | # 图片表字段名
 24 | pic_column_list = ('_id', 'url')
 25 | 
 26 | 
 27 | # 创建数据库
 28 | def create_db():
 29 |     conn = pymysql.connect(host='localhost', user='root', password='Zpj12345', port=3306)
 30 |     cursor = conn.cursor()
 31 |     cursor.execute("Create Database If Not Exists gank Character Set UTF8MB4")
 32 |     conn.close()
 33 |     conn = pymysql.connect(host='localhost', user='root', password='Zpj12345', port=3306, db='gank')
 34 |     return conn
 35 | 
 36 | 
 37 | # 创建数据库表
 38 | def init_tables(c, table):
 39 |     c.execute(
 40 |         ("CREATE TABLE IF Not Exists {table}"
 41 |          "(_id CHAR(24) PRIMARY KEY,"
 42 |          "createdAt TEXT NOT NULL,"
 43 |          "dsec TEXT NOT NULL,"
 44 |          "publishedAt TEXT NOT NULL,"
 45 |          "source TEXT NOT NULL,"
 46 |          "type TEXT NOT NULL,"
 47 |          "url TEXT NOT NULL,"
 48 |          "used TEXT NOT NULL,"
 49 |          "who TEXT NOT NULL)").format(table=table))
 50 | 
 51 | 
 52 | # 创建图表
 53 | def init_pic_table(c, table):
 54 |     c.execute(
 55 |         ("CREATE TABLE IF Not Exists {table} "
 56 |          "(id INT AUTO_INCREMENT PRIMARY KEY,"
 57 |          "_id CHAR(24),"
 58 |          "url TEXT NOT NULL)").format(table=table))
 59 | 
 60 | 
 61 | # 把数据插入到数据库中
 62 | def insert_data(c, table, column, data):
 63 |     try:
 64 |         keys = ', '.join(column)
 65 |         values = ', '.join(['%s'] * len(data))
 66 |         sql = 'INSERT INTO {table} ({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
 67 |         c.execute(sql, tuple(data))
 68 |         db.commit()
 69 |     except Exception as e:
 70 |         print(e)
 71 |         db.rollback()
 72 | 
 73 | 
 74 | # 查询数据库表的方法
 75 | def query_data(c, table):
 76 |     try:
 77 |         sql = 'SELECT * FROM {table}'.format(table=table)
 78 |         c.execute(sql)
 79 |         print('共有 %d 行数据' % c.rowcount)
 80 |         row = c.fetchone()
 81 |         while row:
 82 |             print(row)
 83 |             row = c.fetchone()
 84 |     except Exception as e:
 85 |         print(e)
 86 | 
 87 | 
 88 | # 爬取接口数据的方法
 89 | def fetch_data(c, pos):
 90 |     page_count = 1
 91 |     while True:
 92 |         resp = r.get(search_api_base_url + type_list[pos] + '/50/' + str(page_count))
 93 |         result_json = resp.json()
 94 |         print("抓取：", resp.url)
 95 |         if len(result_json['results']) > 0:
 96 |             for result in result_json['results']:
 97 |                 data_list = [result['_id'],
 98 |                              result['createdAt'],
 99 |                              result['desc'],
100 |                              result['publishedAt'],
101 |                              result.get('source', ''),
102 |                              result['type'],
103 |                              result['url'],
104 |                              1 if result['used'] else 0,
105 |                              result.get('who', '') if result.get('who', '') is not None else '']
106 |                 insert_data(c, category_list[pos], column_list, data_list)
107 |                 if 'images' in result:
108 |                     for image in result['images']:
109 |                         insert_data(c, pic_table_name, pic_column_list, [result['_id'], image])
110 |             page_count += 1
111 |         else:
112 |             break
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     db = create_db()
117 |     cursor = db.cursor()
118 |     # for category in category_list:
119 |     #     init_tables(cursor, category)
120 |     # init_pic_table(cursor, pic_table_name)
121 |     # for i in range(0, len(category_list)):
122 |     #     fetch_data(cursor, i)
123 |     query_data(cursor, 'Android')
124 |     cursor.close()
125 | 


--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/代码/4_7.py:
--------------------------------------------------------------------------------
 1 | """
 2 | redis-py库的基本操作示例
 3 | """
 4 | import redis
 5 | 
 6 | # ====================== 连接Redis ============================
 7 | 
 8 | # 1.普通连接
 9 | r = redis.StrictRedis(host='127.0.0.1', port=6379, db=0)
10 | 
11 | # 2.连接池（一般）
12 | # redis-py使用connection pool来管理对一个redis server的所有连接，避免每次建立、
13 | # 释放连接的开销。这种方式实现多个Redis实例共享一个连接池
14 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='Zpj12345')
15 | r = redis.StrictRedis(connection_pool=pool)
16 | 
17 | # 3.管道
18 | # redis-py,默认情况下，每次都会进行连接池的连接和断开。若是想一次执行多条命令，进行
19 | # 事务性操作，就要用管道。(虽然有这个功能，但是不建议使用，慢而且没什么必要。)
20 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379)
21 | r = redis.StrictRedis(connection_pool=pool)
22 | pipe = r.pipeline(transaction=True)
23 | # 执行多条命令
24 | pipe.execute()
25 | 
26 | # ====================== 通用操作 ============================
27 | 
28 | r.delete('name')  # 根据键删除redis中的任意数据类型
29 | r.exists('name')  # 检测redis的键是否存在
30 | r.keys(pattern='*')  # 根据* ？等通配符匹配获取redis的键
31 | r.expire('name', time=3000)  # 为某个键设置超时时间
32 | r.rename('name', 'name1')  # 重命名键
33 | r.move('name', 'db1')  # 将redis的某个值移动到指定的db下
34 | r.randomkey()  # 随机获取一个redis的键（不删除）
35 | r.type('name')  # 获取键对应值的类型
36 | r.dbsize()  # 获得当前数据库中键的数目
37 | r.ttl('name')  # 获得键的过期时间
38 | r.flushdb()  # 删除当前选择数据库中所有的键
39 | r.flushall()  # 删除所有数据库中的所有键
40 | 
41 | 
42 | # ====================== String操作 ============================
43 | 
44 | # 设置键值对，默认不存在则创建，存在则修改
45 | # set(name, value, ex=None, px=None, nx=False, xx=False)
46 | #      ex，过期时间（秒）
47 | #      px，过期时间（毫秒）
48 | #      nx，如果设置为True，则只有name不存在时，当前set操作才执行,同setnx(name, value)
49 | #      xx，如果设置为True，则只有name存在时，当前set操作才执行
50 | 
51 | r.set('name', value)    #设置值
52 | r.setnx('name',value)   #如果name这个键不存在，把这个键对应的值设置为value
53 | r.setex('name', value, time) #设置值，并指定此键值的有效期
54 | r.setrange(name, offset, value) #修改字符串内容，从指定字符串索引开始向后替换
55 | r.mset({"name3":'xxx', "name4":'xxx'})   #批量设置值
56 | r.msetnx({"name3":'xxx', "name4":'xxx'}) #键都不存在是才批量赋值
57 | 
58 | r.get('name')   # 获取值
59 | r.getset('name', 'yyy') # 为键为name的值赋值为yyy，并返回上次的值xxx
60 | r.mget(['name1','name2']) # 返回多个键对应的值
61 | r.getrange(key, start, end) # 返回键为name的值的字符串，截取索引为start到end的字符
62 | r.strlen("name") #返回name对应值的字节长度（一个汉字3个字节）
63 | 
64 | r.append('name',value)  # 为键为name的值后追加value
65 | r.incr('name',amount) # 字符串转化为整型，再自增属性name对应的值，当属性name不存在时，
66 |                       # 则创建name＝amount，否则，则自增,amount为自增数(整数)
67 | r.decr('name',amount)   #自减name对应的值,当name不存在时,则创建name＝amount，
68 |                         #否则，则自减，amount为自增数(整数)
69 | r.substr('name',start, end) # 返回键为name的值的字符串截取索引为start到end的字符
70 | 
71 | 


--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/代码/4_8.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 利用redis保存bilibili弹幕
 3 | """
 4 | import requests as r
 5 | from bs4 import BeautifulSoup
 6 | import re
 7 | import redis
 8 | 
 9 | video_url = 'https://www.bilibili.com/video/av28989880'
10 | cid_regex = re.compile(r'.*?cid=(\d*?)\&amp.*', re.S)
11 | xml_base_url = 'http://comment.bilibili.com/'
12 | 
13 | 
14 | # 获取弹幕的cid
15 | def get_cid():
16 |     resp = r.get(video_url).text
17 |     bs = BeautifulSoup(resp, 'lxml')
18 |     src = bs.select('div.share-address ul li')[1].input
19 |     cid = cid_regex.match(str(src)).group(1)
20 |     print("获取到的cid：", cid)
21 | 
22 | 
23 | # 解析获取弹幕
24 | def analysis_d(cid):
25 |     count = 1
26 |     url = xml_base_url + cid + '.xml'
27 |     resp = r.get(url)
28 |     resp.encoding = 'utf-8'
29 |     bs = BeautifulSoup(resp.text, 'lxml')
30 |     d_s = bs.find_all('d')
31 |     for d in d_s:
32 |         dan_redis.set(str(count), d.text)
33 |         count += 1
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     # 连接redis
38 |     pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='Zpj12345', db = 0)
39 |     dan_redis = redis.StrictRedis(connection_pool=pool)
40 |     # analysis_d('50280136')
41 |     results = dan_redis.mget(dan_redis.keys())
42 |     print("总共有%d条数据" % len(results))
43 |     for result in results:
44 |         print(result.decode('utf-8'))


--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/代码/4_9.py:
--------------------------------------------------------------------------------
 1 | """
 2 | PyMongo库的基本操作示例
 3 | """
 4 | 
 5 | import pymongo
 6 | 
 7 | # 1.连接MongoDB数据库(默认没有密码，如果设置了密码要调用db.auth("用户名","密码")）
 8 | conn = pymongo.MongoClient(host='localhost', port=27017)
 9 | # 或者采用MongoDB连接字符串的形式也可以：
10 | # conn = pymongo.MongoClient('mongodb://localhost:27017')
11 | 
12 | # 2.选择数据库，也可以使用conn['test']这一的方式选择，等价
13 | # db = conn.test
14 | #
15 | # # 3.选择collection
16 | # collection = db.user
17 | # print(collection)
18 | 
19 | 
20 | # # 4.创建数据库
21 | # db = conn['test_db']
22 | #
23 | # # 5.创建collection
24 | # collection = db['test_collection']
25 | 
26 | # 6.插入一条数据
27 | # db = conn['test_db']
28 | # collection = db['test_collection']
29 | # dic = {'id': '1', 'name': 'Jay'}
30 | # collection.insert_one(dic)
31 | 
32 | db = conn.test_db
33 | collection = db.test_collection
34 | 
35 | # 7.插入多条数据（传入一个字典的列表）
36 | # data_list = [{'id': '2', 'name': 'Tom'},{'id': '3', 'name': 'Jack'}]
37 | # collection.insert_many(data_list)
38 | 
39 | 
40 | # 8.查找数据
41 | 
42 | # 查找一条
43 | # print(collection.find_one({'name': 'Tom'}))
44 | 
45 | 
46 | # 查找多条
47 | # data_list = [{'id': '4', 'name': 'Mary'},{'id': '4', 'name': 'Lucy'}]
48 | # collection.insert_many(data_list)
49 | # results = collection.find({'id':'4'})
50 | # for result in results:
51 | #     print(result)
52 | 
53 | # 正则匹配
54 | # for result in collection.find({'name':{'$regex':'^J.*'}}):
55 | #     print(result)
56 | 
57 | # 9.修改数据
58 | 
59 | # 方法一：需要整条记录参与
60 | # person = collection.find_one({'name':'Jack'})
61 | # person['name'] = 'Jacky'
62 | # collection.update({'name':'Jack'}, person)
63 | 
64 | # 方法二：部分修改字段内容的方式
65 | # result = collection.update_one({'name': 'Tom'}, {'$set': {"name": "Tony"}})
66 | # print(result)
67 | # print("匹配的数据条数：",result.matched_count, "受影响的数据条数：",result.modified_count)
68 | 
69 | # 10.删除数据
70 | # result = collection.delete_many({'id': {'$lte': 3}})
71 | # print("删除的数据条数：", result.deleted_count)
72 | 
73 | # 11.计数
74 | # print("数据库中有%d条记录。" % collection.find().count())
75 | 
76 | # 12.排序
77 | # data_list = [{'id': 2, 'name': 'Tom'},{'id': 3, 'name': 'Jack'},{'id': 5, 'name': 'Daisy'}]
78 | # collection.insert_many(data_list)
79 | # # 降序排列，升序可以传入pymongo.ASCENDING
80 | # results = collection.find().sort('id', pymongo.DESCENDING)
81 | # for result in results:
82 | #     print(result)
83 | 
84 | # 13.偏移
85 | results = collection.find().sort('id', pymongo.ASCENDING).skip(1)
86 | for result in results:
87 |     print(result)
88 | 
89 | 


--------------------------------------------------------------------------------
/4、用CSV 和 Excel 存储数据/勘误.md:
--------------------------------------------------------------------------------
1 | 2021.4.29 更新内容：
2 | 
3 | 4_10.py → 1号店已不提供H5版本，目前无解


--------------------------------------------------------------------------------
/5、用数据库存储数据/代码/5_1.py:
--------------------------------------------------------------------------------
 1 | """
 2 | csv库使用代码示例
 3 | """
 4 | 
 5 | import csv
 6 | import os
 7 | 
 8 | save_file_name_1 = os.path.join(os.getcwd(), '1.csv')
 9 | save_file_name_2 = os.path.join(os.getcwd(), '2.csv')
10 | save_file_name_3 = os.path.join(os.getcwd(), '3.csv')
11 | 
12 | data_1 = [['id', '姓名', '性别', '年龄', '工作'],
13 |           [1, '小明', '男', '18', '学生'],
14 |           [2, '小红', '女', '24', '老师'],
15 |           [3, '小光', '男', '25', 'Python工程师']]
16 | 
17 | headers = ['id', '姓名', '性别', '年龄', '工作']
18 | data_2 = [{'id': 1, '姓名': '小明', '性别': '男', '年龄': '18', '工作': '学生'},
19 |           {'id': 2, '姓名': '小红', '性别': '女', '年龄': '24', '工作': '老师'},
20 |           {'id': 3, '姓名': '小光', '性别': '男', '年龄': '25', '工作': 'Python工程师'}]
21 | 
22 | # 单行写入示例
23 | with open(save_file_name_1, 'w', newline='') as f:
24 |     writer = csv.writer(f)
25 |     for row in data_1:
26 |         writer.writerow(row)
27 | 
28 | # 多行写入
29 | with open(save_file_name_2, 'w', newline='') as f:
30 |     writer = csv.writer(f)
31 |     writer.writerows(data_1)
32 | 
33 | # 字典写入
34 | with open(save_file_name_3, 'w', newline='') as f:
35 |     # 标头在这里传入，作为第一行数据
36 |     writer = csv.DictWriter(f, headers)
37 |     writer.writeheader()
38 |     for row in data_2:
39 |         writer.writerow(row)
40 | 
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     with open(save_file_name_1) as f:
45 |         reader = csv.DictReader(f)
46 |         for row in reader:
47 |             print(row['姓名'])
48 |         # reader = csv.reader(f)
49 |         # print(list(reader)[0][1])
50 |         # for row in reader:
51 |         #     print(reader.line_num, row)
52 | 
53 | 


--------------------------------------------------------------------------------
/5、用数据库存储数据/代码/5_10.py:
--------------------------------------------------------------------------------
 1 | """
 2 | PyMongo库实战示例：爬取一号店关键词搜索结果保存到MongoDB中
 3 | """
 4 | import pymongo
 5 | import requests as r
 6 | from lxml import etree
 7 | 
 8 | search_word = "羽毛球"
 9 | search_base_url = 'https://search.yhd.com/c0-0/k'
10 | 
11 | 
12 | def search_goods(key):
13 |     data_list = []
14 |     resp = r.get(search_base_url + key)
15 |     resp.encoding = 'utf-8'
16 |     html = etree.HTML(resp.text)
17 |     ul_list = html.xpath('//div[@id="itemSearchList"]/div')
18 |     for ul in ul_list:
19 |         # 商品名称
20 |         title = ul.xpath('div//p[@class="proName clearfix"]/a/@title')[0]
21 |         # 商品链接
22 |         link = ul.xpath('div//p[@class="proName clearfix"]/a/@href')[0]
23 |         # 商品价格
24 |         price = ul.xpath('div//p[@class="proPrice"]/em/@yhdprice')[0]
25 |         # 店铺名称
26 |         store = ul.xpath('div//p[@class="storeName limit_width"]/a/@title')
27 |         store_name = store[0] if len(store) > 0 else ''
28 |         # 评论数
29 |         comment_count = ul.xpath('div//p[@class="proPrice"]/span[@class="comment"]/a/text()')[1]
30 |         # 好评率
31 |         favorable_rate = ul.xpath('div//span[@class="positiveRatio"]/text()')[0]
32 |         data_list.append({'title': title, 'link': 'https:' + link, 'price': price, 'store_name': store_name, 'comment_count': comment_count,
33 |                           'favorable_rate': favorable_rate})
34 |     return data_list
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     conn = pymongo.MongoClient(host='localhost', port=27017)
39 |     search_goods(search_word)
40 |     db = conn['yhd']
41 |     collection = db['羽毛球']
42 |     search_result_list = search_goods(search_word)
43 |     collection.insert_many(search_result_list)
44 |     conn.close()
45 | 


--------------------------------------------------------------------------------
/5、用数据库存储数据/代码/5_2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | csv库实战示例：爬取星座运势
 3 | """
 4 | import csv
 5 | import requests as r
 6 | from bs4 import BeautifulSoup
 7 | import re
 8 | import os
 9 | 
10 | # 抓取站点
11 | constellation_url = 'http://www.xzw.com/fortune/'
12 | 
13 | # 提取信息的正则
14 | fetch_regex = re.compile(r'^.*?<strong>(.*?)</strong><small>(.*?)</small>.*?width:(\d*)%.*?p>(.*)\[<a.*$', re.S)
15 | 
16 | # 数据保存文件名
17 | save_path = os.path.join(os.getcwd(), 'constellation.csv')
18 | 
19 | # 表头
20 | headers = ['星座','生日时间','运势评分','今日运势']
21 | 
22 | # 爬取星座运势相关信息保存
23 | def fetch_constellation_msg():
24 |     resp = r.get(constellation_url).text
25 |     bs = BeautifulSoup(resp, 'lxml')
26 |     dls = bs.select('div.alb div dl')
27 |     result_list = [headers]
28 |     for dl in dls:
29 |         # 正则提取信息
30 |         result = fetch_regex.match(str(dl))
31 |         if result is not None:
32 |             result_list.append([result.group(1),result.group(2), str(int(result.group(3))/20) + '颗星', result.group(4)])
33 |     # 数据写入csv文件中
34 |     with open(save_path, 'w', newline='') as f:
35 |         writer = csv.writer(f)
36 |         writer.writerows(result_list)
37 | 
38 | if __name__ == '__main__':
39 |     fetch_constellation_msg()


--------------------------------------------------------------------------------
/5、用数据库存储数据/代码/5_3.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Excel操作库xlwt，xlrd库使用代码示例
 3 | """
 4 | 
 5 | import xlwt
 6 | import xlrd
 7 | import os
 8 | 
 9 | if __name__ == '__main__':
10 |     # 新建一个工作薄
11 |     # workbook = xlwt.Workbook()
12 |     # sheet = workbook.add_sheet('工作表1',cell_overwrite_ok=True)
13 |     # sheet.write(0, 0, '姓名')
14 |     # sheet.write(0, 1, '学号')
15 |     # sheet.write(1, 0, '小猪')
16 |     # sheet.write(1, 1, '1')
17 |     # workbook.save(os.path.join(os.getcwd(), 'result.xlsx'))
18 |     workbook = xlrd.open_workbook(os.path.join(os.getcwd(), 'result.xlsx'))
19 |     sheet = workbook.sheets()[0]
20 |     # 获得行数
21 |     row_count = sheet.nrows
22 |     for row in range(0, row_count):
23 |         print(sheet.row_values(row))


--------------------------------------------------------------------------------
/5、用数据库存储数据/代码/5_4.py:
--------------------------------------------------------------------------------
  1 | """
  2 | xlwt和xlrd库实战示例：爬取豆瓣音乐top 250
  3 | """
  4 | import xlwt
  5 | import xlrd
  6 | import requests as r
  7 | import os
  8 | from bs4 import BeautifulSoup
  9 | 
 10 | base_url = 'https://music.douban.com/top250'
 11 | 
 12 | headers = {
 13 |     'Host': 'music.douban.com',
 14 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
 15 | }
 16 | 
 17 | save_file = 'douban.xlsx'
 18 | 
 19 | # 提取音乐信息的方法
 20 | def fetch_data(start_pos):
 21 |     result = []
 22 |     resp = r.get(base_url, headers=headers, params={'start': str(start_pos)}).text
 23 |     bs = BeautifulSoup(resp, 'lxml')
 24 |     trs = bs.select('tr.item')
 25 |     for tr in trs:
 26 |         tds = tr.select('td')
 27 |         # 歌名
 28 |         music_name = tds[1].div.a.text.strip().split("\n")[0]
 29 |         # 歌曲封面
 30 |         music_pic_url = tds[0].img.get('src')
 31 |         # 歌手
 32 |         msg_list = tds[1].div.p.text.split('/')
 33 |         singer = msg_list[0]
 34 |         # 发行时间
 35 |         public_time = msg_list[1]
 36 |         # 分类
 37 |         albums = ''
 38 |         for album in msg_list[2:]:
 39 |             albums += album.strip() + "/"
 40 |         # 评分
 41 |         spans = tds[1].select('div div span')
 42 |         score = spans[1].text
 43 |         # 评分人数
 44 |         score_num = (spans[2].text.replace(' ','').replace('\n','').replace('人评价','')[1:-1])
 45 |         # 歌曲详情页
 46 |         music_detail_url = tds[0].a.get('href')
 47 |         result.append([music_name, music_pic_url, singer, public_time, albums, score, score_num, music_detail_url])
 48 |     return result
 49 | 
 50 | class ExcelHelper:
 51 |     def __init__(self):
 52 |         if not os.path.exists(save_file):
 53 |             # 1.创建工作薄
 54 |             self.workbook = xlwt.Workbook()
 55 |             # 2.创建工作表，第二个参数用于确认同一个cell单元是否可以重设值
 56 |             self.sheet = self.workbook.add_sheet(u"豆瓣音乐Top 250", cell_overwrite_ok=True)
 57 |             # 3.初始化表头
 58 |             self.headTitles = [u'歌名', u'歌曲封面', u'歌手', u'发行时间', u'分类', u'评分', u'评分人数', u'歌曲详情页']
 59 |             for i, item in enumerate(self.headTitles):
 60 |                 self.sheet.write(0, i, item, self.style('Monaco', 220, bold=True))
 61 |             self.workbook.save(save_file)
 62 | 
 63 |     # 参数依次是：字体名称，字体高度，是否加粗
 64 |     def style(self, name, height, bold=False):
 65 |         style = xlwt.XFStyle()  # 赋值style为XFStyle()，初始化样式
 66 |         font = xlwt.Font()  # 为样式创建字体样式
 67 |         font.name = name
 68 |         font.height = height
 69 |         font.bold = bold
 70 |         return style
 71 | 
 72 |     # 往单元格里插入数据
 73 |     def insert_data(self, data_group):
 74 |         try:
 75 |             xlsx = xlrd.open_workbook(save_file)  # 读取Excel文件
 76 |             table = xlsx.sheets()[0]  # 根据索引获得表
 77 |             row_count = table.nrows  # 获取当前行数，新插入的数据从这里开始
 78 |             count = 0
 79 |             for data in data_group:
 80 |                 for i in range(len(data)):
 81 |                     self.sheet.write(row_count + count, i, data[i], self.style('Monaco', 220, bold=True))
 82 |                 count += 1
 83 |         except Exception as e:
 84 |             print(e)
 85 |         finally:
 86 |             self.workbook.save(save_file)
 87 | 
 88 |     # 读取Excel里的数据
 89 |     def read_data(self):
 90 |         xlsx = xlrd.open_workbook(save_file)
 91 |         table = xlsx.sheets()[0]
 92 |         nrows = table.nrows  # 行数
 93 |         ncols = table.ncols  # 列数
 94 |         # 从第一行开始，0是表头
 95 |         for i in range(1, nrows):
 96 |             # 读取某行数据
 97 |             row_value = table.row_values(i)
 98 |             print(row_value)
 99 | 
100 | 
101 | if __name__ == '__main__':
102 |     data_group = []
103 |     offsets = [x for x in range(0, 250, 25)]
104 |     for offset in offsets:
105 |         data_group += fetch_data(offset)
106 |     print(data_group)
107 |     excel = ExcelHelper()
108 |     excel.insert_data(data_group)
109 |     excel.read_data()


--------------------------------------------------------------------------------
/5、用数据库存储数据/代码/5_5.py:
--------------------------------------------------------------------------------
 1 | """
 2 | pymysql库使用代码示例
 3 | """
 4 | 
 5 | import pymysql
 6 | 
 7 | 
 8 | # 连接数据库
 9 | def db_connect():
10 |     conn = pymysql.connect(host='localhost', user='root', password='Zpj12345', port=3306, db='test')
11 |     return conn
12 | 
13 | 
14 | # 创建一个数据库表
15 | def create_table(c):
16 |     c.execute(
17 |         "CREATE TABLE IF Not Exists person(id INT AUTO_INCREMENT PRIMARY KEY,name VARCHAR(30) NOT NULL DEFAULT '',age INT,sex CHAR(2))")
18 | 
19 | 
20 | # 插入数据
21 | def insert_data(c, table, data_dict):
22 |     try:
23 |         keys = ', '.join(data_dict.keys())
24 |         values = ', '.join(['%s'] * len(data_dict))
25 |         sql = 'INSERT INTO {table} ({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
26 |         print(sql)
27 |         c.execute(sql, tuple(data_dict.values()))
28 |         db.commit()
29 |     except Exception as e:
30 |         print(e)
31 |         db.rollback()
32 | 
33 | 
34 | # 删除数据
35 | def delete_data(c, table, condition):
36 |     try:
37 |         sql = 'DELETE FROM {table} WHERE {condition}'.format(table=table, condition=condition)
38 |         c.execute(sql)
39 |         db.commit()
40 |     except Exception as e:
41 |         print(e)
42 |         db.rollback()
43 | 
44 | 
45 | # 修改数据
46 | def update_data(c, table, old, new):
47 |     try:
48 |         sql = 'UPDATE {table} SET {old} WHERE {new}'.format(table=table, old=old, new=new)
49 |         c.execute(sql)
50 |         db.commit()
51 |     except Exception as e:
52 |         print(e)
53 |         db.rollback()
54 | 
55 | 
56 | # 查看数据
57 | def inquire_data(c, table, condition):
58 |     try:
59 |         sql = 'SELECT * FROM {table} WHERE {condition}'.format(table=table, condition=condition)
60 |         c.execute(sql)
61 |         print('共有 %d 行数据' % c.rowcount)
62 |         row = c.fetchone()
63 |         while row:
64 |             print(row)
65 |             row = c.fetchone()
66 |     except Exception as e:
67 |         print(e)
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     db = db_connect()
72 |     cursor = db.cursor()
73 |     create_table(cursor)
74 |     # data = {
75 |     #     'name': '大黄',
76 |     #     'age': '17',
77 |     #     'sex': '男',
78 |     # }
79 |     # insert_data(cursor, 'person', data)
80 |     # delete_data(cursor, 'person', 'age < 10')
81 |     # update_data(cursor, 'person', 'age = 10', "name = '小红'")
82 |     inquire_data(cursor, 'person', 'age > 15')
83 |     db.close()
84 | 


--------------------------------------------------------------------------------
/5、用数据库存储数据/代码/5_6.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 爬取Gank.io API接口的数据到MySQL
  3 | """
  4 | import requests as r
  5 | from bs4 import BeautifulSoup
  6 | import pymysql
  7 | 
  8 | # 接口地址
  9 | search_api_base_url = 'https://gank.io/api/v2/data/'
 10 | 
 11 | # 各种分类的表名：Android，iOS，休息视频，福利，拓展资源，前端，瞎推荐，App
 12 | category_list = ["android", "ios", "video", "meizi", "other", "fed", "random", "app"]
 13 | 
 14 | # 图片表名
 15 | pic_table_name = 'pics'
 16 | 
 17 | # 请求分类字段列表
 18 | type_list = ["Android", "iOS", "休息视频", "福利", "拓展资源", "前端", "瞎推荐", "App"]
 19 | 
 20 | # 表字段名
 21 | column_list = ('_id', 'createdAt', 'dsec', 'publishedAt', 'source', 'type', 'url', 'used', 'who')
 22 | 
 23 | # 图片表字段名
 24 | pic_column_list = ('_id', 'url')
 25 | 
 26 | 
 27 | # 创建数据库
 28 | def create_db():
 29 |     conn = pymysql.connect(host='localhost', user='root', password='Zpj12345', port=3306)
 30 |     cursor = conn.cursor()
 31 |     cursor.execute("Create Database If Not Exists gank Character Set UTF8MB4")
 32 |     conn.close()
 33 |     conn = pymysql.connect(host='localhost', user='root', password='Zpj12345', port=3306, db='gank')
 34 |     return conn
 35 | 
 36 | 
 37 | # 创建数据库表
 38 | def init_tables(c, table):
 39 |     c.execute(
 40 |         ("CREATE TABLE IF Not Exists {table}"
 41 |          "(_id CHAR(24) PRIMARY KEY,"
 42 |          "createdAt TEXT NOT NULL,"
 43 |          "dsec TEXT NOT NULL,"
 44 |          "publishedAt TEXT NOT NULL,"
 45 |          "source TEXT NOT NULL,"
 46 |          "type TEXT NOT NULL,"
 47 |          "url TEXT NOT NULL,"
 48 |          "used TEXT NOT NULL,"
 49 |          "who TEXT NOT NULL)").format(table=table))
 50 | 
 51 | 
 52 | # 创建图表
 53 | def init_pic_table(c, table):
 54 |     c.execute(
 55 |         ("CREATE TABLE IF Not Exists {table} "
 56 |          "(id INT AUTO_INCREMENT PRIMARY KEY,"
 57 |          "_id CHAR(24),"
 58 |          "url TEXT NOT NULL)").format(table=table))
 59 | 
 60 | 
 61 | # 把数据插入到数据库中
 62 | def insert_data(c, table, column, data):
 63 |     try:
 64 |         keys = ', '.join(column)
 65 |         values = ', '.join(['%s'] * len(data))
 66 |         sql = 'INSERT INTO {table} ({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
 67 |         c.execute(sql, tuple(data))
 68 |         db.commit()
 69 |     except Exception as e:
 70 |         print(e)
 71 |         db.rollback()
 72 | 
 73 | 
 74 | # 查询数据库表的方法
 75 | def query_data(c, table):
 76 |     try:
 77 |         sql = 'SELECT * FROM {table}'.format(table=table)
 78 |         c.execute(sql)
 79 |         print('共有 %d 行数据' % c.rowcount)
 80 |         row = c.fetchone()
 81 |         while row:
 82 |             print(row)
 83 |             row = c.fetchone()
 84 |     except Exception as e:
 85 |         print(e)
 86 | 
 87 | 
 88 | # 爬取接口数据的方法
 89 | def fetch_data(c, pos):
 90 |     page_count = 1
 91 |     while True:
 92 |         resp = r.get(search_api_base_url + type_list[pos] + '/50/' + str(page_count))
 93 |         result_json = resp.json()
 94 |         print("抓取：", resp.url)
 95 |         if len(result_json['results']) > 0:
 96 |             for result in result_json['results']:
 97 |                 data_list = [result['_id'],
 98 |                              result['createdAt'],
 99 |                              result['desc'],
100 |                              result['publishedAt'],
101 |                              result.get('source', ''),
102 |                              result['type'],
103 |                              result['url'],
104 |                              1 if result['used'] else 0,
105 |                              result.get('who', '') if result.get('who', '') is not None else '']
106 |                 insert_data(c, category_list[pos], column_list, data_list)
107 |                 if 'images' in result:
108 |                     for image in result['images']:
109 |                         insert_data(c, pic_table_name, pic_column_list, [result['_id'], image])
110 |             page_count += 1
111 |         else:
112 |             break
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     db = create_db()
117 |     cursor = db.cursor()
118 |     # for category in category_list:
119 |     #     init_tables(cursor, category)
120 |     # init_pic_table(cursor, pic_table_name)
121 |     # for i in range(0, len(category_list)):
122 |     #     fetch_data(cursor, i)
123 |     query_data(cursor, 'Android')
124 |     cursor.close()
125 | 


--------------------------------------------------------------------------------
/5、用数据库存储数据/代码/5_7.py:
--------------------------------------------------------------------------------
 1 | """
 2 | redis-py库的基本操作示例
 3 | """
 4 | import redis
 5 | 
 6 | # ====================== 连接Redis ============================
 7 | 
 8 | # 1.普通连接
 9 | r = redis.StrictRedis(host='127.0.0.1', port=6379, db=0)
10 | 
11 | # 2.连接池（一般）
12 | # redis-py使用connection pool来管理对一个redis server的所有连接，避免每次建立、
13 | # 释放连接的开销。这种方式实现多个Redis实例共享一个连接池
14 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='Zpj12345')
15 | r = redis.StrictRedis(connection_pool=pool)
16 | 
17 | # 3.管道
18 | # redis-py,默认情况下，每次都会进行连接池的连接和断开。若是想一次执行多条命令，进行
19 | # 事务性操作，就要用管道。(虽然有这个功能，但是不建议使用，慢而且没什么必要。)
20 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379)
21 | r = redis.StrictRedis(connection_pool=pool)
22 | pipe = r.pipeline(transaction=True)
23 | # 执行多条命令
24 | pipe.execute()
25 | 
26 | # ====================== 通用操作 ============================
27 | 
28 | r.delete('name')  # 根据键删除redis中的任意数据类型
29 | r.exists('name')  # 检测redis的键是否存在
30 | r.keys(pattern='*')  # 根据* ？等通配符匹配获取redis的键
31 | r.expire('name', time=3000)  # 为某个键设置超时时间
32 | r.rename('name', 'name1')  # 重命名键
33 | r.move('name', 'db1')  # 将redis的某个值移动到指定的db下
34 | r.randomkey()  # 随机获取一个redis的键（不删除）
35 | r.type('name')  # 获取键对应值的类型
36 | r.dbsize()  # 获得当前数据库中键的数目
37 | r.ttl('name')  # 获得键的过期时间
38 | r.flushdb()  # 删除当前选择数据库中所有的键
39 | r.flushall()  # 删除所有数据库中的所有键
40 | 
41 | 
42 | # ====================== String操作 ============================
43 | 
44 | # 设置键值对，默认不存在则创建，存在则修改
45 | # set(name, value, ex=None, px=None, nx=False, xx=False)
46 | #      ex，过期时间（秒）
47 | #      px，过期时间（毫秒）
48 | #      nx，如果设置为True，则只有name不存在时，当前set操作才执行,同setnx(name, value)
49 | #      xx，如果设置为True，则只有name存在时，当前set操作才执行
50 | 
51 | r.set('name', value)    #设置值
52 | r.setnx('name',value)   #如果name这个键不存在，把这个键对应的值设置为value
53 | r.setex('name', value, time) #设置值，并指定此键值的有效期
54 | r.setrange(name, offset, value) #修改字符串内容，从指定字符串索引开始向后替换
55 | r.mset({"name3":'xxx', "name4":'xxx'})   #批量设置值
56 | r.msetnx({"name3":'xxx', "name4":'xxx'}) #键都不存在是才批量赋值
57 | 
58 | r.get('name')   # 获取值
59 | r.getset('name', 'yyy') # 为键为name的值赋值为yyy，并返回上次的值xxx
60 | r.mget(['name1','name2']) # 返回多个键对应的值
61 | r.getrange(key, start, end) # 返回键为name的值的字符串，截取索引为start到end的字符
62 | r.strlen("name") #返回name对应值的字节长度（一个汉字3个字节）
63 | 
64 | r.append('name',value)  # 为键为name的值后追加value
65 | r.incr('name',amount) # 字符串转化为整型，再自增属性name对应的值，当属性name不存在时，
66 |                       # 则创建name＝amount，否则，则自增,amount为自增数(整数)
67 | r.decr('name',amount)   #自减name对应的值,当name不存在时,则创建name＝amount，
68 |                         #否则，则自减，amount为自增数(整数)
69 | r.substr('name',start, end) # 返回键为name的值的字符串截取索引为start到end的字符
70 | 
71 | 


--------------------------------------------------------------------------------
/5、用数据库存储数据/代码/5_8.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 利用redis保存bilibili弹幕
 3 | """
 4 | import requests as r
 5 | from bs4 import BeautifulSoup
 6 | import re
 7 | import redis
 8 | 
 9 | video_url = 'https://www.bilibili.com/video/av28989880'
10 | cid_regex = re.compile(r'cid=(\d{8})', re.S)
11 | xml_base_url = 'http://comment.bilibili.com/'
12 | 
13 | 
14 | # 获取弹幕的cid
15 | def get_cid():
16 |     resp = r.get(video_url).text
17 |     cid = cid_regex.search(str(resp)).group(1).strip()
18 |     print("获取到的cid：", cid)
19 |     return cid
20 | 
21 | 
22 | # 解析获取弹幕
23 | def analysis_d(cid):
24 |     count = 1
25 |     url = xml_base_url + cid + '.xml'
26 |     resp = r.get(url)
27 |     resp.encoding = 'utf-8'
28 |     bs = BeautifulSoup(resp.text, 'lxml')
29 |     d_s = bs.find_all('d')
30 |     for d in d_s:
31 |         print(d.text)
32 |         # dan_redis.set(str(count), d.text)
33 |         count += 1
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     analysis_d(get_cid())
38 |     # 连接redis
39 |     pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='Zpj12345', db=0)
40 |     dan_redis = redis.StrictRedis(connection_pool=pool)
41 |     results = dan_redis.mget(dan_redis.keys())
42 |     print("总共有%d条数据" % len(results))
43 |     for result in results:
44 |         print(result.decode('utf-8'))
45 | 


--------------------------------------------------------------------------------
/5、用数据库存储数据/代码/5_9.py:
--------------------------------------------------------------------------------
 1 | """
 2 | PyMongo库的基本操作示例
 3 | """
 4 | 
 5 | import pymongo
 6 | 
 7 | # 1.连接MongoDB数据库(默认没有密码，如果设置了密码要调用db.auth("用户名","密码")）
 8 | conn = pymongo.MongoClient(host='localhost', port=27017)
 9 | # 或者采用MongoDB连接字符串的形式也可以：
10 | # conn = pymongo.MongoClient('mongodb://localhost:27017')
11 | 
12 | # 2.选择数据库，也可以使用conn['test']这一的方式选择，等价
13 | # db = conn.test
14 | #
15 | # # 3.选择collection
16 | # collection = db.user
17 | # print(collection)
18 | 
19 | 
20 | # # 4.创建数据库
21 | # db = conn['test_db']
22 | #
23 | # # 5.创建collection
24 | # collection = db['test_collection']
25 | 
26 | # 6.插入一条数据
27 | # db = conn['test_db']
28 | # collection = db['test_collection']
29 | # dic = {'id': '1', 'name': 'Jay'}
30 | # collection.insert_one(dic)
31 | 
32 | db = conn.test_db
33 | collection = db.test_collection
34 | 
35 | # 7.插入多条数据（传入一个字典的列表）
36 | # data_list = [{'id': '2', 'name': 'Tom'},{'id': '3', 'name': 'Jack'}]
37 | # collection.insert_many(data_list)
38 | 
39 | 
40 | # 8.查找数据
41 | 
42 | # 查找一条
43 | # print(collection.find_one({'name': 'Tom'}))
44 | 
45 | 
46 | # 查找多条
47 | # data_list = [{'id': '4', 'name': 'Mary'},{'id': '4', 'name': 'Lucy'}]
48 | # collection.insert_many(data_list)
49 | # results = collection.find({'id':'4'})
50 | # for result in results:
51 | #     print(result)
52 | 
53 | # 正则匹配
54 | # for result in collection.find({'name':{'$regex':'^J.*'}}):
55 | #     print(result)
56 | 
57 | # 9.修改数据
58 | 
59 | # 方法一：需要整条记录参与
60 | # person = collection.find_one({'name':'Jack'})
61 | # person['name'] = 'Jacky'
62 | # collection.update({'name':'Jack'}, person)
63 | 
64 | # 方法二：部分修改字段内容的方式
65 | # result = collection.update_one({'name': 'Tom'}, {'$set': {"name": "Tony"}})
66 | # print(result)
67 | # print("匹配的数据条数：",result.matched_count, "受影响的数据条数：",result.modified_count)
68 | 
69 | # 10.删除数据
70 | # result = collection.delete_many({'id': {'$lte': 3}})
71 | # print("删除的数据条数：", result.deleted_count)
72 | 
73 | # 11.计数
74 | # print("数据库中有%d条记录。" % collection.find().count())
75 | 
76 | # 12.排序
77 | # data_list = [{'id': 2, 'name': 'Tom'},{'id': 3, 'name': 'Jack'},{'id': 5, 'name': 'Daisy'}]
78 | # collection.insert_many(data_list)
79 | # # 降序排列，升序可以传入pymongo.ASCENDING
80 | # results = collection.find().sort('id', pymongo.DESCENDING)
81 | # for result in results:
82 | #     print(result)
83 | 
84 | # 13.偏移
85 | results = collection.find().sort('id', pymongo.ASCENDING).skip(1)
86 | for result in results:
87 |     print(result)
88 | 
89 | 


--------------------------------------------------------------------------------
/5、用数据库存储数据/勘误.md:
--------------------------------------------------------------------------------
1 | 2021.4.29 更新内容：
2 | 
3 | 5_6.py → 更新Gank.io的新API
4 | 
5 | 5_8.py → 更新获取cid的正则，修改提取弹幕规则
6 | 
7 | 5_10.py → 1号店已不提供H5版本，目前无解


--------------------------------------------------------------------------------
/6、Python应对反爬虫策略/代码/6_1.py:
--------------------------------------------------------------------------------
 1 | """
 2 | fake_useragent库使用示例
 3 | """
 4 | 
 5 | from fake_useragent import UserAgent
 6 | import random
 7 | 
 8 | if __name__ == '__main__':
 9 |     ua = UserAgent(use_cache_server=False)
10 |     print("Chrome浏览器：", ua.chrome)
11 |     print("FireFox浏览器：", ua.firefox)
12 |     print("Ubuntu FireFox浏览器：", ua.ff)
13 |     print("IE浏览器：", ua.ie)
14 |     print("Safari浏览器：", ua.safari)
15 |     print("Mac Chrome：", ua.google)
16 |     print("Opera浏览器：", ua.opera)
17 |     print("随机：",ua.random)
18 | 


--------------------------------------------------------------------------------
/6、Python应对反爬虫策略/代码/6_2.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Ajax动态加载数据应对策略例子：爬取花瓣网某个画板的所有风景图
  3 | """
  4 | import requests as r
  5 | import os
  6 | import re
  7 | import json
  8 | 
  9 | # 图片URL拼接的前缀和后缀
 10 | img_start_url = 'http://img.hb.aicdn.com/'
 11 | img_end = '_fw658'
 12 | 
 13 | # 图片key的保存文件
 14 | pic_key_file = 'pin_ids.txt'
 15 | 
 16 | # 获取pins的正则
 17 | boards_pattern = re.compile(r'pins":(.*)};')
 18 | 
 19 | # 修改pin_id的正则
 20 | max_pattern = re.compile(r'(?<=max=)\d*(?=&limit)')
 21 | 
 22 | # 图片保存路径
 23 | pic_download_dir = os.path.join(os.getcwd(), 'HuaBan/')
 24 | 
 25 | # Ajax模拟的请求头
 26 | ajax_headers = {
 27 |     'Host': 'huaban.com',
 28 |     'Accept': 'application/json',
 29 |     'X-Request': 'JSON',
 30 |     'X-Requested-With': 'XMLHttpRequest'
 31 | }
 32 | 
 33 | 
 34 | # 以追加的形式往文件中写入内容
 35 | def write_str_data(content, file_path):
 36 |     try:
 37 |         with open(file_path, 'a+', encoding='utf-8') as f:
 38 |             f.write(content + "\n", )
 39 |     except OSError as reason:
 40 |         print(str(reason))
 41 | 
 42 | 
 43 | # 按行读取文件里的内容添加到列表中返回
 44 | def load_data(file_path):
 45 |     if os.path.exists(file_path):
 46 |         data_list = []
 47 |         with open(file_path, "r+", encoding='utf-8') as f:
 48 |             for ip in f:
 49 |                 data_list.append(ip.replace("\n", ""))
 50 |         return data_list
 51 | 
 52 | 
 53 | # 获得borads页数据，提取key列表写入到文件里，并返回最后一个pid用于后续查询
 54 | def get_boards_index_data(url):
 55 |     print("请求：" + url)
 56 |     resp = r.get(url).text
 57 |     result = boards_pattern.search(resp)
 58 |     json_dict = json.loads(result.group(1))
 59 |     for item in json_dict:
 60 |         write_str_data(item['file']['key'], pic_key_file)
 61 |     # 返回最后一个pin_id
 62 |     pin_id = json_dict[-1]['pin_id']
 63 |     return pin_id
 64 | 
 65 | 
 66 | # 模拟Ajax请求更多数据
 67 | def get_json_list(url):
 68 |     print("请求：" + url)
 69 |     resp = r.get(url, headers=ajax_headers)
 70 |     if resp is None:
 71 |         return None
 72 |     else:
 73 |         json_dict = json.loads(resp.text)
 74 |         pins = json_dict['board']['pins']
 75 |         if len(pins) == 0:
 76 |             return None
 77 |         else:
 78 |             for item in pins:
 79 |                 write_str_data(item['file']['key'], pic_key_file)
 80 |             return pins[-1]['pin_id']
 81 | 
 82 | 
 83 | # 下载图片的方法
 84 | def download_pic(key):
 85 |     url = img_start_url + key + img_end
 86 |     resp = r.get(url).content
 87 |     try:
 88 |         print("下载图片：" + url)
 89 |         pic_name = key + ".jpg"
 90 |         with open(pic_download_dir + pic_name, "wb+") as f:
 91 |             f.write(resp)
 92 |     except (OSError, r.HTTPError, r.ConnectionError, Exception) as reason:
 93 |         print(str(reason))
 94 | 
 95 | 
 96 | if __name__ == '__main__':
 97 |     if not os.path.exists(pic_download_dir):
 98 |         os.makedirs(pic_download_dir)
 99 |     # 判断图片key的保存文件是否存在，存在的话删除
100 |     if os.path.exists(pic_key_file):
101 |         os.remove(pic_key_file)
102 |     # 一个画板链接，可自行替换
103 |     boards_url = 'http://huaban.com/boards/279523/'
104 |     board_last_pin_id = get_boards_index_data(boards_url)
105 |     board_json_url = boards_url + '?jl58nz3i&max=43131274&limit=20&wfl=1'
106 |     while True:
107 |         board_last_pin_id = get_json_list(max_pattern.sub(str(board_last_pin_id), board_json_url))
108 |         if board_last_pin_id is None:
109 |             break
110 |     pic_url_list = load_data(pic_key_file)
111 |     for key in pic_url_list:
112 |         download_pic(key)
113 |     print("所有图片下载完成～")
114 | 


--------------------------------------------------------------------------------
/6、Python应对反爬虫策略/代码/6_3.py:
--------------------------------------------------------------------------------
 1 | """
 2 | selenium使用示例
 3 | """
 4 | from selenium import webdriver
 5 | 
 6 | browser = webdriver.Chrome()  # 调用本地的Chrome浏览器
 7 | browser.get('http://www.baidu.com')  # 请求页面，会打开一个浏览器窗口
 8 | html_text = browser.page_source  # 获得页面代码
 9 | # browser.quit()  # 关闭浏览器
10 | print(html_text)


--------------------------------------------------------------------------------
/6、Python应对反爬虫策略/代码/6_4.py:
--------------------------------------------------------------------------------
 1 | """
 2 | selenium爬取简单网无聊图示例
 3 | """
 4 | import os
 5 | from selenium import webdriver
 6 | import redis
 7 | import requests as r
 8 | from bs4 import BeautifulSoup
 9 | 
10 | # 请求基地址
11 | base_url = 'http://jandan.net/pic'
12 | # 图片的保存路径
13 | pic_save_path = os.path.join(os.getcwd(), 'JianDan/')
14 | # 图片需要，作为Reids键用
15 | pic_count = 0
16 | 
17 | # 下载图片用headers
18 | pic_headers = {
19 |     'Host': 'wx2.sinaimg.cn',
20 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
21 |                   'Chrome/61.0.3163.100 Safari/537.36 '
22 | }
23 | 
24 | 
25 | # 打开浏览器模拟请求
26 | def browser_get():
27 |     browser = webdriver.Chrome()
28 |     browser.get(base_url)
29 |     html_text = browser.page_source
30 |     page_count = get_page_count(html_text)
31 |     # 循环拼接URL访问
32 |     for page in range(page_count, 0, -1):
33 |         page_url = base_url + '/page-' + str(page)
34 |         print('解析：' + page_url)
35 |         browser.get(page_url)
36 |         html = browser.page_source
37 |         get_meizi_url(html)
38 |     # 没有更多了关闭浏览器
39 |     browser.quit()
40 | 
41 | 
42 | # 获取总页码
43 | def get_page_count(html):
44 |     bs = BeautifulSoup(html, 'lxml')
45 |     page_count = bs.find('span', attrs={'class': 'current-comment-page'})
46 |     return int(page_count.get_text()[1:-1]) - 1
47 | 
48 | 
49 | # 获取每页的图片
50 | def get_meizi_url(html):
51 |     soup = BeautifulSoup(html, 'html.parser')
52 |     ol = soup.find('ol', attrs={'class': 'commentlist'})
53 |     href = ol.findAll('a', attrs={'class': 'view_img_link'})
54 |     global pic_count
55 |     for a in href:
56 |         dan_redis.set(str(pic_count), a['href'])
57 |         pic_count += 1
58 | 
59 | 
60 | # 下载图片
61 | def download_pic(url):
62 |     correct_url = url
63 |     if url.startswith('//'):
64 |         correct_url = url[2:]
65 |     if not url.startswith('http'):
66 |         correct_url = 'http://' + correct_url
67 |     print("下载：", correct_url)
68 |     try:
69 |         resp = r.get(correct_url, headers=pic_headers).content
70 |         pic_name = correct_url.split("/")[-1]
71 |         with open(pic_save_path + pic_name, "wb+") as f:
72 |             f.write(resp)
73 |     except (OSError, r.ConnectionError, r.HTTPError, Exception) as reason:
74 |         print(str(reason))
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='Zpj12345', db=1)
79 |     dan_redis = redis.StrictRedis(connection_pool=pool)
80 |     if not os.path.exists(pic_save_path):
81 |         os.makedirs(pic_save_path)
82 |     browser_get()
83 |     results = dan_redis.mget(dan_redis.keys())
84 |     for result in results:
85 |         download_pic(result.decode('utf-8'))
86 |     print("图片下载完毕！")
87 | 


--------------------------------------------------------------------------------
/6、Python应对反爬虫策略/代码/6_5.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Selenium+Tesserocr实现自动登陆知乎
  3 | """
  4 | import os
  5 | from selenium import webdriver
  6 | import requests as r
  7 | import time
  8 | from PIL import Image
  9 | from aip import AipOcr
 10 | from hashlib import md5
 11 | import base64
 12 | 
 13 | zhihu_login_url = 'https://www.zhihu.com/signup'
 14 | 
 15 | config = {
 16 |     'appId': 'd4ed8d211abd4f20b3xxe0f55xxx173f',
 17 |     'apiKey': 'Nk3RSGAh0gFEGdoFC7GxxaCQ',
 18 |     'secretKey': '63TyYDkI5R0x21tDsCxxBoF8EEmiDfEd'
 19 | }
 20 | client = AipOcr(**config)
 21 | 
 22 | # 超级鹰参数
 23 | cjy_params = {
 24 |     'user': 'CoderPig',
 25 |     'pass2': md5('zpj12345'.encode('utf8')).hexdigest(),
 26 |     'softid': '897137',
 27 | }
 28 | 
 29 | # 超级鹰请求头
 30 | cjy_headers = {
 31 |     'Connection': 'Keep-Alive',
 32 |     'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
 33 | }
 34 | 
 35 | 
 36 | # 打开浏览器模拟请求
 37 | def auto_login():
 38 |     browser = webdriver.Chrome()
 39 |     while True:
 40 |             browser.get(zhihu_login_url)
 41 |             # 判断是否处于注册页(底部有登录字样，是的话点击跳转)
 42 |             signup_switch_bt = browser.find_element_by_xpath('//div[@class="SignContainer-switch"]/span')
 43 |             if signup_switch_bt.text == '登录':
 44 |                 signup_switch_bt.click()
 45 |                 # 输入用户名
 46 |             username_input = browser.find_element_by_xpath('//input[@name="username"]')
 47 |             username_input.send_keys('xx@qq.com')
 48 |             # 输入密码
 49 |             password_input = browser.find_element_by_xpath('//input[@name="password"]')
 50 |             password_input.send_keys('xxx')
 51 |             # 等待一会儿，等验证码刷出来
 52 |             time.sleep(5)
 53 |             # 判断是否包含英文字符验证码，是的话处理，否则跳出
 54 |             if is_elements_existed(browser, "//div[@class='Captcha-englishContainer']"):
 55 |                 if len(browser.find_element_by_xpath("//img[@class='Captcha-englishImg']").get_attribute('src')) > 30:
 56 |                     code_img = browser.find_element_by_xpath('//img[@alt="图形验证码"]')
 57 |                     code = cjy_fetch_code(base64.b64decode(code_img.get_attribute('src')[22:].replace("%0A", "")), 1902)
 58 |                     # 输入验证码
 59 |                     code_input = browser.find_element_by_xpath('//input[@name="captcha"]')
 60 |                     code_input.send_keys(code)
 61 |                     time.sleep(2)
 62 |                     # 点击登录按钮
 63 |                     login_bt = browser.find_element_by_xpath('//button[@type="submit"]')
 64 |                     login_bt.click()
 65 |                     time.sleep(3)
 66 |                     break
 67 |             else:
 68 |                 continue
 69 |     time.sleep(10)
 70 |     # 打印当前的网页链接，以此判断是否跳转成功
 71 |     print(browser.current_url)
 72 | 
 73 | 
 74 | # 判断xpath定位的元素是否存在
 75 | def is_elements_existed(browser, element):
 76 |     flag = True
 77 |     try:
 78 |         browser.find_element_by_xpath(element)
 79 |         return flag
 80 |     except:
 81 |         flag = False
 82 |         return flag
 83 | 
 84 | 
 85 | # 读取图片
 86 | def get_file_content(file_path):
 87 |     with open(file_path, 'rb') as fp:
 88 |         return fp.read()
 89 | 
 90 | 
 91 | # 百度OCR文字识别
 92 | def baidu_ocr(file):
 93 |     image = get_file_content(file)
 94 |     # 调用通用文字识别, 图片参数为本地图片
 95 |     result = client.basicAccurate(image)
 96 |     print(result)
 97 |     if 'words_result' in result:
 98 |         return '\n'.join([w['words'] for w in result['words_result']])
 99 | 
100 | 
101 | # 重置图片大小，并进行灰度和二值化处理
102 | def resize_pic(file, width=1200, height=480):
103 |     img = Image.open(file)
104 |     try:
105 |         new_img = img.resize((width, height), Image.BILINEAR)
106 |         # 转灰度处理
107 |         new_img = new_img.convert('L')
108 |         # 二值化处理
109 |         table = []
110 |         for i in range(256):
111 |             if i < 150:
112 |                 table.append(0)
113 |             else:
114 |                 table.append(1)
115 |         # 通过表格转换为二进制图片
116 |         new_img = new_img.point(table, "1")
117 |         new_img.save(os.path.join(os.getcwd(), os.path.basename(file)))
118 |     except Exception as e:
119 |         print(e)
120 | 
121 | 
122 | # 超级鹰识别验证码
123 | def cjy_fetch_code(im, codetype):
124 |     cjy_params.update({'codetype': codetype})
125 |     files = {'userfile': ('ccc.jpg', im)}
126 |     resp = r.post('http://upload.chaojiying.net/Upload/Processing.php', data=cjy_params, files=files,
127 |                   headers=cjy_headers).json()
128 |     print(resp)
129 |     if resp.get('err_no', 0) == 0:
130 |         return resp.get('pic_str')
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     # resize_pic('code.png')
135 |     # baidu_ocr('code.png')
136 |     # im = open('code.png', 'rb').read()
137 |     # print(cjy_fetch_code(im, 1902))
138 |     auto_login()
139 | 


--------------------------------------------------------------------------------
/6、Python应对反爬虫策略/代码/6_6.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 破解极验滑动验证码示例
  3 | """
  4 | import time
  5 | from selenium import webdriver
  6 | from selenium.webdriver.support.wait import WebDriverWait
  7 | from selenium.webdriver import ActionChains
  8 | from lxml import etree
  9 | import requests as r
 10 | import re
 11 | import PIL.Image as image
 12 | 
 13 | full_image_file = 'full.jpg'
 14 | cut_image_file = 'cut.jpg'
 15 | bilibili_login_url = 'https://passport.bilibili.com/login'
 16 | url_fetch_regex = re.compile('url\(\"(.*?)\"\);')
 17 | bg_postion_regex = re.compile('position: (.*?)px (.*?)px;')
 18 | 
 19 | 
 20 | def auto_login():
 21 |     # 输入账号密码
 22 |     input_user = browser.find_element_by_xpath('//input[@id="login-username"]')
 23 |     input_user.send_keys("xxx")
 24 |     input_passwd = browser.find_element_by_xpath('//input[@id="login-passwd"]')
 25 |     input_passwd.send_keys("xxx")
 26 |     # 验证码自动验证
 27 |     location_lists = fetch_images()
 28 |     offset = (get_offset(restore_images(cut_image_file, location_lists[0]),
 29 |                          restore_images(full_image_file, location_lists[1])))
 30 |     print("滑块偏移量：", offset)
 31 |     b_track = get_track(offset - 6)
 32 |     b_slider = get_slider()
 33 |     move_slider(b_slider, b_track)
 34 |     time.sleep(1)
 35 |     # 点击登录
 36 |     login_bt = browser.find_element_by_xpath('//a[@class="btn btn-login"]')
 37 |     login_bt.click()
 38 | 
 39 | 
 40 | # 下载缺失的图片，每个小方块的坐标
 41 | def fetch_images():
 42 |     html = etree.HTML(browser.page_source)
 43 |     cut_bg = html.xpath('//div[@class="gt_cut_bg gt_show"]/div')
 44 |     full_bg = html.xpath('//div[@class="gt_cut_fullbg gt_show"]/div')
 45 |     # 提取两个打乱后顺序的webp图片URL替换为jpg
 46 |     cut_bg_url = url_fetch_regex.search((cut_bg[0].get('style'))).group(1).replace('webp', 'jpg')
 47 |     full_bg_url = url_fetch_regex.search((full_bg[0].get('style'))).group(1).replace('webp', 'jpg')
 48 |     with open(cut_image_file, 'wb+') as f: f.write(r.get(cut_bg_url).content)
 49 |     with open(full_image_file, 'wb+') as f: f.write(r.get(full_bg_url).content)
 50 |     # 采集图片定位坐标
 51 |     cut_bg_location_list = []
 52 |     full_bg_location_list = []
 53 |     for cut in cut_bg:
 54 |         cut_result = bg_postion_regex.search(cut.get('style'))
 55 |         full_result = bg_postion_regex.search(cut.get('style'))
 56 |         cut_bg_location_list.append({'x': int(cut_result.group(1)), 'y': int(cut_result.group(2))})
 57 |         full_bg_location_list.append({'x': int(full_result.group(1)), 'y': int(full_result.group(2))})
 58 |     return cut_bg_location_list, full_bg_location_list
 59 | 
 60 | 
 61 | # 合并还原图片
 62 | def restore_images(file, location_list):
 63 |     im = image.open(file)
 64 |     # 分段分成上面的图和下面的图列表
 65 |     below_list = []
 66 |     above_list = []
 67 |     for location in location_list:
 68 |         if location['y'] == -58:
 69 |             above_list.append(im.crop((abs(location['x']), 58, abs(location['x']) + 10, 116)))
 70 |         if location['y'] == 0:
 71 |             below_list.append(im.crop((abs(location['x']), 0, abs(location['x']) + 10, 58)))
 72 | 
 73 |     # 创建一个一样大的图片
 74 |     new_im = image.new('RGB', (260, 116))
 75 |     # 遍历坐标粘贴上面的图片
 76 |     x_offset = 0
 77 |     for im in above_list:
 78 |         new_im.paste(im, (x_offset, 0))
 79 |         x_offset += im.size[0]
 80 |     # 遍历坐标粘贴下面的图片
 81 |     x_offset = 0
 82 |     for im in below_list:
 83 |         new_im.paste(im, (x_offset, 58))
 84 |         x_offset += im.size[0]
 85 |     # 保存图片
 86 |     new_im.save(file)
 87 |     return new_im
 88 | 
 89 | 
 90 | # 判断两个像素点是否相同
 91 | def is_pixel_equal(img1, img2, x, y):
 92 |     pix1 = img1.load()[x, y]
 93 |     pix2 = img2.load()[x, y]
 94 |     scope = 20  # 像素阀值
 95 |     return abs(pix1[0] - pix2[0] < scope) and abs(pix1[1] - pix2[1] < scope) and abs(pix1[2] - pix2[2] < scope)
 96 | 
 97 | 
 98 | # 获得缺口偏移量
 99 | def get_offset(img1, img2):
100 |     left = 60
101 |     for x in range(left, img1.size[0]):
102 |         for y in range(img1.size[1]):
103 |             if not is_pixel_equal(img1, img2, x, y):
104 |                 return x
105 |     return left
106 | 
107 | 
108 | # 获取滑块
109 | def get_slider():
110 |     while True:
111 |         try:
112 |             slider = browser.find_element_by_xpath("//div[@class='gt_slider_knob gt_show']")
113 |             break
114 |         except:
115 |             time.sleep(0.5)
116 |     return slider
117 | 
118 | 
119 | # 滑块匀速滑动轨迹构造
120 | def get_track(distance):
121 |     track = []
122 |     current = 0
123 |     while current < distance:
124 |         move = distance / 4
125 |         current += move
126 |         track.append(round(move))
127 |     return track
128 | 
129 | 
130 | # 先加速后减速滑动轨迹构造
131 | def get_person_track(distance):
132 |     track = []
133 |     current = 0
134 |     mid = distance * 4 / 5  # 减速阈值
135 |     t = 0.2  # 计算间隔
136 |     v = 0  # 初速度
137 |     while current < distance:
138 |         a = 2 if current < mid else -3
139 |         v0 = v  # 初速度v0
140 |         v = v0 + a * t  # 当前速度
141 |         move = v0 * t + 1 / 2 * a * t * t  # 移动距离
142 |         current += move
143 |         track.append(round(move))
144 |     return track
145 | 
146 | 
147 | # 滑块滑动的方法
148 | def move_slider(slider, track):
149 |     ActionChains(browser).click_and_hold(slider).perform()
150 |     for x in track:
151 |         ActionChains(browser).move_by_offset(xoffset=x, yoffset=0).perform()
152 |     time.sleep(0.05)
153 |     ActionChains(browser).release().perform()
154 | 
155 | 
156 | if __name__ == '__main__':
157 |     browser = webdriver.Chrome()
158 |     wait = WebDriverWait(browser, 20)
159 |     browser.get(bilibili_login_url)
160 |     # 休眠2秒等待登录页加载完毕
161 |     time.sleep(1)
162 |     auto_login()
163 |     time.sleep(5)
164 |     print(browser.current_url)
165 |     browser.quit()
166 | 


--------------------------------------------------------------------------------
/6、Python应对反爬虫策略/勘误.md:
--------------------------------------------------------------------------------
1 | 2021.4.29 更新内容：
2 | 
3 | 6_5.py → 知乎登录不再使用文字验证，而是使用滑动验证
4 | 
5 | 6_6.py → B站登录不再使用验证，而是使用文件识别验证


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/__init__.py


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/__pycache__/items.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/__pycache__/items.cpython-37.pyc


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/__pycache__/middlewares.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/__pycache__/middlewares.cpython-37.pyc


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/__pycache__/pipelines.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/__pycache__/pipelines.cpython-37.pyc


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | class BcyItem(scrapy.Item):
11 |     author = scrapy.Field()
12 |     pic_url = scrapy.Field()
13 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | import os
 10 | import random
 11 | import logging
 12 | 
 13 | 
 14 | class FirstspiderSpiderMiddleware(object):
 15 |     # Not all methods need to be defined. If a method is not defined,
 16 |     # scrapy acts as if the spider middleware does not modify the
 17 |     # passed objects.
 18 | 
 19 |     @classmethod
 20 |     def from_crawler(cls, crawler):
 21 |         # This method is used by Scrapy to create your spiders.
 22 |         s = cls()
 23 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 24 |         return s
 25 | 
 26 |     def process_spider_input(self, response, spider):
 27 |         # Called for each response that goes through the spider
 28 |         # middleware and into the spider.
 29 | 
 30 |         # Should return None or raise an exception.
 31 |         return None
 32 | 
 33 |     def process_spider_output(self, response, result, spider):
 34 |         # Called with the results returned from the Spider, after
 35 |         # it has processed the response.
 36 | 
 37 |         # Must return an iterable of Request, dict or Item objects.
 38 |         for i in result:
 39 |             yield i
 40 | 
 41 |     def process_spider_exception(self, response, exception, spider):
 42 |         # Called when a spider or process_spider_input() method
 43 |         # (from other spider middleware) raises an exception.
 44 | 
 45 |         # Should return either None or an iterable of Response, dict
 46 |         # or Item objects.
 47 |         pass
 48 | 
 49 |     def process_start_requests(self, start_requests, spider):
 50 |         # Called with the start requests of the spider, and works
 51 |         # similarly to the process_spider_output() method, except
 52 |         # that it doesn’t have a response associated.
 53 | 
 54 |         # Must return only requests (not items).
 55 |         for r in start_requests:
 56 |             yield r
 57 | 
 58 |     def spider_opened(self, spider):
 59 |         spider.logger.info('Spider opened: %s' % spider.name)
 60 | 
 61 | 
 62 | class FirstspiderDownloaderMiddleware(object):
 63 |     # Not all methods need to be defined. If a method is not defined,
 64 |     # scrapy acts as if the downloader middleware does not modify the
 65 |     # passed objects.
 66 | 
 67 |     @classmethod
 68 |     def from_crawler(cls, crawler):
 69 |         # This method is used by Scrapy to create your spiders.
 70 |         s = cls()
 71 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 72 |         return s
 73 | 
 74 |     def process_request(self, request, spider):
 75 |         # Called for each request that goes through the downloader
 76 |         # middleware.
 77 | 
 78 |         # Must either:
 79 |         # - return None: continue processing this request
 80 |         # - or return a Response object
 81 |         # - or return a Request object
 82 |         # - or raise IgnoreRequest: process_exception() methods of
 83 |         #   installed downloader middleware will be called
 84 |         return None
 85 | 
 86 |     def process_response(self, request, response, spider):
 87 |         # Called with the response returned from the downloader.
 88 | 
 89 |         # Must either;
 90 |         # - return a Response object
 91 |         # - return a Request object
 92 |         # - or raise IgnoreRequest
 93 |         return response
 94 | 
 95 |     def process_exception(self, request, exception, spider):
 96 |         # Called when a download handler or a process_request()
 97 |         # (from other downloader middleware) raises an exception.
 98 | 
 99 |         # Must either:
100 |         # - return None: continue processing this exception
101 |         # - return a Response object: stops process_exception() chain
102 |         # - return a Request object: stops process_exception() chain
103 |         pass
104 | 
105 |     def spider_opened(self, spider):
106 |         spider.logger.info('Spider opened: %s' % spider.name)
107 | 
108 | 
109 | class ProxyMiddleware(object):
110 |     def __init__(self):
111 |         self.proxy_ip_list = self.load_list_from_file()
112 | 
113 |     @staticmethod
114 |     def load_list_from_file():
115 |         data_list = []
116 |         with open(os.path.join(os.getcwd(), 'proxy_ip.txt'), "r+", encoding='utf-8') as f:
117 |             for ip in f:
118 |                 data_list.append(ip.replace("\n", ""))
119 |         return data_list
120 | 
121 |     def process_request(self, request, spider):
122 |         if request.meta.get('retry_times'):
123 |             proxy = self.proxy_ip_list[random.randint(0, 175)]
124 |             if proxy:
125 |                 proxy_ip = 'https://{proxy}'.format(proxy=proxy)
126 |                 logging.debug("使用了代理：", proxy_ip)
127 |                 request.meta['proxy'] = proxy_ip
128 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymysql
 8 | 
 9 | 
10 | class FirstspiderPipeline(object):
11 |     def process_item(self, item, spider):
12 |         return item
13 | 
14 | 
15 | class MySQLPipeline():
16 |     def __init__(self):
17 |         self.host = 'localhost'
18 |         self.database = 'bcy'
19 |         self.user = 'root'
20 |         self.password = 'Jay12345'
21 |         self.port = 3306
22 | 
23 |     def open_spider(self, spider):
24 |         self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8', port=self.port)
25 |         self.cursor = self.db.cursor()
26 | 
27 |     def close_spider(self, spider):
28 |         self.db.close()
29 | 
30 |     def process_item(self, item, spider):
31 |         data = dict(item)
32 |         keys = ', '.join(data.keys())
33 |         values = ', '.join(["%s"] * len(data))
34 |         sql = "INSERT INTO draw (%s) VALUES (%s)" % (keys, values)
35 |         self.cursor.execute(sql, tuple(data.values()))
36 |         self.db.commit()
37 |         return item
38 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/proxy_ip.txt:
--------------------------------------------------------------------------------
  1 | 125.39.9.34:9000
  2 | 183.129.244.17:21776
  3 | 120.131.9.254:1080
  4 | 117.28.97.64:808
  5 | 120.92.174.37:1080
  6 | 119.254.103.43:8000
  7 | 219.141.153.4:80
  8 | 103.205.14.254:53281
  9 | 222.175.200.58:8060
 10 | 60.13.187.162:63000
 11 | 119.180.136.223:8060
 12 | 111.47.192.141:8888
 13 | 219.141.153.40:80
 14 | 171.11.77.4:45592
 15 | 221.2.174.28:8060
 16 | 14.149.68.120:1080
 17 | 61.150.113.74:8908
 18 | 119.179.135.114:8060
 19 | 39.135.24.12:80
 20 | 183.2.203.24:9000
 21 | 123.7.177.20:9999
 22 | 125.72.70.46:8060
 23 | 114.250.25.19:80
 24 | 101.248.64.74:80
 25 | 60.8.42.132:8908
 26 | 119.179.133.58:8060
 27 | 140.207.95.94:8060
 28 | 123.249.88.153:9000
 29 | 219.141.153.2:8080
 30 | 119.179.175.60:8060
 31 | 61.135.180.27:9000
 32 | 112.24.107.102:8908
 33 | 121.8.98.196:80
 34 | 222.88.149.32:8060
 35 | 121.8.98.198:80
 36 | 183.234.38.213:63000
 37 | 27.154.240.222:8060
 38 | 123.161.62.150:9000
 39 | 118.190.200.139:8080
 40 | 219.150.189.212:9999
 41 | 219.145.197.203:8908
 42 | 183.15.121.120:3128
 43 | 219.141.153.44:80
 44 | 221.14.140.130:80
 45 | 121.8.98.197:80
 46 | 221.2.175.214:8060
 47 | 113.87.202.97:53281
 48 | 113.128.198.50:8060
 49 | 111.3.154.196:8060
 50 | 60.13.156.45:8060
 51 | 39.137.77.67:8080
 52 | 222.222.243.124:8060
 53 | 120.194.61.62:8060
 54 | 221.1.205.74:8060
 55 | 118.190.94.254:9001
 56 | 123.161.62.151:9000
 57 | 119.52.116.114:80
 58 | 61.150.109.70:8908
 59 | 101.81.48.234:1028
 60 | 117.158.174.164:8060
 61 | 222.208.208.33:8060
 62 | 106.56.102.219:8070
 63 | 124.118.27.3:8060
 64 | 39.137.69.8:80
 65 | 117.141.99.38:53281
 66 | 183.63.101.62:55555
 67 | 123.117.166.166:8060
 68 | 163.125.114.218:8118
 69 | 171.10.31.67:8080
 70 | 223.93.145.186:8060
 71 | 223.96.95.229:3128
 72 | 61.150.113.27:8908
 73 | 219.141.153.3:80
 74 | 222.88.147.121:8060
 75 | 120.236.128.201:8060
 76 | 221.234.192.220:8010
 77 | 61.150.113.75:8908
 78 | 183.163.41.62:41766
 79 | 221.2.174.99:8060
 80 | 218.60.8.83:3129
 81 | 125.39.9.35:9000
 82 | 180.168.113.204:1080
 83 | 111.205.6.206:8088
 84 | 60.8.42.134:8908
 85 | 219.141.153.35:80
 86 | 61.135.18.206:8888
 87 | 218.201.55.74:63000
 88 | 183.246.84.229:8060
 89 | 116.228.236.219:8080
 90 | 121.17.18.218:8060
 91 | 112.16.28.103:8060
 92 | 61.149.137.110:80
 93 | 175.10.87.16:8060
 94 | 60.30.19.131:10010
 95 | 39.137.69.10:8080
 96 | 117.28.96.109:808
 97 | 125.46.245.93:53281
 98 | 211.136.127.125:80
 99 | 219.141.153.41:80
100 | 180.119.141.11:8118
101 | 124.238.248.4:80
102 | 175.174.85.171:80
103 | 123.122.225.134:8888
104 | 221.194.108.8:8060
105 | 119.180.173.64:8060
106 | 119.179.135.132:8060
107 | 101.227.5.36:9000
108 | 61.150.113.28:8908
109 | 111.43.139.151:80
110 | 124.128.76.142:8060
111 | 112.24.107.109:8908
112 | 119.180.178.70:8060
113 | 106.12.3.84:80
114 | 111.3.122.245:8060
115 | 39.135.24.11:80
116 | 42.236.123.17:80
117 | 222.222.236.207:8060
118 | 113.231.247.131:80
119 | 39.137.69.7:80
120 | 120.92.142.64:8080
121 | 114.225.169.226:53128
122 | 112.24.107.101:8908
123 | 106.58.252.76:80
124 | 58.49.73.141:8888
125 | 116.196.105.136:80
126 | 221.193.177.45:8060
127 | 117.44.247.53:8908
128 | 221.2.174.6:8060
129 | 118.190.95.35:9001
130 | 39.137.69.9:8080
131 | 119.180.138.69:8060
132 | 221.2.174.3:8060
133 | 222.223.203.109:8060
134 | 117.66.167.30:8118
135 | 1.197.117.27:8060
136 | 221.176.206.29:8060
137 | 219.141.153.39:80
138 | 39.137.77.68:8080
139 | 58.49.72.141:8888
140 | 222.88.154.56:8060
141 | 39.137.77.66:80
142 | 59.48.237.6:8060
143 | 119.48.189.100:80
144 | 222.89.85.130:8060
145 | 106.12.22.41:8118
146 | 202.103.215.23:80
147 | 60.8.42.36:8908
148 | 117.177.243.6:80
149 | 218.244.44.194:8060
150 | 118.190.95.43:9001
151 | 219.141.153.34:80
152 | 106.56.102.35:8070
153 | 103.205.26.57:21776
154 | 117.131.235.198:8060
155 | 183.129.207.74:11493
156 | 58.247.46.123:8088
157 | 60.8.42.137:8908
158 | 117.156.234.3:8060
159 | 223.68.190.130:8181
160 | 222.88.147.104:8060
161 | 183.220.43.78:8080
162 | 123.146.216.14:80
163 | 60.8.42.15:8908
164 | 221.14.140.66:80
165 | 175.155.24.10:1133
166 | 119.180.161.173:8060
167 | 175.9.177.63:8060
168 | 182.254.145.163:1080
169 | 119.187.120.118:8060
170 | 202.100.83.139:80
171 | 183.129.207.73:13846
172 | 120.236.168.19:8060
173 | 219.141.153.6:80
174 | 211.159.171.58:80
175 | 221.1.84.241:8197
176 | 60.14.125.246:8908


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/run.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | 
3 | cmdline.execute(["scrapy", "crawl", "bcy"])
4 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for FirstSpider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'FirstSpider'
13 | 
14 | SPIDER_MODULES = ['FirstSpider.spiders']
15 | NEWSPIDER_MODULE = 'FirstSpider.spiders'
16 | 
17 | ROBOTSTXT_OBEY = False
18 | 
19 | 
20 | DEFAULT_REQUEST_HEADERS = {
21 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
22 |                   'Chrome/68.0.3440.106 Safari/537.36',
23 |     'Host': 'bcy.net',
24 |     'Origin': 'https://bcy.net',
25 | }
26 | 
27 | DOWNLOADER_MIDDLEWARES = {
28 |     'FirstSpider.middlewares.ProxyMiddleware': 555
29 | }
30 | 
31 | ITEM_PIPELINES = {
32 |     'FirstSpider.pipelines.MySQLPipeline': 300,
33 | }
34 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/spiders/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/spiders/__pycache__/bcy.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/spiders/__pycache__/bcy.cpython-37.pyc


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/FirstSpider/spiders/bcy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy import Request, Spider, Selector
 3 | import datetime
 4 | 
 5 | from FirstSpider.items import *
 6 | 
 7 | 
 8 | def parse_index(response):
 9 |     items = response.xpath('//li[@class="js-smallCards _box"]')
10 |     for item in items:
11 |         bcy_item = BcyItem()
12 |         bcy_item['author'] = item.xpath('a[@class="db posr ovf"]/@title').extract_first()
13 |         bcy_item['pic_url'] = item.xpath('a/img/@src').extract_first().replace('/2X3', '')
14 |         yield bcy_item
15 | 
16 | 
17 | class BcySpider(Spider):
18 |     name = 'bcy'
19 |     allowed_domains = ['bcy.net']
20 | 
21 |     index_url = 'https://bcy.net/illust/toppost100?type=lastday&date={d}'
22 | 
23 |     ajax_url = 'https://bcy.net/illust/index/ajaxloadtoppost?p=1&type=lastday&date={d}'
24 | 
25 |     date_list = []  # 日期范围列表
26 | 
27 |     ajax_headers = {
28 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
29 |                       'Chrome/68.0.3440.106 Safari/537.36',
30 |         'Host': 'bcy.net',
31 |         'Origin': 'https://bcy.net',
32 |         'X-Requested-With': 'XMLHttpRequest'
33 |     }
34 | 
35 |     def start_requests(self):
36 |         self.init_date_list()
37 |         for date in self.date_list:
38 |             yield Request(self.index_url.format(d=date), callback=parse_index)
39 |         for date in self.date_list:
40 |             yield Request(self.ajax_url.format(d=date), callback=parse_index)
41 | 
42 |     # 构造一个日期列表
43 |     def init_date_list(self):
44 |         begin_date = datetime.datetime.strptime("20150918", "%Y%m%d")
45 |         end_date = datetime.datetime.strptime("20180827", "%Y%m%d")
46 |         while begin_date <= end_date:
47 |             date_str = begin_date.strftime("%Y%m%d")
48 |             self.date_list.append(date_str)
49 |             begin_date += datetime.timedelta(days=1)
50 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/FirstSpider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = FirstSpider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = FirstSpider
12 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.6
2 | ENV PATH /usr/local/bin:$PATH
3 | ADD . /code
4 | WORKDIR /code
5 | RUN pip3 install -r requirements.txt
6 | CMD scrapy crawl BingWallpaper


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/bing.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/bing.json


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/bing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/bing/__init__.py


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/bing/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/bing/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/bing/__pycache__/items.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/bing/__pycache__/items.cpython-37.pyc


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/bing/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/bing/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/bing/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class BingItem(scrapy.Item):
12 |     image_urls = scrapy.Field()
13 |     images = scrapy.Field()
14 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/bing/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class BingSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class BingDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/bing/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class BingPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/bing/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for bing project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'bing'
 13 | 
 14 | SPIDER_MODULES = ['bing.spiders']
 15 | NEWSPIDER_MODULE = 'bing.spiders'
 16 | 
 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 18 | # USER_AGENT = 'bing (+http://www.yourdomain.com)'
 19 | 
 20 | # Obey robots.txt rules
 21 | ROBOTSTXT_OBEY = False
 22 | 
 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 24 | # CONCURRENT_REQUESTS = 32
 25 | 
 26 | # Configure a delay for requests for the same website (default: 0)
 27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 28 | # See also autothrottle settings and docs
 29 | # DOWNLOAD_DELAY = 31
 30 | # The download delay setting will honor only one of:
 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 32 | # CONCURRENT_REQUESTS_PER_IP = 16
 33 | 
 34 | # Disable cookies (enabled by default)
 35 | # COOKIES_ENABLED = False
 36 | 
 37 | # Disable Telnet Console (enabled by default)
 38 | # TELNETCONSOLE_ENABLED = False
 39 | 
 40 | # Override the default request headers:
 41 | # DEFAULT_REQUEST_HEADERS = {
 42 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 43 | #   'Accept-Language': 'en',
 44 | # }
 45 | 
 46 | # Enable or disable spider middlewares
 47 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 48 | # SPIDER_MIDDLEWARES = {
 49 | #    'bing.middlewares.BingSpiderMiddleware': 543,
 50 | # }
 51 | 
 52 | # Enable or disable downloader middlewares
 53 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 54 | # DOWNLOADER_MIDDLEWARES = {
 55 | #    'bing.middlewares.BingDownloaderMiddleware': 543,
 56 | # }
 57 | 
 58 | # Enable or disable extensions
 59 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 60 | # EXTENSIONS = {
 61 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 62 | # }
 63 | 
 64 | # Configure item pipelines
 65 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 66 | # ITEM_PIPELINES = {
 67 | #    'bing.pipelines.BingPipeline': 300,
 68 | # }
 69 | 
 70 | ITEM_PIPELINES = {
 71 |     # 引入Scrapy提供的ImagesPipeline组件
 72 |     'scrapy.pipelines.images.ImagesPipeline': 300,
 73 | }
 74 | 
 75 | # ImagesPipeline辅助配置项
 76 | # 图片存储路径(绝对路径或相对路径)
 77 | IMAGES_STORE = 'out/res/pic/'
 78 | # BingItem中定义的存储图片链接的image_urls字段
 79 | IMAGES_URLS_FIELD = 'image_urls'
 80 | # BingItem中定义的的images字段
 81 | IMAGES_RESULT_FIELD='images'
 82 | # 过期时间,单位:天(可选)
 83 | IMAGES_EXPIRES = 120
 84 | # 过滤小图片(可选)
 85 | # IMAGES_MIN_HEIGHT = 110
 86 | # IMAGES_MIN_WIDTH = 110
 87 | # 是否允许重定向(可选)
 88 | # MEDIA_ALLOW_REDIRECTS = True
 89 | # 生成缩略图(可选)
 90 | # IMAGES_THUMBS = {
 91 | #     'small': (50, 50),
 92 | #     'big': (270, 270),
 93 | # }
 94 | 
 95 | # Enable and configure the AutoThrottle extension (disabled by default)
 96 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 97 | # AUTOTHROTTLE_ENABLED = True
 98 | # The initial download delay
 99 | # AUTOTHROTTLE_START_DELAY = 5
100 | # The maximum download delay to be set in case of high latencies
101 | # AUTOTHROTTLE_MAX_DELAY = 60
102 | # The average number of requests Scrapy should be sending in parallel to
103 | # each remote server
104 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
105 | # Enable showing throttling stats for every response received:
106 | # AUTOTHROTTLE_DEBUG = False
107 | 
108 | # Enable and configure HTTP caching (disabled by default)
109 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
110 | # HTTPCACHE_ENABLED = True
111 | # HTTPCACHE_EXPIRATION_SECS = 0
112 | # HTTPCACHE_DIR = 'httpcache'
113 | # HTTPCACHE_IGNORE_HTTP_CODES = []
114 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
115 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/bing/spiders/BingWallpaper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy import Spider, Request
 3 | import time
 4 | import json
 5 | 
 6 | from bing.items import BingItem
 7 | 
 8 | 
 9 | class BingWallpaperSpider(Spider):
10 |     name = 'BingWallpaper'
11 |     allowed_domains = ['cn.bing.com']
12 | 
13 |     def start_requests(self):
14 |         yield Request(
15 |             'https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc={ts}&pid=hp'.format(ts=int(time.time())),
16 |             callback=self.parse)
17 | 
18 |     def parse(self, response):
19 |         json_result = json.loads(response.body.decode('utf8'))
20 |         images = json_result['images']
21 |         if images is not None:
22 |             item = BingItem()
23 |             url_list = []
24 |             for image in images:
25 |                 url_list.append('https://cn.bing.com' + image['url'])
26 |             item['image_urls'] = url_list
27 |             yield item
28 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/bing/spiders/Test.py:
--------------------------------------------------------------------------------
1 | import time
2 | 
3 | print(int(time.time()))


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/bing/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/bing/spiders/__pycache__/BingWallpaper.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/bing/spiders/__pycache__/BingWallpaper.cpython-37.pyc


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/bing/spiders/__pycache__/Test.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/bing/spiders/__pycache__/Test.cpython-37.pyc


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/bing/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/bing/spiders/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/logs/BingWallpaper/2018-10-15T104228.709049.log:
--------------------------------------------------------------------------------
 1 | 2018-10-15 10:42:29 [scrapy] DEBUG: Crawled (200) <GET https://cn.bing.com> (referer: None)
 2 | 2018-10-15 10:42:29 [scrapy] ERROR: Spider error processing <GET https://cn.bing.com> (referer: None)
 3 | Traceback (most recent call last):
 4 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
 5 |     yield next(it)
 6 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 30, in process_spider_output
 7 |     for x in result:
 8 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in <genexpr>
 9 |     return (_set_referer(r) for r in result or ())
10 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
11 |     return (r for r in result or () if _filter(r))
12 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
13 |     return (r for r in result or () if _filter(r))
14 |   File "E:\Code\Python\bing\bing\spiders\BingWallpaper.py", line 17, in parse
15 |     json_result = json.loads(response.body.decode('utf8'))
16 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\__init__.py", line 348, in loads
17 |     return _default_decoder.decode(s)
18 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 337, in decode
19 |     obj, end = self.raw_decode(s, idx=_w(s, 0).end())
20 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 355, in raw_decode
21 |     raise JSONDecodeError("Expecting value", s, err.value) from None
22 | json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
23 | 2018-10-15 10:42:29 [scrapy] INFO: Closing spider (finished)
24 | 2018-10-15 10:42:29 [scrapy] INFO: Dumping Scrapy stats:
25 | {'downloader/request_bytes': 210,
26 |  'downloader/request_count': 1,
27 |  'downloader/request_method_count/GET': 1,
28 |  'downloader/response_bytes': 48356,
29 |  'downloader/response_count': 1,
30 |  'downloader/response_status_count/200': 1,
31 |  'finish_reason': 'finished',
32 |  'finish_time': datetime.datetime(2018, 10, 15, 2, 42, 29, 916818),
33 |  'log_count/DEBUG': 1,
34 |  'log_count/ERROR': 1,
35 |  'log_count/INFO': 6,
36 |  'response_received_count': 1,
37 |  'scheduler/dequeued': 1,
38 |  'scheduler/dequeued/memory': 1,
39 |  'scheduler/enqueued': 1,
40 |  'scheduler/enqueued/memory': 1,
41 |  'spider_exceptions/JSONDecodeError': 1,
42 |  'start_time': datetime.datetime(2018, 10, 15, 2, 42, 29, 334376)}
43 | 2018-10-15 10:42:29 [scrapy] INFO: Spider closed (finished)
44 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/logs/BingWallpaper/2018-10-15T104303.655633.log:
--------------------------------------------------------------------------------
 1 | 2018-10-15 10:43:03 [scrapy] DEBUG: Crawled (200) <GET https://cn.bing.com> (referer: None)
 2 | 2018-10-15 10:43:04 [scrapy] ERROR: Spider error processing <GET https://cn.bing.com> (referer: None)
 3 | Traceback (most recent call last):
 4 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
 5 |     yield next(it)
 6 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 30, in process_spider_output
 7 |     for x in result:
 8 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in <genexpr>
 9 |     return (_set_referer(r) for r in result or ())
10 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
11 |     return (r for r in result or () if _filter(r))
12 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
13 |     return (r for r in result or () if _filter(r))
14 |   File "E:\Code\Python\bing\bing\spiders\BingWallpaper.py", line 17, in parse
15 |     json_result = json.loads(response.body.decode('utf8'))
16 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\__init__.py", line 348, in loads
17 |     return _default_decoder.decode(s)
18 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 337, in decode
19 |     obj, end = self.raw_decode(s, idx=_w(s, 0).end())
20 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 355, in raw_decode
21 |     raise JSONDecodeError("Expecting value", s, err.value) from None
22 | json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
23 | 2018-10-15 10:43:04 [scrapy] INFO: Closing spider (finished)
24 | 2018-10-15 10:43:04 [scrapy] INFO: Dumping Scrapy stats:
25 | {'downloader/request_bytes': 210,
26 |  'downloader/request_count': 1,
27 |  'downloader/request_method_count/GET': 1,
28 |  'downloader/response_bytes': 48361,
29 |  'downloader/response_count': 1,
30 |  'downloader/response_status_count/200': 1,
31 |  'finish_reason': 'finished',
32 |  'finish_time': datetime.datetime(2018, 10, 15, 2, 43, 4, 73509),
33 |  'log_count/DEBUG': 1,
34 |  'log_count/ERROR': 1,
35 |  'log_count/INFO': 6,
36 |  'response_received_count': 1,
37 |  'scheduler/dequeued': 1,
38 |  'scheduler/dequeued/memory': 1,
39 |  'scheduler/enqueued': 1,
40 |  'scheduler/enqueued/memory': 1,
41 |  'spider_exceptions/JSONDecodeError': 1,
42 |  'start_time': datetime.datetime(2018, 10, 15, 2, 43, 3, 665598)}
43 | 2018-10-15 10:43:04 [scrapy] INFO: Spider closed (finished)
44 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/logs/BingWallpaper/2018-10-15T104348.228406.log:
--------------------------------------------------------------------------------
 1 | 2018-10-15 10:43:48 [scrapy] DEBUG: Crawled (200) <GET https://cn.bing.com> (referer: None)
 2 | 2018-10-15 10:43:48 [scrapy] ERROR: Spider error processing <GET https://cn.bing.com> (referer: None)
 3 | Traceback (most recent call last):
 4 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
 5 |     yield next(it)
 6 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 30, in process_spider_output
 7 |     for x in result:
 8 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in <genexpr>
 9 |     return (_set_referer(r) for r in result or ())
10 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
11 |     return (r for r in result or () if _filter(r))
12 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
13 |     return (r for r in result or () if _filter(r))
14 |   File "E:\Code\Python\bing\bing\spiders\BingWallpaper.py", line 17, in parse
15 |     json_result = json.loads(response.body.decode('utf8'))
16 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\__init__.py", line 348, in loads
17 |     return _default_decoder.decode(s)
18 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 337, in decode
19 |     obj, end = self.raw_decode(s, idx=_w(s, 0).end())
20 |   File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 355, in raw_decode
21 |     raise JSONDecodeError("Expecting value", s, err.value) from None
22 | json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
23 | 2018-10-15 10:43:48 [scrapy] INFO: Closing spider (finished)
24 | 2018-10-15 10:43:48 [scrapy] INFO: Dumping Scrapy stats:
25 | {'downloader/request_bytes': 210,
26 |  'downloader/request_count': 1,
27 |  'downloader/request_method_count/GET': 1,
28 |  'downloader/response_bytes': 48360,
29 |  'downloader/response_count': 1,
30 |  'downloader/response_status_count/200': 1,
31 |  'finish_reason': 'finished',
32 |  'finish_time': datetime.datetime(2018, 10, 15, 2, 43, 48, 681197),
33 |  'log_count/DEBUG': 1,
34 |  'log_count/ERROR': 1,
35 |  'log_count/INFO': 6,
36 |  'response_received_count': 1,
37 |  'scheduler/dequeued': 1,
38 |  'scheduler/dequeued/memory': 1,
39 |  'scheduler/enqueued': 1,
40 |  'scheduler/enqueued/memory': 1,
41 |  'spider_exceptions/JSONDecodeError': 1,
42 |  'start_time': datetime.datetime(2018, 10, 15, 2, 43, 48, 238379)}
43 | 2018-10-15 10:43:48 [scrapy] INFO: Spider closed (finished)
44 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/logs/BingWallpaper/2018-10-15T104841.872511.log:
--------------------------------------------------------------------------------
 1 | 2018-10-15 10:48:42 [scrapy] DEBUG: Crawled (200) <GET https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539571348&pid=hp> (referer: None)
 2 | 2018-10-15 10:48:42 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg> referred in <None>
 3 | 2018-10-15 10:48:42 [scrapy] DEBUG: Crawled (200) <GET https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg> (referer: None)
 4 | 2018-10-15 10:48:42 [scrapy] DEBUG: File (downloaded): Downloaded file from <GET https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg> referred in <None>
 5 | 2018-10-15 10:48:43 [scrapy] DEBUG: Crawled (200) <GET https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg> (referer: None)
 6 | 2018-10-15 10:48:43 [scrapy] DEBUG: File (downloaded): Downloaded file from <GET https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg> referred in <None>
 7 | 2018-10-15 10:48:43 [scrapy] DEBUG: Crawled (200) <GET https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg> (referer: None)
 8 | 2018-10-15 10:48:43 [scrapy] DEBUG: File (downloaded): Downloaded file from <GET https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg> referred in <None>
 9 | 2018-10-15 10:48:43 [scrapy] DEBUG: Crawled (200) <GET https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg> (referer: None)
10 | 2018-10-15 10:48:43 [scrapy] DEBUG: File (downloaded): Downloaded file from <GET https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg> referred in <None>
11 | 2018-10-15 10:48:43 [scrapy] DEBUG: Crawled (200) <GET https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg> (referer: None)
12 | 2018-10-15 10:48:43 [scrapy] DEBUG: File (downloaded): Downloaded file from <GET https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg> referred in <None>
13 | 2018-10-15 10:48:43 [scrapy] DEBUG: Crawled (200) <GET https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg> (referer: None)
14 | 2018-10-15 10:48:43 [scrapy] DEBUG: File (downloaded): Downloaded file from <GET https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg> referred in <None>
15 | 2018-10-15 10:48:43 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539571348&pid=hp>
16 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg',
17 |                 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg',
18 |                 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg',
19 |                 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg',
20 |                 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg',
21 |                 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg',
22 |                 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'],
23 |  'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f',
24 |              'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg',
25 |              'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'},
26 |             {'checksum': 'adbcc3f3fa26188db600654137117e2a',
27 |              'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg',
28 |              'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'},
29 |             {'checksum': '092f09cdb791bedf29913ad3d1940960',
30 |              'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg',
31 |              'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'},
32 |             {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7',
33 |              'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg',
34 |              'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'},
35 |             {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1',
36 |              'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg',
37 |              'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'},
38 |             {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99',
39 |              'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg',
40 |              'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'},
41 |             {'checksum': '6ebfdb5210fce5986b88d07053ac94af',
42 |              'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg',
43 |              'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]}
44 | 2018-10-15 10:48:43 [scrapy] INFO: Closing spider (finished)
45 | 2018-10-15 10:48:43 [scrapy] INFO: Dumping Scrapy stats:
46 | {'downloader/request_bytes': 3614,
47 |  'downloader/request_count': 7,
48 |  'downloader/request_method_count/GET': 7,
49 |  'downloader/response_bytes': 1842731,
50 |  'downloader/response_count': 7,
51 |  'downloader/response_status_count/200': 7,
52 |  'file_count': 7,
53 |  'file_status_count/downloaded': 6,
54 |  'file_status_count/uptodate': 1,
55 |  'finish_reason': 'finished',
56 |  'finish_time': datetime.datetime(2018, 10, 15, 2, 48, 43, 629811),
57 |  'item_scraped_count': 1,
58 |  'log_count/DEBUG': 15,
59 |  'log_count/INFO': 6,
60 |  'response_received_count': 7,
61 |  'scheduler/dequeued': 1,
62 |  'scheduler/dequeued/memory': 1,
63 |  'scheduler/enqueued': 1,
64 |  'scheduler/enqueued/memory': 1,
65 |  'start_time': datetime.datetime(2018, 10, 15, 2, 48, 41, 884479)}
66 | 2018-10-15 10:48:43 [scrapy] INFO: Spider closed (finished)
67 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/logs/BingWallpaper/2018-10-15T104922.591600.log:
--------------------------------------------------------------------------------
 1 | 2018-10-15 10:49:22 [scrapy] DEBUG: Crawled (200) <GET https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539571348&pid=hp> (referer: None)
 2 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg> referred in <None>
 3 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg> referred in <None>
 4 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg> referred in <None>
 5 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg> referred in <None>
 6 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg> referred in <None>
 7 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg> referred in <None>
 8 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg> referred in <None>
 9 | 2018-10-15 10:49:23 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539571348&pid=hp>
10 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg',
11 |                 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg',
12 |                 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg',
13 |                 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg',
14 |                 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg',
15 |                 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg',
16 |                 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'],
17 |  'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f',
18 |              'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg',
19 |              'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'},
20 |             {'checksum': 'adbcc3f3fa26188db600654137117e2a',
21 |              'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg',
22 |              'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'},
23 |             {'checksum': '092f09cdb791bedf29913ad3d1940960',
24 |              'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg',
25 |              'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'},
26 |             {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7',
27 |              'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg',
28 |              'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'},
29 |             {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1',
30 |              'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg',
31 |              'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'},
32 |             {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99',
33 |              'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg',
34 |              'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'},
35 |             {'checksum': '6ebfdb5210fce5986b88d07053ac94af',
36 |              'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg',
37 |              'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]}
38 | 2018-10-15 10:49:23 [scrapy] INFO: Closing spider (finished)
39 | 2018-10-15 10:49:23 [scrapy] INFO: Dumping Scrapy stats:
40 | {'downloader/request_bytes': 270,
41 |  'downloader/request_count': 1,
42 |  'downloader/request_method_count/GET': 1,
43 |  'downloader/response_bytes': 2711,
44 |  'downloader/response_count': 1,
45 |  'downloader/response_status_count/200': 1,
46 |  'file_count': 7,
47 |  'file_status_count/uptodate': 7,
48 |  'finish_reason': 'finished',
49 |  'finish_time': datetime.datetime(2018, 10, 15, 2, 49, 23, 63339),
50 |  'item_scraped_count': 1,
51 |  'log_count/DEBUG': 9,
52 |  'log_count/INFO': 6,
53 |  'response_received_count': 1,
54 |  'scheduler/dequeued': 1,
55 |  'scheduler/dequeued/memory': 1,
56 |  'scheduler/enqueued': 1,
57 |  'scheduler/enqueued/memory': 1,
58 |  'start_time': datetime.datetime(2018, 10, 15, 2, 49, 22, 600576)}
59 | 2018-10-15 10:49:23 [scrapy] INFO: Spider closed (finished)
60 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/logs/BingWallpaper/2018-10-15T105002.320386.log:
--------------------------------------------------------------------------------
 1 | 2018-10-15 10:50:02 [scrapy] DEBUG: Crawled (200) <GET https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539571348&pid=hp> (referer: None)
 2 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg> referred in <None>
 3 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg> referred in <None>
 4 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg> referred in <None>
 5 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg> referred in <None>
 6 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg> referred in <None>
 7 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg> referred in <None>
 8 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg> referred in <None>
 9 | 2018-10-15 10:50:02 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539571348&pid=hp>
10 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg',
11 |                 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg',
12 |                 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg',
13 |                 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg',
14 |                 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg',
15 |                 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg',
16 |                 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'],
17 |  'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f',
18 |              'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg',
19 |              'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'},
20 |             {'checksum': 'adbcc3f3fa26188db600654137117e2a',
21 |              'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg',
22 |              'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'},
23 |             {'checksum': '092f09cdb791bedf29913ad3d1940960',
24 |              'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg',
25 |              'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'},
26 |             {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7',
27 |              'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg',
28 |              'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'},
29 |             {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1',
30 |              'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg',
31 |              'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'},
32 |             {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99',
33 |              'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg',
34 |              'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'},
35 |             {'checksum': '6ebfdb5210fce5986b88d07053ac94af',
36 |              'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg',
37 |              'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]}
38 | 2018-10-15 10:50:02 [scrapy] INFO: Closing spider (finished)
39 | 2018-10-15 10:50:02 [scrapy] INFO: Dumping Scrapy stats:
40 | {'downloader/request_bytes': 270,
41 |  'downloader/request_count': 1,
42 |  'downloader/request_method_count/GET': 1,
43 |  'downloader/response_bytes': 2711,
44 |  'downloader/response_count': 1,
45 |  'downloader/response_status_count/200': 1,
46 |  'file_count': 7,
47 |  'file_status_count/uptodate': 7,
48 |  'finish_reason': 'finished',
49 |  'finish_time': datetime.datetime(2018, 10, 15, 2, 50, 2, 820051),
50 |  'item_scraped_count': 1,
51 |  'log_count/DEBUG': 9,
52 |  'log_count/INFO': 6,
53 |  'response_received_count': 1,
54 |  'scheduler/dequeued': 1,
55 |  'scheduler/dequeued/memory': 1,
56 |  'scheduler/enqueued': 1,
57 |  'scheduler/enqueued/memory': 1,
58 |  'start_time': datetime.datetime(2018, 10, 15, 2, 50, 2, 331356)}
59 | 2018-10-15 10:50:02 [scrapy] INFO: Spider closed (finished)
60 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/logs/BingWallpaper/2018-10-15T105902.809743.log:
--------------------------------------------------------------------------------
 1 | 2018-10-15 10:59:03 [scrapy] DEBUG: Crawled (200) <GET https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539572342&pid=hp> (referer: None)
 2 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg> referred in <None>
 3 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg> referred in <None>
 4 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg> referred in <None>
 5 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg> referred in <None>
 6 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg> referred in <None>
 7 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg> referred in <None>
 8 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg> referred in <None>
 9 | 2018-10-15 10:59:03 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539572342&pid=hp>
10 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg',
11 |                 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg',
12 |                 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg',
13 |                 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg',
14 |                 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg',
15 |                 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg',
16 |                 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'],
17 |  'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f',
18 |              'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg',
19 |              'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'},
20 |             {'checksum': 'adbcc3f3fa26188db600654137117e2a',
21 |              'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg',
22 |              'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'},
23 |             {'checksum': '092f09cdb791bedf29913ad3d1940960',
24 |              'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg',
25 |              'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'},
26 |             {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7',
27 |              'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg',
28 |              'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'},
29 |             {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1',
30 |              'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg',
31 |              'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'},
32 |             {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99',
33 |              'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg',
34 |              'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'},
35 |             {'checksum': '6ebfdb5210fce5986b88d07053ac94af',
36 |              'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg',
37 |              'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]}
38 | 2018-10-15 10:59:03 [scrapy] INFO: Closing spider (finished)
39 | 2018-10-15 10:59:03 [scrapy] INFO: Dumping Scrapy stats:
40 | {'downloader/request_bytes': 270,
41 |  'downloader/request_count': 1,
42 |  'downloader/request_method_count/GET': 1,
43 |  'downloader/response_bytes': 2711,
44 |  'downloader/response_count': 1,
45 |  'downloader/response_status_count/200': 1,
46 |  'file_count': 7,
47 |  'file_status_count/uptodate': 7,
48 |  'finish_reason': 'finished',
49 |  'finish_time': datetime.datetime(2018, 10, 15, 2, 59, 3, 536799),
50 |  'item_scraped_count': 1,
51 |  'log_count/DEBUG': 9,
52 |  'log_count/INFO': 6,
53 |  'response_received_count': 1,
54 |  'scheduler/dequeued': 1,
55 |  'scheduler/dequeued/memory': 1,
56 |  'scheduler/enqueued': 1,
57 |  'scheduler/enqueued/memory': 1,
58 |  'start_time': datetime.datetime(2018, 10, 15, 2, 59, 3, 51096)}
59 | 2018-10-15 10:59:03 [scrapy] INFO: Spider closed (finished)
60 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/logs/BingWallpaper/2018-10-15T113038.987323.log:
--------------------------------------------------------------------------------
 1 | 2018-10-15 11:30:39 [scrapy] DEBUG: Crawled (200) <GET https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539574239&pid=hp> (referer: None)
 2 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg> referred in <None>
 3 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg> referred in <None>
 4 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg> referred in <None>
 5 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg> referred in <None>
 6 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg> referred in <None>
 7 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg> referred in <None>
 8 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg> referred in <None>
 9 | 2018-10-15 11:30:39 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539574239&pid=hp>
10 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg',
11 |                 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg',
12 |                 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg',
13 |                 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg',
14 |                 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg',
15 |                 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg',
16 |                 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'],
17 |  'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f',
18 |              'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg',
19 |              'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'},
20 |             {'checksum': 'adbcc3f3fa26188db600654137117e2a',
21 |              'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg',
22 |              'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'},
23 |             {'checksum': '092f09cdb791bedf29913ad3d1940960',
24 |              'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg',
25 |              'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'},
26 |             {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7',
27 |              'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg',
28 |              'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'},
29 |             {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1',
30 |              'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg',
31 |              'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'},
32 |             {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99',
33 |              'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg',
34 |              'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'},
35 |             {'checksum': '6ebfdb5210fce5986b88d07053ac94af',
36 |              'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg',
37 |              'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]}
38 | 2018-10-15 11:30:39 [scrapy] INFO: Closing spider (finished)
39 | 2018-10-15 11:30:39 [scrapy] INFO: Dumping Scrapy stats:
40 | {'downloader/request_bytes': 270,
41 |  'downloader/request_count': 1,
42 |  'downloader/request_method_count/GET': 1,
43 |  'downloader/response_bytes': 2711,
44 |  'downloader/response_count': 1,
45 |  'downloader/response_status_count/200': 1,
46 |  'file_count': 7,
47 |  'file_status_count/uptodate': 7,
48 |  'finish_reason': 'finished',
49 |  'finish_time': datetime.datetime(2018, 10, 15, 3, 30, 39, 713384),
50 |  'item_scraped_count': 1,
51 |  'log_count/DEBUG': 9,
52 |  'log_count/INFO': 6,
53 |  'response_received_count': 1,
54 |  'scheduler/dequeued': 1,
55 |  'scheduler/dequeued/memory': 1,
56 |  'scheduler/enqueued': 1,
57 |  'scheduler/enqueued/memory': 1,
58 |  'start_time': datetime.datetime(2018, 10, 15, 3, 30, 39, 227680)}
59 | 2018-10-15 11:30:39 [scrapy] INFO: Spider closed (finished)
60 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/logs/BingWallpaper/2018-10-15T120654.496911.log:
--------------------------------------------------------------------------------
 1 | 2018-10-15 12:06:54 [scrapy] DEBUG: Crawled (200) <GET https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539575848&pid=hp> (referer: None)
 2 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg> referred in <None>
 3 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg> referred in <None>
 4 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg> referred in <None>
 5 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg> referred in <None>
 6 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg> referred in <None>
 7 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg> referred in <None>
 8 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg> referred in <None>
 9 | 2018-10-15 12:06:55 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539575848&pid=hp>
10 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg',
11 |                 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg',
12 |                 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg',
13 |                 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg',
14 |                 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg',
15 |                 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg',
16 |                 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'],
17 |  'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f',
18 |              'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg',
19 |              'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'},
20 |             {'checksum': 'adbcc3f3fa26188db600654137117e2a',
21 |              'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg',
22 |              'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'},
23 |             {'checksum': '092f09cdb791bedf29913ad3d1940960',
24 |              'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg',
25 |              'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'},
26 |             {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7',
27 |              'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg',
28 |              'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'},
29 |             {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1',
30 |              'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg',
31 |              'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'},
32 |             {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99',
33 |              'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg',
34 |              'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'},
35 |             {'checksum': '6ebfdb5210fce5986b88d07053ac94af',
36 |              'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg',
37 |              'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]}
38 | 2018-10-15 12:06:55 [scrapy] INFO: Closing spider (finished)
39 | 2018-10-15 12:06:55 [scrapy] INFO: Dumping Scrapy stats:
40 | {'downloader/request_bytes': 270,
41 |  'downloader/request_count': 1,
42 |  'downloader/request_method_count/GET': 1,
43 |  'downloader/response_bytes': 2711,
44 |  'downloader/response_count': 1,
45 |  'downloader/response_status_count/200': 1,
46 |  'file_count': 7,
47 |  'file_status_count/uptodate': 7,
48 |  'finish_reason': 'finished',
49 |  'finish_time': datetime.datetime(2018, 10, 15, 4, 6, 55, 222970),
50 |  'item_scraped_count': 1,
51 |  'log_count/DEBUG': 9,
52 |  'log_count/INFO': 6,
53 |  'response_received_count': 1,
54 |  'scheduler/dequeued': 1,
55 |  'scheduler/dequeued/memory': 1,
56 |  'scheduler/enqueued': 1,
57 |  'scheduler/enqueued/memory': 1,
58 |  'start_time': datetime.datetime(2018, 10, 15, 4, 6, 54, 748238)}
59 | 2018-10-15 12:06:55 [scrapy] INFO: Spider closed (finished)
60 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/033317f07b809f0cd06487b30b29eccb26d063b8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/033317f07b809f0cd06487b30b29eccb26d063b8.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/0698af79b195349b838bdfeebbd11409f82f0f38.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/0698af79b195349b838bdfeebbd11409f82f0f38.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/092235104f84cb2f4de8808c10f655298313f65c.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/092235104f84cb2f4de8808c10f655298313f65c.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/2efd29b32c481136507115a3ee2e6181c122aa0b.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/2efd29b32c481136507115a3ee2e6181c122aa0b.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/3a573eb605fef87faaf91ad8ad421d1a24d0bc6b.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/3a573eb605fef87faaf91ad8ad421d1a24d0bc6b.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/4099096a19a0eaad0aef6782a206881d948ad775.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/4099096a19a0eaad0aef6782a206881d948ad775.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/486c568e353051efd0959cc4a424ff9093cfceb9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/486c568e353051efd0959cc4a424ff9093cfceb9.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/599f27e7835da59b495c44297cce0553ee4a0b51.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/599f27e7835da59b495c44297cce0553ee4a0b51.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/86fd225ce368589a9b5e7454e6583cf77aedb0d4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/86fd225ce368589a9b5e7454e6583cf77aedb0d4.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/885648740905a26703e18c1ae24f23c480ecc822.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/885648740905a26703e18c1ae24f23c480ecc822.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/97e86cde9a308e626f537c107303537ec598903c.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/97e86cde9a308e626f537c107303537ec598903c.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/b7e4ba8cba538b44e31132d175479c7ec37284fd.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/b7e4ba8cba538b44e31132d175479c7ec37284fd.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/bfa7e5e22268f27d7a195390abf6ef9ee45a6c29.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/bfa7e5e22268f27d7a195390abf6ef9ee45a6c29.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/c14461fb44425865b9afe6695ab5926e2001411c.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/c14461fb44425865b9afe6695ab5926e2001411c.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/cbba4b16b644659920ad93e10a6d3478270ce927.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/cbba4b16b644659920ad93e10a6d3478270ce927.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/e254600d400f3c54c77171e02b021d46369788ae.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/e254600d400f3c54c77171e02b021d46369788ae.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/e7fc4de75bcafe18f64b68072bf5cc6ece6084a8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/e7fc4de75bcafe18f64b68072bf5cc6ece6084a8.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/ed989d9c858c5290ca559cf2c462cace68e49362.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/代码/bing/out/res/pic/full/ed989d9c858c5290ca559cf2c462cace68e49362.jpg


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/requirements.txt:
--------------------------------------------------------------------------------
1 | Scrapy==1.5.1
2 | Pillow==5.2.0


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/run.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | 
3 | cmdline.execute(["scrapy", "crawl", "BingWallpaper"])
4 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/代码/bing/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = bing.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = bing
12 | 


--------------------------------------------------------------------------------
/7、Python爬虫框架Scrapy（上）/勘误.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy（上）/勘误.md


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/__init__.py


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/__pycache__/items.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/__pycache__/items.cpython-37.pyc


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/__pycache__/middlewares.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/__pycache__/middlewares.cpython-37.pyc


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/__pycache__/pipelines.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/__pycache__/pipelines.cpython-37.pyc


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Field, Item
 9 | 
10 | 
11 | class JianshuspiderItem(Item):
12 |     title = Field()
13 |     content = Field()
14 |     url = Field()
15 |     nickname = Field()
16 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | from selenium import webdriver
  8 | from scrapy import signals
  9 | from scrapy.http import HtmlResponse
 10 | 
 11 | 
 12 | class JianshuspiderSpiderMiddleware(object):
 13 |     # Not all methods need to be defined. If a method is not defined,
 14 |     # scrapy acts as if the spider middleware does not modify the
 15 |     # passed objects.
 16 | 
 17 |     @classmethod
 18 |     def from_crawler(cls, crawler):
 19 |         # This method is used by Scrapy to create your spiders.
 20 |         s = cls()
 21 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 22 |         return s
 23 | 
 24 |     def process_spider_input(self, response, spider):
 25 |         # Called for each response that goes through the spider
 26 |         # middleware and into the spider.
 27 | 
 28 |         # Should return None or raise an exception.
 29 |         return None
 30 | 
 31 |     def process_spider_output(self, response, result, spider):
 32 |         # Called with the results returned from the Spider, after
 33 |         # it has processed the response.
 34 | 
 35 |         # Must return an iterable of Request, dict or Item objects.
 36 |         for i in result:
 37 |             yield i
 38 | 
 39 |     def process_spider_exception(self, response, exception, spider):
 40 |         # Called when a spider or process_spider_input() method
 41 |         # (from other spider middleware) raises an exception.
 42 | 
 43 |         # Should return either None or an iterable of Response, dict
 44 |         # or Item objects.
 45 |         pass
 46 | 
 47 |     def process_start_requests(self, start_requests, spider):
 48 |         # Called with the start requests of the spider, and works
 49 |         # similarly to the process_spider_output() method, except
 50 |         # that it doesn’t have a response associated.
 51 | 
 52 |         # Must return only requests (not items).
 53 |         for r in start_requests:
 54 |             yield r
 55 | 
 56 |     def spider_opened(self, spider):
 57 |         spider.logger.info('Spider opened: %s' % spider.name)
 58 | 
 59 | 
 60 | class JianshuspiderDownloaderMiddleware(object):
 61 |     # Not all methods need to be defined. If a method is not defined,
 62 |     # scrapy acts as if the downloader middleware does not modify the
 63 |     # passed objects.
 64 | 
 65 |     @classmethod
 66 |     def from_crawler(cls, crawler):
 67 |         # This method is used by Scrapy to create your spiders.
 68 |         s = cls()
 69 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 70 |         return s
 71 | 
 72 |     def process_request(self, request, spider):
 73 |         # Called for each request that goes through the downloader
 74 |         # middleware.
 75 | 
 76 |         # Must either:
 77 |         # - return None: continue processing this request
 78 |         # - or return a Response object
 79 |         # - or return a Request object
 80 |         # - or raise IgnoreRequest: process_exception() methods of
 81 |         #   installed downloader middleware will be called
 82 |         return None
 83 | 
 84 |     def process_response(self, request, response, spider):
 85 |         # Called with the response returned from the downloader.
 86 | 
 87 |         # Must either;
 88 |         # - return a Response object
 89 |         # - return a Request object
 90 |         # - or raise IgnoreRequest
 91 |         return response
 92 | 
 93 |     def process_exception(self, request, exception, spider):
 94 |         # Called when a download handler or a process_request()
 95 |         # (from other downloader middleware) raises an exception.
 96 | 
 97 |         # Must either:
 98 |         # - return None: continue processing this exception
 99 |         # - return a Response object: stops process_exception() chain
100 |         # - return a Request object: stops process_exception() chain
101 |         pass
102 | 
103 |     def spider_opened(self, spider):
104 |         spider.logger.info('Spider opened: %s' % spider.name)
105 | 
106 | 
107 | class JSSeleniumMiddleware:
108 |     def __init__(self):
109 |         self.browser = webdriver.Chrome()
110 | 
111 |     def __del__(self):
112 |         self.browser.close()
113 | 
114 |     def process_request(self, request, spider):
115 |         self.browser.get("https://www.jianshu.com/")
116 |         return HtmlResponse(url='https://www.jianshu.com/', body=self.browser.page_source, request=request,
117 |                             encoding='utf-8', status=200)
118 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | 
 9 | 
10 | class JianshuspiderPipeline(object):
11 |     def process_item(self, item, spider):
12 |         return item
13 | 
14 | 
15 | class MongoPipeline(object):
16 |     def open_spider(self, spider):
17 |         self.client = pymongo.MongoClient(host='localhost', port=27017)
18 |         self.db = self.client['js']
19 | 
20 |     def process_item(self, item, spider):
21 |         self.db['index_article'].insert(dict(item))
22 | 
23 |     def close_spider(self, spider):
24 |         self.client.close()
25 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for jianshuspider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'jianshuspider'
13 | 
14 | SPIDER_MODULES = ['jianshuspider.spiders']
15 | NEWSPIDER_MODULE = 'jianshuspider.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'jianshuspider (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'jianshuspider.middlewares.JianshuspiderSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | DOWNLOADER_MIDDLEWARES = {
56 |    'jianshuspider.middlewares.JSSeleniumMiddleware': 543,
57 | }
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'jianshuspider.pipelines.MongoPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/spiders/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/spiders/__pycache__/jianshu.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/spiders/__pycache__/jianshu.cpython-37.pyc


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/jianshuspider/spiders/jianshu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy import Spider, Request
 3 | 
 4 | from jianshuspider.items import JianshuspiderItem
 5 | 
 6 | 
 7 | class JianshuSpider(Spider):
 8 |     name = 'jianshu'
 9 |     allowed_domains = ['www.jianshu.com']
10 |     start_urls = ['http://www.jianshu.com/']
11 | 
12 |     def start_requests(self):
13 |         yield Request('https://www.jianshu.com', callback=self.parse)
14 | 
15 |     def parse(self, response):
16 |         li_s = response.xpath('//ul[@class="note-list"]/li')
17 |         for li in li_s:
18 |             item = JianshuspiderItem()
19 |             item['title'] = li.xpath('.//div/a[@class="title"]/text()').extract_first()
20 |             item['content'] = str(li.xpath('.//div/p[@class="abstract"]/text()').extract_first()).replace(
21 |                 " ", "").replace(
22 |                 "\n", "")
23 |             item['url'] = 'https://www.jianshu.com/p/' + str(
24 |                 li.xpath('.//div/a[@class="title"]/@href').extract_first())
25 |             item['nickname'] = li.xpath('.//div/a[@class="nickname"]/text()').extract_first()
26 |             yield item
27 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.4.4
 2 | APScheduler==3.5.3
 3 | asn1crypto==0.24.0
 4 | async-timeout==3.0.1
 5 | attrs==18.2.0
 6 | Automat==0.7.0
 7 | certifi==2018.8.24
 8 | cffi==1.11.5
 9 | chardet==3.0.4
10 | Click==7.0
11 | constantly==15.1.0
12 | cryptography==2.3.1
13 | cssselect==1.0.3
14 | demjson==2.2.4
15 | docopt==0.6.2
16 | Flask==1.0.2
17 | hyperlink==18.0.0
18 | idna==2.7
19 | incremental==17.5.0
20 | itsdangerous==0.24
21 | Jinja2==2.10
22 | lxml==4.2.5
23 | MarkupSafe==1.0
24 | multidict==4.4.2
25 | parsel==1.5.0
26 | Pillow==5.2.0
27 | pipreqs==0.4.9
28 | pyasn1==0.4.4
29 | pyasn1-modules==0.2.2
30 | pycparser==2.19
31 | PyDispatcher==2.0.5
32 | PyHamcrest==1.9.0
33 | pymongo==3.7.2
34 | PyMySQL==0.9.2
35 | pyOpenSSL==18.0.0
36 | pytz==2018.5
37 | pywin32==223
38 | queuelib==1.5.0
39 | redis==2.10.6
40 | requests==2.19.1
41 | Scrapy==1.5.1
42 | scrapyrt==0.10
43 | selenium==3.14.1
44 | service-identity==17.0.0
45 | six==1.11.0
46 | Twisted==18.7.0
47 | tzlocal==1.5.1
48 | urllib3==1.23
49 | w3lib==1.19.0
50 | Werkzeug==0.14.1
51 | yarg==0.1.9
52 | yarl==1.2.6
53 | zope.interface==4.5.0
54 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/jianshuspider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = jianshuspider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = jianshuspider
12 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_ip_check.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | import aiohttp
 4 | from aiohttp import ClientError, ClientConnectionError, ClientHttpProxyError, ServerDisconnectedError
 5 | from redis import StrictRedis
 6 | 
 7 | test_url = 'https://ip.cn/'
 8 | 
 9 | 
10 | class ProxyCheck:
11 |     def __init__(self):
12 |         self.redis_db = StrictRedis(
13 |             host="127.0.0.1",
14 |             port=6379,
15 |             password="Jay12345",
16 |             db=0
17 |         )
18 | 
19 |     # 检测代理IP是否可用
20 |     async def check_ip(self, proxy_ip):
21 |         conn = aiohttp.TCPConnector(ssl=False)
22 |         async with aiohttp.ClientSession(connector=conn) as session:
23 |             try:
24 |                 async with session.get(test_url, proxy=proxy_ip.replace("https", "http"), headers={
25 |                     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
26 |                                   'Chrome/68.0.3440.106 Safari/537.36'
27 |                 }) as resp:
28 |                     if resp.status in [200]:
29 |                         print("代理可用：", proxy_ip)
30 |                     else:
31 |                         print("移除不可用代理ip：", proxy_ip)
32 |                         self.redis_db.srem('proxy_ips:proxy_pool', proxy_ip)
33 |             except (ClientError, ClientConnectionError, ClientHttpProxyError, ServerDisconnectedError, TimeoutError,
34 |                     AttributeError):
35 |                 print("代理请求失败移除代理ip：", proxy_ip)
36 |                 self.redis_db.srem('proxy_ips:proxy_pool', proxy_ip)
37 | 
38 |     def check_all_ip(self):
39 |         print("开始检测代理ip是否可用")
40 |         loop = asyncio.get_event_loop()
41 |         tasks = []
42 |         for ip in self.redis_db.smembers('proxy_ips:proxy_pool'):
43 |             tasks.append(self.check_ip(ip.decode()))
44 |         loop.run_until_complete(asyncio.wait(tasks))
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     ProxyCheck().check_all_ip()
49 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_ips/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_ips/__init__.py


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_ips/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_ips/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_ips/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_ips/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_ips/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ProxyIpsItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_ips/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class ProxyIpsSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class ProxyIpsDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_ips/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class ProxyIpsPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_ips/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for proxy_ips project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'proxy_ips'
13 | 
14 | SPIDER_MODULES = ['proxy_ips.spiders']
15 | NEWSPIDER_MODULE = 'proxy_ips.spiders'
16 | 
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'proxy_ips (+http://www.yourdomain.com)'
19 | 
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 | 
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | # CONCURRENT_REQUESTS = 32
25 | 
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | # DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | # CONCURRENT_REQUESTS_PER_IP = 16
33 | 
34 | # Disable cookies (enabled by default)
35 | # COOKIES_ENABLED = False
36 | 
37 | # Disable Telnet Console (enabled by default)
38 | # TELNETCONSOLE_ENABLED = False
39 | 
40 | # Override the default request headers:
41 | DEFAULT_REQUEST_HEADERS = {
42 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 |     'Accept-Language': 'en',
44 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
45 |                   'Chrome/68.0.3440.106 Safari/537.36',
46 | 
47 | }
48 | 
49 | # Enable or disable spider middlewares
50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
51 | # SPIDER_MIDDLEWARES = {
52 | #    'proxy_ips.middlewares.ProxyIpsSpiderMiddleware': 543,
53 | # }
54 | 
55 | # Enable or disable downloader middlewares
56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
57 | # DOWNLOADER_MIDDLEWARES = {
58 | #    'proxy_ips.middlewares.ProxyIpsDownloaderMiddleware': 543,
59 | # }
60 | 
61 | # Enable or disable extensions
62 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
63 | # EXTENSIONS = {
64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
65 | # }
66 | 
67 | # Configure item pipelines
68 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
69 | # ITEM_PIPELINES = {
70 | #    'proxy_ips.pipelines.ProxyIpsPipeline': 300,
71 | # }
72 | 
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | # AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | # AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | # AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | # AUTOTHROTTLE_DEBUG = False
85 | 
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | # HTTPCACHE_ENABLED = True
89 | # HTTPCACHE_EXPIRATION_SECS = 0
90 | # HTTPCACHE_DIR = 'httpcache'
91 | # HTTPCACHE_IGNORE_HTTP_CODES = []
92 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_ips/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_ips/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_ips/spiders/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_ips/spiders/__pycache__/proxy_spider.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_ips/spiders/__pycache__/proxy_spider.cpython-37.pyc


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_ips/spiders/proxy_spider.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | 
  3 | import aiohttp
  4 | from aiohttp import ClientError, ClientConnectionError, ClientHttpProxyError, ServerDisconnectedError
  5 | from redis import StrictRedis
  6 | from scrapy import Spider, Request
  7 | import time
  8 | 
  9 | test_url = 'https://ip.cn/'
 10 | 
 11 | 
 12 | # 获取代理IP的爬虫
 13 | class FetchIpSpider(Spider):
 14 |     name = "fetch_ip"
 15 | 
 16 |     def __init__(self, **kwargs):
 17 |         super().__init__(**kwargs)
 18 |         self.redis_db = StrictRedis(
 19 |             host="127.0.0.1",
 20 |             port=6379,
 21 |             password="Jay12345",
 22 |             db=0
 23 |         )
 24 | 
 25 |     def start_requests(self):
 26 |         # for i in range(1, 5):
 27 |         #     yield Request(url="http://www.xicidaili.com/nn/" + str(i), callback=self.parse_xici, headers={
 28 |         #         'Host': 'www.xicidaili.com',
 29 |         #         'Referer': 'http://www.xicidaili.com/',
 30 |         #         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
 31 |         #                       'Chrome/68.0.3440.106 Safari/537.36'
 32 |         #     })
 33 | 
 34 |         for i in range(1, 5):
 35 |             time.sleep(3)
 36 |             yield Request(url='https://www.kuaidaili.com/free/inha/' + str(i) + '/', callback=self.parse_kuaidaili,
 37 |                           headers={
 38 |                               'Host': 'www.kuaidaili.com',
 39 |                               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, '
 40 |                                             'like Gecko) '
 41 |                                             'Chrome/68.0.3440.106 Safari/537.36'
 42 |                           })
 43 | 
 44 |     def parse_xici(self, response):
 45 |         loop = asyncio.get_event_loop()
 46 |         proxy_ips = []
 47 |         for tr in response.css('#ip_list tr'):
 48 |             td_list = tr.css('td::text')
 49 |             if len(td_list) < 3:
 50 |                 continue
 51 |             ip_address = td_list[0].extract()  # IP
 52 |             port = td_list[1].extract()  # 端口
 53 |             if len(td_list) == 11:
 54 |                 proto = td_list[4].extract()
 55 |             else:
 56 |                 proto = td_list[5].extract()  # 协议类型
 57 |             proxy_ip = '%s://%s:%s' % (proto.lower(), ip_address, port)
 58 |             # 获取响应时间，超过2s的丢弃
 59 |             latency = tr.css('div.bar::attr(title)').re_first('(\d+\.\d+)')
 60 |             if float(latency) > 2:
 61 |                 self.logger.info("跳过慢速代理：%s 响应时间：%s" % (proxy_ip, latency))
 62 |             else:
 63 |                 self.logger.info("可用代理加入队列：%s 响应时间：%s" % (proxy_ip, latency))
 64 |                 proxy_ips.append(proxy_ip)
 65 |         tasks = []
 66 |         for ip in proxy_ips:
 67 |             tasks.append(self.check_ip(ip))
 68 |         loop.run_until_complete(asyncio.wait(tasks))
 69 | 
 70 | 
 71 |     def parse_kuaidaili(self, response):
 72 |         loop = asyncio.get_event_loop()
 73 |         proxy_ips = []
 74 |         for tr in response.css('tbody tr'):
 75 |             td_list = tr.css('td::text')
 76 |             ip_address = td_list[0].extract()  # IP
 77 |             port = td_list[1].extract()  # 端口
 78 |             proto = td_list[3].extract()  # 协议
 79 |             proxy_ip = '%s://%s:%s' % (proto.lower(), ip_address, port)
 80 |             # 获取响应时间，超过2s的丢弃
 81 |             latency = float((td_list[5].extract())[:-1])
 82 |             if float(latency) > 2:
 83 |                 self.logger.info("跳过慢速代理：%s 响应时间：%s" % (proxy_ip, latency))
 84 |             else:
 85 |                 self.logger.info("可用代理加入队列：%s 响应时间：%s" % (proxy_ip, latency))
 86 |                 proxy_ips.append(proxy_ip)
 87 |         tasks = []
 88 |         for ip in proxy_ips:
 89 |             tasks.append(self.check_ip(ip))
 90 |         loop.run_until_complete(asyncio.wait(tasks))
 91 | 
 92 |     # 检测代理IP是否可用
 93 |     async def check_ip(self, proxy_ip):
 94 |         conn = aiohttp.TCPConnector(ssl=False)
 95 |         async with aiohttp.ClientSession(connector=conn) as session:
 96 |             try:
 97 |                 async with session.get(test_url, proxy=proxy_ip.replace("https", "http")) as resp:
 98 |                     if resp.status in [200]:
 99 |                         print("代理可用：", proxy_ip)
100 |                         self.redis_db.sadd('proxy_ips:proxy_pool', proxy_ip)
101 |                     else:
102 |                         print("代理不可用：", proxy_ip)
103 |             except (ClientError, ClientConnectionError, ClientHttpProxyError, ServerDisconnectedError, TimeoutError,
104 |                     AttributeError):
105 |                 print("代理请求失败：", proxy_ip)
106 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/proxy_server.py:
--------------------------------------------------------------------------------
 1 | # coding =utf-8
 2 | from flask import Flask
 3 | from redis import StrictRedis
 4 | import random
 5 | 
 6 | app = Flask(__name__)
 7 | 
 8 | 
 9 | @app.route("/")
10 | def fetch_ip():
11 |     ip_list = list(redis_db.smembers("proxy_ips:proxy_pool"))
12 |     return random.choice(ip_list).decode()
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     redis_db = StrictRedis(
17 |         host="127.0.0.1",
18 |         port=6379,
19 |         password="Jay12345",
20 |         db=0
21 |     )
22 |     app.run()
23 | 
24 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/run.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | from apscheduler.schedulers.blocking import BlockingScheduler
 5 | from redis import StrictRedis
 6 | 
 7 | fetch_ip_time = 0
 8 | 
 9 | redis_db = StrictRedis(
10 |     host="127.0.0.1",
11 |     port=6379,
12 |     password="Jay12345",
13 |     db=0
14 | )
15 | 
16 | 
17 | def check_ip():
18 |     global fetch_ip_time
19 |     proxy_poll = redis_db.smembers("proxy_ips:proxy_pool")
20 |     if len(proxy_poll) == 0:
21 |         print("可用代理IP数目为0，激活爬虫...")
22 |         os.system("scrapy crawl fetch_ip")
23 |         fetch_ip_time = int(time.time())
24 |     else:
25 |         if len(proxy_poll) < 5:
26 |             if int(time.time() - fetch_ip_time) < 600:
27 |                 if len(proxy_poll) == 0:
28 |                     print("虽然处于保护状态，但是可用代理IP数目为0，激活爬虫...")
29 |                     os.system("scrapy crawl fetch_ip")
30 |                     fetch_ip_time = int(time.time())
31 |                 else:
32 |                     print("当前可用代理IP少于5，但是还处于保护状态，不激活爬虫")
33 |             else:
34 |                 print("当前可用代理IP少于5，且处于非保护状态，激活爬虫...")
35 |                 os.system("scrapy crawl fetch_ip")
36 |                 fetch_ip_time = int(time.time())
37 |         else:
38 |             print("日常自检...")
39 |             os.system("python proxy_ip_check.py")
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     check_ip()
44 |     scheduler = BlockingScheduler()
45 |     # 每隔20s执行一次
46 |     scheduler.add_job(check_ip, 'interval', max_instances=10, seconds=20)
47 |     scheduler.start()
48 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/代码/proxy_ips/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = proxy_ips.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = proxy_ips
12 | 


--------------------------------------------------------------------------------
/8、Python爬虫框架Scrapy（下）/勘误.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy（下）/勘误.md


--------------------------------------------------------------------------------
/9、数据分析案例：Python岗位行情/代码/9_1.py:
--------------------------------------------------------------------------------
 1 | """
 2 | numpy使用代码示例
 3 | """
 4 | 
 5 | import numpy as np
 6 | 
 7 | print("1.生成一个一维数组:\n %s" % np.array([1, 2]))
 8 | print("2.生成一个二维数组:\n %s" % np.array([[1, 2], [3, 4]]))
 9 | print("3.生成一个元素初始值都为0的，4行3列矩阵:\n %s" % np.zeros((4, 3)))
10 | print("4.生成一个元素初始值都为1的，3行4列矩阵:\n %s" % np.ones((3, 4)))
11 | print("5.创建一个空数组，元素为随机值：\n %s" % np.empty([2, 3], dtype=int))
12 | a1 = np.arange(0, 30, 2)
13 | print("6.生成一个等间隔数字的数组:\n %s" % a1)
14 | a2 = a1.reshape(3, 5)
15 | print("7.转换数组的维度，比如把一维的转为3行5列的数组:\n %s" % a2)
16 | 
17 | # ndarray常用属性
18 | print("8.a1的维度: %d \t a2的维度：%d" % (a1.ndim, a2.ndim))
19 | print("9.a1的行列数：%s \t a2的行列数：%s" % (a1.shape, a2.shape))
20 | print("10.a1的元素个数：%d \t a2的元素个数：%d" % (a1.size, a2.size))
21 | print("11.a1的元素数据类型：%s 数据类型大小：%s" % (a1.dtype, a1.itemsize))
22 | 


--------------------------------------------------------------------------------
/9、数据分析案例：Python岗位行情/代码/9_2.py:
--------------------------------------------------------------------------------
  1 | # 拉勾网Android招聘数据分析
  2 | import html
  3 | import random
  4 | import re
  5 | import time
  6 | import urllib.parse
  7 | from collections import Counter
  8 | 
  9 | import matplotlib.pyplot as plt
 10 | import numpy as np
 11 | import pandas as pd
 12 | import requests
 13 | from scipy.misc import imread
 14 | from wordcloud import WordCloud, ImageColorGenerator
 15 | 
 16 | import config as c
 17 | import tools as t
 18 | 
 19 | max_page = 1
 20 | result_save_file = c.outputs_logs_path + 'result.csv'
 21 | pic_save_path = c.outputs_pictures_path + 'LaGou/'
 22 | default_font = c.res_documents + 'wryh.ttf'  # 生成词云用的默认字体
 23 | default_mask = c.res_pictures + 'default_mask.jpg'  # 默认遮罩图片
 24 | 
 25 | # Ajax加载url
 26 | ajax_url = "https://www.lagou.com/jobs/positionAjax.json?"
 27 | 
 28 | # url拼接参数
 29 | request_params = {'needAddtionalResult': 'false'}
 30 | 
 31 | # post提交参数
 32 | form_data = {'first': 'false', 'pn': '1', 'kd': 'Python'}
 33 | 
 34 | # 获得页数的正则
 35 | page_pattern = re.compile('"totalCount":(\d*),', re.S)
 36 | 
 37 | # csv表头
 38 | csv_headers = [
 39 |     '公司id', '城市', '职位名称', '工作年限', '学历', '职位性质', '薪资',
 40 |     '融资状态', '行业领域', '招聘岗位id', '公司优势', '公司规模',
 41 |     '公司标签', '所在区域', '技能标签', '公司经度', '公司纬度', '公司全名'
 42 | ]
 43 | 
 44 | # 模拟请求头
 45 | ajax_headers = {
 46 |     'Accept': 'application/json, text/javascript, */*; q=0.01',
 47 |     'Accept-Encoding': 'gzip, deflate, br',
 48 |     'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
 49 |     'Connection': 'keep-alive',
 50 |     'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
 51 |     'Host': 'www.lagou.com',
 52 |     'Origin': 'https://www.lagou.com',
 53 |     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 '
 54 |                   'Safari/537.36',
 55 |     'X-Anit-Forge-Code': '0',
 56 |     'X-Anit-Forge-Token': 'None',
 57 |     'X-Requested-With': 'XMLHttpRequest',
 58 |     'Referer': 'https://www.lagou.com/jobs/list_android?labelWords=&fromSearch=true&suginput='
 59 | }
 60 | 
 61 | 
 62 | # 获取每页招聘信息
 63 | def fetch_data(page):
 64 |     fetch_url = ajax_url + urllib.parse.urlencode(request_params)
 65 |     global max_page
 66 |     while True:
 67 |         try:
 68 |             form_data['pn'] = page
 69 |             print("抓取第：" + str(page) + "页!")
 70 |             # 随机休眠5-15s，避免因为访问过于频繁ip被封
 71 |             resp = requests.post(url=fetch_url, data=form_data, headers=ajax_headers)
 72 |             if resp.status_code == 200:
 73 |                 if page == 1:
 74 |                     max_page = int(int(page_pattern.search(resp.text).group(1)) / 15)
 75 |                     print("总共有：" + str(max_page) + "页")
 76 |                 data_json = resp.json()['content']['positionResult']['result']
 77 |                 data_list = []
 78 |                 for data in data_json:
 79 |                     data_list.append((data['companyId'],
 80 |                                       data['city'],
 81 |                                       html.unescape(data['positionName']),
 82 |                                       data['workYear'],
 83 |                                       data['education'],
 84 |                                       data['jobNature'],
 85 |                                       data['salary'],
 86 |                                       data['financeStage'],
 87 |                                       data['industryField'],
 88 |                                       data['positionId'],
 89 |                                       html.unescape(data['positionAdvantage']),
 90 |                                       data['companySize'],
 91 |                                       data['companyLabelList'],
 92 |                                       data['district'],
 93 |                                       html.unescape(data['positionLables']),
 94 |                                       data['longitude'],
 95 |                                       data['latitude'],
 96 |                                       html.unescape(data['companyFullName'])))
 97 |                     result = pd.DataFrame(data_list)
 98 |                 if page == 1:
 99 |                     result.to_csv(result_save_file, header=csv_headers, index=False, mode='a+')
100 |                 else:
101 |                     result.to_csv(result_save_file, header=False, index=False, mode='a+')
102 |                 return None
103 |         except Exception as e:
104 |             print(e)
105 | 
106 | 
107 | # 生成词云文件
108 | def make_wc(content, file_name, mask_pic=default_mask, font=default_font):
109 |     bg_pic = imread(mask_pic)
110 |     pic_colors = ImageColorGenerator(bg_pic)
111 |     wc = WordCloud(font_path=font, background_color='white', margin=2, max_font_size=250,
112 |                    width=2000, height=2000,
113 |                    min_font_size=30, max_words=1000)
114 |     wc.generate_from_frequencies(content)
115 |     wc.to_file(file_name)
116 | 
117 | 
118 | # 数据分析方法(生成相关文件)
119 | def data_analysis(data):
120 |     # 1.分析招聘公司的相关信息
121 |     # 行业领域
122 |     industry_field_list = []
123 |     for industry_field in data['行业领域']:
124 |         for field in industry_field.strip().replace(" ", ",").replace("、", ",").split(','):
125 |             industry_field_list.append(field)
126 |     counter = dict(Counter(industry_field_list))
127 |     counter.pop('')
128 |     make_wc(counter, pic_save_path + "wc_1.jpg")
129 | 
130 |     # 公司规模
131 |     plt.figure(1)
132 |     data['公司规模'].value_counts().plot(kind='pie', autopct='%1.1f%%', explode=np.linspace(0, 0.5, 6))
133 |     plt.subplots_adjust(left=0.22, right=0.74, wspace=0.20, hspace=0.20,
134 |                         bottom=0.17, top=0.84)
135 |     plt.savefig(pic_save_path + 'result_1.jpg')
136 |     plt.close(1)
137 |     # 融资状态
138 |     plt.figure(2)
139 |     data['融资状态'].value_counts().plot(kind='pie', autopct='%1.1f%%')
140 |     plt.subplots_adjust(left=0.22, right=0.74, wspace=0.20, hspace=0.20,
141 |                         bottom=0.17, top=0.84)
142 |     plt.savefig(pic_save_path + 'result_2.jpg')
143 |     plt.close(2)
144 |     # 所在区域
145 |     plt.figure(3)
146 |     data['所在区域'].value_counts().plot(kind='pie', autopct='%1.1f%%', explode=[0, 0, 0, 0, 0, 0, 0, 1, 1.5])
147 |     plt.subplots_adjust(left=0.31, right=0.74, wspace=0.20, hspace=0.20,
148 |                         bottom=0.26, top=0.84)
149 |     plt.savefig(pic_save_path + 'result_3.jpg')
150 |     plt.close(3)
151 |     # 公司标签
152 |     tags_list = []
153 |     for tags in data['公司标签']:
154 |         for tag in tags.strip().replace("[", "").replace("]", "").replace("'", "").split(','):
155 |             tags_list.append(tag)
156 |     counter = dict(Counter(tags_list))
157 |     counter.pop('')
158 |     make_wc(counter, pic_save_path + "wc_2.jpg")
159 |     # 公司优势
160 |     advantage_list = []
161 |     for advantage_field in data['公司优势']:
162 |         for field in advantage_field.strip().replace(" ", ",").replace("、", ",").replace("，", ",").replace("+", ",") \
163 |                 .split(','):
164 |             industry_field_list.append(field)
165 |     counter = dict(Counter(industry_field_list))
166 |     counter.pop('')
167 |     counter.pop('移动互联网')
168 |     make_wc(counter, pic_save_path + "wc_3.jpg")
169 | 
170 |     # 2.分析招聘需求
171 |     # 工作年限要求
172 |     # 横向条形图
173 |     plt.figure(4)
174 |     data['工作年限'].value_counts().plot(kind='barh', rot=0)
175 |     plt.title("工作经验直方图")
176 |     plt.xlabel("年限/年")
177 |     plt.ylabel("公司/个")
178 |     plt.savefig(pic_save_path + 'result_4.jpg')
179 |     plt.close(4)
180 |     # 饼图
181 |     plt.figure(5)
182 |     data['工作年限'].value_counts().plot(kind='pie', autopct='%1.1f%%', explode=np.linspace(0, 0.75, 6))
183 |     plt.title("工作经验饼图")
184 |     plt.subplots_adjust(left=0.22, right=0.74, wspace=0.20, hspace=0.20,
185 |                         bottom=0.17, top=0.84)
186 |     plt.savefig(pic_save_path + 'result_5.jpg')
187 |     plt.close(5)
188 |     # 学历要求
189 |     plt.figure(6)
190 |     data['学历'].value_counts().plot(kind='pie', autopct='%1.1f%%', explode=(0, 0.1, 0.2))
191 |     plt.title("学历饼图")
192 |     plt.subplots_adjust(left=0.22, right=0.74, wspace=0.20, hspace=0.20,
193 |                         bottom=0.17, top=0.84)
194 |     plt.savefig(pic_save_path + 'result_6.jpg')
195 |     plt.close(6)
196 | 
197 |     # 薪资(先去掉后部分的最大工资，过滤掉kK以上词汇，获取索引按照整数生序排列)
198 |     plt.figure(7)
199 |     salary = data['薪资'].str.split('-').str.get(0).str.replace('k|K|以上', "").value_counts()
200 |     salary_index = list(salary.index)
201 |     salary_index.sort(key=lambda x: int(x))
202 |     final_salary = salary.reindex(salary_index)
203 |     plt.title("薪资条形图")
204 |     final_salary.plot(kind='bar', rot=0)
205 |     plt.xlabel("薪资/K")
206 |     plt.ylabel("公司/个")
207 |     plt.savefig(pic_save_path + 'result_7.jpg')
208 |     plt.close(7)
209 | 
210 |     # 技能标签
211 |     skill_list = []
212 |     for skills in data['技能标签']:
213 |         for skill in skills.strip().replace("[", "").replace("]", "").replace("'", "").split(','):
214 |             skill_list.append(skill)
215 |     counter = dict(Counter(skill_list))
216 |     counter.pop('')
217 |     counter.pop('Android')
218 |     make_wc(counter, pic_save_path + "wc_4.jpg")
219 | 
220 | 
221 | # 处理数据
222 | if __name__ == '__main__':
223 |     t.is_dir_existed(pic_save_path)
224 |     if not t.is_dir_existed(result_save_file, mkdir=False):
225 |         fetch_data(1)
226 |         for cur_page in range(2, max_page + 1):
227 |             fetch_data(cur_page)
228 |     else:
229 |         raw_data = pd.read_csv(result_save_file)
230 |         data_analysis(raw_data)
231 |         # 筛选电子商务公司
232 |         dzsw_result = raw_data.loc[raw_data["行业领域"].str.find("电子商务") != -1, ["行业领域", "公司全名"]]
233 |         dzsw_result.to_csv(c.outputs_logs_path + "dzsw.csv", header=False, index=False, mode='a+')
234 |         # 筛选人15-50人的公司
235 |         p_num_result = raw_data.loc[raw_data["所在区域"] == "龙华新区", ["所在区域", "公司全名"]]
236 |         p_num_result.to_csv(c.outputs_logs_path + "lhxq.csv", header=False, index=False, mode='a+')
237 | 


--------------------------------------------------------------------------------
/9、数据分析案例：Python岗位行情/勘误.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/9、数据分析案例：Python岗位行情/勘误.md


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 《Python网络爬虫从入门到实践》勘误与随书源代码
 2 | 
 3 | 
 4 | ---
 5 | 
 6 | ![][1]
 7 | 
 8 | 貌似，出版社忘记把随书源代码加到书里了...
 9 | 
10 | 而电脑重装了，只找到了挺久以前的备份...
11 | 
12 | 将就着用吧...
13 | 
14 | 书中有问题的地方（错别字，代码错误，印刷错误等）欢迎 **提issues反馈** ，感激不尽。
15 | 
16 | 另外，本来书是由：**Python基础** 和 **Python爬虫** 两个部分组成的，本书为爬虫部分，
17 | 基础部分的内容因为一些客观因素没有出版，笔者把这部分的内容发布到了公号上，**免费**！！！
18 | 有兴趣的同学可自行到公号查阅，谢谢~
19 | 
20 | ![][2]
21 | 
22 | 最后，如果有其他事宜，如：加群、商业合作等，可在公号留言或者加下我的小号~
23 | 
24 | ![][3]
25 | 
26 | 
27 |   [1]: http://static.zybuluo.com/coder-pig/ionx6je52iwlhxbgba3t1x51/12121.png
28 |   [2]: http://static.zybuluo.com/coder-pig/1jpu7nalyfp3kvaxfm4q0h8y/20190524181102821.jpg
29 |   [3]: http://static.zybuluo.com/coder-pig/whqf2oblwvzqempi2eec32xy/1111.png


--------------------------------------------------------------------------------