├── .gitignore ├── 10、数据分析案例:某婚恋网站交友情况分析 ├── 代码 │ ├── 10_1.py │ ├── render.html │ ├── wzly.csv │ └── wzly.ipynb └── 勘误.md ├── 1、Python爬虫概念与Web基础 └── 勘误.md ├── 2、Python爬虫基本库的使用 ├── 代码 │ ├── 2_1.py │ ├── 2_10.py │ ├── 2_11.py │ ├── 2_12.py │ ├── 2_13.py │ ├── 2_14.py │ ├── 2_2.py │ ├── 2_3.py │ ├── 2_4.py │ ├── 2_5.py │ ├── 2_6.py │ ├── 2_7.py │ ├── 2_8.py │ ├── 2_9.py │ ├── cookie.txt │ └── proxy_ips.txt └── 勘误.md ├── 3、Python爬虫抓包与数据解析 ├── 代码 │ ├── 3_1.py │ ├── 3_2.py │ ├── 3_3.py │ └── 3_4.py └── 勘误.md ├── 4、用CSV 和 Excel 存储数据 ├── 代码 │ ├── 4_1.py │ ├── 4_10.py │ ├── 4_2.py │ ├── 4_3.py │ ├── 4_4.py │ ├── 4_5.py │ ├── 4_6.py │ ├── 4_7.py │ ├── 4_8.py │ └── 4_9.py └── 勘误.md ├── 5、用数据库存储数据 ├── 代码 │ ├── 5_1.py │ ├── 5_10.py │ ├── 5_2.py │ ├── 5_3.py │ ├── 5_4.py │ ├── 5_5.py │ ├── 5_6.py │ ├── 5_7.py │ ├── 5_8.py │ └── 5_9.py └── 勘误.md ├── 6、Python应对反爬虫策略 ├── 代码 │ ├── 6_1.py │ ├── 6_2.py │ ├── 6_3.py │ ├── 6_4.py │ ├── 6_5.py │ └── 6_6.py └── 勘误.md ├── 7、Python爬虫框架Scrapy(上) ├── 代码 │ ├── FirstSpider │ │ ├── FirstSpider │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-37.pyc │ │ │ │ ├── items.cpython-37.pyc │ │ │ │ ├── middlewares.cpython-37.pyc │ │ │ │ ├── pipelines.cpython-37.pyc │ │ │ │ └── settings.cpython-37.pyc │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── proxy_ip.txt │ │ │ ├── run.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-37.pyc │ │ │ │ └── bcy.cpython-37.pyc │ │ │ │ └── bcy.py │ │ └── scrapy.cfg │ └── bing │ │ ├── Dockerfile │ │ ├── bing.json │ │ ├── bing │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── items.cpython-37.pyc │ │ │ └── settings.cpython-37.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── BingWallpaper.py │ │ │ ├── Test.py │ │ │ ├── __init__.py │ │ │ └── __pycache__ │ │ │ ├── BingWallpaper.cpython-37.pyc │ │ │ ├── Test.cpython-37.pyc │ │ │ └── __init__.cpython-37.pyc │ │ ├── logs │ │ └── BingWallpaper │ │ │ ├── 2018-10-15T104228.709049.log │ │ │ ├── 2018-10-15T104303.655633.log │ │ │ ├── 2018-10-15T104348.228406.log │ │ │ ├── 2018-10-15T104841.872511.log │ │ │ ├── 2018-10-15T104922.591600.log │ │ │ ├── 2018-10-15T105002.320386.log │ │ │ ├── 2018-10-15T105902.809743.log │ │ │ ├── 2018-10-15T113038.987323.log │ │ │ └── 2018-10-15T120654.496911.log │ │ ├── out │ │ └── res │ │ │ └── pic │ │ │ └── full │ │ │ ├── 033317f07b809f0cd06487b30b29eccb26d063b8.jpg │ │ │ ├── 0698af79b195349b838bdfeebbd11409f82f0f38.jpg │ │ │ ├── 092235104f84cb2f4de8808c10f655298313f65c.jpg │ │ │ ├── 2efd29b32c481136507115a3ee2e6181c122aa0b.jpg │ │ │ ├── 3a573eb605fef87faaf91ad8ad421d1a24d0bc6b.jpg │ │ │ ├── 4099096a19a0eaad0aef6782a206881d948ad775.jpg │ │ │ ├── 486c568e353051efd0959cc4a424ff9093cfceb9.jpg │ │ │ ├── 5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg │ │ │ ├── 599f27e7835da59b495c44297cce0553ee4a0b51.jpg │ │ │ ├── 86fd225ce368589a9b5e7454e6583cf77aedb0d4.jpg │ │ │ ├── 885648740905a26703e18c1ae24f23c480ecc822.jpg │ │ │ ├── 97e86cde9a308e626f537c107303537ec598903c.jpg │ │ │ ├── b7e4ba8cba538b44e31132d175479c7ec37284fd.jpg │ │ │ ├── bca701f1923e317aa8a9be18125c2a894fc80780.jpg │ │ │ ├── bfa7e5e22268f27d7a195390abf6ef9ee45a6c29.jpg │ │ │ ├── c14461fb44425865b9afe6695ab5926e2001411c.jpg │ │ │ ├── cbba4b16b644659920ad93e10a6d3478270ce927.jpg │ │ │ ├── e254600d400f3c54c77171e02b021d46369788ae.jpg │ │ │ ├── e7fc4de75bcafe18f64b68072bf5cc6ece6084a8.jpg │ │ │ └── ed989d9c858c5290ca559cf2c462cace68e49362.jpg │ │ ├── requirements.txt │ │ ├── run.py │ │ └── scrapy.cfg └── 勘误.md ├── 8、Python爬虫框架Scrapy(下) ├── 代码 │ ├── jianshuspider │ │ ├── jianshuspider │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-37.pyc │ │ │ │ ├── items.cpython-37.pyc │ │ │ │ ├── middlewares.cpython-37.pyc │ │ │ │ ├── pipelines.cpython-37.pyc │ │ │ │ └── settings.cpython-37.pyc │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-37.pyc │ │ │ │ └── jianshu.cpython-37.pyc │ │ │ │ └── jianshu.py │ │ ├── requirements.txt │ │ └── scrapy.cfg │ └── proxy_ips │ │ ├── proxy_ip_check.py │ │ ├── proxy_ips │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ └── settings.cpython-37.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ └── proxy_spider.cpython-37.pyc │ │ │ └── proxy_spider.py │ │ ├── proxy_server.py │ │ ├── run.py │ │ └── scrapy.cfg └── 勘误.md ├── 9、数据分析案例:Python岗位行情 ├── 代码 │ ├── 9_1.py │ └── 9_2.py └── 勘误.md └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea/ -------------------------------------------------------------------------------- /10、数据分析案例:某婚恋网站交友情况分析/代码/10_1.py: -------------------------------------------------------------------------------- 1 | """ 2 | 抓取我主良缘妹子交友信息做数据分析 3 | """ 4 | 5 | import requests as rq 6 | import pandas as pd 7 | import time 8 | import random 9 | import os 10 | 11 | # 结果写入文件 12 | result_save_file = 'wzly.csv' 13 | 14 | # Ajax加载url 15 | ajax_url = "http://www.lovewzly.com/api/user/pc/list/search?" 16 | 17 | # 模拟请求头 18 | ajax_headers = { 19 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 20 | 'Accept-Encoding': 'gzip, deflate, br', 21 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 22 | 'Connection': 'keep-alive', 23 | 'Host': 'www.lovewzly.com', 24 | 'Referer': 'http://www.lovewzly.com/jiaoyou.html', 25 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 ' 26 | 'Safari/537.36', 27 | 'X-Requested-With': 'XMLHttpRequest', 28 | } 29 | 30 | # post请求参数 31 | form_data = {'gender': '2', 'marry': '1', 'page': '1'} 32 | 33 | # csv表头 34 | csv_headers = [ 35 | '昵称', '用户id', '头像', '身高', '学历', '省份', 36 | '城市', '出生年份', '性别', '交友宣言' 37 | ] 38 | 39 | height_interval = ['140', '150', '160', '170', '180'] # 身高范围 40 | edu_interval = ['本科', '大专', '高中', '中专', '初中', '硕士', '博士', '院士'] # 学历范围 41 | age_interval = [ 42 | ('18-30', 8000), ('26-30', 8000), ('31-40', 8000), 43 | ('41-50', 8000), ('50以上', 8000), 44 | ] # 年龄范围 45 | 46 | 47 | # 获取每页交友信息 48 | def fetch_data(page): 49 | while True: 50 | try: 51 | form_data['page'] = page 52 | print("抓取第:" + str(page) + "页!") 53 | resp = rq.get(url=ajax_url, params=form_data, headers=ajax_headers) 54 | if resp.status_code == 200: 55 | data_json = resp.json()['data']['list'] 56 | if len(data_json) > 0: 57 | data_list = [] 58 | for data in data_json: 59 | data_list.append(( 60 | data['username'], data['userid'], data['avatar'], 61 | data['height'], data['education'], data['province'], 62 | data['city'], data['birthdayyear'], data['gender'], data['monolog'])) 63 | result = pd.DataFrame(data_list) 64 | if page == 1: 65 | result.to_csv(result_save_file, header=csv_headers, index=False, mode='a+', encoding='utf-8') 66 | else: 67 | result.to_csv(result_save_file, header=False, index=False, mode='a+', encoding='utf-8') 68 | return None 69 | except Exception as e: 70 | print(e) 71 | 72 | 73 | if __name__ == '__main__': 74 | if not os.path.exists(result_save_file): 75 | for i in range(1, 718): 76 | time.sleep(random.randint(2, 10)) 77 | fetch_data(i) 78 | -------------------------------------------------------------------------------- /10、数据分析案例:某婚恋网站交友情况分析/勘误.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/10、数据分析案例:某婚恋网站交友情况分析/勘误.md -------------------------------------------------------------------------------- /1、Python爬虫概念与Web基础/勘误.md: -------------------------------------------------------------------------------- 1 | ## 第1章 Python 爬虫概念与Web基础 2 | 3 | ### 1.1.7 爬虫的学习路线 4 | 5 | 学习路线图部分: 6 | 7 | 1、原文:利用urllib、**requestsy**库 8 | 9 | > 改为:利用urllib、**requests** 库 10 | 11 | 2、原文:利用文件、**CVS**、Excel 12 | 13 | > 改为:利用文件、**CSV**、Excel -------------------------------------------------------------------------------- /2、Python爬虫基本库的使用/代码/2_1.py: -------------------------------------------------------------------------------- 1 | """ 2 | urllib.request使用示例 3 | """ 4 | 5 | import urllib.request 6 | 7 | resp = urllib.request.urlopen("http://www.baidu.com") 8 | print("resp.geturl:", resp.geturl()) 9 | print("resp.msg:", resp.msg) 10 | print("resp.status:", resp.status) 11 | print("resp.version:", resp.version) 12 | print("resp.reason:", resp.reason) 13 | print("resp.debuglevel:", resp.debuglevel) 14 | print("resp.getheaders:", resp.getheaders()[0:2]) 15 | print(resp.read().decode('utf-8')) 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /2、Python爬虫基本库的使用/代码/2_10.py: -------------------------------------------------------------------------------- 1 | """ 2 | urllib.parse.urlencode函数使用代码示例 3 | """ 4 | from urllib import parse 5 | 6 | params = { 7 | 'q': 'parse', 8 | 'check_keywords': 'yes', 9 | 'area': 'default' 10 | } 11 | url = 'https://docs.python.org/3/search.html?' + parse.urlencode(params) 12 | print("拼接后的URL:", url) 13 | -------------------------------------------------------------------------------- /2、Python爬虫基本库的使用/代码/2_11.py: -------------------------------------------------------------------------------- 1 | """ 2 | urllib.parse.parse_qs和parse_qsl函数使用代码示例 3 | """ 4 | from urllib import parse 5 | 6 | params_str = 'q=parse&check_keywords=yes&area=default' 7 | 8 | print("parse_qs 反序列化结果:", parse.parse_qs(params_str)) 9 | print("parse_qsl 反序列化结果:", parse.parse_qsl(params_str)) 10 | -------------------------------------------------------------------------------- /2、Python爬虫基本库的使用/代码/2_12.py: -------------------------------------------------------------------------------- 1 | """ 2 | urllib.robotparser使用示例 3 | """ 4 | 5 | from urllib import robotparser 6 | import ssl 7 | ssl._create_default_https_context = ssl._create_unverified_context 8 | 9 | rp = robotparser.RobotFileParser() 10 | # 设置rebots.txt文件的链接 11 | rp.set_url('http://www.taobao.com/robots.txt') 12 | # 读取rebots.txt文件并进行分析 13 | rp.read() 14 | 15 | url = 'https://www.douban.com' 16 | user_agent = 'Baiduspider' 17 | op_info = rp.can_fetch(user_agent, url) 18 | print("Elsespider 代理用户访问情况:",op_info) 19 | 20 | bdp_info = rp.can_fetch(user_agent, url) 21 | print("Baiduspider 代理用户访问情况:",bdp_info) 22 | user_agent = 'Elsespider' -------------------------------------------------------------------------------- /2、Python爬虫基本库的使用/代码/2_13.py: -------------------------------------------------------------------------------- 1 | """ 2 | 刷CSDN博客文章访问量的脚本 3 | """ 4 | import random 5 | import urllib.request 6 | import threading as t 7 | import os 8 | import ssl 9 | 10 | # 全局取消证书验证 11 | ssl._create_default_https_context = ssl._create_unverified_context 12 | 13 | # 代理ip文件 14 | proxy_ips_file = 'proxy_ips.txt' 15 | 16 | # 代理ip列表 17 | proxy_ips = [] 18 | 19 | # 文章地址 20 | article_url = 'https://blog.csdn.net/l1028386804/article/details/116191713' 21 | 22 | # 请求头 23 | headers = { 24 | 'Host': 'blog.csdn.net', 25 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 26 | } 27 | 28 | read_count = 0 29 | 30 | 31 | # 读取文件里的代理ip,返回一个列表 32 | def load_ips(file_path): 33 | if os.path.exists(file_path): 34 | data_list = [] 35 | with open(file_path, "r+", encoding='utf-8') as f: 36 | for ip in f: 37 | data_list.append(ip.replace("\n", "")) 38 | return data_list 39 | 40 | 41 | # 访问网页 42 | def read_article(): 43 | # 随机取出一枚代理ip 44 | proxy_ip = proxy_ips[random.randint(0, len(proxy_ips) - 1)] 45 | proxy_support = urllib.request.ProxyHandler({'http': proxy_ip}) 46 | opener = urllib.request.build_opener(proxy_support) 47 | urllib.request.install_opener(opener) 48 | try: 49 | req = urllib.request.Request(article_url, headers=headers) 50 | resp = urllib.request.urlopen(req, timeout=20) 51 | # 如果返回码是200代表访问成功 52 | if resp is not None and resp.status == 200: 53 | global read_count 54 | read_count += 1 55 | print("累计访问成功次数: %d" % read_count) 56 | return None 57 | except Exception as e: 58 | print(e) 59 | 60 | 61 | if __name__ == '__main__': 62 | # 读取代理ip列表 63 | proxy_ips = load_ips(proxy_ips_file) 64 | read_article() 65 | if len(proxy_ips) > 0: 66 | for i in range(100): 67 | read_article() -------------------------------------------------------------------------------- /2、Python爬虫基本库的使用/代码/2_14.py: -------------------------------------------------------------------------------- 1 | """ 2 | 爬取笔趣看的小说脚本示例 3 | """ 4 | 5 | import urllib 6 | import urllib.request 7 | import urllib.parse 8 | from lxml import etree 9 | from urllib import error 10 | import lxml.html 11 | import os 12 | import time 13 | 14 | # 小说站点的URL 15 | novel_base_url = 'http://www.biqukan.com' 16 | 17 | # 拉取小说的URL 18 | novel_url = urllib.parse.urljoin(novel_base_url, '/0_790/') 19 | 20 | # 每章小说的链接 21 | chapter_url_list = [] 22 | 23 | # 小说的保存文件夹 24 | novel_save_dir = os.path.join(os.getcwd(), 'novel_cache/') 25 | 26 | # 请求头 27 | headers = { 28 | 'Host': 'www.biqukan.com', 29 | 'Referer': 'http://www.biqukan.com/', 30 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 31 | } 32 | 33 | # 获取章节链接列表 34 | def fetch_chapter_urls(): 35 | req = urllib.request.Request(url=novel_url, headers=headers) 36 | html = lxml.html.parse(urllib.request.urlopen(req)) 37 | hrefs = html.xpath('//dd/a/@href') 38 | # 过滤前面的最新章节列表和无用章节 39 | for href in hrefs[16:]: 40 | chapter_url_list.append(urllib.parse.urljoin(novel_base_url, href)) 41 | 42 | # 解析每个页面获得章节正文 43 | def parsing_chapter(url): 44 | req = urllib.request.Request(url=url, headers=headers) 45 | html = lxml.html.parse(urllib.request.urlopen(req)) 46 | title = html.xpath('//h1/text()')[0] 47 | contents = html.xpath('//*[@id="content"]/text()') 48 | content = '' 49 | for i in contents: 50 | content += i.strip() 51 | save_novel(title, content) 52 | 53 | # 把章节正文写到本地 54 | def save_novel(name, content): 55 | try: 56 | with open(novel_save_dir + name + '.txt', "w+") as f: 57 | f.write(content.strip()) 58 | except (error.HTTPError, OSError) as reason: 59 | print(str(reason)) 60 | else: 61 | print("下载完成:" + name) 62 | 63 | 64 | if __name__ == '__main__': 65 | # 判断存储的文件夹是否存在,不存在新建 66 | if not os.path.exists(novel_save_dir): 67 | os.mkdir(novel_save_dir) 68 | # 爬取小说文章链接列表 69 | fetch_chapter_urls() 70 | # 遍历抓取所有的小说内容 71 | for chapter in chapter_url_list: 72 | # 定时休眠1s防止ip被封 73 | time.sleep(1) 74 | parsing_chapter(chapter) -------------------------------------------------------------------------------- /2、Python爬虫基本库的使用/代码/2_2.py: -------------------------------------------------------------------------------- 1 | """ 2 | urllib下载图片 3 | """ 4 | import urllib.request 5 | import ssl 6 | 7 | ssl._create_default_https_context = ssl._create_unverified_context 8 | 9 | # pic_url = "https://www.baidu.com/img/bd_logo1.png" 10 | # pic_resp = urllib.request.urlopen(pic_url,context=context) 11 | # pic = pic_resp.read() 12 | # with open("bg_logo.png", "wb") as f: 13 | # f.write(pic) 14 | 15 | urllib.request.urlretrieve('https://www.baidu.com/img/bd_logo1.png', 'bd_logo.png') 16 | 17 | 18 | -------------------------------------------------------------------------------- /2、Python爬虫基本库的使用/代码/2_3.py: -------------------------------------------------------------------------------- 1 | """ 2 | itchat模拟Get请求 3 | """ 4 | 5 | import urllib.request 6 | import json 7 | import ssl 8 | 9 | ssl._create_default_https_context = ssl._create_unverified_context 10 | 11 | get_url = "http://gank.io/api/data/" + urllib.request.quote("福利") + "/1/1" 12 | get_resp = urllib.request.urlopen(get_url) 13 | get_result = json.loads(get_resp.read().decode('utf-8')) 14 | # 这里后面的参数用于格式化Json输出格式 15 | get_result_format = json.dumps(get_result, indent=2, 16 | sort_keys=True, ensure_ascii=False) 17 | print(get_result_format) -------------------------------------------------------------------------------- /2、Python爬虫基本库的使用/代码/2_4.py: -------------------------------------------------------------------------------- 1 | """ 2 | urllib模拟Post请求示例(伪代码,不能直接请求) 3 | """ 4 | import urllib.request 5 | import urllib.parse 6 | import json 7 | 8 | post_url = "http://xxx.xxx.login" 9 | phone = "13555555555" 10 | password = "111111" 11 | values = { 12 | 'phone': phone, 13 | 'password': password 14 | } 15 | data = urllib.parse.urlencode(values).encode(encoding='utf-8') 16 | req = urllib.request.Request(post_url, data) 17 | resp = urllib.request.urlopen(req) 18 | result = json.loads(resp.read()) # Byte结果转Json 19 | print(json.dumps(result, sort_keys=True, 20 | indent=2, ensure_ascii=False)) # 格式化输出Json 21 | 22 | 23 | -------------------------------------------------------------------------------- /2、Python爬虫基本库的使用/代码/2_5.py: -------------------------------------------------------------------------------- 1 | """ 2 | urllib修改请求头代码示例 3 | """ 4 | import urllib.request 5 | 6 | # 修改头信息 7 | novel_url = "http://www.biqukxs.com/book/1.html" 8 | headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) ' 9 | 'AppleWebKit/537.36 (KHTML, like Gecko)' 10 | ' Chrome/63.0.3239.84 Safari/537.36', 11 | 'Host': 'www.biqukxs.com', 12 | 'Referer': 'http://www.biqukxs.com/', 13 | 'Connection': 'keep-alive'} 14 | novel_req = urllib.request.Request(novel_url, headers=headers) 15 | novel_resp = urllib.request.urlopen(novel_req) 16 | print(novel_resp.read().decode('gbk')) 17 | -------------------------------------------------------------------------------- /2、Python爬虫基本库的使用/代码/2_6.py: -------------------------------------------------------------------------------- 1 | """ 2 | urllib配置代理示例 3 | """ 4 | 5 | import urllib.request 6 | 7 | # 使用ip代理 8 | ip_query_url = "http://ip.chinaz.com/" 9 | 10 | # 1.创建代理处理器,ProxyHandler参数是一个字典{类型:代理ip:端口},下述代理IP失效的话替换成可用的代理ip即可 11 | proxy_support = urllib.request.ProxyHandler({'http': '60.187.118.246:9000'}) 12 | 13 | # 2.定制,创建一个opener 14 | opener = urllib.request.build_opener(proxy_support) 15 | 16 | # 3.安装opener 17 | urllib.request.install_opener(opener) 18 | 19 | # 请求头 20 | headers = { 21 | 'User-Agent': 'User-Agent:Mozilla/5.0 (X11; Linux x86_64)' 22 | ' AppleWebKit/537.36 (KHTML, like Gecko)' 23 | ' Chrome/63.0.3239.84 Safari/537.36', 24 | 'Host': 'ip.chinaz.com' 25 | } 26 | 27 | req = urllib.request.Request(ip_query_url, headers=headers) 28 | resp = urllib.request.urlopen(req, timeout=20) 29 | html = resp.read().decode('utf-8') 30 | print(html) 31 | -------------------------------------------------------------------------------- /2、Python爬虫基本库的使用/代码/2_7.py: -------------------------------------------------------------------------------- 1 | """ 2 | urllib使用cookie代码示例 3 | """ 4 | 5 | import urllib.request 6 | from http import cookiejar 7 | 8 | # ============ 获得Cookie ============ 9 | 10 | # 1.实例化CookieJar对象 11 | 12 | 13 | cookie = cookiejar.CookieJar() 14 | 15 | # 2.创建Cookie处理器 16 | handler = urllib.request.HTTPCookieProcessor(cookie) 17 | 18 | # 3.通过CookieHandler创建opener 19 | opener = urllib.request.build_opener(handler) 20 | 21 | # 4.打开网页 22 | resp = opener.open("http://www.baidu.com") 23 | 24 | for i in cookie: 25 | print("Name = %s" % i.name) 26 | print("Name = %s" % i.value) 27 | 28 | # ============ 保存Cookie到文件 ============ 29 | # 1.用于保存cookie的文件 30 | cookie_file = "cookie.txt" 31 | 32 | # 2.创建MozillaCookieJar对象保存Cookie 33 | cookie = cookiejar.MozillaCookieJar(cookie_file) 34 | 35 | # 3.创建Cookie处理器 36 | handler = urllib.request.HTTPCookieProcessor(cookie) 37 | 38 | # 4.通过CookieHandler创建opener 39 | opener = urllib.request.build_opener(handler) 40 | 41 | # 5.打开网页 42 | resp = opener.open("http://www.baidu.com") 43 | 44 | # 6.保存Cookie到文件中,参数依次是: 45 | # ignore_discard:即使cookies将被丢弃也将它保存下来 46 | # ignore_expires:如果在该文件中cookies已存在,覆盖原文件写入 47 | cookie.save(ignore_discard=True, ignore_expires=True) 48 | 49 | # ============ 读取Cookie文件 ============ 50 | 51 | cookie_file = "cookie.txt" 52 | 53 | # 1.创建MozillaCookieJar对象保存Cookie 54 | cookie = cookiejar.MozillaCookieJar(cookie_file) 55 | 56 | # 2.从文件中读取cookie内容 57 | cookie.load(cookie_file, ignore_expires=True, ignore_discard=True) 58 | 59 | handler = urllib.request.HTTPCookieProcessor(cookie) 60 | opener = urllib.request.build_opener(handler) 61 | resp = opener.open("http://www.baidu.com") 62 | print(resp.read().decode('utf-8')) 63 | 64 | -------------------------------------------------------------------------------- /2、Python爬虫基本库的使用/代码/2_8.py: -------------------------------------------------------------------------------- 1 | """ 2 | urllib.parse.urlparse和urlsplit函数使用示例 3 | """ 4 | import urllib.parse 5 | 6 | urp = urllib.parse.urlparse('https://docs.python.org/3/search.html?q=parse&check_keywords=yes&area=default') 7 | print('urlparse执行结果:', urp) 8 | # 可以通过.的方式获取某个部分 9 | print('urp.scheme:', urp.scheme) 10 | print('urp.netloc:', urp.netloc) 11 | 12 | urp = urllib.parse.urlsplit('https://docs.python.org/3/search.html?q=parse&check_keywords=yes&area=default') 13 | print('urlsplit执行结果:', urp) 14 | -------------------------------------------------------------------------------- /2、Python爬虫基本库的使用/代码/2_9.py: -------------------------------------------------------------------------------- 1 | """ 2 | urllib.parse.urlunparse,urlunsplit和urljoin函数使用示例 3 | """ 4 | import urllib.parse 5 | 6 | url = urllib.parse.urlunparse(['https','docs.python.org', '/3/search.html', 'q=parse&check_keywords=yes&area=default' , '', '']) 7 | print('urlunparse函数拼接的URL:',url) 8 | 9 | url = urllib.parse.urlunsplit(['https','docs.python.org', '/3/search.html', 'q=parse&check_keywords=yes&area=default','']) 10 | print('urlunsplit函数拼接的URL:',url) 11 | 12 | url = urllib.parse.urljoin('https://docs.python.org','/3/search.html') 13 | url = urllib.parse.urljoin(url,'?q=parse&check_keywords=yes&area=default') 14 | print('urljoin函数拼接的URL:',url) -------------------------------------------------------------------------------- /2、Python爬虫基本库的使用/代码/cookie.txt: -------------------------------------------------------------------------------- 1 | # Netscape HTTP Cookie File 2 | # http://curl.haxx.se/rfc/cookie_spec.html 3 | # This is a generated file! Do not edit. 4 | 5 | .baidu.com TRUE / FALSE 3681539028 BAIDUID F16617940595A8E3EF9BB50E63AC0954:FG=1 6 | .baidu.com TRUE / FALSE 3681539028 BIDUPSID F16617940595A8E3EF9BB50E63AC0954 7 | .baidu.com TRUE / FALSE H_PS_PSSID 1442_21106_22074 8 | .baidu.com TRUE / FALSE 3681539028 PSTM 1534055381 9 | www.baidu.com FALSE / FALSE BDSVRTM 0 10 | www.baidu.com FALSE / FALSE BD_HOME 0 11 | www.baidu.com FALSE / FALSE 2480135321 delPer 0 12 | -------------------------------------------------------------------------------- /2、Python爬虫基本库的使用/代码/proxy_ips.txt: -------------------------------------------------------------------------------- 1 | 183.129.244.16:10080 2 | 219.141.153.39:80 3 | 119.180.140.9:8060 4 | 111.3.154.196:8060 5 | 123.117.250.127:8060 6 | 222.182.56.120:8118 7 | 123.114.200.43:8118 8 | 117.28.96.103:808 9 | 120.92.174.37:1080 10 | 39.137.69.10:80 11 | 106.56.102.78:8070 12 | 218.88.177.155:8908 13 | 221.2.175.238:8060 14 | 120.198.224.5:8000 15 | 119.180.131.39:8060 16 | 112.67.34.99:8118 17 | 123.114.200.72:8118 18 | 39.137.69.6:8080 19 | 163.125.235.73:8118 20 | 219.141.153.11:80 21 | 180.119.65.150:1133 22 | 221.14.140.66:80 23 | 119.180.142.175:8060 24 | 113.78.255.243:8118 25 | 119.180.172.222:8060 26 | 39.137.77.66:8080 27 | 61.171.0.40:9999 28 | 221.2.155.35:8060 29 | 118.190.94.254:9001 30 | 219.141.153.43:80 31 | 112.24.107.109:8908 32 | 222.186.45.139:65309 33 | 219.141.153.5:80 34 | 219.141.153.35:80 35 | 221.14.140.130:80 36 | 101.96.11.5:80 37 | 119.179.131.245:8060 38 | 121.14.159.150:9001 39 | 114.250.25.19:80 40 | 120.198.224.6:8088 41 | 223.96.95.229:3128 42 | 121.17.18.219:8060 43 | 117.190.90.20:8060 44 | 219.141.153.6:80 45 | 113.239.240.152:80 46 | 101.96.10.5:80 47 | 219.141.153.10:80 48 | 117.44.247.37:8908 49 | 115.213.103.150:8010 50 | 113.3.210.60:80 51 | 106.56.102.252:8070 52 | 183.246.84.229:8060 53 | 118.190.95.35:9001 54 | 219.141.153.41:80 55 | 58.247.46.123:8088 56 | 112.24.107.102:8908 57 | 223.93.145.186:8060 58 | 218.244.44.194:8060 59 | 120.198.224.7:8080 60 | 117.28.97.169:808 61 | 222.88.147.104:8060 62 | 218.88.177.149:8908 63 | 39.137.69.8:8080 64 | 119.179.147.68:8060 65 | 113.105.202.51:1133 66 | 219.141.153.12:8080 67 | 114.95.61.165:8118 68 | 222.186.34.212:65309 69 | 113.128.198.50:8060 70 | 219.141.153.2:8080 71 | 219.141.153.34:80 72 | 222.175.200.58:8060 73 | 117.131.235.198:8060 74 | 219.141.153.44:80 75 | 60.14.125.246:8908 76 | 119.180.137.134:8060 77 | 39.137.77.67:80 78 | 120.131.9.254:1080 79 | 106.56.102.17:8070 80 | 119.180.168.33:8060 81 | 221.2.174.99:8060 82 | 118.190.200.139:8080 83 | 222.88.149.32:8060 84 | 118.190.145.138:9001 85 | 221.2.174.6:8060 86 | 219.141.153.38:80 87 | 119.180.140.140:8060 88 | 123.158.175.102:1080 89 | 219.141.153.7:80 90 | 117.44.247.53:8908 91 | 124.128.76.142:8060 92 | 112.80.93.76:8118 93 | 119.180.131.16:8060 94 | 39.135.24.11:8080 95 | 222.222.236.207:8060 96 | 218.88.177.161:8908 97 | 119.179.132.101:8060 98 | 39.137.69.7:80 99 | 119.180.171.89:8060 100 | 118.190.95.43:9001 -------------------------------------------------------------------------------- /2、Python爬虫基本库的使用/勘误.md: -------------------------------------------------------------------------------- 1 | 2021.4.29 更新内容: 2 | 3 | - 2_4.py → 新增注释:伪代码,不能直接请求,只是用于演示用法; 4 | - 2_5.py → 将百度地址替换为小说地址,请求头内容替换; 5 | - 2_6.py → 新增注释:请求失败时,将时效代理IP替换为可用代理IP; 6 | - 2_12.py → 新增全局取消https证书验证; 7 | - 2_13.py → 替换失效博客地址; 8 | 9 | -------------------------------------------------------------------------------- /3、Python爬虫抓包与数据解析/代码/3_1.py: -------------------------------------------------------------------------------- 1 | """ 2 | requests抓取微信公众号文章的图片,音视频 3 | """ 4 | import requests 5 | from lxml import etree 6 | import time 7 | import os 8 | 9 | # 资源的保存文件夹 10 | save_dir = os.path.join(os.getcwd(), 'tmp') 11 | 12 | # 测试文章的URL 13 | test_url = 'https://mp.weixin.qq.com/s/4oLnJvfGCZneoErkrh0sHw' 14 | 15 | # 语音获取的基URL 16 | music_res_url = 'http://res.wx.qq.com/voice/getvoice' 17 | 18 | # 视频获取的接口URL 19 | video_parse_url = 'http://v.ranks.xin/video-parse.php' 20 | 21 | # 微信公众号文章请求头 22 | headers = { 23 | 'Host': 'mp.weixin.qq.com', 24 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 ' 25 | '(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 26 | } 27 | 28 | # 视频获取接口的请求头 29 | video_parse_headers = { 30 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)', 31 | 'Host': 'v.ranks.xin', 32 | 'Referer': 'http://v.ranks.xin/', 33 | 'X-Requested-With': 'XMLHttpRequest' 34 | } 35 | 36 | 37 | # 获取标题 38 | def get_title(content): 39 | return content.xpath("//h2[@class='rich_media_title']/text()")[0].strip() 40 | 41 | 42 | # 解析下载图片 43 | def get_pic(content, path): 44 | img_list = content.xpath("//img/@data-src") 45 | for img in img_list: 46 | download_pic(img, path) 47 | 48 | 49 | # 解析获得音频 50 | def get_sound(content, path): 51 | sound_list = content.xpath("//mpvoice/@voice_encode_fileid") 52 | for sound in sound_list: 53 | download_sound(sound, path) 54 | 55 | 56 | # 解析获得视频 57 | def get_video(content, path): 58 | video_list = content.xpath("//iframe/@data-src") 59 | for video in video_list: 60 | download_video(video, path) 61 | 62 | 63 | # 下载图片的方法 64 | def download_pic(url, path): 65 | print("下载图片:" + url) 66 | try: 67 | pic_name = str(int(time.time())) # 使用当前时间戳作为图片名字 68 | fmt = url.split('=')[-1] # 图片格式 69 | img_resp = requests.get(url).content 70 | with open(path + pic_name + "." + fmt, "wb+") as f: 71 | f.write(img_resp) 72 | except Exception as reason: 73 | print(str(reason)) 74 | 75 | 76 | # 下载音频的方法 77 | def download_sound(file_id, path): 78 | try: 79 | sound_resp = requests.get(music_res_url, params={'mediaid': file_id, 'voice_type': '1'}) 80 | if sound_resp is not None: 81 | music_name = str(int(time.time())) + '.mp3' # 使用当前时间戳作为音频名字 82 | print("开始下载音频: " + sound_resp.url) 83 | with open(path + music_name, "wb+") as f: 84 | f.write(sound_resp.content) 85 | print("音频下载完成:" + music_name) 86 | except Exception as reason: 87 | print(str(reason)) 88 | 89 | 90 | # 下载视频的方法 91 | def download_video(url, path): 92 | print("开始解析视频链接:" + url) 93 | video_resp = requests.get(video_parse_url, headers=video_parse_headers, params={'url': url}) 94 | if video_resp is not None: 95 | video_url = video_resp.json()['data'][0]['url'] 96 | print("解析完成,开始下载视频:" + video_url) 97 | try: 98 | video_name = str(int(time.time())) + '.mp4' # 使用当前时间戳作为视频名字 99 | video_resp = requests.get(video_url).content 100 | if video_resp is not None: 101 | with open(path + video_name, "wb+") as f: 102 | f.write(video_resp) 103 | print("视频下载完成:" + video_name) 104 | except Exception as reason: 105 | print(str(reason)) 106 | 107 | 108 | if __name__ == '__main__': 109 | while True: 110 | print("请输入你要抓取的微信文章链接:(输出Q回车或者按Ctrl+C可以退出~)") 111 | input_url = input() 112 | if input_url == 'Q': 113 | exit() 114 | else: 115 | resp = requests.get(url=input_url.strip(), headers=headers).text 116 | html = etree.HTML(resp) 117 | title = get_title(html) 118 | res_save_dir = os.path.join(save_dir, title) 119 | if not os.path.exists(res_save_dir): 120 | os.makedirs(res_save_dir) 121 | get_pic(html,res_save_dir) 122 | get_sound(html,res_save_dir) 123 | get_video(html,res_save_dir) 124 | print("所有资源下载完成!") -------------------------------------------------------------------------------- /3、Python爬虫抓包与数据解析/代码/3_2.py: -------------------------------------------------------------------------------- 1 | """ 2 | Beautiful Soup使用示例,抓取壁纸网站的壁纸 3 | """ 4 | import requests as r 5 | from bs4 import BeautifulSoup 6 | import os 7 | import time 8 | 9 | base_url = "http://www.win4000.com" 10 | theme_base_url = "http://www.win4000.com/zt/xiaoqingxin_" 11 | 12 | # 利用列表表达式生成每页链接列表 13 | theme_url_list = [theme_base_url + str(x) + '.html' for x in range(1, 6)] 14 | 15 | # 套图链接列表 16 | series_url_lists = [] 17 | 18 | # 保存文件名 19 | save_root_dir = os.path.join(os.getcwd(), 'tmp/') 20 | 21 | 22 | # 获取所有套图的链接列表 23 | def get_series_url_lists(url): 24 | resp = r.get(url) 25 | if resp is not None: 26 | result = resp.text 27 | bs = BeautifulSoup(result, 'html.parser') 28 | ul = bs.find('div', attrs={'class': 'tab_tj'}) 29 | a_s = ul.find_all('a') 30 | for a in a_s: 31 | series_url_lists.append(a.get('href')) 32 | 33 | 34 | # 获取某个套图里的所有图片 35 | def fetch_all_series_pic(url): 36 | cur_page = 1 37 | while True: 38 | current_url = url 39 | if cur_page > 1: 40 | current_url = url.replace('.html', '_' + str(cur_page) + '.html') 41 | resp = r.get(current_url) 42 | if resp.status_code == 404: 43 | break 44 | else: 45 | if resp is not None: 46 | result = resp.text 47 | bs = BeautifulSoup(result, 'lxml') 48 | # 使用lxml来获取标题,用作文件夹名 49 | title_name = bs.find('div', attrs={'class': 'ptitle'}).h1.text 50 | save_dir = os.path.join(save_root_dir, title_name) 51 | if not os.path.exists(save_dir): 52 | os.makedirs(save_dir) 53 | # 使用CSS选择器选择图片结点 54 | imgs = bs.select('img.pic-large') 55 | for img in imgs: 56 | download_pic(img.attrs.get('src'), save_dir) 57 | cur_page += 1 58 | 59 | 60 | # 下载图片的方法 61 | def download_pic(url, path): 62 | print("下载图片:" + url) 63 | try: 64 | pic_name = url.split('/')[-1] 65 | img_resp = r.get(url).content 66 | with open(path + '/' +pic_name, "wb+") as f: 67 | f.write(img_resp) 68 | except Exception as reason: 69 | print(str(reason)) 70 | 71 | 72 | if __name__ == '__main__': 73 | for url in theme_url_list: 74 | get_series_url_lists(url) 75 | for url in series_url_lists: 76 | fetch_all_series_pic(url) 77 | -------------------------------------------------------------------------------- /3、Python爬虫抓包与数据解析/代码/3_3.py: -------------------------------------------------------------------------------- 1 | """ 2 | 正则使用示例 3 | """ 4 | 5 | import re 6 | 7 | ret = re.match(r'^(\d{4})-(\d{3,8})$', '0756-3890993') 8 | print(ret.group()) 9 | print(ret.group(0)) 10 | print(ret.group(1)) 11 | print(ret.group(2)) 12 | 13 | str_count = "您的网站被访问了10000次" 14 | match = re.match(r"^您的网站被访问了(\d{1,6})次$", str_count) 15 | print(match.group(1)) 16 | 17 | -------------------------------------------------------------------------------- /3、Python爬虫抓包与数据解析/代码/3_4.py: -------------------------------------------------------------------------------- 1 | """ 2 | 正则表达式实战示例:采集所有城市编码 3 | """ 4 | import requests as r 5 | from bs4 import BeautifulSoup 6 | import re 7 | import os 8 | 9 | base_url = 'http://www.weather.com.cn' 10 | city_referer_url = 'http://www.weather.com.cn/textFC/hb.shtml' 11 | 12 | # 获取城市编码的正则 13 | code_regex = re.compile('^.*?weather/(.*?).shtml$', re.S) 14 | # 城市编码的保存文件 15 | save_file_name = os.path.join(os.getcwd(), 'city_codes.txt') 16 | # 城市编码列表 17 | city_code_list = [] 18 | 19 | 20 | # 获取所有的城市列表 21 | def fetch_city_url_list(): 22 | city_url_list = [] 23 | resp = r.get(city_referer_url) 24 | resp.encoding = 'utf-8' 25 | bs = BeautifulSoup(resp.text, 'lxml') 26 | content = bs.find('div', attrs={'class': 'lqcontentBoxheader'}) 27 | if content is not None: 28 | a_s = content.find_all('a') 29 | if a_s is not None: 30 | for a in a_s: 31 | city_url_list.append(base_url + a.get('href')) 32 | return city_url_list 33 | 34 | 35 | # 获取城市天气跳转链接列表 36 | def fetch_city_weather_url_list(url): 37 | resp = r.get(url) 38 | resp.encoding = 'utf-8' 39 | bs = BeautifulSoup(resp.text, 'lxml') 40 | a_s = bs.select('div.conMidtab a') 41 | for a in a_s: 42 | if a.get("href") is not None and a.text != '详情' and a.text != '返回顶部': 43 | # 提取城市编码 44 | result = code_regex.match(a.get("href")) 45 | if result is not None: 46 | city_code_list.append(a.text + ":" + result.group(1)) 47 | 48 | 49 | # 把列表写入到文件中的方法 50 | def write_list_to_file(data): 51 | try: 52 | with open(save_file_name, "w+", encoding='utf-8') as f: 53 | for content in data: 54 | f.write(content + "\n") 55 | except OSError as reason: 56 | print(str(reason)) 57 | 58 | 59 | if __name__ == '__main__': 60 | city_list = fetch_city_url_list() 61 | for city in city_list: 62 | print("解析:", city) 63 | fetch_city_weather_url_list(city) 64 | write_list_to_file(city_code_list) 65 | -------------------------------------------------------------------------------- /3、Python爬虫抓包与数据解析/勘误.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/3、Python爬虫抓包与数据解析/勘误.md -------------------------------------------------------------------------------- /4、用CSV 和 Excel 存储数据/代码/4_1.py: -------------------------------------------------------------------------------- 1 | """ 2 | csv库使用代码示例 3 | """ 4 | 5 | import csv 6 | import os 7 | 8 | save_file_name_1 = os.path.join(os.getcwd(), '1.csv') 9 | save_file_name_2 = os.path.join(os.getcwd(), '2.csv') 10 | save_file_name_3 = os.path.join(os.getcwd(), '3.csv') 11 | 12 | data_1 = [['id', '姓名', '性别', '年龄', '工作'], 13 | [1, '小明', '男', '18', '学生'], 14 | [2, '小红', '女', '24', '老师'], 15 | [3, '小光', '男', '25', 'Python工程师']] 16 | 17 | headers = ['id', '姓名', '性别', '年龄', '工作'] 18 | data_2 = [{'id': 1, '姓名': '小明', '性别': '男', '年龄': '18', '工作': '学生'}, 19 | {'id': 2, '姓名': '小红', '性别': '女', '年龄': '24', '工作': '老师'}, 20 | {'id': 3, '姓名': '小光', '性别': '男', '年龄': '25', '工作': 'Python工程师'}] 21 | 22 | # 单行写入示例 23 | with open(save_file_name_1, 'w', newline='') as f: 24 | writer = csv.writer(f) 25 | for row in data_1: 26 | writer.writerow(row) 27 | 28 | # 多行写入 29 | with open(save_file_name_2, 'w', newline='') as f: 30 | writer = csv.writer(f) 31 | writer.writerows(data_1) 32 | 33 | # 字典写入 34 | with open(save_file_name_3, 'w', newline='') as f: 35 | # 标头在这里传入,作为第一行数据 36 | writer = csv.DictWriter(f, headers) 37 | writer.writeheader() 38 | for row in data_2: 39 | writer.writerow(row) 40 | 41 | 42 | 43 | if __name__ == '__main__': 44 | with open(save_file_name_1) as f: 45 | reader = csv.DictReader(f) 46 | for row in reader: 47 | print(row['姓名']) 48 | # reader = csv.reader(f) 49 | # print(list(reader)[0][1]) 50 | # for row in reader: 51 | # print(reader.line_num, row) 52 | 53 | -------------------------------------------------------------------------------- /4、用CSV 和 Excel 存储数据/代码/4_10.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyMongo库实战示例:爬取一号店关键词搜索结果保存到MongoDB中 3 | """ 4 | import pymongo 5 | import requests as r 6 | from lxml import etree 7 | 8 | search_word = "羽毛球" 9 | search_base_url = 'https://search.yhd.com/c0-0/k' 10 | 11 | 12 | def search_goods(key): 13 | data_list = [] 14 | resp = r.get(search_base_url + key) 15 | print(resp.url) 16 | resp.encoding = 'utf-8' 17 | html = etree.HTML(resp.text) 18 | ul_list = html.xpath('//div[@id="itemSearchList"]/div') 19 | for ul in ul_list: 20 | # 商品名称 21 | title = ul.xpath('div//p[@class="proName clearfix"]/a/@title')[0] 22 | # 商品链接 23 | link = ul.xpath('div//p[@class="proName clearfix"]/a/@href')[0] 24 | # 商品价格 25 | price = ul.xpath('div//p[@class="proPrice"]/em/@yhdprice')[0] 26 | # 店铺名称 27 | store = ul.xpath('div//p[@class="storeName limit_width"]/a/@title') 28 | store_name = store[0] if len(store) > 0 else '' 29 | # 评论数 30 | comment_count = ul.xpath('div//p[@class="proPrice"]/span[@class="comment"]/a/text()')[1] 31 | # 好评率 32 | favorable_rate = ul.xpath('div//span[@class="positiveRatio"]/text()')[0] 33 | data_list.append({'title': title, 'link': 'https:' + link, 'price': price, 'store_name': store_name, 'comment_count': comment_count, 34 | 'favorable_rate': favorable_rate}) 35 | return data_list 36 | 37 | 38 | if __name__ == '__main__': 39 | conn = pymongo.MongoClient(host='localhost', port=27017) 40 | search_goods(search_word) 41 | db = conn['yhd'] 42 | collection = db['羽毛球'] 43 | search_result_list = search_goods(search_word) 44 | collection.insert_many(search_result_list) 45 | conn.close() 46 | -------------------------------------------------------------------------------- /4、用CSV 和 Excel 存储数据/代码/4_2.py: -------------------------------------------------------------------------------- 1 | """ 2 | csv库实战示例:爬取星座运势 3 | """ 4 | import csv 5 | import requests as r 6 | from bs4 import BeautifulSoup 7 | import re 8 | import os 9 | 10 | # 抓取站点 11 | constellation_url = 'http://www.xzw.com/fortune/' 12 | 13 | # 提取信息的正则 14 | fetch_regex = re.compile(r'^.*?(.*?)(.*?).*?width:(\d*)%.*?p>(.*)\[ 15') 83 | db.close() 84 | -------------------------------------------------------------------------------- /4、用CSV 和 Excel 存储数据/代码/4_6.py: -------------------------------------------------------------------------------- 1 | """ 2 | 爬取Gank.io API接口的数据到MySQL 3 | """ 4 | import requests as r 5 | from bs4 import BeautifulSoup 6 | import pymysql 7 | 8 | # 接口地址 9 | search_api_base_url = 'https://gank.io/api/data/' 10 | 11 | # 各种分类的表名:Android,iOS,休息视频,福利,拓展资源,前端,瞎推荐,App 12 | category_list = ["android", "ios", "video", "meizi", "other", "fed", "random", "app"] 13 | 14 | # 图片表名 15 | pic_table_name = 'pics' 16 | 17 | # 请求分类字段列表 18 | type_list = ["Android", "iOS", "休息视频", "福利", "拓展资源", "前端", "瞎推荐", "App"] 19 | 20 | # 表字段名 21 | column_list = ('_id', 'createdAt', 'dsec', 'publishedAt', 'source', 'type', 'url', 'used', 'who') 22 | 23 | # 图片表字段名 24 | pic_column_list = ('_id', 'url') 25 | 26 | 27 | # 创建数据库 28 | def create_db(): 29 | conn = pymysql.connect(host='localhost', user='root', password='Zpj12345', port=3306) 30 | cursor = conn.cursor() 31 | cursor.execute("Create Database If Not Exists gank Character Set UTF8MB4") 32 | conn.close() 33 | conn = pymysql.connect(host='localhost', user='root', password='Zpj12345', port=3306, db='gank') 34 | return conn 35 | 36 | 37 | # 创建数据库表 38 | def init_tables(c, table): 39 | c.execute( 40 | ("CREATE TABLE IF Not Exists {table}" 41 | "(_id CHAR(24) PRIMARY KEY," 42 | "createdAt TEXT NOT NULL," 43 | "dsec TEXT NOT NULL," 44 | "publishedAt TEXT NOT NULL," 45 | "source TEXT NOT NULL," 46 | "type TEXT NOT NULL," 47 | "url TEXT NOT NULL," 48 | "used TEXT NOT NULL," 49 | "who TEXT NOT NULL)").format(table=table)) 50 | 51 | 52 | # 创建图表 53 | def init_pic_table(c, table): 54 | c.execute( 55 | ("CREATE TABLE IF Not Exists {table} " 56 | "(id INT AUTO_INCREMENT PRIMARY KEY," 57 | "_id CHAR(24)," 58 | "url TEXT NOT NULL)").format(table=table)) 59 | 60 | 61 | # 把数据插入到数据库中 62 | def insert_data(c, table, column, data): 63 | try: 64 | keys = ', '.join(column) 65 | values = ', '.join(['%s'] * len(data)) 66 | sql = 'INSERT INTO {table} ({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values) 67 | c.execute(sql, tuple(data)) 68 | db.commit() 69 | except Exception as e: 70 | print(e) 71 | db.rollback() 72 | 73 | 74 | # 查询数据库表的方法 75 | def query_data(c, table): 76 | try: 77 | sql = 'SELECT * FROM {table}'.format(table=table) 78 | c.execute(sql) 79 | print('共有 %d 行数据' % c.rowcount) 80 | row = c.fetchone() 81 | while row: 82 | print(row) 83 | row = c.fetchone() 84 | except Exception as e: 85 | print(e) 86 | 87 | 88 | # 爬取接口数据的方法 89 | def fetch_data(c, pos): 90 | page_count = 1 91 | while True: 92 | resp = r.get(search_api_base_url + type_list[pos] + '/50/' + str(page_count)) 93 | result_json = resp.json() 94 | print("抓取:", resp.url) 95 | if len(result_json['results']) > 0: 96 | for result in result_json['results']: 97 | data_list = [result['_id'], 98 | result['createdAt'], 99 | result['desc'], 100 | result['publishedAt'], 101 | result.get('source', ''), 102 | result['type'], 103 | result['url'], 104 | 1 if result['used'] else 0, 105 | result.get('who', '') if result.get('who', '') is not None else ''] 106 | insert_data(c, category_list[pos], column_list, data_list) 107 | if 'images' in result: 108 | for image in result['images']: 109 | insert_data(c, pic_table_name, pic_column_list, [result['_id'], image]) 110 | page_count += 1 111 | else: 112 | break 113 | 114 | 115 | if __name__ == '__main__': 116 | db = create_db() 117 | cursor = db.cursor() 118 | # for category in category_list: 119 | # init_tables(cursor, category) 120 | # init_pic_table(cursor, pic_table_name) 121 | # for i in range(0, len(category_list)): 122 | # fetch_data(cursor, i) 123 | query_data(cursor, 'Android') 124 | cursor.close() 125 | -------------------------------------------------------------------------------- /4、用CSV 和 Excel 存储数据/代码/4_7.py: -------------------------------------------------------------------------------- 1 | """ 2 | redis-py库的基本操作示例 3 | """ 4 | import redis 5 | 6 | # ====================== 连接Redis ============================ 7 | 8 | # 1.普通连接 9 | r = redis.StrictRedis(host='127.0.0.1', port=6379, db=0) 10 | 11 | # 2.连接池(一般) 12 | # redis-py使用connection pool来管理对一个redis server的所有连接,避免每次建立、 13 | # 释放连接的开销。这种方式实现多个Redis实例共享一个连接池 14 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='Zpj12345') 15 | r = redis.StrictRedis(connection_pool=pool) 16 | 17 | # 3.管道 18 | # redis-py,默认情况下,每次都会进行连接池的连接和断开。若是想一次执行多条命令,进行 19 | # 事务性操作,就要用管道。(虽然有这个功能,但是不建议使用,慢而且没什么必要。) 20 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379) 21 | r = redis.StrictRedis(connection_pool=pool) 22 | pipe = r.pipeline(transaction=True) 23 | # 执行多条命令 24 | pipe.execute() 25 | 26 | # ====================== 通用操作 ============================ 27 | 28 | r.delete('name') # 根据键删除redis中的任意数据类型 29 | r.exists('name') # 检测redis的键是否存在 30 | r.keys(pattern='*') # 根据* ?等通配符匹配获取redis的键 31 | r.expire('name', time=3000) # 为某个键设置超时时间 32 | r.rename('name', 'name1') # 重命名键 33 | r.move('name', 'db1') # 将redis的某个值移动到指定的db下 34 | r.randomkey() # 随机获取一个redis的键(不删除) 35 | r.type('name') # 获取键对应值的类型 36 | r.dbsize() # 获得当前数据库中键的数目 37 | r.ttl('name') # 获得键的过期时间 38 | r.flushdb() # 删除当前选择数据库中所有的键 39 | r.flushall() # 删除所有数据库中的所有键 40 | 41 | 42 | # ====================== String操作 ============================ 43 | 44 | # 设置键值对,默认不存在则创建,存在则修改 45 | # set(name, value, ex=None, px=None, nx=False, xx=False) 46 | # ex,过期时间(秒) 47 | # px,过期时间(毫秒) 48 | # nx,如果设置为True,则只有name不存在时,当前set操作才执行,同setnx(name, value) 49 | # xx,如果设置为True,则只有name存在时,当前set操作才执行 50 | 51 | r.set('name', value) #设置值 52 | r.setnx('name',value) #如果name这个键不存在,把这个键对应的值设置为value 53 | r.setex('name', value, time) #设置值,并指定此键值的有效期 54 | r.setrange(name, offset, value) #修改字符串内容,从指定字符串索引开始向后替换 55 | r.mset({"name3":'xxx', "name4":'xxx'}) #批量设置值 56 | r.msetnx({"name3":'xxx', "name4":'xxx'}) #键都不存在是才批量赋值 57 | 58 | r.get('name') # 获取值 59 | r.getset('name', 'yyy') # 为键为name的值赋值为yyy,并返回上次的值xxx 60 | r.mget(['name1','name2']) # 返回多个键对应的值 61 | r.getrange(key, start, end) # 返回键为name的值的字符串,截取索引为start到end的字符 62 | r.strlen("name") #返回name对应值的字节长度(一个汉字3个字节) 63 | 64 | r.append('name',value) # 为键为name的值后追加value 65 | r.incr('name',amount) # 字符串转化为整型,再自增属性name对应的值,当属性name不存在时, 66 | # 则创建name=amount,否则,则自增,amount为自增数(整数) 67 | r.decr('name',amount) #自减name对应的值,当name不存在时,则创建name=amount, 68 | #否则,则自减,amount为自增数(整数) 69 | r.substr('name',start, end) # 返回键为name的值的字符串截取索引为start到end的字符 70 | 71 | -------------------------------------------------------------------------------- /4、用CSV 和 Excel 存储数据/代码/4_8.py: -------------------------------------------------------------------------------- 1 | """ 2 | 利用redis保存bilibili弹幕 3 | """ 4 | import requests as r 5 | from bs4 import BeautifulSoup 6 | import re 7 | import redis 8 | 9 | video_url = 'https://www.bilibili.com/video/av28989880' 10 | cid_regex = re.compile(r'.*?cid=(\d*?)\&.*', re.S) 11 | xml_base_url = 'http://comment.bilibili.com/' 12 | 13 | 14 | # 获取弹幕的cid 15 | def get_cid(): 16 | resp = r.get(video_url).text 17 | bs = BeautifulSoup(resp, 'lxml') 18 | src = bs.select('div.share-address ul li')[1].input 19 | cid = cid_regex.match(str(src)).group(1) 20 | print("获取到的cid:", cid) 21 | 22 | 23 | # 解析获取弹幕 24 | def analysis_d(cid): 25 | count = 1 26 | url = xml_base_url + cid + '.xml' 27 | resp = r.get(url) 28 | resp.encoding = 'utf-8' 29 | bs = BeautifulSoup(resp.text, 'lxml') 30 | d_s = bs.find_all('d') 31 | for d in d_s: 32 | dan_redis.set(str(count), d.text) 33 | count += 1 34 | 35 | 36 | if __name__ == '__main__': 37 | # 连接redis 38 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='Zpj12345', db = 0) 39 | dan_redis = redis.StrictRedis(connection_pool=pool) 40 | # analysis_d('50280136') 41 | results = dan_redis.mget(dan_redis.keys()) 42 | print("总共有%d条数据" % len(results)) 43 | for result in results: 44 | print(result.decode('utf-8')) -------------------------------------------------------------------------------- /4、用CSV 和 Excel 存储数据/代码/4_9.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyMongo库的基本操作示例 3 | """ 4 | 5 | import pymongo 6 | 7 | # 1.连接MongoDB数据库(默认没有密码,如果设置了密码要调用db.auth("用户名","密码")) 8 | conn = pymongo.MongoClient(host='localhost', port=27017) 9 | # 或者采用MongoDB连接字符串的形式也可以: 10 | # conn = pymongo.MongoClient('mongodb://localhost:27017') 11 | 12 | # 2.选择数据库,也可以使用conn['test']这一的方式选择,等价 13 | # db = conn.test 14 | # 15 | # # 3.选择collection 16 | # collection = db.user 17 | # print(collection) 18 | 19 | 20 | # # 4.创建数据库 21 | # db = conn['test_db'] 22 | # 23 | # # 5.创建collection 24 | # collection = db['test_collection'] 25 | 26 | # 6.插入一条数据 27 | # db = conn['test_db'] 28 | # collection = db['test_collection'] 29 | # dic = {'id': '1', 'name': 'Jay'} 30 | # collection.insert_one(dic) 31 | 32 | db = conn.test_db 33 | collection = db.test_collection 34 | 35 | # 7.插入多条数据(传入一个字典的列表) 36 | # data_list = [{'id': '2', 'name': 'Tom'},{'id': '3', 'name': 'Jack'}] 37 | # collection.insert_many(data_list) 38 | 39 | 40 | # 8.查找数据 41 | 42 | # 查找一条 43 | # print(collection.find_one({'name': 'Tom'})) 44 | 45 | 46 | # 查找多条 47 | # data_list = [{'id': '4', 'name': 'Mary'},{'id': '4', 'name': 'Lucy'}] 48 | # collection.insert_many(data_list) 49 | # results = collection.find({'id':'4'}) 50 | # for result in results: 51 | # print(result) 52 | 53 | # 正则匹配 54 | # for result in collection.find({'name':{'$regex':'^J.*'}}): 55 | # print(result) 56 | 57 | # 9.修改数据 58 | 59 | # 方法一:需要整条记录参与 60 | # person = collection.find_one({'name':'Jack'}) 61 | # person['name'] = 'Jacky' 62 | # collection.update({'name':'Jack'}, person) 63 | 64 | # 方法二:部分修改字段内容的方式 65 | # result = collection.update_one({'name': 'Tom'}, {'$set': {"name": "Tony"}}) 66 | # print(result) 67 | # print("匹配的数据条数:",result.matched_count, "受影响的数据条数:",result.modified_count) 68 | 69 | # 10.删除数据 70 | # result = collection.delete_many({'id': {'$lte': 3}}) 71 | # print("删除的数据条数:", result.deleted_count) 72 | 73 | # 11.计数 74 | # print("数据库中有%d条记录。" % collection.find().count()) 75 | 76 | # 12.排序 77 | # data_list = [{'id': 2, 'name': 'Tom'},{'id': 3, 'name': 'Jack'},{'id': 5, 'name': 'Daisy'}] 78 | # collection.insert_many(data_list) 79 | # # 降序排列,升序可以传入pymongo.ASCENDING 80 | # results = collection.find().sort('id', pymongo.DESCENDING) 81 | # for result in results: 82 | # print(result) 83 | 84 | # 13.偏移 85 | results = collection.find().sort('id', pymongo.ASCENDING).skip(1) 86 | for result in results: 87 | print(result) 88 | 89 | -------------------------------------------------------------------------------- /4、用CSV 和 Excel 存储数据/勘误.md: -------------------------------------------------------------------------------- 1 | 2021.4.29 更新内容: 2 | 3 | 4_10.py → 1号店已不提供H5版本,目前无解 -------------------------------------------------------------------------------- /5、用数据库存储数据/代码/5_1.py: -------------------------------------------------------------------------------- 1 | """ 2 | csv库使用代码示例 3 | """ 4 | 5 | import csv 6 | import os 7 | 8 | save_file_name_1 = os.path.join(os.getcwd(), '1.csv') 9 | save_file_name_2 = os.path.join(os.getcwd(), '2.csv') 10 | save_file_name_3 = os.path.join(os.getcwd(), '3.csv') 11 | 12 | data_1 = [['id', '姓名', '性别', '年龄', '工作'], 13 | [1, '小明', '男', '18', '学生'], 14 | [2, '小红', '女', '24', '老师'], 15 | [3, '小光', '男', '25', 'Python工程师']] 16 | 17 | headers = ['id', '姓名', '性别', '年龄', '工作'] 18 | data_2 = [{'id': 1, '姓名': '小明', '性别': '男', '年龄': '18', '工作': '学生'}, 19 | {'id': 2, '姓名': '小红', '性别': '女', '年龄': '24', '工作': '老师'}, 20 | {'id': 3, '姓名': '小光', '性别': '男', '年龄': '25', '工作': 'Python工程师'}] 21 | 22 | # 单行写入示例 23 | with open(save_file_name_1, 'w', newline='') as f: 24 | writer = csv.writer(f) 25 | for row in data_1: 26 | writer.writerow(row) 27 | 28 | # 多行写入 29 | with open(save_file_name_2, 'w', newline='') as f: 30 | writer = csv.writer(f) 31 | writer.writerows(data_1) 32 | 33 | # 字典写入 34 | with open(save_file_name_3, 'w', newline='') as f: 35 | # 标头在这里传入,作为第一行数据 36 | writer = csv.DictWriter(f, headers) 37 | writer.writeheader() 38 | for row in data_2: 39 | writer.writerow(row) 40 | 41 | 42 | 43 | if __name__ == '__main__': 44 | with open(save_file_name_1) as f: 45 | reader = csv.DictReader(f) 46 | for row in reader: 47 | print(row['姓名']) 48 | # reader = csv.reader(f) 49 | # print(list(reader)[0][1]) 50 | # for row in reader: 51 | # print(reader.line_num, row) 52 | 53 | -------------------------------------------------------------------------------- /5、用数据库存储数据/代码/5_10.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyMongo库实战示例:爬取一号店关键词搜索结果保存到MongoDB中 3 | """ 4 | import pymongo 5 | import requests as r 6 | from lxml import etree 7 | 8 | search_word = "羽毛球" 9 | search_base_url = 'https://search.yhd.com/c0-0/k' 10 | 11 | 12 | def search_goods(key): 13 | data_list = [] 14 | resp = r.get(search_base_url + key) 15 | resp.encoding = 'utf-8' 16 | html = etree.HTML(resp.text) 17 | ul_list = html.xpath('//div[@id="itemSearchList"]/div') 18 | for ul in ul_list: 19 | # 商品名称 20 | title = ul.xpath('div//p[@class="proName clearfix"]/a/@title')[0] 21 | # 商品链接 22 | link = ul.xpath('div//p[@class="proName clearfix"]/a/@href')[0] 23 | # 商品价格 24 | price = ul.xpath('div//p[@class="proPrice"]/em/@yhdprice')[0] 25 | # 店铺名称 26 | store = ul.xpath('div//p[@class="storeName limit_width"]/a/@title') 27 | store_name = store[0] if len(store) > 0 else '' 28 | # 评论数 29 | comment_count = ul.xpath('div//p[@class="proPrice"]/span[@class="comment"]/a/text()')[1] 30 | # 好评率 31 | favorable_rate = ul.xpath('div//span[@class="positiveRatio"]/text()')[0] 32 | data_list.append({'title': title, 'link': 'https:' + link, 'price': price, 'store_name': store_name, 'comment_count': comment_count, 33 | 'favorable_rate': favorable_rate}) 34 | return data_list 35 | 36 | 37 | if __name__ == '__main__': 38 | conn = pymongo.MongoClient(host='localhost', port=27017) 39 | search_goods(search_word) 40 | db = conn['yhd'] 41 | collection = db['羽毛球'] 42 | search_result_list = search_goods(search_word) 43 | collection.insert_many(search_result_list) 44 | conn.close() 45 | -------------------------------------------------------------------------------- /5、用数据库存储数据/代码/5_2.py: -------------------------------------------------------------------------------- 1 | """ 2 | csv库实战示例:爬取星座运势 3 | """ 4 | import csv 5 | import requests as r 6 | from bs4 import BeautifulSoup 7 | import re 8 | import os 9 | 10 | # 抓取站点 11 | constellation_url = 'http://www.xzw.com/fortune/' 12 | 13 | # 提取信息的正则 14 | fetch_regex = re.compile(r'^.*?(.*?)(.*?).*?width:(\d*)%.*?p>(.*)\[ 15') 83 | db.close() 84 | -------------------------------------------------------------------------------- /5、用数据库存储数据/代码/5_6.py: -------------------------------------------------------------------------------- 1 | """ 2 | 爬取Gank.io API接口的数据到MySQL 3 | """ 4 | import requests as r 5 | from bs4 import BeautifulSoup 6 | import pymysql 7 | 8 | # 接口地址 9 | search_api_base_url = 'https://gank.io/api/v2/data/' 10 | 11 | # 各种分类的表名:Android,iOS,休息视频,福利,拓展资源,前端,瞎推荐,App 12 | category_list = ["android", "ios", "video", "meizi", "other", "fed", "random", "app"] 13 | 14 | # 图片表名 15 | pic_table_name = 'pics' 16 | 17 | # 请求分类字段列表 18 | type_list = ["Android", "iOS", "休息视频", "福利", "拓展资源", "前端", "瞎推荐", "App"] 19 | 20 | # 表字段名 21 | column_list = ('_id', 'createdAt', 'dsec', 'publishedAt', 'source', 'type', 'url', 'used', 'who') 22 | 23 | # 图片表字段名 24 | pic_column_list = ('_id', 'url') 25 | 26 | 27 | # 创建数据库 28 | def create_db(): 29 | conn = pymysql.connect(host='localhost', user='root', password='Zpj12345', port=3306) 30 | cursor = conn.cursor() 31 | cursor.execute("Create Database If Not Exists gank Character Set UTF8MB4") 32 | conn.close() 33 | conn = pymysql.connect(host='localhost', user='root', password='Zpj12345', port=3306, db='gank') 34 | return conn 35 | 36 | 37 | # 创建数据库表 38 | def init_tables(c, table): 39 | c.execute( 40 | ("CREATE TABLE IF Not Exists {table}" 41 | "(_id CHAR(24) PRIMARY KEY," 42 | "createdAt TEXT NOT NULL," 43 | "dsec TEXT NOT NULL," 44 | "publishedAt TEXT NOT NULL," 45 | "source TEXT NOT NULL," 46 | "type TEXT NOT NULL," 47 | "url TEXT NOT NULL," 48 | "used TEXT NOT NULL," 49 | "who TEXT NOT NULL)").format(table=table)) 50 | 51 | 52 | # 创建图表 53 | def init_pic_table(c, table): 54 | c.execute( 55 | ("CREATE TABLE IF Not Exists {table} " 56 | "(id INT AUTO_INCREMENT PRIMARY KEY," 57 | "_id CHAR(24)," 58 | "url TEXT NOT NULL)").format(table=table)) 59 | 60 | 61 | # 把数据插入到数据库中 62 | def insert_data(c, table, column, data): 63 | try: 64 | keys = ', '.join(column) 65 | values = ', '.join(['%s'] * len(data)) 66 | sql = 'INSERT INTO {table} ({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values) 67 | c.execute(sql, tuple(data)) 68 | db.commit() 69 | except Exception as e: 70 | print(e) 71 | db.rollback() 72 | 73 | 74 | # 查询数据库表的方法 75 | def query_data(c, table): 76 | try: 77 | sql = 'SELECT * FROM {table}'.format(table=table) 78 | c.execute(sql) 79 | print('共有 %d 行数据' % c.rowcount) 80 | row = c.fetchone() 81 | while row: 82 | print(row) 83 | row = c.fetchone() 84 | except Exception as e: 85 | print(e) 86 | 87 | 88 | # 爬取接口数据的方法 89 | def fetch_data(c, pos): 90 | page_count = 1 91 | while True: 92 | resp = r.get(search_api_base_url + type_list[pos] + '/50/' + str(page_count)) 93 | result_json = resp.json() 94 | print("抓取:", resp.url) 95 | if len(result_json['results']) > 0: 96 | for result in result_json['results']: 97 | data_list = [result['_id'], 98 | result['createdAt'], 99 | result['desc'], 100 | result['publishedAt'], 101 | result.get('source', ''), 102 | result['type'], 103 | result['url'], 104 | 1 if result['used'] else 0, 105 | result.get('who', '') if result.get('who', '') is not None else ''] 106 | insert_data(c, category_list[pos], column_list, data_list) 107 | if 'images' in result: 108 | for image in result['images']: 109 | insert_data(c, pic_table_name, pic_column_list, [result['_id'], image]) 110 | page_count += 1 111 | else: 112 | break 113 | 114 | 115 | if __name__ == '__main__': 116 | db = create_db() 117 | cursor = db.cursor() 118 | # for category in category_list: 119 | # init_tables(cursor, category) 120 | # init_pic_table(cursor, pic_table_name) 121 | # for i in range(0, len(category_list)): 122 | # fetch_data(cursor, i) 123 | query_data(cursor, 'Android') 124 | cursor.close() 125 | -------------------------------------------------------------------------------- /5、用数据库存储数据/代码/5_7.py: -------------------------------------------------------------------------------- 1 | """ 2 | redis-py库的基本操作示例 3 | """ 4 | import redis 5 | 6 | # ====================== 连接Redis ============================ 7 | 8 | # 1.普通连接 9 | r = redis.StrictRedis(host='127.0.0.1', port=6379, db=0) 10 | 11 | # 2.连接池(一般) 12 | # redis-py使用connection pool来管理对一个redis server的所有连接,避免每次建立、 13 | # 释放连接的开销。这种方式实现多个Redis实例共享一个连接池 14 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='Zpj12345') 15 | r = redis.StrictRedis(connection_pool=pool) 16 | 17 | # 3.管道 18 | # redis-py,默认情况下,每次都会进行连接池的连接和断开。若是想一次执行多条命令,进行 19 | # 事务性操作,就要用管道。(虽然有这个功能,但是不建议使用,慢而且没什么必要。) 20 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379) 21 | r = redis.StrictRedis(connection_pool=pool) 22 | pipe = r.pipeline(transaction=True) 23 | # 执行多条命令 24 | pipe.execute() 25 | 26 | # ====================== 通用操作 ============================ 27 | 28 | r.delete('name') # 根据键删除redis中的任意数据类型 29 | r.exists('name') # 检测redis的键是否存在 30 | r.keys(pattern='*') # 根据* ?等通配符匹配获取redis的键 31 | r.expire('name', time=3000) # 为某个键设置超时时间 32 | r.rename('name', 'name1') # 重命名键 33 | r.move('name', 'db1') # 将redis的某个值移动到指定的db下 34 | r.randomkey() # 随机获取一个redis的键(不删除) 35 | r.type('name') # 获取键对应值的类型 36 | r.dbsize() # 获得当前数据库中键的数目 37 | r.ttl('name') # 获得键的过期时间 38 | r.flushdb() # 删除当前选择数据库中所有的键 39 | r.flushall() # 删除所有数据库中的所有键 40 | 41 | 42 | # ====================== String操作 ============================ 43 | 44 | # 设置键值对,默认不存在则创建,存在则修改 45 | # set(name, value, ex=None, px=None, nx=False, xx=False) 46 | # ex,过期时间(秒) 47 | # px,过期时间(毫秒) 48 | # nx,如果设置为True,则只有name不存在时,当前set操作才执行,同setnx(name, value) 49 | # xx,如果设置为True,则只有name存在时,当前set操作才执行 50 | 51 | r.set('name', value) #设置值 52 | r.setnx('name',value) #如果name这个键不存在,把这个键对应的值设置为value 53 | r.setex('name', value, time) #设置值,并指定此键值的有效期 54 | r.setrange(name, offset, value) #修改字符串内容,从指定字符串索引开始向后替换 55 | r.mset({"name3":'xxx', "name4":'xxx'}) #批量设置值 56 | r.msetnx({"name3":'xxx', "name4":'xxx'}) #键都不存在是才批量赋值 57 | 58 | r.get('name') # 获取值 59 | r.getset('name', 'yyy') # 为键为name的值赋值为yyy,并返回上次的值xxx 60 | r.mget(['name1','name2']) # 返回多个键对应的值 61 | r.getrange(key, start, end) # 返回键为name的值的字符串,截取索引为start到end的字符 62 | r.strlen("name") #返回name对应值的字节长度(一个汉字3个字节) 63 | 64 | r.append('name',value) # 为键为name的值后追加value 65 | r.incr('name',amount) # 字符串转化为整型,再自增属性name对应的值,当属性name不存在时, 66 | # 则创建name=amount,否则,则自增,amount为自增数(整数) 67 | r.decr('name',amount) #自减name对应的值,当name不存在时,则创建name=amount, 68 | #否则,则自减,amount为自增数(整数) 69 | r.substr('name',start, end) # 返回键为name的值的字符串截取索引为start到end的字符 70 | 71 | -------------------------------------------------------------------------------- /5、用数据库存储数据/代码/5_8.py: -------------------------------------------------------------------------------- 1 | """ 2 | 利用redis保存bilibili弹幕 3 | """ 4 | import requests as r 5 | from bs4 import BeautifulSoup 6 | import re 7 | import redis 8 | 9 | video_url = 'https://www.bilibili.com/video/av28989880' 10 | cid_regex = re.compile(r'cid=(\d{8})', re.S) 11 | xml_base_url = 'http://comment.bilibili.com/' 12 | 13 | 14 | # 获取弹幕的cid 15 | def get_cid(): 16 | resp = r.get(video_url).text 17 | cid = cid_regex.search(str(resp)).group(1).strip() 18 | print("获取到的cid:", cid) 19 | return cid 20 | 21 | 22 | # 解析获取弹幕 23 | def analysis_d(cid): 24 | count = 1 25 | url = xml_base_url + cid + '.xml' 26 | resp = r.get(url) 27 | resp.encoding = 'utf-8' 28 | bs = BeautifulSoup(resp.text, 'lxml') 29 | d_s = bs.find_all('d') 30 | for d in d_s: 31 | print(d.text) 32 | # dan_redis.set(str(count), d.text) 33 | count += 1 34 | 35 | 36 | if __name__ == '__main__': 37 | analysis_d(get_cid()) 38 | # 连接redis 39 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='Zpj12345', db=0) 40 | dan_redis = redis.StrictRedis(connection_pool=pool) 41 | results = dan_redis.mget(dan_redis.keys()) 42 | print("总共有%d条数据" % len(results)) 43 | for result in results: 44 | print(result.decode('utf-8')) 45 | -------------------------------------------------------------------------------- /5、用数据库存储数据/代码/5_9.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyMongo库的基本操作示例 3 | """ 4 | 5 | import pymongo 6 | 7 | # 1.连接MongoDB数据库(默认没有密码,如果设置了密码要调用db.auth("用户名","密码")) 8 | conn = pymongo.MongoClient(host='localhost', port=27017) 9 | # 或者采用MongoDB连接字符串的形式也可以: 10 | # conn = pymongo.MongoClient('mongodb://localhost:27017') 11 | 12 | # 2.选择数据库,也可以使用conn['test']这一的方式选择,等价 13 | # db = conn.test 14 | # 15 | # # 3.选择collection 16 | # collection = db.user 17 | # print(collection) 18 | 19 | 20 | # # 4.创建数据库 21 | # db = conn['test_db'] 22 | # 23 | # # 5.创建collection 24 | # collection = db['test_collection'] 25 | 26 | # 6.插入一条数据 27 | # db = conn['test_db'] 28 | # collection = db['test_collection'] 29 | # dic = {'id': '1', 'name': 'Jay'} 30 | # collection.insert_one(dic) 31 | 32 | db = conn.test_db 33 | collection = db.test_collection 34 | 35 | # 7.插入多条数据(传入一个字典的列表) 36 | # data_list = [{'id': '2', 'name': 'Tom'},{'id': '3', 'name': 'Jack'}] 37 | # collection.insert_many(data_list) 38 | 39 | 40 | # 8.查找数据 41 | 42 | # 查找一条 43 | # print(collection.find_one({'name': 'Tom'})) 44 | 45 | 46 | # 查找多条 47 | # data_list = [{'id': '4', 'name': 'Mary'},{'id': '4', 'name': 'Lucy'}] 48 | # collection.insert_many(data_list) 49 | # results = collection.find({'id':'4'}) 50 | # for result in results: 51 | # print(result) 52 | 53 | # 正则匹配 54 | # for result in collection.find({'name':{'$regex':'^J.*'}}): 55 | # print(result) 56 | 57 | # 9.修改数据 58 | 59 | # 方法一:需要整条记录参与 60 | # person = collection.find_one({'name':'Jack'}) 61 | # person['name'] = 'Jacky' 62 | # collection.update({'name':'Jack'}, person) 63 | 64 | # 方法二:部分修改字段内容的方式 65 | # result = collection.update_one({'name': 'Tom'}, {'$set': {"name": "Tony"}}) 66 | # print(result) 67 | # print("匹配的数据条数:",result.matched_count, "受影响的数据条数:",result.modified_count) 68 | 69 | # 10.删除数据 70 | # result = collection.delete_many({'id': {'$lte': 3}}) 71 | # print("删除的数据条数:", result.deleted_count) 72 | 73 | # 11.计数 74 | # print("数据库中有%d条记录。" % collection.find().count()) 75 | 76 | # 12.排序 77 | # data_list = [{'id': 2, 'name': 'Tom'},{'id': 3, 'name': 'Jack'},{'id': 5, 'name': 'Daisy'}] 78 | # collection.insert_many(data_list) 79 | # # 降序排列,升序可以传入pymongo.ASCENDING 80 | # results = collection.find().sort('id', pymongo.DESCENDING) 81 | # for result in results: 82 | # print(result) 83 | 84 | # 13.偏移 85 | results = collection.find().sort('id', pymongo.ASCENDING).skip(1) 86 | for result in results: 87 | print(result) 88 | 89 | -------------------------------------------------------------------------------- /5、用数据库存储数据/勘误.md: -------------------------------------------------------------------------------- 1 | 2021.4.29 更新内容: 2 | 3 | 5_6.py → 更新Gank.io的新API 4 | 5 | 5_8.py → 更新获取cid的正则,修改提取弹幕规则 6 | 7 | 5_10.py → 1号店已不提供H5版本,目前无解 -------------------------------------------------------------------------------- /6、Python应对反爬虫策略/代码/6_1.py: -------------------------------------------------------------------------------- 1 | """ 2 | fake_useragent库使用示例 3 | """ 4 | 5 | from fake_useragent import UserAgent 6 | import random 7 | 8 | if __name__ == '__main__': 9 | ua = UserAgent(use_cache_server=False) 10 | print("Chrome浏览器:", ua.chrome) 11 | print("FireFox浏览器:", ua.firefox) 12 | print("Ubuntu FireFox浏览器:", ua.ff) 13 | print("IE浏览器:", ua.ie) 14 | print("Safari浏览器:", ua.safari) 15 | print("Mac Chrome:", ua.google) 16 | print("Opera浏览器:", ua.opera) 17 | print("随机:",ua.random) 18 | -------------------------------------------------------------------------------- /6、Python应对反爬虫策略/代码/6_2.py: -------------------------------------------------------------------------------- 1 | """ 2 | Ajax动态加载数据应对策略例子:爬取花瓣网某个画板的所有风景图 3 | """ 4 | import requests as r 5 | import os 6 | import re 7 | import json 8 | 9 | # 图片URL拼接的前缀和后缀 10 | img_start_url = 'http://img.hb.aicdn.com/' 11 | img_end = '_fw658' 12 | 13 | # 图片key的保存文件 14 | pic_key_file = 'pin_ids.txt' 15 | 16 | # 获取pins的正则 17 | boards_pattern = re.compile(r'pins":(.*)};') 18 | 19 | # 修改pin_id的正则 20 | max_pattern = re.compile(r'(?<=max=)\d*(?=&limit)') 21 | 22 | # 图片保存路径 23 | pic_download_dir = os.path.join(os.getcwd(), 'HuaBan/') 24 | 25 | # Ajax模拟的请求头 26 | ajax_headers = { 27 | 'Host': 'huaban.com', 28 | 'Accept': 'application/json', 29 | 'X-Request': 'JSON', 30 | 'X-Requested-With': 'XMLHttpRequest' 31 | } 32 | 33 | 34 | # 以追加的形式往文件中写入内容 35 | def write_str_data(content, file_path): 36 | try: 37 | with open(file_path, 'a+', encoding='utf-8') as f: 38 | f.write(content + "\n", ) 39 | except OSError as reason: 40 | print(str(reason)) 41 | 42 | 43 | # 按行读取文件里的内容添加到列表中返回 44 | def load_data(file_path): 45 | if os.path.exists(file_path): 46 | data_list = [] 47 | with open(file_path, "r+", encoding='utf-8') as f: 48 | for ip in f: 49 | data_list.append(ip.replace("\n", "")) 50 | return data_list 51 | 52 | 53 | # 获得borads页数据,提取key列表写入到文件里,并返回最后一个pid用于后续查询 54 | def get_boards_index_data(url): 55 | print("请求:" + url) 56 | resp = r.get(url).text 57 | result = boards_pattern.search(resp) 58 | json_dict = json.loads(result.group(1)) 59 | for item in json_dict: 60 | write_str_data(item['file']['key'], pic_key_file) 61 | # 返回最后一个pin_id 62 | pin_id = json_dict[-1]['pin_id'] 63 | return pin_id 64 | 65 | 66 | # 模拟Ajax请求更多数据 67 | def get_json_list(url): 68 | print("请求:" + url) 69 | resp = r.get(url, headers=ajax_headers) 70 | if resp is None: 71 | return None 72 | else: 73 | json_dict = json.loads(resp.text) 74 | pins = json_dict['board']['pins'] 75 | if len(pins) == 0: 76 | return None 77 | else: 78 | for item in pins: 79 | write_str_data(item['file']['key'], pic_key_file) 80 | return pins[-1]['pin_id'] 81 | 82 | 83 | # 下载图片的方法 84 | def download_pic(key): 85 | url = img_start_url + key + img_end 86 | resp = r.get(url).content 87 | try: 88 | print("下载图片:" + url) 89 | pic_name = key + ".jpg" 90 | with open(pic_download_dir + pic_name, "wb+") as f: 91 | f.write(resp) 92 | except (OSError, r.HTTPError, r.ConnectionError, Exception) as reason: 93 | print(str(reason)) 94 | 95 | 96 | if __name__ == '__main__': 97 | if not os.path.exists(pic_download_dir): 98 | os.makedirs(pic_download_dir) 99 | # 判断图片key的保存文件是否存在,存在的话删除 100 | if os.path.exists(pic_key_file): 101 | os.remove(pic_key_file) 102 | # 一个画板链接,可自行替换 103 | boards_url = 'http://huaban.com/boards/279523/' 104 | board_last_pin_id = get_boards_index_data(boards_url) 105 | board_json_url = boards_url + '?jl58nz3i&max=43131274&limit=20&wfl=1' 106 | while True: 107 | board_last_pin_id = get_json_list(max_pattern.sub(str(board_last_pin_id), board_json_url)) 108 | if board_last_pin_id is None: 109 | break 110 | pic_url_list = load_data(pic_key_file) 111 | for key in pic_url_list: 112 | download_pic(key) 113 | print("所有图片下载完成~") 114 | -------------------------------------------------------------------------------- /6、Python应对反爬虫策略/代码/6_3.py: -------------------------------------------------------------------------------- 1 | """ 2 | selenium使用示例 3 | """ 4 | from selenium import webdriver 5 | 6 | browser = webdriver.Chrome() # 调用本地的Chrome浏览器 7 | browser.get('http://www.baidu.com') # 请求页面,会打开一个浏览器窗口 8 | html_text = browser.page_source # 获得页面代码 9 | # browser.quit() # 关闭浏览器 10 | print(html_text) -------------------------------------------------------------------------------- /6、Python应对反爬虫策略/代码/6_4.py: -------------------------------------------------------------------------------- 1 | """ 2 | selenium爬取简单网无聊图示例 3 | """ 4 | import os 5 | from selenium import webdriver 6 | import redis 7 | import requests as r 8 | from bs4 import BeautifulSoup 9 | 10 | # 请求基地址 11 | base_url = 'http://jandan.net/pic' 12 | # 图片的保存路径 13 | pic_save_path = os.path.join(os.getcwd(), 'JianDan/') 14 | # 图片需要,作为Reids键用 15 | pic_count = 0 16 | 17 | # 下载图片用headers 18 | pic_headers = { 19 | 'Host': 'wx2.sinaimg.cn', 20 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 21 | 'Chrome/61.0.3163.100 Safari/537.36 ' 22 | } 23 | 24 | 25 | # 打开浏览器模拟请求 26 | def browser_get(): 27 | browser = webdriver.Chrome() 28 | browser.get(base_url) 29 | html_text = browser.page_source 30 | page_count = get_page_count(html_text) 31 | # 循环拼接URL访问 32 | for page in range(page_count, 0, -1): 33 | page_url = base_url + '/page-' + str(page) 34 | print('解析:' + page_url) 35 | browser.get(page_url) 36 | html = browser.page_source 37 | get_meizi_url(html) 38 | # 没有更多了关闭浏览器 39 | browser.quit() 40 | 41 | 42 | # 获取总页码 43 | def get_page_count(html): 44 | bs = BeautifulSoup(html, 'lxml') 45 | page_count = bs.find('span', attrs={'class': 'current-comment-page'}) 46 | return int(page_count.get_text()[1:-1]) - 1 47 | 48 | 49 | # 获取每页的图片 50 | def get_meizi_url(html): 51 | soup = BeautifulSoup(html, 'html.parser') 52 | ol = soup.find('ol', attrs={'class': 'commentlist'}) 53 | href = ol.findAll('a', attrs={'class': 'view_img_link'}) 54 | global pic_count 55 | for a in href: 56 | dan_redis.set(str(pic_count), a['href']) 57 | pic_count += 1 58 | 59 | 60 | # 下载图片 61 | def download_pic(url): 62 | correct_url = url 63 | if url.startswith('//'): 64 | correct_url = url[2:] 65 | if not url.startswith('http'): 66 | correct_url = 'http://' + correct_url 67 | print("下载:", correct_url) 68 | try: 69 | resp = r.get(correct_url, headers=pic_headers).content 70 | pic_name = correct_url.split("/")[-1] 71 | with open(pic_save_path + pic_name, "wb+") as f: 72 | f.write(resp) 73 | except (OSError, r.ConnectionError, r.HTTPError, Exception) as reason: 74 | print(str(reason)) 75 | 76 | 77 | if __name__ == '__main__': 78 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, password='Zpj12345', db=1) 79 | dan_redis = redis.StrictRedis(connection_pool=pool) 80 | if not os.path.exists(pic_save_path): 81 | os.makedirs(pic_save_path) 82 | browser_get() 83 | results = dan_redis.mget(dan_redis.keys()) 84 | for result in results: 85 | download_pic(result.decode('utf-8')) 86 | print("图片下载完毕!") 87 | -------------------------------------------------------------------------------- /6、Python应对反爬虫策略/代码/6_5.py: -------------------------------------------------------------------------------- 1 | """ 2 | Selenium+Tesserocr实现自动登陆知乎 3 | """ 4 | import os 5 | from selenium import webdriver 6 | import requests as r 7 | import time 8 | from PIL import Image 9 | from aip import AipOcr 10 | from hashlib import md5 11 | import base64 12 | 13 | zhihu_login_url = 'https://www.zhihu.com/signup' 14 | 15 | config = { 16 | 'appId': 'd4ed8d211abd4f20b3xxe0f55xxx173f', 17 | 'apiKey': 'Nk3RSGAh0gFEGdoFC7GxxaCQ', 18 | 'secretKey': '63TyYDkI5R0x21tDsCxxBoF8EEmiDfEd' 19 | } 20 | client = AipOcr(**config) 21 | 22 | # 超级鹰参数 23 | cjy_params = { 24 | 'user': 'CoderPig', 25 | 'pass2': md5('zpj12345'.encode('utf8')).hexdigest(), 26 | 'softid': '897137', 27 | } 28 | 29 | # 超级鹰请求头 30 | cjy_headers = { 31 | 'Connection': 'Keep-Alive', 32 | 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', 33 | } 34 | 35 | 36 | # 打开浏览器模拟请求 37 | def auto_login(): 38 | browser = webdriver.Chrome() 39 | while True: 40 | browser.get(zhihu_login_url) 41 | # 判断是否处于注册页(底部有登录字样,是的话点击跳转) 42 | signup_switch_bt = browser.find_element_by_xpath('//div[@class="SignContainer-switch"]/span') 43 | if signup_switch_bt.text == '登录': 44 | signup_switch_bt.click() 45 | # 输入用户名 46 | username_input = browser.find_element_by_xpath('//input[@name="username"]') 47 | username_input.send_keys('xx@qq.com') 48 | # 输入密码 49 | password_input = browser.find_element_by_xpath('//input[@name="password"]') 50 | password_input.send_keys('xxx') 51 | # 等待一会儿,等验证码刷出来 52 | time.sleep(5) 53 | # 判断是否包含英文字符验证码,是的话处理,否则跳出 54 | if is_elements_existed(browser, "//div[@class='Captcha-englishContainer']"): 55 | if len(browser.find_element_by_xpath("//img[@class='Captcha-englishImg']").get_attribute('src')) > 30: 56 | code_img = browser.find_element_by_xpath('//img[@alt="图形验证码"]') 57 | code = cjy_fetch_code(base64.b64decode(code_img.get_attribute('src')[22:].replace("%0A", "")), 1902) 58 | # 输入验证码 59 | code_input = browser.find_element_by_xpath('//input[@name="captcha"]') 60 | code_input.send_keys(code) 61 | time.sleep(2) 62 | # 点击登录按钮 63 | login_bt = browser.find_element_by_xpath('//button[@type="submit"]') 64 | login_bt.click() 65 | time.sleep(3) 66 | break 67 | else: 68 | continue 69 | time.sleep(10) 70 | # 打印当前的网页链接,以此判断是否跳转成功 71 | print(browser.current_url) 72 | 73 | 74 | # 判断xpath定位的元素是否存在 75 | def is_elements_existed(browser, element): 76 | flag = True 77 | try: 78 | browser.find_element_by_xpath(element) 79 | return flag 80 | except: 81 | flag = False 82 | return flag 83 | 84 | 85 | # 读取图片 86 | def get_file_content(file_path): 87 | with open(file_path, 'rb') as fp: 88 | return fp.read() 89 | 90 | 91 | # 百度OCR文字识别 92 | def baidu_ocr(file): 93 | image = get_file_content(file) 94 | # 调用通用文字识别, 图片参数为本地图片 95 | result = client.basicAccurate(image) 96 | print(result) 97 | if 'words_result' in result: 98 | return '\n'.join([w['words'] for w in result['words_result']]) 99 | 100 | 101 | # 重置图片大小,并进行灰度和二值化处理 102 | def resize_pic(file, width=1200, height=480): 103 | img = Image.open(file) 104 | try: 105 | new_img = img.resize((width, height), Image.BILINEAR) 106 | # 转灰度处理 107 | new_img = new_img.convert('L') 108 | # 二值化处理 109 | table = [] 110 | for i in range(256): 111 | if i < 150: 112 | table.append(0) 113 | else: 114 | table.append(1) 115 | # 通过表格转换为二进制图片 116 | new_img = new_img.point(table, "1") 117 | new_img.save(os.path.join(os.getcwd(), os.path.basename(file))) 118 | except Exception as e: 119 | print(e) 120 | 121 | 122 | # 超级鹰识别验证码 123 | def cjy_fetch_code(im, codetype): 124 | cjy_params.update({'codetype': codetype}) 125 | files = {'userfile': ('ccc.jpg', im)} 126 | resp = r.post('http://upload.chaojiying.net/Upload/Processing.php', data=cjy_params, files=files, 127 | headers=cjy_headers).json() 128 | print(resp) 129 | if resp.get('err_no', 0) == 0: 130 | return resp.get('pic_str') 131 | 132 | 133 | if __name__ == '__main__': 134 | # resize_pic('code.png') 135 | # baidu_ocr('code.png') 136 | # im = open('code.png', 'rb').read() 137 | # print(cjy_fetch_code(im, 1902)) 138 | auto_login() 139 | -------------------------------------------------------------------------------- /6、Python应对反爬虫策略/代码/6_6.py: -------------------------------------------------------------------------------- 1 | """ 2 | 破解极验滑动验证码示例 3 | """ 4 | import time 5 | from selenium import webdriver 6 | from selenium.webdriver.support.wait import WebDriverWait 7 | from selenium.webdriver import ActionChains 8 | from lxml import etree 9 | import requests as r 10 | import re 11 | import PIL.Image as image 12 | 13 | full_image_file = 'full.jpg' 14 | cut_image_file = 'cut.jpg' 15 | bilibili_login_url = 'https://passport.bilibili.com/login' 16 | url_fetch_regex = re.compile('url\(\"(.*?)\"\);') 17 | bg_postion_regex = re.compile('position: (.*?)px (.*?)px;') 18 | 19 | 20 | def auto_login(): 21 | # 输入账号密码 22 | input_user = browser.find_element_by_xpath('//input[@id="login-username"]') 23 | input_user.send_keys("xxx") 24 | input_passwd = browser.find_element_by_xpath('//input[@id="login-passwd"]') 25 | input_passwd.send_keys("xxx") 26 | # 验证码自动验证 27 | location_lists = fetch_images() 28 | offset = (get_offset(restore_images(cut_image_file, location_lists[0]), 29 | restore_images(full_image_file, location_lists[1]))) 30 | print("滑块偏移量:", offset) 31 | b_track = get_track(offset - 6) 32 | b_slider = get_slider() 33 | move_slider(b_slider, b_track) 34 | time.sleep(1) 35 | # 点击登录 36 | login_bt = browser.find_element_by_xpath('//a[@class="btn btn-login"]') 37 | login_bt.click() 38 | 39 | 40 | # 下载缺失的图片,每个小方块的坐标 41 | def fetch_images(): 42 | html = etree.HTML(browser.page_source) 43 | cut_bg = html.xpath('//div[@class="gt_cut_bg gt_show"]/div') 44 | full_bg = html.xpath('//div[@class="gt_cut_fullbg gt_show"]/div') 45 | # 提取两个打乱后顺序的webp图片URL替换为jpg 46 | cut_bg_url = url_fetch_regex.search((cut_bg[0].get('style'))).group(1).replace('webp', 'jpg') 47 | full_bg_url = url_fetch_regex.search((full_bg[0].get('style'))).group(1).replace('webp', 'jpg') 48 | with open(cut_image_file, 'wb+') as f: f.write(r.get(cut_bg_url).content) 49 | with open(full_image_file, 'wb+') as f: f.write(r.get(full_bg_url).content) 50 | # 采集图片定位坐标 51 | cut_bg_location_list = [] 52 | full_bg_location_list = [] 53 | for cut in cut_bg: 54 | cut_result = bg_postion_regex.search(cut.get('style')) 55 | full_result = bg_postion_regex.search(cut.get('style')) 56 | cut_bg_location_list.append({'x': int(cut_result.group(1)), 'y': int(cut_result.group(2))}) 57 | full_bg_location_list.append({'x': int(full_result.group(1)), 'y': int(full_result.group(2))}) 58 | return cut_bg_location_list, full_bg_location_list 59 | 60 | 61 | # 合并还原图片 62 | def restore_images(file, location_list): 63 | im = image.open(file) 64 | # 分段分成上面的图和下面的图列表 65 | below_list = [] 66 | above_list = [] 67 | for location in location_list: 68 | if location['y'] == -58: 69 | above_list.append(im.crop((abs(location['x']), 58, abs(location['x']) + 10, 116))) 70 | if location['y'] == 0: 71 | below_list.append(im.crop((abs(location['x']), 0, abs(location['x']) + 10, 58))) 72 | 73 | # 创建一个一样大的图片 74 | new_im = image.new('RGB', (260, 116)) 75 | # 遍历坐标粘贴上面的图片 76 | x_offset = 0 77 | for im in above_list: 78 | new_im.paste(im, (x_offset, 0)) 79 | x_offset += im.size[0] 80 | # 遍历坐标粘贴下面的图片 81 | x_offset = 0 82 | for im in below_list: 83 | new_im.paste(im, (x_offset, 58)) 84 | x_offset += im.size[0] 85 | # 保存图片 86 | new_im.save(file) 87 | return new_im 88 | 89 | 90 | # 判断两个像素点是否相同 91 | def is_pixel_equal(img1, img2, x, y): 92 | pix1 = img1.load()[x, y] 93 | pix2 = img2.load()[x, y] 94 | scope = 20 # 像素阀值 95 | return abs(pix1[0] - pix2[0] < scope) and abs(pix1[1] - pix2[1] < scope) and abs(pix1[2] - pix2[2] < scope) 96 | 97 | 98 | # 获得缺口偏移量 99 | def get_offset(img1, img2): 100 | left = 60 101 | for x in range(left, img1.size[0]): 102 | for y in range(img1.size[1]): 103 | if not is_pixel_equal(img1, img2, x, y): 104 | return x 105 | return left 106 | 107 | 108 | # 获取滑块 109 | def get_slider(): 110 | while True: 111 | try: 112 | slider = browser.find_element_by_xpath("//div[@class='gt_slider_knob gt_show']") 113 | break 114 | except: 115 | time.sleep(0.5) 116 | return slider 117 | 118 | 119 | # 滑块匀速滑动轨迹构造 120 | def get_track(distance): 121 | track = [] 122 | current = 0 123 | while current < distance: 124 | move = distance / 4 125 | current += move 126 | track.append(round(move)) 127 | return track 128 | 129 | 130 | # 先加速后减速滑动轨迹构造 131 | def get_person_track(distance): 132 | track = [] 133 | current = 0 134 | mid = distance * 4 / 5 # 减速阈值 135 | t = 0.2 # 计算间隔 136 | v = 0 # 初速度 137 | while current < distance: 138 | a = 2 if current < mid else -3 139 | v0 = v # 初速度v0 140 | v = v0 + a * t # 当前速度 141 | move = v0 * t + 1 / 2 * a * t * t # 移动距离 142 | current += move 143 | track.append(round(move)) 144 | return track 145 | 146 | 147 | # 滑块滑动的方法 148 | def move_slider(slider, track): 149 | ActionChains(browser).click_and_hold(slider).perform() 150 | for x in track: 151 | ActionChains(browser).move_by_offset(xoffset=x, yoffset=0).perform() 152 | time.sleep(0.05) 153 | ActionChains(browser).release().perform() 154 | 155 | 156 | if __name__ == '__main__': 157 | browser = webdriver.Chrome() 158 | wait = WebDriverWait(browser, 20) 159 | browser.get(bilibili_login_url) 160 | # 休眠2秒等待登录页加载完毕 161 | time.sleep(1) 162 | auto_login() 163 | time.sleep(5) 164 | print(browser.current_url) 165 | browser.quit() 166 | -------------------------------------------------------------------------------- /6、Python应对反爬虫策略/勘误.md: -------------------------------------------------------------------------------- 1 | 2021.4.29 更新内容: 2 | 3 | 6_5.py → 知乎登录不再使用文字验证,而是使用滑动验证 4 | 5 | 6_6.py → B站登录不再使用验证,而是使用文件识别验证 -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__init__.py -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/items.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/items.cpython-37.pyc -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/middlewares.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/middlewares.cpython-37.pyc -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/pipelines.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/pipelines.cpython-37.pyc -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class BcyItem(scrapy.Item): 11 | author = scrapy.Field() 12 | pic_url = scrapy.Field() 13 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | import os 10 | import random 11 | import logging 12 | 13 | 14 | class FirstspiderSpiderMiddleware(object): 15 | # Not all methods need to be defined. If a method is not defined, 16 | # scrapy acts as if the spider middleware does not modify the 17 | # passed objects. 18 | 19 | @classmethod 20 | def from_crawler(cls, crawler): 21 | # This method is used by Scrapy to create your spiders. 22 | s = cls() 23 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 24 | return s 25 | 26 | def process_spider_input(self, response, spider): 27 | # Called for each response that goes through the spider 28 | # middleware and into the spider. 29 | 30 | # Should return None or raise an exception. 31 | return None 32 | 33 | def process_spider_output(self, response, result, spider): 34 | # Called with the results returned from the Spider, after 35 | # it has processed the response. 36 | 37 | # Must return an iterable of Request, dict or Item objects. 38 | for i in result: 39 | yield i 40 | 41 | def process_spider_exception(self, response, exception, spider): 42 | # Called when a spider or process_spider_input() method 43 | # (from other spider middleware) raises an exception. 44 | 45 | # Should return either None or an iterable of Response, dict 46 | # or Item objects. 47 | pass 48 | 49 | def process_start_requests(self, start_requests, spider): 50 | # Called with the start requests of the spider, and works 51 | # similarly to the process_spider_output() method, except 52 | # that it doesn’t have a response associated. 53 | 54 | # Must return only requests (not items). 55 | for r in start_requests: 56 | yield r 57 | 58 | def spider_opened(self, spider): 59 | spider.logger.info('Spider opened: %s' % spider.name) 60 | 61 | 62 | class FirstspiderDownloaderMiddleware(object): 63 | # Not all methods need to be defined. If a method is not defined, 64 | # scrapy acts as if the downloader middleware does not modify the 65 | # passed objects. 66 | 67 | @classmethod 68 | def from_crawler(cls, crawler): 69 | # This method is used by Scrapy to create your spiders. 70 | s = cls() 71 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 72 | return s 73 | 74 | def process_request(self, request, spider): 75 | # Called for each request that goes through the downloader 76 | # middleware. 77 | 78 | # Must either: 79 | # - return None: continue processing this request 80 | # - or return a Response object 81 | # - or return a Request object 82 | # - or raise IgnoreRequest: process_exception() methods of 83 | # installed downloader middleware will be called 84 | return None 85 | 86 | def process_response(self, request, response, spider): 87 | # Called with the response returned from the downloader. 88 | 89 | # Must either; 90 | # - return a Response object 91 | # - return a Request object 92 | # - or raise IgnoreRequest 93 | return response 94 | 95 | def process_exception(self, request, exception, spider): 96 | # Called when a download handler or a process_request() 97 | # (from other downloader middleware) raises an exception. 98 | 99 | # Must either: 100 | # - return None: continue processing this exception 101 | # - return a Response object: stops process_exception() chain 102 | # - return a Request object: stops process_exception() chain 103 | pass 104 | 105 | def spider_opened(self, spider): 106 | spider.logger.info('Spider opened: %s' % spider.name) 107 | 108 | 109 | class ProxyMiddleware(object): 110 | def __init__(self): 111 | self.proxy_ip_list = self.load_list_from_file() 112 | 113 | @staticmethod 114 | def load_list_from_file(): 115 | data_list = [] 116 | with open(os.path.join(os.getcwd(), 'proxy_ip.txt'), "r+", encoding='utf-8') as f: 117 | for ip in f: 118 | data_list.append(ip.replace("\n", "")) 119 | return data_list 120 | 121 | def process_request(self, request, spider): 122 | if request.meta.get('retry_times'): 123 | proxy = self.proxy_ip_list[random.randint(0, 175)] 124 | if proxy: 125 | proxy_ip = 'https://{proxy}'.format(proxy=proxy) 126 | logging.debug("使用了代理:", proxy_ip) 127 | request.meta['proxy'] = proxy_ip 128 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymysql 8 | 9 | 10 | class FirstspiderPipeline(object): 11 | def process_item(self, item, spider): 12 | return item 13 | 14 | 15 | class MySQLPipeline(): 16 | def __init__(self): 17 | self.host = 'localhost' 18 | self.database = 'bcy' 19 | self.user = 'root' 20 | self.password = 'Jay12345' 21 | self.port = 3306 22 | 23 | def open_spider(self, spider): 24 | self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8', port=self.port) 25 | self.cursor = self.db.cursor() 26 | 27 | def close_spider(self, spider): 28 | self.db.close() 29 | 30 | def process_item(self, item, spider): 31 | data = dict(item) 32 | keys = ', '.join(data.keys()) 33 | values = ', '.join(["%s"] * len(data)) 34 | sql = "INSERT INTO draw (%s) VALUES (%s)" % (keys, values) 35 | self.cursor.execute(sql, tuple(data.values())) 36 | self.db.commit() 37 | return item 38 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/proxy_ip.txt: -------------------------------------------------------------------------------- 1 | 125.39.9.34:9000 2 | 183.129.244.17:21776 3 | 120.131.9.254:1080 4 | 117.28.97.64:808 5 | 120.92.174.37:1080 6 | 119.254.103.43:8000 7 | 219.141.153.4:80 8 | 103.205.14.254:53281 9 | 222.175.200.58:8060 10 | 60.13.187.162:63000 11 | 119.180.136.223:8060 12 | 111.47.192.141:8888 13 | 219.141.153.40:80 14 | 171.11.77.4:45592 15 | 221.2.174.28:8060 16 | 14.149.68.120:1080 17 | 61.150.113.74:8908 18 | 119.179.135.114:8060 19 | 39.135.24.12:80 20 | 183.2.203.24:9000 21 | 123.7.177.20:9999 22 | 125.72.70.46:8060 23 | 114.250.25.19:80 24 | 101.248.64.74:80 25 | 60.8.42.132:8908 26 | 119.179.133.58:8060 27 | 140.207.95.94:8060 28 | 123.249.88.153:9000 29 | 219.141.153.2:8080 30 | 119.179.175.60:8060 31 | 61.135.180.27:9000 32 | 112.24.107.102:8908 33 | 121.8.98.196:80 34 | 222.88.149.32:8060 35 | 121.8.98.198:80 36 | 183.234.38.213:63000 37 | 27.154.240.222:8060 38 | 123.161.62.150:9000 39 | 118.190.200.139:8080 40 | 219.150.189.212:9999 41 | 219.145.197.203:8908 42 | 183.15.121.120:3128 43 | 219.141.153.44:80 44 | 221.14.140.130:80 45 | 121.8.98.197:80 46 | 221.2.175.214:8060 47 | 113.87.202.97:53281 48 | 113.128.198.50:8060 49 | 111.3.154.196:8060 50 | 60.13.156.45:8060 51 | 39.137.77.67:8080 52 | 222.222.243.124:8060 53 | 120.194.61.62:8060 54 | 221.1.205.74:8060 55 | 118.190.94.254:9001 56 | 123.161.62.151:9000 57 | 119.52.116.114:80 58 | 61.150.109.70:8908 59 | 101.81.48.234:1028 60 | 117.158.174.164:8060 61 | 222.208.208.33:8060 62 | 106.56.102.219:8070 63 | 124.118.27.3:8060 64 | 39.137.69.8:80 65 | 117.141.99.38:53281 66 | 183.63.101.62:55555 67 | 123.117.166.166:8060 68 | 163.125.114.218:8118 69 | 171.10.31.67:8080 70 | 223.93.145.186:8060 71 | 223.96.95.229:3128 72 | 61.150.113.27:8908 73 | 219.141.153.3:80 74 | 222.88.147.121:8060 75 | 120.236.128.201:8060 76 | 221.234.192.220:8010 77 | 61.150.113.75:8908 78 | 183.163.41.62:41766 79 | 221.2.174.99:8060 80 | 218.60.8.83:3129 81 | 125.39.9.35:9000 82 | 180.168.113.204:1080 83 | 111.205.6.206:8088 84 | 60.8.42.134:8908 85 | 219.141.153.35:80 86 | 61.135.18.206:8888 87 | 218.201.55.74:63000 88 | 183.246.84.229:8060 89 | 116.228.236.219:8080 90 | 121.17.18.218:8060 91 | 112.16.28.103:8060 92 | 61.149.137.110:80 93 | 175.10.87.16:8060 94 | 60.30.19.131:10010 95 | 39.137.69.10:8080 96 | 117.28.96.109:808 97 | 125.46.245.93:53281 98 | 211.136.127.125:80 99 | 219.141.153.41:80 100 | 180.119.141.11:8118 101 | 124.238.248.4:80 102 | 175.174.85.171:80 103 | 123.122.225.134:8888 104 | 221.194.108.8:8060 105 | 119.180.173.64:8060 106 | 119.179.135.132:8060 107 | 101.227.5.36:9000 108 | 61.150.113.28:8908 109 | 111.43.139.151:80 110 | 124.128.76.142:8060 111 | 112.24.107.109:8908 112 | 119.180.178.70:8060 113 | 106.12.3.84:80 114 | 111.3.122.245:8060 115 | 39.135.24.11:80 116 | 42.236.123.17:80 117 | 222.222.236.207:8060 118 | 113.231.247.131:80 119 | 39.137.69.7:80 120 | 120.92.142.64:8080 121 | 114.225.169.226:53128 122 | 112.24.107.101:8908 123 | 106.58.252.76:80 124 | 58.49.73.141:8888 125 | 116.196.105.136:80 126 | 221.193.177.45:8060 127 | 117.44.247.53:8908 128 | 221.2.174.6:8060 129 | 118.190.95.35:9001 130 | 39.137.69.9:8080 131 | 119.180.138.69:8060 132 | 221.2.174.3:8060 133 | 222.223.203.109:8060 134 | 117.66.167.30:8118 135 | 1.197.117.27:8060 136 | 221.176.206.29:8060 137 | 219.141.153.39:80 138 | 39.137.77.68:8080 139 | 58.49.72.141:8888 140 | 222.88.154.56:8060 141 | 39.137.77.66:80 142 | 59.48.237.6:8060 143 | 119.48.189.100:80 144 | 222.89.85.130:8060 145 | 106.12.22.41:8118 146 | 202.103.215.23:80 147 | 60.8.42.36:8908 148 | 117.177.243.6:80 149 | 218.244.44.194:8060 150 | 118.190.95.43:9001 151 | 219.141.153.34:80 152 | 106.56.102.35:8070 153 | 103.205.26.57:21776 154 | 117.131.235.198:8060 155 | 183.129.207.74:11493 156 | 58.247.46.123:8088 157 | 60.8.42.137:8908 158 | 117.156.234.3:8060 159 | 223.68.190.130:8181 160 | 222.88.147.104:8060 161 | 183.220.43.78:8080 162 | 123.146.216.14:80 163 | 60.8.42.15:8908 164 | 221.14.140.66:80 165 | 175.155.24.10:1133 166 | 119.180.161.173:8060 167 | 175.9.177.63:8060 168 | 182.254.145.163:1080 169 | 119.187.120.118:8060 170 | 202.100.83.139:80 171 | 183.129.207.73:13846 172 | 120.236.168.19:8060 173 | 219.141.153.6:80 174 | 211.159.171.58:80 175 | 221.1.84.241:8197 176 | 60.14.125.246:8908 -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/run.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | 3 | cmdline.execute(["scrapy", "crawl", "bcy"]) 4 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for FirstSpider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'FirstSpider' 13 | 14 | SPIDER_MODULES = ['FirstSpider.spiders'] 15 | NEWSPIDER_MODULE = 'FirstSpider.spiders' 16 | 17 | ROBOTSTXT_OBEY = False 18 | 19 | 20 | DEFAULT_REQUEST_HEADERS = { 21 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 22 | 'Chrome/68.0.3440.106 Safari/537.36', 23 | 'Host': 'bcy.net', 24 | 'Origin': 'https://bcy.net', 25 | } 26 | 27 | DOWNLOADER_MIDDLEWARES = { 28 | 'FirstSpider.middlewares.ProxyMiddleware': 555 29 | } 30 | 31 | ITEM_PIPELINES = { 32 | 'FirstSpider.pipelines.MySQLPipeline': 300, 33 | } 34 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/spiders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/spiders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/spiders/__pycache__/bcy.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/spiders/__pycache__/bcy.cpython-37.pyc -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/FirstSpider/FirstSpider/spiders/bcy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Request, Spider, Selector 3 | import datetime 4 | 5 | from FirstSpider.items import * 6 | 7 | 8 | def parse_index(response): 9 | items = response.xpath('//li[@class="js-smallCards _box"]') 10 | for item in items: 11 | bcy_item = BcyItem() 12 | bcy_item['author'] = item.xpath('a[@class="db posr ovf"]/@title').extract_first() 13 | bcy_item['pic_url'] = item.xpath('a/img/@src').extract_first().replace('/2X3', '') 14 | yield bcy_item 15 | 16 | 17 | class BcySpider(Spider): 18 | name = 'bcy' 19 | allowed_domains = ['bcy.net'] 20 | 21 | index_url = 'https://bcy.net/illust/toppost100?type=lastday&date={d}' 22 | 23 | ajax_url = 'https://bcy.net/illust/index/ajaxloadtoppost?p=1&type=lastday&date={d}' 24 | 25 | date_list = [] # 日期范围列表 26 | 27 | ajax_headers = { 28 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 29 | 'Chrome/68.0.3440.106 Safari/537.36', 30 | 'Host': 'bcy.net', 31 | 'Origin': 'https://bcy.net', 32 | 'X-Requested-With': 'XMLHttpRequest' 33 | } 34 | 35 | def start_requests(self): 36 | self.init_date_list() 37 | for date in self.date_list: 38 | yield Request(self.index_url.format(d=date), callback=parse_index) 39 | for date in self.date_list: 40 | yield Request(self.ajax_url.format(d=date), callback=parse_index) 41 | 42 | # 构造一个日期列表 43 | def init_date_list(self): 44 | begin_date = datetime.datetime.strptime("20150918", "%Y%m%d") 45 | end_date = datetime.datetime.strptime("20180827", "%Y%m%d") 46 | while begin_date <= end_date: 47 | date_str = begin_date.strftime("%Y%m%d") 48 | self.date_list.append(date_str) 49 | begin_date += datetime.timedelta(days=1) 50 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/FirstSpider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = FirstSpider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = FirstSpider 12 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | ENV PATH /usr/local/bin:$PATH 3 | ADD . /code 4 | WORKDIR /code 5 | RUN pip3 install -r requirements.txt 6 | CMD scrapy crawl BingWallpaper -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/bing.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/bing.json -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/bing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/bing/__init__.py -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/bing/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/bing/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/bing/__pycache__/items.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/bing/__pycache__/items.cpython-37.pyc -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/bing/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/bing/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/bing/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BingItem(scrapy.Item): 12 | image_urls = scrapy.Field() 13 | images = scrapy.Field() 14 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/bing/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class BingSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class BingDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/bing/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class BingPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/bing/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for bing project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'bing' 13 | 14 | SPIDER_MODULES = ['bing.spiders'] 15 | NEWSPIDER_MODULE = 'bing.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'bing (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | # CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | # DOWNLOAD_DELAY = 31 30 | # The download delay setting will honor only one of: 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | # CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | # COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | # TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | # DEFAULT_REQUEST_HEADERS = { 42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | # 'Accept-Language': 'en', 44 | # } 45 | 46 | # Enable or disable spider middlewares 47 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 48 | # SPIDER_MIDDLEWARES = { 49 | # 'bing.middlewares.BingSpiderMiddleware': 543, 50 | # } 51 | 52 | # Enable or disable downloader middlewares 53 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 54 | # DOWNLOADER_MIDDLEWARES = { 55 | # 'bing.middlewares.BingDownloaderMiddleware': 543, 56 | # } 57 | 58 | # Enable or disable extensions 59 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 60 | # EXTENSIONS = { 61 | # 'scrapy.extensions.telnet.TelnetConsole': None, 62 | # } 63 | 64 | # Configure item pipelines 65 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 66 | # ITEM_PIPELINES = { 67 | # 'bing.pipelines.BingPipeline': 300, 68 | # } 69 | 70 | ITEM_PIPELINES = { 71 | # 引入Scrapy提供的ImagesPipeline组件 72 | 'scrapy.pipelines.images.ImagesPipeline': 300, 73 | } 74 | 75 | # ImagesPipeline辅助配置项 76 | # 图片存储路径(绝对路径或相对路径) 77 | IMAGES_STORE = 'out/res/pic/' 78 | # BingItem中定义的存储图片链接的image_urls字段 79 | IMAGES_URLS_FIELD = 'image_urls' 80 | # BingItem中定义的的images字段 81 | IMAGES_RESULT_FIELD='images' 82 | # 过期时间,单位:天(可选) 83 | IMAGES_EXPIRES = 120 84 | # 过滤小图片(可选) 85 | # IMAGES_MIN_HEIGHT = 110 86 | # IMAGES_MIN_WIDTH = 110 87 | # 是否允许重定向(可选) 88 | # MEDIA_ALLOW_REDIRECTS = True 89 | # 生成缩略图(可选) 90 | # IMAGES_THUMBS = { 91 | # 'small': (50, 50), 92 | # 'big': (270, 270), 93 | # } 94 | 95 | # Enable and configure the AutoThrottle extension (disabled by default) 96 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 97 | # AUTOTHROTTLE_ENABLED = True 98 | # The initial download delay 99 | # AUTOTHROTTLE_START_DELAY = 5 100 | # The maximum download delay to be set in case of high latencies 101 | # AUTOTHROTTLE_MAX_DELAY = 60 102 | # The average number of requests Scrapy should be sending in parallel to 103 | # each remote server 104 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 105 | # Enable showing throttling stats for every response received: 106 | # AUTOTHROTTLE_DEBUG = False 107 | 108 | # Enable and configure HTTP caching (disabled by default) 109 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 110 | # HTTPCACHE_ENABLED = True 111 | # HTTPCACHE_EXPIRATION_SECS = 0 112 | # HTTPCACHE_DIR = 'httpcache' 113 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 114 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 115 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/BingWallpaper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Spider, Request 3 | import time 4 | import json 5 | 6 | from bing.items import BingItem 7 | 8 | 9 | class BingWallpaperSpider(Spider): 10 | name = 'BingWallpaper' 11 | allowed_domains = ['cn.bing.com'] 12 | 13 | def start_requests(self): 14 | yield Request( 15 | 'https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc={ts}&pid=hp'.format(ts=int(time.time())), 16 | callback=self.parse) 17 | 18 | def parse(self, response): 19 | json_result = json.loads(response.body.decode('utf8')) 20 | images = json_result['images'] 21 | if images is not None: 22 | item = BingItem() 23 | url_list = [] 24 | for image in images: 25 | url_list.append('https://cn.bing.com' + image['url']) 26 | item['image_urls'] = url_list 27 | yield item 28 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/Test.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | print(int(time.time())) -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/__pycache__/BingWallpaper.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/__pycache__/BingWallpaper.cpython-37.pyc -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/__pycache__/Test.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/__pycache__/Test.cpython-37.pyc -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/bing/spiders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T104228.709049.log: -------------------------------------------------------------------------------- 1 | 2018-10-15 10:42:29 [scrapy] DEBUG: Crawled (200) (referer: None) 2 | 2018-10-15 10:42:29 [scrapy] ERROR: Spider error processing (referer: None) 3 | Traceback (most recent call last): 4 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback 5 | yield next(it) 6 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 30, in process_spider_output 7 | for x in result: 8 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in 9 | return (_set_referer(r) for r in result or ()) 10 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in 11 | return (r for r in result or () if _filter(r)) 12 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in 13 | return (r for r in result or () if _filter(r)) 14 | File "E:\Code\Python\bing\bing\spiders\BingWallpaper.py", line 17, in parse 15 | json_result = json.loads(response.body.decode('utf8')) 16 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\__init__.py", line 348, in loads 17 | return _default_decoder.decode(s) 18 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 337, in decode 19 | obj, end = self.raw_decode(s, idx=_w(s, 0).end()) 20 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 355, in raw_decode 21 | raise JSONDecodeError("Expecting value", s, err.value) from None 22 | json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) 23 | 2018-10-15 10:42:29 [scrapy] INFO: Closing spider (finished) 24 | 2018-10-15 10:42:29 [scrapy] INFO: Dumping Scrapy stats: 25 | {'downloader/request_bytes': 210, 26 | 'downloader/request_count': 1, 27 | 'downloader/request_method_count/GET': 1, 28 | 'downloader/response_bytes': 48356, 29 | 'downloader/response_count': 1, 30 | 'downloader/response_status_count/200': 1, 31 | 'finish_reason': 'finished', 32 | 'finish_time': datetime.datetime(2018, 10, 15, 2, 42, 29, 916818), 33 | 'log_count/DEBUG': 1, 34 | 'log_count/ERROR': 1, 35 | 'log_count/INFO': 6, 36 | 'response_received_count': 1, 37 | 'scheduler/dequeued': 1, 38 | 'scheduler/dequeued/memory': 1, 39 | 'scheduler/enqueued': 1, 40 | 'scheduler/enqueued/memory': 1, 41 | 'spider_exceptions/JSONDecodeError': 1, 42 | 'start_time': datetime.datetime(2018, 10, 15, 2, 42, 29, 334376)} 43 | 2018-10-15 10:42:29 [scrapy] INFO: Spider closed (finished) 44 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T104303.655633.log: -------------------------------------------------------------------------------- 1 | 2018-10-15 10:43:03 [scrapy] DEBUG: Crawled (200) (referer: None) 2 | 2018-10-15 10:43:04 [scrapy] ERROR: Spider error processing (referer: None) 3 | Traceback (most recent call last): 4 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback 5 | yield next(it) 6 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 30, in process_spider_output 7 | for x in result: 8 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in 9 | return (_set_referer(r) for r in result or ()) 10 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in 11 | return (r for r in result or () if _filter(r)) 12 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in 13 | return (r for r in result or () if _filter(r)) 14 | File "E:\Code\Python\bing\bing\spiders\BingWallpaper.py", line 17, in parse 15 | json_result = json.loads(response.body.decode('utf8')) 16 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\__init__.py", line 348, in loads 17 | return _default_decoder.decode(s) 18 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 337, in decode 19 | obj, end = self.raw_decode(s, idx=_w(s, 0).end()) 20 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 355, in raw_decode 21 | raise JSONDecodeError("Expecting value", s, err.value) from None 22 | json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) 23 | 2018-10-15 10:43:04 [scrapy] INFO: Closing spider (finished) 24 | 2018-10-15 10:43:04 [scrapy] INFO: Dumping Scrapy stats: 25 | {'downloader/request_bytes': 210, 26 | 'downloader/request_count': 1, 27 | 'downloader/request_method_count/GET': 1, 28 | 'downloader/response_bytes': 48361, 29 | 'downloader/response_count': 1, 30 | 'downloader/response_status_count/200': 1, 31 | 'finish_reason': 'finished', 32 | 'finish_time': datetime.datetime(2018, 10, 15, 2, 43, 4, 73509), 33 | 'log_count/DEBUG': 1, 34 | 'log_count/ERROR': 1, 35 | 'log_count/INFO': 6, 36 | 'response_received_count': 1, 37 | 'scheduler/dequeued': 1, 38 | 'scheduler/dequeued/memory': 1, 39 | 'scheduler/enqueued': 1, 40 | 'scheduler/enqueued/memory': 1, 41 | 'spider_exceptions/JSONDecodeError': 1, 42 | 'start_time': datetime.datetime(2018, 10, 15, 2, 43, 3, 665598)} 43 | 2018-10-15 10:43:04 [scrapy] INFO: Spider closed (finished) 44 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T104348.228406.log: -------------------------------------------------------------------------------- 1 | 2018-10-15 10:43:48 [scrapy] DEBUG: Crawled (200) (referer: None) 2 | 2018-10-15 10:43:48 [scrapy] ERROR: Spider error processing (referer: None) 3 | Traceback (most recent call last): 4 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback 5 | yield next(it) 6 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 30, in process_spider_output 7 | for x in result: 8 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in 9 | return (_set_referer(r) for r in result or ()) 10 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in 11 | return (r for r in result or () if _filter(r)) 12 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in 13 | return (r for r in result or () if _filter(r)) 14 | File "E:\Code\Python\bing\bing\spiders\BingWallpaper.py", line 17, in parse 15 | json_result = json.loads(response.body.decode('utf8')) 16 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\__init__.py", line 348, in loads 17 | return _default_decoder.decode(s) 18 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 337, in decode 19 | obj, end = self.raw_decode(s, idx=_w(s, 0).end()) 20 | File "c:\users\coderpig\appdata\local\programs\python\python37-32\lib\json\decoder.py", line 355, in raw_decode 21 | raise JSONDecodeError("Expecting value", s, err.value) from None 22 | json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) 23 | 2018-10-15 10:43:48 [scrapy] INFO: Closing spider (finished) 24 | 2018-10-15 10:43:48 [scrapy] INFO: Dumping Scrapy stats: 25 | {'downloader/request_bytes': 210, 26 | 'downloader/request_count': 1, 27 | 'downloader/request_method_count/GET': 1, 28 | 'downloader/response_bytes': 48360, 29 | 'downloader/response_count': 1, 30 | 'downloader/response_status_count/200': 1, 31 | 'finish_reason': 'finished', 32 | 'finish_time': datetime.datetime(2018, 10, 15, 2, 43, 48, 681197), 33 | 'log_count/DEBUG': 1, 34 | 'log_count/ERROR': 1, 35 | 'log_count/INFO': 6, 36 | 'response_received_count': 1, 37 | 'scheduler/dequeued': 1, 38 | 'scheduler/dequeued/memory': 1, 39 | 'scheduler/enqueued': 1, 40 | 'scheduler/enqueued/memory': 1, 41 | 'spider_exceptions/JSONDecodeError': 1, 42 | 'start_time': datetime.datetime(2018, 10, 15, 2, 43, 48, 238379)} 43 | 2018-10-15 10:43:48 [scrapy] INFO: Spider closed (finished) 44 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T104841.872511.log: -------------------------------------------------------------------------------- 1 | 2018-10-15 10:48:42 [scrapy] DEBUG: Crawled (200) (referer: None) 2 | 2018-10-15 10:48:42 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 3 | 2018-10-15 10:48:42 [scrapy] DEBUG: Crawled (200) (referer: None) 4 | 2018-10-15 10:48:42 [scrapy] DEBUG: File (downloaded): Downloaded file from referred in 5 | 2018-10-15 10:48:43 [scrapy] DEBUG: Crawled (200) (referer: None) 6 | 2018-10-15 10:48:43 [scrapy] DEBUG: File (downloaded): Downloaded file from referred in 7 | 2018-10-15 10:48:43 [scrapy] DEBUG: Crawled (200) (referer: None) 8 | 2018-10-15 10:48:43 [scrapy] DEBUG: File (downloaded): Downloaded file from referred in 9 | 2018-10-15 10:48:43 [scrapy] DEBUG: Crawled (200) (referer: None) 10 | 2018-10-15 10:48:43 [scrapy] DEBUG: File (downloaded): Downloaded file from referred in 11 | 2018-10-15 10:48:43 [scrapy] DEBUG: Crawled (200) (referer: None) 12 | 2018-10-15 10:48:43 [scrapy] DEBUG: File (downloaded): Downloaded file from referred in 13 | 2018-10-15 10:48:43 [scrapy] DEBUG: Crawled (200) (referer: None) 14 | 2018-10-15 10:48:43 [scrapy] DEBUG: File (downloaded): Downloaded file from referred in 15 | 2018-10-15 10:48:43 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539571348&pid=hp> 16 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg', 17 | 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg', 18 | 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg', 19 | 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg', 20 | 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg', 21 | 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg', 22 | 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'], 23 | 'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f', 24 | 'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg', 25 | 'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'}, 26 | {'checksum': 'adbcc3f3fa26188db600654137117e2a', 27 | 'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg', 28 | 'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'}, 29 | {'checksum': '092f09cdb791bedf29913ad3d1940960', 30 | 'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg', 31 | 'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'}, 32 | {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7', 33 | 'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg', 34 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'}, 35 | {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1', 36 | 'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg', 37 | 'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'}, 38 | {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99', 39 | 'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg', 40 | 'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'}, 41 | {'checksum': '6ebfdb5210fce5986b88d07053ac94af', 42 | 'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg', 43 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]} 44 | 2018-10-15 10:48:43 [scrapy] INFO: Closing spider (finished) 45 | 2018-10-15 10:48:43 [scrapy] INFO: Dumping Scrapy stats: 46 | {'downloader/request_bytes': 3614, 47 | 'downloader/request_count': 7, 48 | 'downloader/request_method_count/GET': 7, 49 | 'downloader/response_bytes': 1842731, 50 | 'downloader/response_count': 7, 51 | 'downloader/response_status_count/200': 7, 52 | 'file_count': 7, 53 | 'file_status_count/downloaded': 6, 54 | 'file_status_count/uptodate': 1, 55 | 'finish_reason': 'finished', 56 | 'finish_time': datetime.datetime(2018, 10, 15, 2, 48, 43, 629811), 57 | 'item_scraped_count': 1, 58 | 'log_count/DEBUG': 15, 59 | 'log_count/INFO': 6, 60 | 'response_received_count': 7, 61 | 'scheduler/dequeued': 1, 62 | 'scheduler/dequeued/memory': 1, 63 | 'scheduler/enqueued': 1, 64 | 'scheduler/enqueued/memory': 1, 65 | 'start_time': datetime.datetime(2018, 10, 15, 2, 48, 41, 884479)} 66 | 2018-10-15 10:48:43 [scrapy] INFO: Spider closed (finished) 67 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T104922.591600.log: -------------------------------------------------------------------------------- 1 | 2018-10-15 10:49:22 [scrapy] DEBUG: Crawled (200) (referer: None) 2 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 3 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 4 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 5 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 6 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 7 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 8 | 2018-10-15 10:49:22 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 9 | 2018-10-15 10:49:23 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539571348&pid=hp> 10 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg', 11 | 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg', 12 | 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg', 13 | 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg', 14 | 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg', 15 | 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg', 16 | 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'], 17 | 'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f', 18 | 'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg', 19 | 'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'}, 20 | {'checksum': 'adbcc3f3fa26188db600654137117e2a', 21 | 'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg', 22 | 'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'}, 23 | {'checksum': '092f09cdb791bedf29913ad3d1940960', 24 | 'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg', 25 | 'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'}, 26 | {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7', 27 | 'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg', 28 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'}, 29 | {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1', 30 | 'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg', 31 | 'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'}, 32 | {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99', 33 | 'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg', 34 | 'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'}, 35 | {'checksum': '6ebfdb5210fce5986b88d07053ac94af', 36 | 'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg', 37 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]} 38 | 2018-10-15 10:49:23 [scrapy] INFO: Closing spider (finished) 39 | 2018-10-15 10:49:23 [scrapy] INFO: Dumping Scrapy stats: 40 | {'downloader/request_bytes': 270, 41 | 'downloader/request_count': 1, 42 | 'downloader/request_method_count/GET': 1, 43 | 'downloader/response_bytes': 2711, 44 | 'downloader/response_count': 1, 45 | 'downloader/response_status_count/200': 1, 46 | 'file_count': 7, 47 | 'file_status_count/uptodate': 7, 48 | 'finish_reason': 'finished', 49 | 'finish_time': datetime.datetime(2018, 10, 15, 2, 49, 23, 63339), 50 | 'item_scraped_count': 1, 51 | 'log_count/DEBUG': 9, 52 | 'log_count/INFO': 6, 53 | 'response_received_count': 1, 54 | 'scheduler/dequeued': 1, 55 | 'scheduler/dequeued/memory': 1, 56 | 'scheduler/enqueued': 1, 57 | 'scheduler/enqueued/memory': 1, 58 | 'start_time': datetime.datetime(2018, 10, 15, 2, 49, 22, 600576)} 59 | 2018-10-15 10:49:23 [scrapy] INFO: Spider closed (finished) 60 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T105002.320386.log: -------------------------------------------------------------------------------- 1 | 2018-10-15 10:50:02 [scrapy] DEBUG: Crawled (200) (referer: None) 2 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 3 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 4 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 5 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 6 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 7 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 8 | 2018-10-15 10:50:02 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 9 | 2018-10-15 10:50:02 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539571348&pid=hp> 10 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg', 11 | 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg', 12 | 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg', 13 | 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg', 14 | 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg', 15 | 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg', 16 | 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'], 17 | 'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f', 18 | 'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg', 19 | 'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'}, 20 | {'checksum': 'adbcc3f3fa26188db600654137117e2a', 21 | 'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg', 22 | 'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'}, 23 | {'checksum': '092f09cdb791bedf29913ad3d1940960', 24 | 'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg', 25 | 'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'}, 26 | {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7', 27 | 'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg', 28 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'}, 29 | {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1', 30 | 'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg', 31 | 'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'}, 32 | {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99', 33 | 'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg', 34 | 'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'}, 35 | {'checksum': '6ebfdb5210fce5986b88d07053ac94af', 36 | 'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg', 37 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]} 38 | 2018-10-15 10:50:02 [scrapy] INFO: Closing spider (finished) 39 | 2018-10-15 10:50:02 [scrapy] INFO: Dumping Scrapy stats: 40 | {'downloader/request_bytes': 270, 41 | 'downloader/request_count': 1, 42 | 'downloader/request_method_count/GET': 1, 43 | 'downloader/response_bytes': 2711, 44 | 'downloader/response_count': 1, 45 | 'downloader/response_status_count/200': 1, 46 | 'file_count': 7, 47 | 'file_status_count/uptodate': 7, 48 | 'finish_reason': 'finished', 49 | 'finish_time': datetime.datetime(2018, 10, 15, 2, 50, 2, 820051), 50 | 'item_scraped_count': 1, 51 | 'log_count/DEBUG': 9, 52 | 'log_count/INFO': 6, 53 | 'response_received_count': 1, 54 | 'scheduler/dequeued': 1, 55 | 'scheduler/dequeued/memory': 1, 56 | 'scheduler/enqueued': 1, 57 | 'scheduler/enqueued/memory': 1, 58 | 'start_time': datetime.datetime(2018, 10, 15, 2, 50, 2, 331356)} 59 | 2018-10-15 10:50:02 [scrapy] INFO: Spider closed (finished) 60 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T105902.809743.log: -------------------------------------------------------------------------------- 1 | 2018-10-15 10:59:03 [scrapy] DEBUG: Crawled (200) (referer: None) 2 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 3 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 4 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 5 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 6 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 7 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 8 | 2018-10-15 10:59:03 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 9 | 2018-10-15 10:59:03 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539572342&pid=hp> 10 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg', 11 | 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg', 12 | 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg', 13 | 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg', 14 | 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg', 15 | 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg', 16 | 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'], 17 | 'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f', 18 | 'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg', 19 | 'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'}, 20 | {'checksum': 'adbcc3f3fa26188db600654137117e2a', 21 | 'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg', 22 | 'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'}, 23 | {'checksum': '092f09cdb791bedf29913ad3d1940960', 24 | 'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg', 25 | 'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'}, 26 | {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7', 27 | 'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg', 28 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'}, 29 | {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1', 30 | 'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg', 31 | 'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'}, 32 | {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99', 33 | 'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg', 34 | 'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'}, 35 | {'checksum': '6ebfdb5210fce5986b88d07053ac94af', 36 | 'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg', 37 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]} 38 | 2018-10-15 10:59:03 [scrapy] INFO: Closing spider (finished) 39 | 2018-10-15 10:59:03 [scrapy] INFO: Dumping Scrapy stats: 40 | {'downloader/request_bytes': 270, 41 | 'downloader/request_count': 1, 42 | 'downloader/request_method_count/GET': 1, 43 | 'downloader/response_bytes': 2711, 44 | 'downloader/response_count': 1, 45 | 'downloader/response_status_count/200': 1, 46 | 'file_count': 7, 47 | 'file_status_count/uptodate': 7, 48 | 'finish_reason': 'finished', 49 | 'finish_time': datetime.datetime(2018, 10, 15, 2, 59, 3, 536799), 50 | 'item_scraped_count': 1, 51 | 'log_count/DEBUG': 9, 52 | 'log_count/INFO': 6, 53 | 'response_received_count': 1, 54 | 'scheduler/dequeued': 1, 55 | 'scheduler/dequeued/memory': 1, 56 | 'scheduler/enqueued': 1, 57 | 'scheduler/enqueued/memory': 1, 58 | 'start_time': datetime.datetime(2018, 10, 15, 2, 59, 3, 51096)} 59 | 2018-10-15 10:59:03 [scrapy] INFO: Spider closed (finished) 60 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T113038.987323.log: -------------------------------------------------------------------------------- 1 | 2018-10-15 11:30:39 [scrapy] DEBUG: Crawled (200) (referer: None) 2 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 3 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 4 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 5 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 6 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 7 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 8 | 2018-10-15 11:30:39 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 9 | 2018-10-15 11:30:39 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539574239&pid=hp> 10 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg', 11 | 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg', 12 | 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg', 13 | 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg', 14 | 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg', 15 | 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg', 16 | 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'], 17 | 'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f', 18 | 'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg', 19 | 'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'}, 20 | {'checksum': 'adbcc3f3fa26188db600654137117e2a', 21 | 'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg', 22 | 'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'}, 23 | {'checksum': '092f09cdb791bedf29913ad3d1940960', 24 | 'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg', 25 | 'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'}, 26 | {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7', 27 | 'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg', 28 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'}, 29 | {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1', 30 | 'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg', 31 | 'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'}, 32 | {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99', 33 | 'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg', 34 | 'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'}, 35 | {'checksum': '6ebfdb5210fce5986b88d07053ac94af', 36 | 'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg', 37 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]} 38 | 2018-10-15 11:30:39 [scrapy] INFO: Closing spider (finished) 39 | 2018-10-15 11:30:39 [scrapy] INFO: Dumping Scrapy stats: 40 | {'downloader/request_bytes': 270, 41 | 'downloader/request_count': 1, 42 | 'downloader/request_method_count/GET': 1, 43 | 'downloader/response_bytes': 2711, 44 | 'downloader/response_count': 1, 45 | 'downloader/response_status_count/200': 1, 46 | 'file_count': 7, 47 | 'file_status_count/uptodate': 7, 48 | 'finish_reason': 'finished', 49 | 'finish_time': datetime.datetime(2018, 10, 15, 3, 30, 39, 713384), 50 | 'item_scraped_count': 1, 51 | 'log_count/DEBUG': 9, 52 | 'log_count/INFO': 6, 53 | 'response_received_count': 1, 54 | 'scheduler/dequeued': 1, 55 | 'scheduler/dequeued/memory': 1, 56 | 'scheduler/enqueued': 1, 57 | 'scheduler/enqueued/memory': 1, 58 | 'start_time': datetime.datetime(2018, 10, 15, 3, 30, 39, 227680)} 59 | 2018-10-15 11:30:39 [scrapy] INFO: Spider closed (finished) 60 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/logs/BingWallpaper/2018-10-15T120654.496911.log: -------------------------------------------------------------------------------- 1 | 2018-10-15 12:06:54 [scrapy] DEBUG: Crawled (200) (referer: None) 2 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 3 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 4 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 5 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 6 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 7 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 8 | 2018-10-15 12:06:55 [scrapy] DEBUG: File (uptodate): Downloaded image from referred in 9 | 2018-10-15 12:06:55 [scrapy] DEBUG: Scraped from <200 https://cn.bing.com/HPImageArchive.aspx?format=js&idx=1&n=7&nc=1539575848&pid=hp> 10 | {'image_urls': ['https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg', 11 | 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg', 12 | 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg', 13 | 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg', 14 | 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg', 15 | 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg', 16 | 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'], 17 | 'images': [{'checksum': 'dc935f1115f39bf76b9f983e6affe29f', 18 | 'path': 'full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg', 19 | 'url': 'https://cn.bing.com/az/hprichbg/rb/BodeBerlin_ZH-CN6982399462_1920x1080.jpg'}, 20 | {'checksum': 'adbcc3f3fa26188db600654137117e2a', 21 | 'path': 'full/e254600d400f3c54c77171e02b021d46369788ae.jpg', 22 | 'url': 'https://cn.bing.com/az/hprichbg/rb/ZeroDegrees_ZH-CN10117368234_1920x1080.jpg'}, 23 | {'checksum': '092f09cdb791bedf29913ad3d1940960', 24 | 'path': 'full/c14461fb44425865b9afe6695ab5926e2001411c.jpg', 25 | 'url': 'https://cn.bing.com/az/hprichbg/rb/LascauxCavePainting_ZH-CN11733576571_1920x1080.jpg'}, 26 | {'checksum': 'a8cfd0a149f520f7bc18dd237ce264d7', 27 | 'path': 'full/97e86cde9a308e626f537c107303537ec598903c.jpg', 28 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SchoolGirls_ZH-CN10666418108_1920x1080.jpg'}, 29 | {'checksum': 'a0c7d5e136ef2e06407bc42e86873fc1', 30 | 'path': 'full/4099096a19a0eaad0aef6782a206881d948ad775.jpg', 31 | 'url': 'https://cn.bing.com/az/hprichbg/rb/HubbleSaturn_ZH-CN12020278371_1920x1080.jpg'}, 32 | {'checksum': 'fb6823a2a509458d285fcfc2d4f96b99', 33 | 'path': 'full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg', 34 | 'url': 'https://cn.bing.com/az/hprichbg/rb/MarshallPoint_ZH-CN9062933060_1920x1080.jpg'}, 35 | {'checksum': '6ebfdb5210fce5986b88d07053ac94af', 36 | 'path': 'full/885648740905a26703e18c1ae24f23c480ecc822.jpg', 37 | 'url': 'https://cn.bing.com/az/hprichbg/rb/SandiaSunrise_ZH-CN11155504388_1920x1080.jpg'}]} 38 | 2018-10-15 12:06:55 [scrapy] INFO: Closing spider (finished) 39 | 2018-10-15 12:06:55 [scrapy] INFO: Dumping Scrapy stats: 40 | {'downloader/request_bytes': 270, 41 | 'downloader/request_count': 1, 42 | 'downloader/request_method_count/GET': 1, 43 | 'downloader/response_bytes': 2711, 44 | 'downloader/response_count': 1, 45 | 'downloader/response_status_count/200': 1, 46 | 'file_count': 7, 47 | 'file_status_count/uptodate': 7, 48 | 'finish_reason': 'finished', 49 | 'finish_time': datetime.datetime(2018, 10, 15, 4, 6, 55, 222970), 50 | 'item_scraped_count': 1, 51 | 'log_count/DEBUG': 9, 52 | 'log_count/INFO': 6, 53 | 'response_received_count': 1, 54 | 'scheduler/dequeued': 1, 55 | 'scheduler/dequeued/memory': 1, 56 | 'scheduler/enqueued': 1, 57 | 'scheduler/enqueued/memory': 1, 58 | 'start_time': datetime.datetime(2018, 10, 15, 4, 6, 54, 748238)} 59 | 2018-10-15 12:06:55 [scrapy] INFO: Spider closed (finished) 60 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/033317f07b809f0cd06487b30b29eccb26d063b8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/033317f07b809f0cd06487b30b29eccb26d063b8.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/0698af79b195349b838bdfeebbd11409f82f0f38.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/0698af79b195349b838bdfeebbd11409f82f0f38.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/092235104f84cb2f4de8808c10f655298313f65c.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/092235104f84cb2f4de8808c10f655298313f65c.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/2efd29b32c481136507115a3ee2e6181c122aa0b.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/2efd29b32c481136507115a3ee2e6181c122aa0b.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/3a573eb605fef87faaf91ad8ad421d1a24d0bc6b.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/3a573eb605fef87faaf91ad8ad421d1a24d0bc6b.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/4099096a19a0eaad0aef6782a206881d948ad775.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/4099096a19a0eaad0aef6782a206881d948ad775.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/486c568e353051efd0959cc4a424ff9093cfceb9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/486c568e353051efd0959cc4a424ff9093cfceb9.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/5295941635a2aa9c67cebf27c7bdbfc9a27230e9.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/599f27e7835da59b495c44297cce0553ee4a0b51.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/599f27e7835da59b495c44297cce0553ee4a0b51.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/86fd225ce368589a9b5e7454e6583cf77aedb0d4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/86fd225ce368589a9b5e7454e6583cf77aedb0d4.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/885648740905a26703e18c1ae24f23c480ecc822.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/885648740905a26703e18c1ae24f23c480ecc822.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/97e86cde9a308e626f537c107303537ec598903c.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/97e86cde9a308e626f537c107303537ec598903c.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/b7e4ba8cba538b44e31132d175479c7ec37284fd.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/b7e4ba8cba538b44e31132d175479c7ec37284fd.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/bca701f1923e317aa8a9be18125c2a894fc80780.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/bfa7e5e22268f27d7a195390abf6ef9ee45a6c29.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/bfa7e5e22268f27d7a195390abf6ef9ee45a6c29.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/c14461fb44425865b9afe6695ab5926e2001411c.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/c14461fb44425865b9afe6695ab5926e2001411c.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/cbba4b16b644659920ad93e10a6d3478270ce927.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/cbba4b16b644659920ad93e10a6d3478270ce927.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/e254600d400f3c54c77171e02b021d46369788ae.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/e254600d400f3c54c77171e02b021d46369788ae.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/e7fc4de75bcafe18f64b68072bf5cc6ece6084a8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/e7fc4de75bcafe18f64b68072bf5cc6ece6084a8.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/ed989d9c858c5290ca559cf2c462cace68e49362.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/代码/bing/out/res/pic/full/ed989d9c858c5290ca559cf2c462cace68e49362.jpg -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/requirements.txt: -------------------------------------------------------------------------------- 1 | Scrapy==1.5.1 2 | Pillow==5.2.0 -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/run.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | 3 | cmdline.execute(["scrapy", "crawl", "BingWallpaper"]) 4 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/代码/bing/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = bing.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = bing 12 | -------------------------------------------------------------------------------- /7、Python爬虫框架Scrapy(上)/勘误.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/7、Python爬虫框架Scrapy(上)/勘误.md -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__init__.py -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/items.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/items.cpython-37.pyc -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/middlewares.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/middlewares.cpython-37.pyc -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/pipelines.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/pipelines.cpython-37.pyc -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Field, Item 9 | 10 | 11 | class JianshuspiderItem(Item): 12 | title = Field() 13 | content = Field() 14 | url = Field() 15 | nickname = Field() 16 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | from selenium import webdriver 8 | from scrapy import signals 9 | from scrapy.http import HtmlResponse 10 | 11 | 12 | class JianshuspiderSpiderMiddleware(object): 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the spider middleware does not modify the 15 | # passed objects. 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # This method is used by Scrapy to create your spiders. 20 | s = cls() 21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 22 | return s 23 | 24 | def process_spider_input(self, response, spider): 25 | # Called for each response that goes through the spider 26 | # middleware and into the spider. 27 | 28 | # Should return None or raise an exception. 29 | return None 30 | 31 | def process_spider_output(self, response, result, spider): 32 | # Called with the results returned from the Spider, after 33 | # it has processed the response. 34 | 35 | # Must return an iterable of Request, dict or Item objects. 36 | for i in result: 37 | yield i 38 | 39 | def process_spider_exception(self, response, exception, spider): 40 | # Called when a spider or process_spider_input() method 41 | # (from other spider middleware) raises an exception. 42 | 43 | # Should return either None or an iterable of Response, dict 44 | # or Item objects. 45 | pass 46 | 47 | def process_start_requests(self, start_requests, spider): 48 | # Called with the start requests of the spider, and works 49 | # similarly to the process_spider_output() method, except 50 | # that it doesn’t have a response associated. 51 | 52 | # Must return only requests (not items). 53 | for r in start_requests: 54 | yield r 55 | 56 | def spider_opened(self, spider): 57 | spider.logger.info('Spider opened: %s' % spider.name) 58 | 59 | 60 | class JianshuspiderDownloaderMiddleware(object): 61 | # Not all methods need to be defined. If a method is not defined, 62 | # scrapy acts as if the downloader middleware does not modify the 63 | # passed objects. 64 | 65 | @classmethod 66 | def from_crawler(cls, crawler): 67 | # This method is used by Scrapy to create your spiders. 68 | s = cls() 69 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 70 | return s 71 | 72 | def process_request(self, request, spider): 73 | # Called for each request that goes through the downloader 74 | # middleware. 75 | 76 | # Must either: 77 | # - return None: continue processing this request 78 | # - or return a Response object 79 | # - or return a Request object 80 | # - or raise IgnoreRequest: process_exception() methods of 81 | # installed downloader middleware will be called 82 | return None 83 | 84 | def process_response(self, request, response, spider): 85 | # Called with the response returned from the downloader. 86 | 87 | # Must either; 88 | # - return a Response object 89 | # - return a Request object 90 | # - or raise IgnoreRequest 91 | return response 92 | 93 | def process_exception(self, request, exception, spider): 94 | # Called when a download handler or a process_request() 95 | # (from other downloader middleware) raises an exception. 96 | 97 | # Must either: 98 | # - return None: continue processing this exception 99 | # - return a Response object: stops process_exception() chain 100 | # - return a Request object: stops process_exception() chain 101 | pass 102 | 103 | def spider_opened(self, spider): 104 | spider.logger.info('Spider opened: %s' % spider.name) 105 | 106 | 107 | class JSSeleniumMiddleware: 108 | def __init__(self): 109 | self.browser = webdriver.Chrome() 110 | 111 | def __del__(self): 112 | self.browser.close() 113 | 114 | def process_request(self, request, spider): 115 | self.browser.get("https://www.jianshu.com/") 116 | return HtmlResponse(url='https://www.jianshu.com/', body=self.browser.page_source, request=request, 117 | encoding='utf-8', status=200) 118 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | 9 | 10 | class JianshuspiderPipeline(object): 11 | def process_item(self, item, spider): 12 | return item 13 | 14 | 15 | class MongoPipeline(object): 16 | def open_spider(self, spider): 17 | self.client = pymongo.MongoClient(host='localhost', port=27017) 18 | self.db = self.client['js'] 19 | 20 | def process_item(self, item, spider): 21 | self.db['index_article'].insert(dict(item)) 22 | 23 | def close_spider(self, spider): 24 | self.client.close() 25 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for jianshuspider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'jianshuspider' 13 | 14 | SPIDER_MODULES = ['jianshuspider.spiders'] 15 | NEWSPIDER_MODULE = 'jianshuspider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'jianshuspider (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'jianshuspider.middlewares.JianshuspiderSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | DOWNLOADER_MIDDLEWARES = { 56 | 'jianshuspider.middlewares.JSSeleniumMiddleware': 543, 57 | } 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'jianshuspider.pipelines.MongoPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/spiders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/spiders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/spiders/__pycache__/jianshu.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/spiders/__pycache__/jianshu.cpython-37.pyc -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/jianshuspider/jianshuspider/spiders/jianshu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Spider, Request 3 | 4 | from jianshuspider.items import JianshuspiderItem 5 | 6 | 7 | class JianshuSpider(Spider): 8 | name = 'jianshu' 9 | allowed_domains = ['www.jianshu.com'] 10 | start_urls = ['http://www.jianshu.com/'] 11 | 12 | def start_requests(self): 13 | yield Request('https://www.jianshu.com', callback=self.parse) 14 | 15 | def parse(self, response): 16 | li_s = response.xpath('//ul[@class="note-list"]/li') 17 | for li in li_s: 18 | item = JianshuspiderItem() 19 | item['title'] = li.xpath('.//div/a[@class="title"]/text()').extract_first() 20 | item['content'] = str(li.xpath('.//div/p[@class="abstract"]/text()').extract_first()).replace( 21 | " ", "").replace( 22 | "\n", "") 23 | item['url'] = 'https://www.jianshu.com/p/' + str( 24 | li.xpath('.//div/a[@class="title"]/@href').extract_first()) 25 | item['nickname'] = li.xpath('.//div/a[@class="nickname"]/text()').extract_first() 26 | yield item 27 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/jianshuspider/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.4.4 2 | APScheduler==3.5.3 3 | asn1crypto==0.24.0 4 | async-timeout==3.0.1 5 | attrs==18.2.0 6 | Automat==0.7.0 7 | certifi==2018.8.24 8 | cffi==1.11.5 9 | chardet==3.0.4 10 | Click==7.0 11 | constantly==15.1.0 12 | cryptography==2.3.1 13 | cssselect==1.0.3 14 | demjson==2.2.4 15 | docopt==0.6.2 16 | Flask==1.0.2 17 | hyperlink==18.0.0 18 | idna==2.7 19 | incremental==17.5.0 20 | itsdangerous==0.24 21 | Jinja2==2.10 22 | lxml==4.2.5 23 | MarkupSafe==1.0 24 | multidict==4.4.2 25 | parsel==1.5.0 26 | Pillow==5.2.0 27 | pipreqs==0.4.9 28 | pyasn1==0.4.4 29 | pyasn1-modules==0.2.2 30 | pycparser==2.19 31 | PyDispatcher==2.0.5 32 | PyHamcrest==1.9.0 33 | pymongo==3.7.2 34 | PyMySQL==0.9.2 35 | pyOpenSSL==18.0.0 36 | pytz==2018.5 37 | pywin32==223 38 | queuelib==1.5.0 39 | redis==2.10.6 40 | requests==2.19.1 41 | Scrapy==1.5.1 42 | scrapyrt==0.10 43 | selenium==3.14.1 44 | service-identity==17.0.0 45 | six==1.11.0 46 | Twisted==18.7.0 47 | tzlocal==1.5.1 48 | urllib3==1.23 49 | w3lib==1.19.0 50 | Werkzeug==0.14.1 51 | yarg==0.1.9 52 | yarl==1.2.6 53 | zope.interface==4.5.0 54 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/jianshuspider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = jianshuspider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = jianshuspider 12 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ip_check.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | import aiohttp 4 | from aiohttp import ClientError, ClientConnectionError, ClientHttpProxyError, ServerDisconnectedError 5 | from redis import StrictRedis 6 | 7 | test_url = 'https://ip.cn/' 8 | 9 | 10 | class ProxyCheck: 11 | def __init__(self): 12 | self.redis_db = StrictRedis( 13 | host="127.0.0.1", 14 | port=6379, 15 | password="Jay12345", 16 | db=0 17 | ) 18 | 19 | # 检测代理IP是否可用 20 | async def check_ip(self, proxy_ip): 21 | conn = aiohttp.TCPConnector(ssl=False) 22 | async with aiohttp.ClientSession(connector=conn) as session: 23 | try: 24 | async with session.get(test_url, proxy=proxy_ip.replace("https", "http"), headers={ 25 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 26 | 'Chrome/68.0.3440.106 Safari/537.36' 27 | }) as resp: 28 | if resp.status in [200]: 29 | print("代理可用:", proxy_ip) 30 | else: 31 | print("移除不可用代理ip:", proxy_ip) 32 | self.redis_db.srem('proxy_ips:proxy_pool', proxy_ip) 33 | except (ClientError, ClientConnectionError, ClientHttpProxyError, ServerDisconnectedError, TimeoutError, 34 | AttributeError): 35 | print("代理请求失败移除代理ip:", proxy_ip) 36 | self.redis_db.srem('proxy_ips:proxy_pool', proxy_ip) 37 | 38 | def check_all_ip(self): 39 | print("开始检测代理ip是否可用") 40 | loop = asyncio.get_event_loop() 41 | tasks = [] 42 | for ip in self.redis_db.smembers('proxy_ips:proxy_pool'): 43 | tasks.append(self.check_ip(ip.decode())) 44 | loop.run_until_complete(asyncio.wait(tasks)) 45 | 46 | 47 | if __name__ == '__main__': 48 | ProxyCheck().check_all_ip() 49 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/__init__.py -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ProxyIpsItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ProxyIpsSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class ProxyIpsDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class ProxyIpsPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for proxy_ips project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'proxy_ips' 13 | 14 | SPIDER_MODULES = ['proxy_ips.spiders'] 15 | NEWSPIDER_MODULE = 'proxy_ips.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'proxy_ips (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | # CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | # DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | # CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | # COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | # TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | DEFAULT_REQUEST_HEADERS = { 42 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | 'Accept-Language': 'en', 44 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 45 | 'Chrome/68.0.3440.106 Safari/537.36', 46 | 47 | } 48 | 49 | # Enable or disable spider middlewares 50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 51 | # SPIDER_MIDDLEWARES = { 52 | # 'proxy_ips.middlewares.ProxyIpsSpiderMiddleware': 543, 53 | # } 54 | 55 | # Enable or disable downloader middlewares 56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 57 | # DOWNLOADER_MIDDLEWARES = { 58 | # 'proxy_ips.middlewares.ProxyIpsDownloaderMiddleware': 543, 59 | # } 60 | 61 | # Enable or disable extensions 62 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 63 | # EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | # } 66 | 67 | # Configure item pipelines 68 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 69 | # ITEM_PIPELINES = { 70 | # 'proxy_ips.pipelines.ProxyIpsPipeline': 300, 71 | # } 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | # AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | # AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | # AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | # AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | # HTTPCACHE_ENABLED = True 89 | # HTTPCACHE_EXPIRATION_SECS = 0 90 | # HTTPCACHE_DIR = 'httpcache' 91 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/spiders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/spiders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/spiders/__pycache__/proxy_spider.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/spiders/__pycache__/proxy_spider.cpython-37.pyc -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_ips/spiders/proxy_spider.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | import aiohttp 4 | from aiohttp import ClientError, ClientConnectionError, ClientHttpProxyError, ServerDisconnectedError 5 | from redis import StrictRedis 6 | from scrapy import Spider, Request 7 | import time 8 | 9 | test_url = 'https://ip.cn/' 10 | 11 | 12 | # 获取代理IP的爬虫 13 | class FetchIpSpider(Spider): 14 | name = "fetch_ip" 15 | 16 | def __init__(self, **kwargs): 17 | super().__init__(**kwargs) 18 | self.redis_db = StrictRedis( 19 | host="127.0.0.1", 20 | port=6379, 21 | password="Jay12345", 22 | db=0 23 | ) 24 | 25 | def start_requests(self): 26 | # for i in range(1, 5): 27 | # yield Request(url="http://www.xicidaili.com/nn/" + str(i), callback=self.parse_xici, headers={ 28 | # 'Host': 'www.xicidaili.com', 29 | # 'Referer': 'http://www.xicidaili.com/', 30 | # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 31 | # 'Chrome/68.0.3440.106 Safari/537.36' 32 | # }) 33 | 34 | for i in range(1, 5): 35 | time.sleep(3) 36 | yield Request(url='https://www.kuaidaili.com/free/inha/' + str(i) + '/', callback=self.parse_kuaidaili, 37 | headers={ 38 | 'Host': 'www.kuaidaili.com', 39 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, ' 40 | 'like Gecko) ' 41 | 'Chrome/68.0.3440.106 Safari/537.36' 42 | }) 43 | 44 | def parse_xici(self, response): 45 | loop = asyncio.get_event_loop() 46 | proxy_ips = [] 47 | for tr in response.css('#ip_list tr'): 48 | td_list = tr.css('td::text') 49 | if len(td_list) < 3: 50 | continue 51 | ip_address = td_list[0].extract() # IP 52 | port = td_list[1].extract() # 端口 53 | if len(td_list) == 11: 54 | proto = td_list[4].extract() 55 | else: 56 | proto = td_list[5].extract() # 协议类型 57 | proxy_ip = '%s://%s:%s' % (proto.lower(), ip_address, port) 58 | # 获取响应时间,超过2s的丢弃 59 | latency = tr.css('div.bar::attr(title)').re_first('(\d+\.\d+)') 60 | if float(latency) > 2: 61 | self.logger.info("跳过慢速代理:%s 响应时间:%s" % (proxy_ip, latency)) 62 | else: 63 | self.logger.info("可用代理加入队列:%s 响应时间:%s" % (proxy_ip, latency)) 64 | proxy_ips.append(proxy_ip) 65 | tasks = [] 66 | for ip in proxy_ips: 67 | tasks.append(self.check_ip(ip)) 68 | loop.run_until_complete(asyncio.wait(tasks)) 69 | 70 | 71 | def parse_kuaidaili(self, response): 72 | loop = asyncio.get_event_loop() 73 | proxy_ips = [] 74 | for tr in response.css('tbody tr'): 75 | td_list = tr.css('td::text') 76 | ip_address = td_list[0].extract() # IP 77 | port = td_list[1].extract() # 端口 78 | proto = td_list[3].extract() # 协议 79 | proxy_ip = '%s://%s:%s' % (proto.lower(), ip_address, port) 80 | # 获取响应时间,超过2s的丢弃 81 | latency = float((td_list[5].extract())[:-1]) 82 | if float(latency) > 2: 83 | self.logger.info("跳过慢速代理:%s 响应时间:%s" % (proxy_ip, latency)) 84 | else: 85 | self.logger.info("可用代理加入队列:%s 响应时间:%s" % (proxy_ip, latency)) 86 | proxy_ips.append(proxy_ip) 87 | tasks = [] 88 | for ip in proxy_ips: 89 | tasks.append(self.check_ip(ip)) 90 | loop.run_until_complete(asyncio.wait(tasks)) 91 | 92 | # 检测代理IP是否可用 93 | async def check_ip(self, proxy_ip): 94 | conn = aiohttp.TCPConnector(ssl=False) 95 | async with aiohttp.ClientSession(connector=conn) as session: 96 | try: 97 | async with session.get(test_url, proxy=proxy_ip.replace("https", "http")) as resp: 98 | if resp.status in [200]: 99 | print("代理可用:", proxy_ip) 100 | self.redis_db.sadd('proxy_ips:proxy_pool', proxy_ip) 101 | else: 102 | print("代理不可用:", proxy_ip) 103 | except (ClientError, ClientConnectionError, ClientHttpProxyError, ServerDisconnectedError, TimeoutError, 104 | AttributeError): 105 | print("代理请求失败:", proxy_ip) 106 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/proxy_ips/proxy_server.py: -------------------------------------------------------------------------------- 1 | # coding =utf-8 2 | from flask import Flask 3 | from redis import StrictRedis 4 | import random 5 | 6 | app = Flask(__name__) 7 | 8 | 9 | @app.route("/") 10 | def fetch_ip(): 11 | ip_list = list(redis_db.smembers("proxy_ips:proxy_pool")) 12 | return random.choice(ip_list).decode() 13 | 14 | 15 | if __name__ == '__main__': 16 | redis_db = StrictRedis( 17 | host="127.0.0.1", 18 | port=6379, 19 | password="Jay12345", 20 | db=0 21 | ) 22 | app.run() 23 | 24 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/proxy_ips/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | from apscheduler.schedulers.blocking import BlockingScheduler 5 | from redis import StrictRedis 6 | 7 | fetch_ip_time = 0 8 | 9 | redis_db = StrictRedis( 10 | host="127.0.0.1", 11 | port=6379, 12 | password="Jay12345", 13 | db=0 14 | ) 15 | 16 | 17 | def check_ip(): 18 | global fetch_ip_time 19 | proxy_poll = redis_db.smembers("proxy_ips:proxy_pool") 20 | if len(proxy_poll) == 0: 21 | print("可用代理IP数目为0,激活爬虫...") 22 | os.system("scrapy crawl fetch_ip") 23 | fetch_ip_time = int(time.time()) 24 | else: 25 | if len(proxy_poll) < 5: 26 | if int(time.time() - fetch_ip_time) < 600: 27 | if len(proxy_poll) == 0: 28 | print("虽然处于保护状态,但是可用代理IP数目为0,激活爬虫...") 29 | os.system("scrapy crawl fetch_ip") 30 | fetch_ip_time = int(time.time()) 31 | else: 32 | print("当前可用代理IP少于5,但是还处于保护状态,不激活爬虫") 33 | else: 34 | print("当前可用代理IP少于5,且处于非保护状态,激活爬虫...") 35 | os.system("scrapy crawl fetch_ip") 36 | fetch_ip_time = int(time.time()) 37 | else: 38 | print("日常自检...") 39 | os.system("python proxy_ip_check.py") 40 | 41 | 42 | if __name__ == '__main__': 43 | check_ip() 44 | scheduler = BlockingScheduler() 45 | # 每隔20s执行一次 46 | scheduler.add_job(check_ip, 'interval', max_instances=10, seconds=20) 47 | scheduler.start() 48 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/代码/proxy_ips/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = proxy_ips.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = proxy_ips 12 | -------------------------------------------------------------------------------- /8、Python爬虫框架Scrapy(下)/勘误.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/8、Python爬虫框架Scrapy(下)/勘误.md -------------------------------------------------------------------------------- /9、数据分析案例:Python岗位行情/代码/9_1.py: -------------------------------------------------------------------------------- 1 | """ 2 | numpy使用代码示例 3 | """ 4 | 5 | import numpy as np 6 | 7 | print("1.生成一个一维数组:\n %s" % np.array([1, 2])) 8 | print("2.生成一个二维数组:\n %s" % np.array([[1, 2], [3, 4]])) 9 | print("3.生成一个元素初始值都为0的,4行3列矩阵:\n %s" % np.zeros((4, 3))) 10 | print("4.生成一个元素初始值都为1的,3行4列矩阵:\n %s" % np.ones((3, 4))) 11 | print("5.创建一个空数组,元素为随机值:\n %s" % np.empty([2, 3], dtype=int)) 12 | a1 = np.arange(0, 30, 2) 13 | print("6.生成一个等间隔数字的数组:\n %s" % a1) 14 | a2 = a1.reshape(3, 5) 15 | print("7.转换数组的维度,比如把一维的转为3行5列的数组:\n %s" % a2) 16 | 17 | # ndarray常用属性 18 | print("8.a1的维度: %d \t a2的维度:%d" % (a1.ndim, a2.ndim)) 19 | print("9.a1的行列数:%s \t a2的行列数:%s" % (a1.shape, a2.shape)) 20 | print("10.a1的元素个数:%d \t a2的元素个数:%d" % (a1.size, a2.size)) 21 | print("11.a1的元素数据类型:%s 数据类型大小:%s" % (a1.dtype, a1.itemsize)) 22 | -------------------------------------------------------------------------------- /9、数据分析案例:Python岗位行情/代码/9_2.py: -------------------------------------------------------------------------------- 1 | # 拉勾网Android招聘数据分析 2 | import html 3 | import random 4 | import re 5 | import time 6 | import urllib.parse 7 | from collections import Counter 8 | 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import pandas as pd 12 | import requests 13 | from scipy.misc import imread 14 | from wordcloud import WordCloud, ImageColorGenerator 15 | 16 | import config as c 17 | import tools as t 18 | 19 | max_page = 1 20 | result_save_file = c.outputs_logs_path + 'result.csv' 21 | pic_save_path = c.outputs_pictures_path + 'LaGou/' 22 | default_font = c.res_documents + 'wryh.ttf' # 生成词云用的默认字体 23 | default_mask = c.res_pictures + 'default_mask.jpg' # 默认遮罩图片 24 | 25 | # Ajax加载url 26 | ajax_url = "https://www.lagou.com/jobs/positionAjax.json?" 27 | 28 | # url拼接参数 29 | request_params = {'needAddtionalResult': 'false'} 30 | 31 | # post提交参数 32 | form_data = {'first': 'false', 'pn': '1', 'kd': 'Python'} 33 | 34 | # 获得页数的正则 35 | page_pattern = re.compile('"totalCount":(\d*),', re.S) 36 | 37 | # csv表头 38 | csv_headers = [ 39 | '公司id', '城市', '职位名称', '工作年限', '学历', '职位性质', '薪资', 40 | '融资状态', '行业领域', '招聘岗位id', '公司优势', '公司规模', 41 | '公司标签', '所在区域', '技能标签', '公司经度', '公司纬度', '公司全名' 42 | ] 43 | 44 | # 模拟请求头 45 | ajax_headers = { 46 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 47 | 'Accept-Encoding': 'gzip, deflate, br', 48 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 49 | 'Connection': 'keep-alive', 50 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 51 | 'Host': 'www.lagou.com', 52 | 'Origin': 'https://www.lagou.com', 53 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 ' 54 | 'Safari/537.36', 55 | 'X-Anit-Forge-Code': '0', 56 | 'X-Anit-Forge-Token': 'None', 57 | 'X-Requested-With': 'XMLHttpRequest', 58 | 'Referer': 'https://www.lagou.com/jobs/list_android?labelWords=&fromSearch=true&suginput=' 59 | } 60 | 61 | 62 | # 获取每页招聘信息 63 | def fetch_data(page): 64 | fetch_url = ajax_url + urllib.parse.urlencode(request_params) 65 | global max_page 66 | while True: 67 | try: 68 | form_data['pn'] = page 69 | print("抓取第:" + str(page) + "页!") 70 | # 随机休眠5-15s,避免因为访问过于频繁ip被封 71 | resp = requests.post(url=fetch_url, data=form_data, headers=ajax_headers) 72 | if resp.status_code == 200: 73 | if page == 1: 74 | max_page = int(int(page_pattern.search(resp.text).group(1)) / 15) 75 | print("总共有:" + str(max_page) + "页") 76 | data_json = resp.json()['content']['positionResult']['result'] 77 | data_list = [] 78 | for data in data_json: 79 | data_list.append((data['companyId'], 80 | data['city'], 81 | html.unescape(data['positionName']), 82 | data['workYear'], 83 | data['education'], 84 | data['jobNature'], 85 | data['salary'], 86 | data['financeStage'], 87 | data['industryField'], 88 | data['positionId'], 89 | html.unescape(data['positionAdvantage']), 90 | data['companySize'], 91 | data['companyLabelList'], 92 | data['district'], 93 | html.unescape(data['positionLables']), 94 | data['longitude'], 95 | data['latitude'], 96 | html.unescape(data['companyFullName']))) 97 | result = pd.DataFrame(data_list) 98 | if page == 1: 99 | result.to_csv(result_save_file, header=csv_headers, index=False, mode='a+') 100 | else: 101 | result.to_csv(result_save_file, header=False, index=False, mode='a+') 102 | return None 103 | except Exception as e: 104 | print(e) 105 | 106 | 107 | # 生成词云文件 108 | def make_wc(content, file_name, mask_pic=default_mask, font=default_font): 109 | bg_pic = imread(mask_pic) 110 | pic_colors = ImageColorGenerator(bg_pic) 111 | wc = WordCloud(font_path=font, background_color='white', margin=2, max_font_size=250, 112 | width=2000, height=2000, 113 | min_font_size=30, max_words=1000) 114 | wc.generate_from_frequencies(content) 115 | wc.to_file(file_name) 116 | 117 | 118 | # 数据分析方法(生成相关文件) 119 | def data_analysis(data): 120 | # 1.分析招聘公司的相关信息 121 | # 行业领域 122 | industry_field_list = [] 123 | for industry_field in data['行业领域']: 124 | for field in industry_field.strip().replace(" ", ",").replace("、", ",").split(','): 125 | industry_field_list.append(field) 126 | counter = dict(Counter(industry_field_list)) 127 | counter.pop('') 128 | make_wc(counter, pic_save_path + "wc_1.jpg") 129 | 130 | # 公司规模 131 | plt.figure(1) 132 | data['公司规模'].value_counts().plot(kind='pie', autopct='%1.1f%%', explode=np.linspace(0, 0.5, 6)) 133 | plt.subplots_adjust(left=0.22, right=0.74, wspace=0.20, hspace=0.20, 134 | bottom=0.17, top=0.84) 135 | plt.savefig(pic_save_path + 'result_1.jpg') 136 | plt.close(1) 137 | # 融资状态 138 | plt.figure(2) 139 | data['融资状态'].value_counts().plot(kind='pie', autopct='%1.1f%%') 140 | plt.subplots_adjust(left=0.22, right=0.74, wspace=0.20, hspace=0.20, 141 | bottom=0.17, top=0.84) 142 | plt.savefig(pic_save_path + 'result_2.jpg') 143 | plt.close(2) 144 | # 所在区域 145 | plt.figure(3) 146 | data['所在区域'].value_counts().plot(kind='pie', autopct='%1.1f%%', explode=[0, 0, 0, 0, 0, 0, 0, 1, 1.5]) 147 | plt.subplots_adjust(left=0.31, right=0.74, wspace=0.20, hspace=0.20, 148 | bottom=0.26, top=0.84) 149 | plt.savefig(pic_save_path + 'result_3.jpg') 150 | plt.close(3) 151 | # 公司标签 152 | tags_list = [] 153 | for tags in data['公司标签']: 154 | for tag in tags.strip().replace("[", "").replace("]", "").replace("'", "").split(','): 155 | tags_list.append(tag) 156 | counter = dict(Counter(tags_list)) 157 | counter.pop('') 158 | make_wc(counter, pic_save_path + "wc_2.jpg") 159 | # 公司优势 160 | advantage_list = [] 161 | for advantage_field in data['公司优势']: 162 | for field in advantage_field.strip().replace(" ", ",").replace("、", ",").replace(",", ",").replace("+", ",") \ 163 | .split(','): 164 | industry_field_list.append(field) 165 | counter = dict(Counter(industry_field_list)) 166 | counter.pop('') 167 | counter.pop('移动互联网') 168 | make_wc(counter, pic_save_path + "wc_3.jpg") 169 | 170 | # 2.分析招聘需求 171 | # 工作年限要求 172 | # 横向条形图 173 | plt.figure(4) 174 | data['工作年限'].value_counts().plot(kind='barh', rot=0) 175 | plt.title("工作经验直方图") 176 | plt.xlabel("年限/年") 177 | plt.ylabel("公司/个") 178 | plt.savefig(pic_save_path + 'result_4.jpg') 179 | plt.close(4) 180 | # 饼图 181 | plt.figure(5) 182 | data['工作年限'].value_counts().plot(kind='pie', autopct='%1.1f%%', explode=np.linspace(0, 0.75, 6)) 183 | plt.title("工作经验饼图") 184 | plt.subplots_adjust(left=0.22, right=0.74, wspace=0.20, hspace=0.20, 185 | bottom=0.17, top=0.84) 186 | plt.savefig(pic_save_path + 'result_5.jpg') 187 | plt.close(5) 188 | # 学历要求 189 | plt.figure(6) 190 | data['学历'].value_counts().plot(kind='pie', autopct='%1.1f%%', explode=(0, 0.1, 0.2)) 191 | plt.title("学历饼图") 192 | plt.subplots_adjust(left=0.22, right=0.74, wspace=0.20, hspace=0.20, 193 | bottom=0.17, top=0.84) 194 | plt.savefig(pic_save_path + 'result_6.jpg') 195 | plt.close(6) 196 | 197 | # 薪资(先去掉后部分的最大工资,过滤掉kK以上词汇,获取索引按照整数生序排列) 198 | plt.figure(7) 199 | salary = data['薪资'].str.split('-').str.get(0).str.replace('k|K|以上', "").value_counts() 200 | salary_index = list(salary.index) 201 | salary_index.sort(key=lambda x: int(x)) 202 | final_salary = salary.reindex(salary_index) 203 | plt.title("薪资条形图") 204 | final_salary.plot(kind='bar', rot=0) 205 | plt.xlabel("薪资/K") 206 | plt.ylabel("公司/个") 207 | plt.savefig(pic_save_path + 'result_7.jpg') 208 | plt.close(7) 209 | 210 | # 技能标签 211 | skill_list = [] 212 | for skills in data['技能标签']: 213 | for skill in skills.strip().replace("[", "").replace("]", "").replace("'", "").split(','): 214 | skill_list.append(skill) 215 | counter = dict(Counter(skill_list)) 216 | counter.pop('') 217 | counter.pop('Android') 218 | make_wc(counter, pic_save_path + "wc_4.jpg") 219 | 220 | 221 | # 处理数据 222 | if __name__ == '__main__': 223 | t.is_dir_existed(pic_save_path) 224 | if not t.is_dir_existed(result_save_file, mkdir=False): 225 | fetch_data(1) 226 | for cur_page in range(2, max_page + 1): 227 | fetch_data(cur_page) 228 | else: 229 | raw_data = pd.read_csv(result_save_file) 230 | data_analysis(raw_data) 231 | # 筛选电子商务公司 232 | dzsw_result = raw_data.loc[raw_data["行业领域"].str.find("电子商务") != -1, ["行业领域", "公司全名"]] 233 | dzsw_result.to_csv(c.outputs_logs_path + "dzsw.csv", header=False, index=False, mode='a+') 234 | # 筛选人15-50人的公司 235 | p_num_result = raw_data.loc[raw_data["所在区域"] == "龙华新区", ["所在区域", "公司全名"]] 236 | p_num_result.to_csv(c.outputs_logs_path + "lhxq.csv", header=False, index=False, mode='a+') 237 | -------------------------------------------------------------------------------- /9、数据分析案例:Python岗位行情/勘误.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coder-pig/PythonSpiderBook/46f3479c22d3f91096bc065b14061cb2e401ab27/9、数据分析案例:Python岗位行情/勘误.md -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 《Python网络爬虫从入门到实践》勘误与随书源代码 2 | 3 | 4 | --- 5 | 6 | ![][1] 7 | 8 | 貌似,出版社忘记把随书源代码加到书里了... 9 | 10 | 而电脑重装了,只找到了挺久以前的备份... 11 | 12 | 将就着用吧... 13 | 14 | 书中有问题的地方(错别字,代码错误,印刷错误等)欢迎 **提issues反馈** ,感激不尽。 15 | 16 | 另外,本来书是由:**Python基础** 和 **Python爬虫** 两个部分组成的,本书为爬虫部分, 17 | 基础部分的内容因为一些客观因素没有出版,笔者把这部分的内容发布到了公号上,**免费**!!! 18 | 有兴趣的同学可自行到公号查阅,谢谢~ 19 | 20 | ![][2] 21 | 22 | 最后,如果有其他事宜,如:加群、商业合作等,可在公号留言或者加下我的小号~ 23 | 24 | ![][3] 25 | 26 | 27 | [1]: http://static.zybuluo.com/coder-pig/ionx6je52iwlhxbgba3t1x51/12121.png 28 | [2]: http://static.zybuluo.com/coder-pig/1jpu7nalyfp3kvaxfm4q0h8y/20190524181102821.jpg 29 | [3]: http://static.zybuluo.com/coder-pig/whqf2oblwvzqempi2eec32xy/1111.png --------------------------------------------------------------------------------